#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include <ctype.h>
#include <malloc.h>
#include <time.h>
#include <limits.h>
#include <libgen.h>
#include "fdr_pwm.h"
#include "fdr_defines.h"
#include "alloc.h"

int main(int argc,char **argv) {
   
   register int ii,i,j;

   char *motifFileName;            // motif sequences
   int numSeq1,*seqLen1;           // motif sequences
   char **seq1,**rseq1;            // motif sequences
   double **score1,**rscore1;      // subsequence score for motif sequences
   int maxSeqLen1,aveSeqLen1;      // motif sequences
   int totalTopSubseq1;            // number of subsequences, at most Cmax per sequence
   Subseq *topSubSeq1;             // all highest scoring subsequences defined above
   int *numSitePerSeq1;            // number of binding sites per seq
   int numSite1;                   // total number of binding sites
   int minSites;                   // minimal number of sites in a motif
   int seqHavingSite1;             // number of sequences having at least on site
   double *bfreq;                  // base frequencies

   int nonMotifSeqProvided;        // indicator if seq2 file provided or not
   char *nonMotifFileName;         // motif sequences
   int numSeq2,*seqLen2;           // non-motif sequences 
   int bMarkovOrder;               // background markov order
   char **seq2;                    // non-motif sequences

   int numSeq0,*seqLen0;           // background sequences from either seq2 or Markov sequences from seq1 or seq2
   char **seq0,**rseq0;            // background sequences
   double **score0,**rscore0;       // subsequence score for backg sequences
   int maxSeqLen0,aveSeqLen0;      // background sequences
   int totalTopSubseq0;            // number of subsequences, at most Cmax per sequence
   Subseq *topSubSeq0;             // all highest scoring subsequences defined above
   int *numSitePerSeq0;            // number of binding sites per seq
   double numSite0;                 // average number of binding sites in backg sequences
   int seqHavingSite0;             // number of sequences having at least on site
   int numBackgSets;               // number of sets of backg sequences
 
   int Cmax;                       // maximal number of binding sites per sequence
   
   char OutputDIR[PATH_MAX];       // Holds the output directory for all files - determined from parameter
   char chScratch[PATH_MAX];
   
   int numPWM;                     // number of input starting pwms
   int *pwmLen;                    // lengths of these pwms
   double ***originalPWM;           // input starting pwms;
   double **tmpPWM;                 // standardized pwm
   double **t1PWM;                  // pwm at (t)th iteration
   double **t2PWM;                  // pwm at (t+1)th iteration
   double maxPWMdiff;               // maximal difference between two pwms
   double **opwm;                   // observed pwm from final aligned binding sites - for e value calc.
   
   double specifiedFDR;             // specified FDR bound
   double actualBound;              // computed FDR bound
   double tmpFDR;                   // dummy variable FDR
   double convergenceRate;          // pwm convergence criterion

   char *oconsensus,*fconsensus;   // starting and converged PWM consensuses 
   int motifCn;
   double m,e;                      // evalue - two components
   Motifs *motif;

   // others
   int maxNumIteration;
   int numTry,seq2cn,numSite1ForNorm;
   char *oFileName,foundRequiredFile[3],*motifName;
   char *motifSummaryFileName;
   char **geneID,**pwmName;
   int numBase,iterationCn,goodArgument,generateMarkovBackg; 
   seed_t seed;
   FILE *fp,*f1,*f2;
   time_t t;

   if (argc<5) {
      printf("\n\nUSAGE:\n\n");
      printf("   ./fdrMotif -fm inputSeqFile -fpwm initialPWM(s) [optional arguments]\n\n");
  
      printf("  -fm      inputSeqFile       Input (motif) sequences file name, e.g., ChIP file\n");
      printf("  -fpwm    initialPWM(s)      Starting PWM(s) (e.g., your_start.mx or *.mx)\n");
      printf("\n\n  Optional arguments\n\n");
      printf("  -fdrb    FDRbound           Upper bound of false discovery rate (default: 0.1)\n");
      printf("  -g       maxNumIteration    Maximal number of iterations (default: 100)\n");
      printf("  -nbsets  NumBackgSets       Number sets of background sequences (chose 5-10) (default: 10\n");
      printf("  -Cmax    maxNumSites        Maximal number of occurrences of a motif per sequence (default: 10)\n");
      printf("  -bOrder  bMarkovOrder       User-specified background Markov order (default: 0)\n");
      printf("  -fb      nonMotifSeqFile    Non-motif sequences for backg or estimating backg Markov model (default: input seqs\n");
      printf("  -Imb     indicator          Indicator for generate Markov background sequences (1 - yes) (0 - no) (default: 1)\n");
      printf("  -minN    minSites           minimal number of sites in a motif (default: #bases/5000\n");
      printf("  -fo      outputFile         Name of the output file (default: fdrMotif.txt)\n\n");
      printf("  -fM      motifSummaryFile   Name of the motif summary file (default: fdrMotifSummary.txt)\n\n");
      exit(0);
   }
   motifFileName=alloc_char(500); nonMotifFileName=alloc_char(500); oFileName=alloc_char(500);
   
   motifSummaryFileName = alloc_char(500); strncpy(motifSummaryFileName, "fdrMotifSummary.txt", 500);

   seqLen1=alloc_int(MAX_NUM_SEQ);
   geneID=alloc_char_char(MAX_NUM_SEQ,500);
   bfreq=alloc_double(4);
   
   // default settings
   maxNumIteration=100; specifiedFDR=0.1; numBackgSets=10; Cmax=10; bMarkovOrder=0; minSites=-1;
   strcpy(oFileName,"fdrMotif.txt"); oFileName[12]='\0'; nonMotifSeqProvided=0; generateMarkovBackg=1;

   for (ii=0;  ii<2; ii++) foundRequiredFile[ii]='0';

   for (ii=1;  ii<argc; ii++) {
      if (argv[ii][0]=='-' && isalpha(argv[ii][1])) {
         goodArgument=0;
         if ((strncmp(argv[ii],"-g",2)==0  && strlen(argv[ii])==2)     ||
             (strncmp(argv[ii],"-fm",3)==0 && strlen(argv[ii])==3)     ||
             (strncmp(argv[ii],"-fb",3)==0 && strlen(argv[ii])==3)     ||
             (strncmp(argv[ii],"-fo",3)==0 && strlen(argv[ii])==3)     ||
             (strncmp(argv[ii],"-fM",3)==0 && strlen(argv[ii])==3)     ||
             (strncmp(argv[ii],"-Imb",4)==0 && strlen(argv[ii])==4)    ||
             (strncmp(argv[ii],"-fpwm",5)==0 && strlen(argv[ii])==5)   ||
             (strncmp(argv[ii],"-fdrb",5)==0 && strlen(argv[ii])==5)   ||
             (strncmp(argv[ii],"-Cmax",5)==0 && strlen(argv[ii])==5)   ||
             (strncmp(argv[ii],"-minN",5)==0 && strlen(argv[ii])==5)   ||
             (strncmp(argv[ii],"-bOrder",7)==0 && strlen(argv[ii])==7) ||
             (strncmp(argv[ii],"-nbsets",7)==0 && strlen(argv[ii])==7)) goodArgument=1;
         
         if (!goodArgument) { printf("argument: %s unknown\n",argv[ii]); exit(0);  }
      }
   }
   for (ii=1;  ii<argc; ii++) {
      if (strncmp(argv[ii],"-fm",3)==0 && argv[ii+1]!=NULL) {
         foundRequiredFile[0]='1';
         strcpy(motifFileName,argv[ii+1]);
         // read input (motif) sequence file
         printf("\nReading input (motif) sequences file...");
         seq1=read_seq(&numSeq1,seqLen1,geneID,MAX_NUM_SEQ,MAX_SEQ_LENGTH,argv[ii+1]);
         numBase=count_base(numSeq1,seqLen1,seq1);
         aveSeqLen1=(int)((double)numBase/(double)numSeq1);
         printf("done\n");
         printf("Number of sequences: %5d average length: %5d\n\n",numSeq1,aveSeqLen1);
         minSites=(int)(numBase/5000);
      }
      else if (strncmp(argv[ii],"-fpwm",5)==0 && argv[ii+1]!=NULL) {
         foundRequiredFile[1]='1';
         pwmName=alloc_char_char(argc-1,200);
         pwmLen=alloc_int(argc-1);
         originalPWM=read_initial_pwm(pwmLen,pwmName,&numPWM,argc,argv);
      }
      else if (strncmp(argv[ii],"-fb",3)==0 && argv[ii+1]!=NULL) {
         printf("\nReading non-motif sequences file...");
         seqLen2=alloc_int(MAX_NUM_BSEQ);
         seq2=read_seq0(&numSeq2,seqLen2,MAX_NUM_BSEQ,MAX_SEQ_LENGTH,argv[ii+1]);
         numBase=count_base(numSeq2,seqLen2,seq2);
         aveSeqLen0=(int)((double)numBase/(double)numSeq2);
         printf("Number of sequences: %5d average length: %5d\n",numSeq2,aveSeqLen0);
         nonMotifSeqProvided=1;
         strcpy(nonMotifFileName,argv[ii+1]);
      }
      else if (strncmp(argv[ii],"-g",2)==0      && strlen(argv[ii])==2 && argv[ii+1]!=NULL) maxNumIteration=atoi(argv[ii+1]);
      else if (strncmp(argv[ii],"-fo",3)==0     && strlen(argv[ii])==3 && argv[ii+1]!=NULL) strcpy(oFileName,argv[ii+1]);
      else if (strncmp(argv[ii],"-Imb",4)==0    && strlen(argv[ii])==4 && argv[ii+1]!=NULL) generateMarkovBackg=atoi(argv[ii+1]);
      else if (strncmp(argv[ii],"-fdrb",5)==0   && strlen(argv[ii])==5 && argv[ii+1]!=NULL) specifiedFDR=atof(argv[ii+1]);
      else if (strncmp(argv[ii],"-Cmax",5)==0   && strlen(argv[ii])==5 && argv[ii+1]!=NULL) Cmax=atoi(argv[ii+1]);
      else if (strncmp(argv[ii],"-minN",5)==0   && strlen(argv[ii])==5 && argv[ii+1]!=NULL) minSites=atoi(argv[ii+1]);
      else if (strncmp(argv[ii],"-nbsets",7)==0 && strlen(argv[ii])==7 && argv[ii+1]!=NULL) numBackgSets=atoi(argv[ii+1]);
      else if (strncmp(argv[ii],"-bOrder",7)==0 && strlen(argv[ii])==7 && argv[ii+1]!=NULL) bMarkovOrder=atoi(argv[ii+1]);
      else if (strncmp(argv[ii],"-fM",3)==0     && strlen(argv[ii])==3 && argv[ii+1]!=NULL) {
         strncpy(motifSummaryFileName, argv[ii+1], 500);
      }
      else { }
   }
   if (foundRequiredFile[0]=='0') {
      printf("\nPlease specify file containing sequences to be searched with the -fm argument.\n"); exit(0);
   }
   if (foundRequiredFile[1]=='0') {
      printf("\nPlease specify file(s) containing starting pwm(s) with the -fpwm argument.\n"); exit(0);
   }

   if ((bMarkovOrder < 0) || (bMarkovOrder > 5)) {
      printf("The Markov order must lie between 0 and 5, inclusive\n");
      exit(0);
   } 

   if (maxNumIteration<=0) {
      printf("\nMaximal number of iteration must be >1\n"); exit(0);
   }
   if (specifiedFDR<=0) {
      printf("\nError: False Discovery Rate must be non-zero\n"); exit(0);
   }
   if (numBackgSets<=0) {
      printf("\nError: Number sets of background sequences must not be zero\n"); exit(0);
   }
   if (Cmax<=0) {
      printf("\nMaximal number of binding sites in a single sequence can not be zero\n"); exit(0);
   }
   if (numBackgSets<=2) {
      printf("\nWarning: the number of background sequence sets [%d] may be too small.\n",numBackgSets);
      printf("The result may not be stable. We recommend at least 5 sets (default is 10)\n\n");   
   }
   if (Cmax<10) {
      printf("\nNote: there might be more than 10 occurrences of a motif in\n"); 
      printf("any of the sequences, one should set Cmax larger accordingly.\n\n"); 
   }

   numSeq0=numBackgSets*numSeq1;
   seqLen0=alloc_int(numSeq0);
   
   // generate many sets of background sequences from a Markov model
   if (generateMarkovBackg==1) {
      //seed=time(0); 
      seed=123456789; 
      sgenrand(seed);

      printf("\nGenerating background sequences...");

      if (!nonMotifSeqProvided) {
         printf("from Markov model that is estimated from input sequences[%s]\n\n",motifFileName);
         seq0=background(numSeq1,seq1,seqLen1,aveSeqLen1,numBackgSets,numBackgSets*numSeq1,bfreq,bMarkovOrder);
      }
      else { 
         printf("from Markov model that is estimated from non-motif sequences[%s]\n\n",nonMotifFileName);
         seq0=background(numSeq2,seq2,seqLen2,aveSeqLen1,numBackgSets,numBackgSets*numSeq1, bfreq,bMarkovOrder);
      }
      for (i=0;  i<numSeq0; i++) seqLen0[i]=aveSeqLen1; 
   }
   else {
      printf("\nUsing non-motif sequences as the background sequences\n\n");
      seq2cn=0;
      for (i=0; i<numSeq2; i++) {
         if (seqLen2[i]>=aveSeqLen1) seq2cn++; 
      }
      if (seq2cn<numBackgSets*numSeq1) {
         printf("\n\nNumber of nom-motif sequences of enough length: %d\n",seq2cn);
         printf("Not enough to split into %d set of %d backg sequences\n",numBackgSets,numSeq1); 
         exit(0);
      }
      seq0=alloc_char_char(numSeq0,MAX_SEQ_LENGTH+1);
      seq2cn=0;
      for (i=0; i<numSeq2; i++) {
         if (seqLen2[i]>=aveSeqLen1) {
            strncpy(seq0[seq2cn], seq2[i], (size_t) aveSeqLen1); 
            seq0[seq2cn][aveSeqLen1]='\0'; 
            seqLen0[seq2cn]=aveSeqLen1;  
            seq2cn++;
            if (seq2cn==numSeq0) break;
         }
      }
      base_frequency(seq0,numSeq0,seqLen0,bfreq);
   }
   printf("\nMaximal number of iterations:\t\t\t\t\t%d\n",maxNumIteration);
   printf("Upper bound of FDR is controlled at:\t\t\t\t%5.4f\n",specifiedFDR);
   printf("Number sets of background sequences:\t\t\t\t%d\n",numBackgSets);
   printf("Number of sequences in each set:\t\t\t\t%d\n",numSeq0/numBackgSets);
   printf("Maximal number of binding sites in a single sequence:\t\t%d\n",Cmax);
   printf("Minimal number of binding sites in a motif\t\t\t%d\n\n",minSites);

   // find the longest sequence
   maxSeqLen1=0;
   for(i=0; i<numSeq1; i++) { 
      if(seqLen1[i]>maxSeqLen1) maxSeqLen1=seqLen1[i];
   } 
   maxSeqLen0=0;
   for(i=0; i<numSeq0; i++) { 
      if(seqLen0[i]>maxSeqLen0) maxSeqLen0=seqLen0[i];
   } 

   // reverse input and background sequences
   rseq1=alloc_char_char(numSeq1,maxSeqLen1);
   rseq0=alloc_char_char(numSeq0,maxSeqLen0);

   reverse_seq(seq1,rseq1,numSeq1,seqLen1);
   reverse_seq(seq0,rseq0,numSeq0,seqLen0);

   tmpPWM=alloc_double_double(MAX_MATRIX_LENGTH,4);  // pwm transformed/standardized
   t1PWM=alloc_double_double(MAX_MATRIX_LENGTH,4);  // pwm at (t) iteration
   t2PWM=alloc_double_double(MAX_MATRIX_LENGTH,4);  // pwm at (t+1) iteration
   opwm=alloc_double_double(MAX_MATRIX_LENGTH,4);
   
   // store r see manuscript for details
   numSitePerSeq1=alloc_int(numSeq1); 
   numSitePerSeq0=alloc_int(numSeq0); 
   score1 =alloc_double_double(numSeq1,maxSeqLen1); // pwm scores for all plus  subseqs in input sequences
   rscore1=alloc_double_double(numSeq1,maxSeqLen1); // pwm scores for all minus subseqs in input sequences
   score0 =alloc_double_double(numSeq0,maxSeqLen0); // pwm scores for all plus  subseqs in backg sequences
   rscore0=alloc_double_double(numSeq0,maxSeqLen0); // pwm scores for all minus subseqs in backg sequences

   topSubSeq1 = (Subseq*) calloc((size_t)(numSeq1*Cmax), sizeof(Subseq)); // all info for K*Cmax subseqs in input
   topSubSeq0 = (Subseq*) calloc((size_t)(numSeq0*Cmax), sizeof(Subseq)); // all info for K*Cmax subseqs in backg

   /* setting the PWM convergence criterion */
   convergenceRate=10e-5;

   strcpy(chScratch, oFileName);
   strcpy(OutputDIR, dirname(chScratch));
   
   fp=fopen(oFileName,"w");
   fprintf(fp,"==========================================================================\n\n");
   fprintf(fp,"command line: ");
   for (i=0;  i<argc; i++) fprintf(fp,"%s ",argv[i]); fprintf(fp,"\n\n");
   fprintf(fp,"Input (motif) sequence file:\t\t\t\t%s\n",motifFileName);
   fprintf(fp,"Number of sequences in input sequence file:\t\t%d\n",numSeq1);
   fprintf(fp,"Average input sequence length:\t\t\t\t%d\n",aveSeqLen1);
   fprintf(fp,"Number of background sequence sets:\t\t\t%d\n",numBackgSets);
   fprintf(fp,"Number of background sequence in each set:\t\t%d\n",numSeq0/numBackgSets);
   if (nonMotifFileName!=NULL) fprintf(fp,"Average backg sequence length:\t\t\t\t%d\n",aveSeqLen0);
   fprintf(fp,"Number of starting PMWs:\t\t\t\t\t%d\n",numPWM);
   fprintf(fp,"Maximal number of iterations:\t\t\t\t%d\n",maxNumIteration);
   fprintf(fp,"Upper bound of FDR is set at:\t\t\t\t%6.4f\n",specifiedFDR);
   fprintf(fp,"Maximal number of binding sits in a single sequence:\t%d\n",Cmax);
   fprintf(fp,"Termination convergence criterion:\t\t\t%e\n",convergenceRate);
   if (nonMotifFileName[0]!='\0') fprintf(fp,"non-motif sequence file:\t\t\t\t%s\n",nonMotifFileName); 
   if (generateMarkovBackg==1) {

#if SIZEOF_LONG == 8
      fprintf(fp,"Random seed number:\t\t\t\t\t%lld\n",seed);
#else
      fprintf(fp,"Random seed number:\t\t\t\t\t%lu\n",seed);
#endif
      
      if (!nonMotifSeqProvided)
         fprintf(fp,"Background sequences are generated from Markov model that is estimated from motif sequences\n\n");
      else
         fprintf(fp,"Background sequences are generated from Markov model that is estimated from non-motif sequences\n\n");
   }
   else fprintf(fp,"Use other sequences as the background sequences\n");
   fprintf(fp,"The background nucleotide frequency: a:%8.7f\tc:%8.7f\tg:%8.7f\tt:%8.7f\n", bfreq[0],bfreq[1],bfreq[2],bfreq[3]);

   time(&t);
   fprintf(fp,"\njob started: %s\n", asctime(localtime(&t)));

   fprintf(fp,"=========================================================================\n\n");
   fflush(fp);

   oconsensus=alloc_char(MAX_MATRIX_LENGTH);
   fconsensus=alloc_char(MAX_MATRIX_LENGTH);
   motif= (Motifs *) calloc((size_t) numPWM, sizeof(Motifs));

   motifCn=0;

   for (ii=0; ii<numPWM; ii++) {
      printf("\nStarting PWM[%4d]: %s\n",ii+1,pwmName[ii]);
      for (i=0;  i<pwmLen[ii]; i++) {
         for (j=0; j<4; j++) t1PWM[i][j]=originalPWM[ii][i][j];
      }
      motif_consensus(t1PWM,pwmLen[ii],oconsensus);
      /// standardize initial PWM
      transform_mat(t1PWM,tmpPWM,pwmLen[ii]);
   
      // compute the score for each subsequence based on the PWM
      score_subsequences(numSeq1,seq1,rseq1,seqLen1,tmpPWM,pwmLen[ii],score1,rscore1,bfreq);
      score_subsequences(numSeq0,seq0,rseq0,seqLen0,tmpPWM,pwmLen[ii],score0,rscore0,bfreq);
      
      // select Cmax non-overlapping subseqes in input sequences
      totalTopSubseq1=select_top_subseqs(topSubSeq1,score1,rscore1,numSeq1,seqLen1,pwmLen[ii],Cmax,maxSeqLen1);
      sort_subseq(topSubSeq1,totalTopSubseq1);
   
      // select Cmax non-overlapping subseqes in background sequences
      totalTopSubseq0=select_top_subseqs(topSubSeq0,score0,rscore0,numSeq0,seqLen0,pwmLen[ii],Cmax,maxSeqLen0);
      sort_subseq(topSubSeq0,totalTopSubseq0);
   
      tmpFDR=specifiedFDR;
      // compute the actual FDR bound 
      actualBound=false_discovery(topSubSeq1,numSeq1,topSubSeq0,numBackgSets,tmpFDR,&numSite1,&numSite0,totalTopSubseq1,totalTopSubseq0); 

      // if computed FDR is larger than specified FDR, temporarily sequentially increase the specified FDR 
      numTry=0;
      while (actualBound==1) {
         tmpFDR=min(1.0,tmpFDR+0.05);
         printf("initial FDR bound: %5.3f\n",tmpFDR);
         actualBound=false_discovery(topSubSeq1,numSeq1,topSubSeq0,numBackgSets,tmpFDR,&numSite1,&numSite0,totalTopSubseq1,totalTopSubseq0); 
         numTry++;
         if (numTry==50) break;
      };
     
      printf("checking PWM[%4d] initial FDR bound: %5.3f\n",ii+1,tmpFDR);
   
      if (actualBound>1.0) continue; 

      // count the number of binding sites in each sequence in the input data
      numSite1ForNorm=min(Cmax*numSeq1,(int)(1.5*numSeq1));
      //numSite1ForNorm=numSite1; 
      count_num_sites_per_seq(topSubSeq1,numSite1ForNorm,numSeq1,numSitePerSeq1);
   
      // normalize probabilities so that the sum over all s in a sequence is equal to r (see manuscript)
      normalize_per_seq(score1,rscore1,seqLen1,pwmLen[ii],numSeq1,numSitePerSeq1);
   
      // M-step, construct PWM
      construct_pwm(t2PWM,score1,rscore1,seq1,rseq1,seqLen1,numSeq1,pwmLen[ii]);

      // check convergence
      maxPWMdiff=check_convergence(t1PWM,t2PWM,pwmLen[ii]);

      iterationCn=1;
      while(fabs(maxPWMdiff)>convergenceRate || actualBound-specifiedFDR>specifiedFDR*0.05) {

         if(iterationCn>=maxNumIteration) break;
         else {
            // use the updated pwm 
            for(i=0; i<pwmLen[ii]; i++) {
               for(j=0; j<4; j++) t1PWM[i][j]=t2PWM[i][j];
            }

            transform_mat(t1PWM,tmpPWM,pwmLen[ii]);
            score_subsequences(numSeq1,seq1,rseq1,seqLen1,tmpPWM,pwmLen[ii],score1,rscore1,bfreq);
            score_subsequences(numSeq0,seq0,rseq0,seqLen0,tmpPWM,pwmLen[ii],score0,rscore0,bfreq);
 
            totalTopSubseq1=select_top_subseqs(topSubSeq1,score1,rscore1,numSeq1,seqLen1,pwmLen[ii],Cmax,maxSeqLen1);
            sort_subseq(topSubSeq1,totalTopSubseq1);
            totalTopSubseq0=select_top_subseqs(topSubSeq0,score0,rscore0,numSeq0,seqLen0,pwmLen[ii],Cmax,maxSeqLen0);
            sort_subseq(topSubSeq0,totalTopSubseq0);

            // gradually decrease FDR to the specified value   
            if (tmpFDR>specifiedFDR) { tmpFDR-=0.025; tmpFDR=max(tmpFDR,specifiedFDR); }

            actualBound=false_discovery(topSubSeq1,numSeq1,topSubSeq0,numBackgSets,tmpFDR,&numSite1,&numSite0,totalTopSubseq1,totalTopSubseq0);

            if (numSite1==0) { 
               printf("\nAbandon - no binding sites found for %s\n",pwmName[ii]); break; 
            }

            printf("Iteration[%3d]: %4d binding sites in %4d input sequences, %4.1f sites in %4d backg sequences\n",
               iterationCn,numSite1,numSeq1,numSite0,numSeq0/numBackgSets);
            printf("Max difference in PWM between two consecutive iterations: %5.4f. Specified FDR bound %5.4f actual FDR bound %5.4f\n\n",
               maxPWMdiff,specifiedFDR,actualBound);
            fprintf(fp,"\nNumber of binding sites in %d input sequences:\t\t\t\t%d\n",numSeq1,numSite1);
            fprintf(fp,"Average number of binding sites in %d sets of %d backg sequences:\t%4.1f\n",numBackgSets,numSeq0/numBackgSets,numSite0);
            fprintf(fp,"Maximal difference in PWM between two consecutive iterations:\t\t%5.4f\n",maxPWMdiff);
            fprintf(fp,"Specified and actual computed FDR bounds:\t\t\t\t%5.4f %5.4f\n\n",specifiedFDR,actualBound);
            fflush(fp);
  
            count_num_sites_per_seq(topSubSeq1,numSite1,numSeq1,numSitePerSeq1);
            normalize_per_seq(score1,rscore1,seqLen1,pwmLen[ii],numSeq1,numSitePerSeq1);
            construct_pwm(t2PWM,score1,rscore1,seq1,rseq1,seqLen1,numSeq1,pwmLen[ii]);

            maxPWMdiff=check_convergence(t1PWM,t2PWM,pwmLen[ii]);
            iterationCn++;
         }
      }
      if ((actualBound-specifiedFDR>0.05*specifiedFDR) || numSite1<minSites) continue;

      printf("Number of sites1: %d\n",numSite1);
      motifName=alloc_char(PATH_MAX);
      strncpy(motifName, OutputDIR, strlen(OutputDIR));
      strncat(motifName, "/", 1);
      strncat(motifName, pwmName[ii], strlen(pwmName[ii])-3);
      strcat(motifName,"_fdrMotif.mx");
      
      f1=fopen(motifName,"w");
      fprintf(f1,"4\t%d\n",pwmLen[ii]);
      for(j=0; j<4; j++){
         for (i=0;  i<pwmLen[ii]; i++) fprintf(f1,"%5.4f ",t2PWM[i][j]); fprintf(f1,"\n"); 
      }
      fclose(f1);

      count_num_sites_per_seq(topSubSeq1,numSite1,numSeq1,numSitePerSeq1);
      seqHavingSite1=0;
      for (i=0; i<numSeq1; i++) {
         if (numSitePerSeq1[i]!=0) seqHavingSite1++; 
      }
   
      count_num_sites_per_seq(topSubSeq0,numBackgSets*numSite0,numSeq0,numSitePerSeq0);
      seqHavingSite0=0;
      for (i=0; i<numSeq0; i++) {
         if (numSitePerSeq0[i]!=0) seqHavingSite0++; 
      }
      if ((double)seqHavingSite0/(double)numSeq0<0.05) {
         fprintf(fp,"If one would like to select more binding sites in input sequences, one may increase the FDR bound and re-run fdrMotif\n");
      }
   
      if (fabs(maxPWMdiff)<=convergenceRate)  printf("\n\npwm converged...\n"); 
      if (iterationCn>=maxNumIteration)       printf("\n\nmaximal number of iterations reached\n");
 
      if (motifName) { free(motifName); motifName=NULL; }
      motifName=alloc_char(PATH_MAX);
      strncpy(motifName, OutputDIR, strlen(OutputDIR));
      strncat(motifName, "/", 1);
      strncat(motifName, pwmName[ii], strlen(pwmName[ii])-3);
      strcpy(motif[motifCn].name, motifName);
      strcat(motifName,"_fdrMotif.txt");

      f2=fopen(motifName,"w");

      fprintf(f2,"\nNumber of binding sites in %d input sequences:\t\t\t\t%d\n",numSeq1,numSite1);
      fprintf(f2,"Average number of binding sites in %d sets of %d backg sequences:\t%4.1f\n",numBackgSets,numSeq0/numBackgSets,numSite0);
      fprintf(f2,"Maximal difference in PWM between two consecutive iterations:\t\t%5.4f\n",maxPWMdiff);
      fprintf(f2,"Specified and actual computed FDR bounds:\t\t\t\t%5.4f %5.4f\n\n",specifiedFDR,actualBound);
      fprintf(f2,"\nThe %4d binding sites are distributed in %5d input sequences (%5.3f)\n\n",numSite1,numSeq1,(double)seqHavingSite1/(double)numSeq1);
      fprintf(f2,"The %4.0f binding sites are distributed in %5d backg sequences (%5.3f)\n\n",numSite0*numBackgSets,numSeq0,(double)seqHavingSite0/(double)numSeq0);
      fprintf(f2,"--------------------------------------------------------------------------------------------\n");

      binding_site_location(topSubSeq1,seq1,rseq1,geneID,seqLen1,numSeq1,pwmLen[ii],numSite1,f2);

      // construct pwm from the observed binding sites 
      binding_site_base_freq(topSubSeq1,seq1,rseq1,seqLen1,pwmLen[ii],numSite1,opwm);
      motif_consensus(opwm,pwmLen[ii],fconsensus);

      // evaluate motif usign MEME's llr subroutine
      // The return value is log(e_value)
      motif[motifCn].evalue = E_value(opwm, numSite1, bfreq, pwmLen[ii], numSeq1,
                                      seqLen1, &m, &e, f2);

      motif[motifCn].numSite=numSite1;
      motif[motifCn].m=m;
      motif[motifCn].e=e;
      motif[motifCn].specifiedFDR=specifiedFDR;
      motif[motifCn].actualBound=actualBound;
      strcpy(motif[motifCn].oconsensus,oconsensus);
      strcpy(motif[motifCn].fconsensus,fconsensus);
      motifCn++;
      
      fclose(f2);
      if (motifName) { free(motifName); motifName=NULL; }
   }

   sort_motif(motif, motifCn);
   
   /* Reusing f2 ... too many variables */   
   if ((f2 = fopen(motifSummaryFileName, "w"))) {
      for (i=0; i<motifCn; i++) {
         fprintf(f2, "%s\t%s\t%s\t%5.4f\t%5.4f\t%d\t%3.1fe%+04.0f\n",
                      motif[i].name,motif[i].oconsensus,motif[i].fconsensus,
                      motif[i].specifiedFDR,motif[i].actualBound,
                      motif[i].numSite,motif[i].m,motif[i].e); 
      }
      
      fclose(f2);
   } else printf("Unable to open motif summary file!\n");
   
   time(&t);
   fprintf(fp,"\nfinished: %s\n", asctime(localtime(&t))); 

   fclose(fp);
   
   if (seqLen1)        { free(seqLen1);        seqLen1=NULL;        }
   if (seqLen0)        { free(seqLen0);        seqLen0=NULL;        }
   if (seq1[0])        { free(seq1[0]);        seq1[0]=NULL;        }
   if (seq1)           { free(seq1);           seq1=NULL;           }
   if (rseq1[0])       { free(rseq1[0]);       rseq1[0]=NULL;       }
   if (rseq1)          { free(rseq1);          rseq1=NULL;          }
   if (seq0[0])        { free(seq0[0]);        seq0[0]=NULL;        }
   if (seq0)           { free(seq0);           seq0=NULL;           }
   if (rseq0[0])       { free(rseq0[0]);       rseq0[0]=NULL;       }
   if (rseq0)          { free(rseq0);          rseq0=NULL;          }
   if (geneID[0])      { free(geneID[0]);      geneID[0]=NULL;      }
   if (geneID)         { free(geneID);         geneID=NULL;         }
   if (t1PWM[0])       { free(t1PWM[0]);       t1PWM[0]=NULL;       }
   if (t1PWM)          { free(t1PWM);          t1PWM=NULL;          }
   if (tmpPWM[0])      { free(tmpPWM[0]);      tmpPWM[0]=NULL;      }
   if (tmpPWM)         { free(tmpPWM);         tmpPWM=NULL;         }
   if (t2PWM[0])       { free(t2PWM[0]);       t2PWM[0]=NULL;       }
   if (t2PWM)          { free(t2PWM);          t2PWM=NULL;          }
   if (score1[0])      { free(score1[0]);      score1[0]=NULL;      }
   if (score1)         { free(score1);         score1=NULL;         }
   if (rscore1[0])     { free(rscore1[0]);     rscore1[0]=NULL;     }
   if (rscore1)        { free(rscore1);        rscore1=NULL;        }
   if (score0[0])      { free(score0[0]);      score0[0]=NULL;      }
   if (score0)         { free(score0);         score0=NULL;         }
   if (rscore0[0])     { free(rscore0[0]);     rscore0[0]=NULL;     }
   if (rscore0)        { free(rscore0);        rscore0=NULL;        }
   if (topSubSeq1)     { free(topSubSeq1);     topSubSeq1=NULL;     }
   if (topSubSeq0)     { free(topSubSeq0);     topSubSeq0=NULL;     }
   if (numSitePerSeq0) { free(numSitePerSeq0); numSitePerSeq0=NULL; }
   if (numSitePerSeq1) { free(numSitePerSeq1); numSitePerSeq1=NULL; }
   if (bfreq)          { free(bfreq);          bfreq=NULL;          }
   if (opwm[0])        { free(opwm[0]);        opwm[0]=NULL;        }
   if (opwm)           { free(opwm);           opwm=NULL;           }
   
   if (motifSummaryFileName) free(motifSummaryFileName);
   if (motifFileName) free(motifFileName);
   if (nonMotifFileName) free(nonMotifFileName);
   if (oFileName) free(oFileName);
   
   return (1);
}

void motif_consensus(double **opwm,int pwmLen,char *consensus) {

   register int i,j,k;
   int used[4];
   char s[5];
   double tmp[4];

   for (i=0; i<pwmLen; i++) {
      for (j=0; j<4; j++) tmp[j]=opwm[i][j];
      sort_double(tmp,4);

      for (k=0; k<4; k++) used[k]=0;
      for (k=0; k<4; k++) {
         for (j=0; j<4; j++) {
            if (tmp[k]==opwm[i][j] && !used[j]) { used[j]=1; break; }
         }
         switch (j) {
            case 0: s[k]='a'; break; 
            case 1: s[k]='c'; break; 
            case 2: s[k]='g'; break; 
            case 3: s[k]='t'; break; 
            default: break; 
         } 
      }
      s[4]='\0';

      // single nucleotide
      if (tmp[0]>0.5 && tmp[0]>=2*tmp[1]) {
         consensus[i]=s[0];
      }
      // double degenerate code 
      //else if (tmp[0]+tmp[1]>0.75 && tmp[0]<=0.5 && tmp[1]<=0.5) {
      else if (tmp[0]+tmp[1]>0.75) {
         if      ((s[0]=='a' && s[1]=='t')||(s[1]=='a' && s[0]=='t')) consensus[i]='w'; 
         else if ((s[0]=='a' && s[1]=='g')||(s[1]=='a' && s[0]=='g')) consensus[i]='r'; 
         else if ((s[0]=='g' && s[1]=='t')||(s[1]=='g' && s[0]=='t')) consensus[i]='k'; 
         else if ((s[0]=='c' && s[1]=='g')||(s[1]=='c' && s[0]=='g')) consensus[i]='s'; 
         else if ((s[0]=='c' && s[1]=='t')||(s[1]=='c' && s[0]=='t')) consensus[i]='y'; 
         else if ((s[0]=='a' && s[1]=='c')||(s[1]=='a' && s[0]=='c')) consensus[i]='m'; 
      }
      // triple degenerate code
      else if (tmp[3]<0.05) {
         if (s[3]=='a') consensus[i]='b'; 
         else if (s[3]=='c') consensus[i]='d'; 
         else if (s[3]=='g') consensus[i]='h'; 
         else if (s[3]=='t') consensus[i]='v'; 
      }
      else consensus[i]='n'; 
   }
   consensus[pwmLen]='\0';
}
