#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <malloc.h>
#include <time.h>
#include "ga_pwm.h"

int main(int argc,char **argv) {

   register int ii,jj,i,j;
   int numMotifSeq,numBackgSeq,matrixLen,numConstraints,goodArgument;
   int populationSize,numGeneration;
   int *motifSeqLen,*backgSeqLen;
   char **motifSeq,**backgSeq,**rmotifSeq,**rbackgSeq,**geneID,*ch,foundRequiredFile[4];
   char *mFileName,*bFileName,*cFileName,*pFileName,*oFileName;
   double **originalPWM,***PWM,**tPWM,*scoreMotif,*scoreBackg;
   double sumMin,sumMax,specificity,mutationRate,aveLenMotif,aveLenBackg;
   double greaterMutationRate, lesserMutationRate;
   unsigned long long  seed;
   Constraint *fix;
   Fitness *fitness;
   Wheel *wheel;
   FILE *fp;
   time_t t;

   if (argc<7) { 
      printf("\n\nUSAGE:\n\n"); 
      printf("   ./gapwm -fm motifSeqFile -fb backgSeqFile -fpwm initialPWMFile -g maxNumGeneration -p populationSize -rg greaterMutationRate -rl lesserMutationRate -sp specificity -fo outputFile -fc optionalFile\n\n");
      printf("  -fm   motifSeqFile           File containing motif sequences in FASTA format, e.g., ChIP file\n");
      printf("  -fb   backgSeqFile           File containing background sequences in FASTA format\n");
      printf("  -fpwm initialPWMFile         File containing initial PWM\n");
      printf("  -g    maxGeneration          Maximal number of generations for a GA run (default: 1000)\n");
      printf("  -p    populationSize         Population size (default: 100)\n");
      printf("  -rg   greaterMutationRate    Mutation rate for first 100 generations (default: 0.05)\n");
      printf("  -rl   lesserMutationRate     Mutation rate for all generations after 100 (default: 0.02)\n");
      printf("  -sp   specificity            For ROC integration from 0 to 1.0-specificity (default: 0.9)\n");
      printf("  -fo   outputFile             Name of the output file (default: output.txt)\n\n");
      printf("  -fc   optionalFile           Optional file containing position constraints\n\n");
      exit(0);
   }

   mFileName=alloc_char(200); bFileName=alloc_char(200); pFileName=alloc_char(200);
   cFileName=alloc_char(200); oFileName=alloc_char(200);
   mFileName[0]='\0'; bFileName[0]='\0'; pFileName[0]='\0'; cFileName[0]='\0'; 
   motifSeqLen=alloc_int(MAX_NUM_SEQ);
   backgSeqLen=alloc_int(MAX_NUM_SEQ);
   geneID=alloc_char_char(MAX_NUM_SEQ,500);
   numMotifSeq=0; numBackgSeq=0;
   numConstraints=0; numGeneration=1000; populationSize=100; specificity=0.9;
   greaterMutationRate = 0.05; lesserMutationRate = 0.02;
   strcpy(oFileName,"output.txt"); oFileName[10]='\0';
   for (ii=0; ii<3; ii++) foundRequiredFile[ii]='0';

   for (ii=0; ii<argc; ii++) {
      if (argv[ii][0]=='-') {
         goodArgument=0;
         if (strncmp(argv[ii],"-fm",3)==0||strncmp(argv[ii],"-fb",3)==0||strncmp(argv[ii],"-fpwm",5)==0||
             strncmp(argv[ii],"-fc",3)==0||strncmp(argv[ii],"-g",2)==0 ||strncmp(argv[ii],"-p",2)==0||
             strncmp(argv[ii],"-sp",3)==0||strncmp(argv[ii],"-r",2)==0 ||strncmp(argv[ii],"-fo",3)==0) {
            goodArgument=1; 
         }
         if (goodArgument==0) { printf("argument: %s unknown\n",argv[ii]); exit(0);  }
      }
   }

   for (ii=0; ii<argc-1; ii++) {
      if (strncmp(argv[ii],"-fm",3)==0   && argv[ii+1]!=NULL) {
         foundRequiredFile[0]='1';
         strcpy(mFileName,argv[ii+1]);
         motifSeq=read_seq(&numMotifSeq,motifSeqLen,geneID,MAX_NUM_SEQ,MAX_SEQ_LENGTH,argv[ii+1]);
         aveLenMotif=0; for (i=0; i<numMotifSeq; i++) aveLenMotif +=motifSeqLen[i]; aveLenMotif /=(double)numMotifSeq;
         printf("\ntrue set - number of sequences: %4d, average sequence length: %5.1f\n",numMotifSeq,aveLenMotif);
      }
      else if (strncmp(argv[ii],"-fb",3)==0   && argv[ii+1]!=NULL) {
         foundRequiredFile[1]='1';
         strcpy(bFileName,argv[ii+1]);
         backgSeq=read_seq(&numBackgSeq,backgSeqLen,geneID,MAX_NUM_SEQ,MAX_SEQ_LENGTH,argv[ii+1]);
         aveLenBackg=0; for (i=0; i<numBackgSeq; i++) aveLenBackg +=backgSeqLen[i]; aveLenBackg /=(double)numBackgSeq;
         printf("backg set - number of sequences: %4d, average sequence length: %5.1f\n",numBackgSeq,aveLenBackg);
      }
      else if (strncmp(argv[ii],"-fpwm",5)==0 && argv[ii+1]!=NULL) {
         foundRequiredFile[2]='1';
         strcpy(pFileName,argv[ii+1]);
         originalPWM=read_initial_pwm(&matrixLen,argv[ii+1]);
      }
      else if (strncmp(argv[ii],"-fc",3)==0    && argv[ii+1]!=NULL) {
         strcpy(cFileName,argv[ii+1]);
         fix=read_constraints(cFileName,matrixLen,&numConstraints,matrixLen,numGeneration);
      }
      else if (strncmp(argv[ii],"-g",2)==0   && argv[ii+1]!=NULL) numGeneration=atoi(argv[ii+1]); 
      else if (strncmp(argv[ii],"-p",2)==0   && argv[ii+1]!=NULL) populationSize=atoi(argv[ii+1]); 
      else if (strncmp(argv[ii],"-sp",3)==0  && argv[ii+1]!=NULL) specificity=atof(argv[ii+1]); 
      else if (strncmp(argv[ii],"-rg",3)==0   && argv[ii+1]!=NULL) greaterMutationRate=atof(argv[ii+1]); 
      else if (strncmp(argv[ii],"-rl",3)==0   && argv[ii+1]!=NULL) lesserMutationRate=atof(argv[ii+1]); 
      else if (strncmp(argv[ii],"-fo",3)==0  && argv[ii+1]!=NULL) strcpy(oFileName,argv[ii+1]); 
      else { }
   }
   if (foundRequiredFile[0]=='0') {
      printf("\nPlease specify file containing motif sequences with the -fm argument\n"); exit(0); 
   }
   else if (foundRequiredFile[1]=='0') {
      printf("\nPlease specify file containing background sequences with the -fb argument\n"); exit(0); 
   }
   else if (foundRequiredFile[2]=='0') {
      printf("\nPlease specify file containing the initial pwm to be optimized with the -fpwm argument\n"); exit(0); 
   }
   else { }

   if (numGeneration<=0) {
      printf("\nMaximal numbe of generaton < 1\n"); exit(0);
   }
   else if (populationSize<=0) {
      printf("\nPopulation size < 1\n"); exit(0);
   }
   else if (greaterMutationRate<=0) {
      printf("\nGreater mutation rate should be >0 and <1\n"); exit(0);
   }
   else if (lesserMutationRate<=0) {
      printf("\nLesser mutation rate should be >0 and <1\n"); exit(0);
   }
   if (cFileName[0]=='\0') printf("\nNo constraints read...\n");

   printf("\nMax number of generations:\t\t\t%4d\n",numGeneration);
   printf("Population size:\t\t\t\t%4d\n",populationSize);
   printf("Specificity cutoff for ROC integration:\t\t%4.2f\n\n",specificity);
   printf("================================================================\n\n");

   /* reverse complementary sequences */
   rmotifSeq=alloc_char_char(numMotifSeq,MAX_SEQ_LENGTH);
   rbackgSeq=alloc_char_char(numBackgSeq,MAX_SEQ_LENGTH);
   reverse_seq(motifSeq,rmotifSeq,numMotifSeq,motifSeqLen);
   reverse_seq(backgSeq,rbackgSeq,numBackgSeq,backgSeqLen);


   PWM=alloc_double_double_double(populationSize,matrixLen,4);
   tPWM=alloc_double_double(matrixLen,4);
   scoreMotif=alloc_double(numMotifSeq); /* store motif sequence scores, one per sequence */
   scoreBackg=alloc_double(numBackgSeq); /* store backg sequence scores, one per sequence */
   fitness=(Fitness *)calloc(populationSize,sizeof(Fitness));
   wheel=(Wheel *)calloc(populationSize,sizeof(Wheel));

   /* make many 'clones' of the initial PWM to form a 'population' */
   for (ii=0; ii<populationSize; ii++) {
      for (i=0; i<matrixLen; i++) {
         for (j=0; j<4; j++) PWM[ii][i][j]=originalPWM[i][j];
      }
   }
   /* information weighted */
   transform_pwm(PWM[0],tPWM,matrixLen);
   sum_position_min(&(sumMin),tPWM,matrixLen); /* column sum of min */
   sum_position_max(&(sumMax),tPWM,matrixLen); /* column sum of max */

   /* score motif and background sequences */
   score_seq(numMotifSeq,motifSeq,rmotifSeq,motifSeqLen,tPWM,matrixLen,sumMin,sumMax,scoreMotif);
   score_seq(numBackgSeq,backgSeq,rbackgSeq,backgSeqLen,tPWM,matrixLen,sumMin,sumMax,scoreBackg);
   fitness[0].value=cal_ROC(scoreMotif,numMotifSeq,scoreBackg,numBackgSeq,1.0-specificity);
   fitness[0].index=0;
   for (ii=1; ii<populationSize; ii++) {
      fitness[ii].value=fitness[0].value; fitness[ii].index=ii;
   }

   ch=(char *)calloc(50,sizeof(char));
   t=time(NULL);
   ch=asctime(localtime(&t));

   seed=time(0);
   sgenrand(seed);

   fp=fopen(oFileName,"w");
   fprintf(fp,"==========================================================================\n\n");
   fprintf(fp,"command line: ");
   for (i=0; i<argc; i++) fprintf(fp,"%s ",argv[i]); fprintf(fp,"\n\n");
   fprintf(fp,"motif sequence file:\t\t\t\t\t%s\n",mFileName);
   fprintf(fp,"background sequence file:\t\t\t\t%s\n",bFileName);
   fprintf(fp,"number of sequences in motif sequence file:\t\t%d\n",numMotifSeq);
   fprintf(fp,"average length:\t\t\t\t\t\t%5.1f\n",aveLenMotif);
   fprintf(fp,"number of sequences in background sequence file:\t%d\n",numBackgSeq);
   fprintf(fp,"average length:\t\t\t\t\t\t%5.1f\n",aveLenBackg);
   fprintf(fp,"initial PWM:\t\t\t\t\t\t%s\n",pFileName);
   fprintf(fp,"\nmutation rates: %4.3f for the 1st 100 generations and %4.3f for the remaining generations\n",greaterMutationRate, lesserMutationRate);
   if (cFileName[0]!='\0') {
      fprintf(fp,"constraint file:\t\t\t\t\t%s\n",cFileName);
      fprintf(fp,"\n\nYou specified the following constraints:\n\n");
      for (i=0; i<numConstraints; i++) {
         fprintf(fp,"   Generations %4d-%4d:\n",fix[i].startGen,fix[i].endGen);
         for (j=0; j<matrixLen; j++) {
            switch (fix[i].s1[j]) {
               case 'a': fprintf(fp,"      position[%2d]:  no mutation allowed\n",j+1); break;
               case 'c': fprintf(fp,"      position[%2d]:  no mutation allowed\n",j+1); break;
               case 'g': fprintf(fp,"      position[%2d]:  no mutation allowed\n",j+1); break;
               case 't': fprintf(fp,"      position[%2d]:  no mutation allowed\n",j+1); break;
               case 'w': fprintf(fp,"      position[%2d]:  mutation restricted to 'A' or 'T'\n",j+1); break;
               case 'r': fprintf(fp,"      position[%2d]:  mutation restricted to 'A' or 'G'\n",j+1); break;
               case 'k': fprintf(fp,"      position[%2d]:  mutation restricted to 'G' or 'T'\n",j+1); break;
               case 's': fprintf(fp,"      position[%2d]:  mutation restricted to 'C' or 'G'\n",j+1); break;
               case 'y': fprintf(fp,"      position[%2d]:  mutation restricted to 'C' or 'T'\n",j+1); break;
               case 'm': fprintf(fp,"      position[%2d]:  mutation restricted to 'A' or 'C'\n",j+1); break;
               case 'b': fprintf(fp,"      position[%2d]:  mutation restricted to 'C', 'G', or 'T'\n",j+1); break;
               case 'd': fprintf(fp,"      position[%2d]:  mutation restricted to 'A', 'G', or 'T'\n",j+1); break;
               case 'h': fprintf(fp,"      position[%2d]:  mutation restricted to 'A', 'C', or 'T'\n",j+1); break;
               case 'v': fprintf(fp,"      position[%2d]:  mutation restricted to 'A', 'C', or 'G'\n",j+1); break;
               case 'n': fprintf(fp,"      position[%2d]:  mutation without any constraints\n",j+1); break;
               default: break;
            }
         }
         fprintf(fp,"\n");
      }
      fprintf(fp,"\n");
      fprintf(fp,"   No constraints after generations %4d.\n\n",fix[numConstraints-1].endGen+1);
   }
   else fprintf(fp,"No constraints specified.\n\n");

   fprintf(fp,"max number of generations:\t\t\t\t%d\n",numGeneration);
   fprintf(fp,"population size:\t\t\t\t\t%d\n",populationSize);
   fprintf(fp,"random seed:\t\t\t\t\t\t%lld\n\n",seed);
   fprintf(fp,"ROC integration from 0 to %3.2f\t\t\t\t\n",1.0-specificity);
   fprintf(fp,"job started: %s\n",ch);
   fprintf(fp,"=========================================================================\n\n");
   fprintf(fp,"generation: %4d fitness: %7.5f\n",0,fitness[0].value);
   fprintf(fp,"original PWM:\n");
   for (j=0; j<4; j++) {
      for (i=0; i<matrixLen; i++) {
         if (i<matrixLen-1) fprintf(fp,"%7.5f\t",PWM[0][i][j]);
         else               fprintf(fp,"%7.5f\n",PWM[0][i][j]);
      }
   }
   fprintf(fp,"\n");
   fflush(fp);

   printf("starting GAPWM... this may take several hours to complete\n");
   printf("type: ctrl/z, then, bg, to run it in background\n\n");

   for (mutationRate = greaterMutationRate, jj=1; jj<numGeneration+1; jj++) {
      if (jj>100) mutationRate=lesserMutationRate;

      roulett_wheel(fitness,populationSize,wheel,jj);
      mutation(PWM,matrixLen,wheel,populationSize,fitness[0].index,mutationRate,jj,fix,numConstraints);
      for (ii=0; ii<populationSize; ii++) {
         transform_pwm(PWM[ii],tPWM,matrixLen);
         sum_position_min(&(sumMin),tPWM,matrixLen);
         sum_position_max(&(sumMax),tPWM,matrixLen);

         score_seq(numMotifSeq,motifSeq,rmotifSeq,motifSeqLen,tPWM,matrixLen,sumMin,sumMax,scoreMotif);
         score_seq(numBackgSeq,backgSeq,rbackgSeq,backgSeqLen,tPWM,matrixLen,sumMin,sumMax,scoreBackg);

         fitness[ii].value=cal_ROC(scoreMotif,numMotifSeq,scoreBackg,numBackgSeq,1.0-specificity);
         fitness[ii].index=ii;
      }
      /* sort struct <fitness> by values in descending order */
      sort_fitness(fitness,populationSize);
      printf("generation %5d fitness score: %8.5f\n",jj,fitness[0].value);
      fprintf(fp,"generation: %4d fitness: %7.5f\n",jj,fitness[0].value);
      fprintf(fp,"mutation rate: %5.3f\n",mutationRate);
      for (j=0; j<4; j++) {
         for (i=0; i<matrixLen; i++) {
            if (i<matrixLen-1) fprintf(fp,"%7.5f\t",PWM[fitness[0].index][i][j]);
            else fprintf(fp,"%7.5f\n",PWM[fitness[0].index][i][j]);
         }
      }
      fprintf(fp,"\n");
      fflush(fp);
   }
   t=time(NULL);
   ch=asctime(localtime(&t));
   fprintf(fp,"finished: %s\n",ch);
   fclose(fp);

   if (motifSeqLen)    { free(motifSeqLen);    motifSeqLen=NULL;    }
   if (backgSeqLen)    { free(backgSeqLen);    backgSeqLen=NULL;    }
   if (motifSeq[0])    { free(motifSeq[0]);    motifSeq[0]=NULL;    }
   if (motifSeq)       { free(motifSeq);       motifSeq=NULL;       }
   if (backgSeq[0])    { free(backgSeq[0]);    backgSeq[0]=NULL;    }
   if (backgSeq)       { free(backgSeq);       backgSeq=NULL;       }
   if (rmotifSeq[0])   { free(rmotifSeq[0]);   rmotifSeq[0]=NULL;   }
   if (rmotifSeq)      { free(rmotifSeq);      rmotifSeq=NULL;      }
   if (rbackgSeq[0])   { free(rbackgSeq[0]);   rbackgSeq[0]=NULL;   }
   if (rbackgSeq)      { free(rbackgSeq);      rbackgSeq=NULL;      }
   if (geneID[0])      { free(geneID[0]);      geneID[0]=NULL;      }
   if (geneID)         { free(geneID);         geneID=NULL;         }
   if (originalPWM[0]) { free(originalPWM[0]); originalPWM[0]=NULL; }
   if (originalPWM)    { free(originalPWM);    originalPWM=NULL;    }
   if (PWM[0][0])      { free(PWM[0][0]);      PWM[0][0]=NULL;      }
   if (PWM[0])         { free(PWM[0]);         PWM[0]=NULL;         }
   if (PWM)            { free(PWM);            PWM=NULL;            }
   if (tPWM[0])        { free(tPWM[0]);        tPWM[0]=NULL;        }
   if (tPWM)           { free(tPWM);           tPWM=NULL;           }
   if (scoreMotif)     { free(scoreMotif);     scoreMotif=NULL;     }
   if (scoreBackg)     { free(scoreBackg);     scoreBackg=NULL;     }
   if (wheel)          { free(wheel);          wheel=NULL;          }
   if (fitness)        { free(fitness);        fitness=NULL;        }
   if (bFileName)      { free(bFileName);      bFileName=NULL;      }
   if (mFileName)      { free(mFileName);      mFileName=NULL;      }
   if (cFileName)      { free(cFileName);      cFileName=NULL;      }
   if (oFileName)      { free(oFileName);      oFileName=NULL;      }
   if (pFileName)      { free(pFileName);      pFileName=NULL;      }
   return (1);
}

