#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include <malloc.h>
#include "fitness_pwm.h"

double **read_pwm(int *,char *);
int main(int argc,char **argv) {

   int numSeq1,numSeq0,pwmLen,generation;
   int *seqLen1,*seqLen0;
   char **seq1,**rseq1,**seq0,**rseq0,**geneID,*buffer,*tok;
   double **pwm,**tpwm,*score1,*score0;
   double sumMin,sumMax,fitnessScore,specificity; 
   register int i,j;
   FILE *fp,*fq;

   printf("  \n\nThis program calculates fitness score (area under ROC) for each PWM\n");
   printf("in the output file from gapwm using a motif set and a background set.\n\n");

   if (argc!=7) { 
      printf("USAGE: fitnessPWM MotifSet BackgSet outputFromGA pwmLen specificity output\n\n"); 
      printf("  MotifSet:      file containing motif sequences\n"); 
      printf("  BackgSet:      file containing background sequences\n"); 
      printf("  outputFromGA:  file containing the best performing PWMs from <gapwm>\n");
      printf("  pwmLen:        length of the PWM\n");
      printf("  specificity:   specificity as used in <gapwm>, e.g., 0.9\n");
      printf("  output:        output file containing the scores\n\n");
      exit(0);
   }
   
   pwmLen=atoi(argv[4]);
   specificity=atof(argv[5]);

   seqLen1=alloc_int(MAX_NUM_SEQ);
   seqLen0=alloc_int(MAX_NUM_SEQ);
   geneID=alloc_char_char(MAX_NUM_SEQ,500);
   pwm=alloc_double_double(MAX_PWM_LENGTH,4);
   tpwm=alloc_double_double(MAX_PWM_LENGTH,4);
   buffer=alloc_char(500);

   /* read motif sequence set */
   seq1=read_seq(&numSeq1,seqLen1,geneID,MAX_NUM_SEQ,MAX_SEQ_LENGTH,argv[1]);
   /* read background sequence set */
   seq0=read_seq(&numSeq0,seqLen0,geneID,MAX_NUM_SEQ,MAX_SEQ_LENGTH,argv[2]);

   rseq1=alloc_char_char(numSeq1,MAX_SEQ_LENGTH);
   rseq0=alloc_char_char(numSeq0,MAX_SEQ_LENGTH);
   score1=alloc_double(numSeq1);
   score0=alloc_double(numSeq0);

   reverse_seq(seq1,rseq1,numSeq1,seqLen1);
   reverse_seq(seq0,rseq0,numSeq0,seqLen0);

   fp=fopen(argv[3],"r");
   if (!fp) { perror(argv[3]); exit(0); }
   do {
      if (!feof(fp)) fgets(buffer,500,fp);
      else {
         printf("Error: no data in %s???\n",argv[3]); exit(0); 
      }
   } while (strncmp(buffer,"generation:",11)!=0);
   tok=strtok(buffer," ");
   tok=strtok(0," ");
   generation=atoi(tok); 
   fgets(buffer,500,fp);
   for (i=0; i<4; i++) {
      fgets(buffer,500,fp);
      tok=strtok(buffer,"\t");
      pwm[0][i]=atof(tok);
      for (j=1; j<pwmLen; j++) {
         tok=strtok(0,"\t");
         pwm[j][i]=atof(tok);
      }
   }

   transform_pwm(pwm,tpwm,pwmLen);
   sum_position_min(&(sumMin),tpwm,pwmLen);
   sum_position_max(&(sumMax),tpwm,pwmLen);

   score_seq(numSeq0,seq0,rseq0,seqLen0,tpwm,pwmLen,sumMin,sumMax,score0);
   score_seq(numSeq1,seq1,rseq1,seqLen1,tpwm,pwmLen,sumMin,sumMax,score1);
   fitnessScore=cal_ROC(score1,numSeq1,score0,numSeq0,1.0-specificity);

   fq=fopen(argv[6],"w");
   fprintf(fq,"#==========================================================================\n");
   fprintf(fq,"#train motif sequence file:\t%s\n",argv[1]);
   fprintf(fq,"#train backg sequence file:\t%s\n",argv[2]);
   fprintf(fq,"#specificity:\t\t\t%4.2f\n",specificity);
   fprintf(fq,"#pwm file:\t\t\t%s\n",argv[3]);
   fprintf(fq,"#=========================================================================\n\n");
   fprintf(fq,"%3d\t%7.5f\n",generation,fitnessScore);

   while (!feof(fp)) {
      fgets(buffer,500,fp);
      fgets(buffer,500,fp);
      if (strncmp(buffer,"generation",10)==0) {
         tok=strtok(buffer," ");
         tok=strtok(0," ");
         generation=atoi(tok);
         fgets(buffer,500,fp);
         for (i=0; i<4; i++) {
            fgets(buffer,500,fp);
            tok=strtok(buffer,"\t");
            pwm[0][i]=atof(tok);
            for (j=1; j<pwmLen; j++) {
               tok=strtok(0,"\t");
               pwm[j][i]=atof(tok);
            }
         }

         if (generation%5==0) {
            printf("scanning pwm: %d\n",generation);
            transform_pwm(pwm,tpwm,pwmLen);
            sum_position_min(&(sumMin),tpwm,pwmLen);
            sum_position_max(&(sumMax),tpwm,pwmLen);

            score_seq(numSeq0,seq0,rseq0,seqLen0,tpwm,pwmLen,sumMin,sumMax,score0);
            score_seq(numSeq1,seq1,rseq1,seqLen1,tpwm,pwmLen,sumMin,sumMax,score1);
            fitnessScore=cal_ROC(score1,numSeq1,score0,numSeq0,1.0-specificity);
   
            fprintf(fq,"%3d\t%7.5f\n",generation,fitnessScore);
            fflush(fq);
         }
      }
      else break;
   }; 
   fclose(fp); fclose(fq);
  
   if (seqLen1)     { free(seqLen1);    seqLen1=NULL;   }
   if (seqLen0)     { free(seqLen0);    seqLen0=NULL;   }
   if (seq1[0])     { free(seq1[0]);    seq1[0]=NULL;   }
   if (seq1)        { free(seq1);       seq1=NULL;      }
   if (seq0[0])     { free(seq0[0]);    seq0[0]=NULL;   }
   if (seq0)        { free(seq0);       seq0=NULL;      }
   if (geneID[0])   { free(geneID[0]);  geneID[0]=NULL; }
   if (geneID)      { free(geneID);     geneID=NULL;    }
   if (buffer)      { free(buffer);     buffer=NULL;    }
   if (pwm[0])      { free(pwm[0]);     pwm[0]=NULL;    }
   if (pwm)         { free(pwm);        pwm=NULL;       }
   if (tpwm[0])     { free(tpwm[0]);    tpwm[0]=NULL;   }
   if (tpwm)        { free(tpwm);       tpwm=NULL;      }
   if (score1)      { free(score1);     score1=NULL;    }
   if (score0)      { free(score0);     score0=NULL;    }

   return (1);
}

