#include "config.h"

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>

#ifdef HAVE_MALLOC_H
#include <malloc.h>
#endif

#include "defines.h"

int **alloc_int_int(int ,int );
char *alloc_char(int );
char **alloc_char_char(int, int);
double *alloc_double(int );

int read_seq(SAMPLE *data,char *fileName) {

   FILE *fp;
   register int i,j;
   int len,numSeq,cn,yesSpace;
   char  *buffer,*tok;

   printf("reading %s...\n",fileName);

   fp=fopen(fileName,"r");
   if (!fp) { perror(fileName); exit(0); }

   buffer=alloc_char(MAX_BUFFER_LENGTH);

   i=0;
   if (fgets(buffer,MAX_BUFFER_LENGTH,fp)>0) {
      while (!feof(fp)){
         len=strlen(buffer);
         buffer[len]='\0';
         if (buffer[0]=='>') {
            yesSpace=0;
            for (j=1; j<len; j++) {
               if (buffer[j]==' ') { yesSpace=1; break; } 
            }
            if (yesSpace) tok=strtok(buffer," ");
            else          tok=strtok(buffer,"\n");
            strcpy(data[i].name,tok);
            len=strlen(tok);
            data[i].name[len]='\0';
  
            cn=0;
            do {
               if (fgets(buffer,MAX_BUFFER_LENGTH,fp)) {
                  len=strlen(buffer);
                  buffer[len-1]='\0';
                  if (buffer[0]=='/' || buffer[0]=='#') continue;
                  if (buffer[0]!='>') {
                     for (j=0; j<len-1; j++) {
                        if (cn<MAX_SEQ_LENGTH) { 
                           data[i].seq[cn]=buffer[j]; cn++; 
                        }
                     }
                  }
                  else break;
               }
               else break;
            } while (buffer[0]!='>');
            data[i].seq[cn]='\0'; data[i].length=cn;
            if (cn>1) {
               i++; 
               if (i>=MAX_NUM_SEQ) { 
                  printf("\n\nErro: maximal number of seqences reached!\n"); 
                  printf("Please reset MAX_NUM_SEQ in gadem.h and rebuild (see installation)\n\n");
                  exit(0); 
               }
            }
         }
      };
   }
   fclose(fp);
   if (buffer) { free(buffer); buffer=NULL; }

   numSeq=i;
   for (i=0; i<numSeq; i++) {
      for (j=0; j<data[i].length; j++) {
         switch(data[i].seq[j]) {
            case 'A': data[i].seq[j]='a'; break;
            case 'C': data[i].seq[j]='c'; break;
            case 'G': data[i].seq[j]='g'; break;
            case 'T': data[i].seq[j]='t'; break;
            case 'N': data[i].seq[j]='n'; break;
            case 'a': data[i].seq[j]='a'; break;
            case 'c': data[i].seq[j]='c'; break;
            case 'g': data[i].seq[j]='g'; break;
            case 't': data[i].seq[j]='t'; break;
            case 'n': data[i].seq[j]='n'; break;
            default:  data[i].seq[j]='n'; break;
         }
      }
   }

   //for (i=0; i<numSeq; i++) {
   //   if (data[i].length>401) printf("%s\n"); 
   //}

   /*------------------------------------------------------------------------
   fp=fopen("debug.seq","w");
   for (i=0; i<numSeq; i++) {
      fprintf(fp,"%s\n",data[i].name);
      fprintf(fp,"%s\n",data[i].seq);
   }
   fclose(fp);
   exit(0);
   // for (i=0; i<numSeq; i++) printf("seq[%4d] length: %5d\n",i+1,data[i].length);
   ------------------------------------------------------------------------*/

   return (numSeq);
}

int max_seq_length(int numSeq,SAMPLE *data) {

  int i,maxLength=0;
  for (i=0; i<numSeq; i++) {
     if (data[i].length>maxLength) maxLength=data[i].length; 
  }
  return (maxLength);
}

int average_seq_length(int numSeq,SAMPLE *data) {

   int i,aveLen=0;

   for (i=0; i<numSeq; i++) aveLen +=data[i].length; 

   return ((int)((double)aveLen/(double)numSeq));
}

void base_count(int **count,int numSeq,SAMPLE *data) {

   int i,j;

   for (i=0; i<numSeq; i++) {
      for (j=0; j<4; j++) count[i][j]=0; 
   }
   for (i=0; i<numSeq; i++) {
      for (j=0; j<data[i].length; j++) {
         switch (data[i].seq[j]) {
            case 'a': count[i][0]++; break;
            case 'c': count[i][1]++; break;
            case 'g': count[i][2]++; break;
            case 't': count[i][3]++; break;
            default: break; 
         } 
      } 
   }
}

void print_seq(int numSeq,SAMPLE *data) {

   int i;
   FILE *fp;

   fp=fopen("debug.seq","w");
   for (i=0; i<numSeq; i++) {
      fprintf(fp,"%s\n",data[i].name);
      fprintf(fp,"%s\n",data[i].seq);
  }
  fclose(fp);
}

void copy_data(SAMPLE *odata,SAMPLE *data,int numSeq) {

   int i;

   for (i=0; i<numSeq; i++) {
      strcpy(odata[i].seq,data[i].seq);
      strcpy(odata[i].name,data[i].name);
      odata[i].length=data[i].length; 
   }
}

void mask_repetitive(SAMPLE *data,int numSeq,char *fileName) {

   register int i,j,k,l;
   char **kmer,*s1;
   char *maskedFileName;
   int maxNumKmer,maxKmerLen,klen,numKmer,pos,id,cn;
   // MASK *mask;
   FILE *fp;

   maxKmerLen=20;
   maxNumKmer=30;
   kmer=alloc_char_char(maxNumKmer,maxKmerLen+1);
   s1=alloc_char(maxKmerLen+1);

   numKmer=5; klen=8;
   strcpy(kmer[0],"aaaaaaaa");
   strcpy(kmer[1],"tttttttt");
   strcpy(kmer[2],"cacacaca");
   strcpy(kmer[3],"tgtgtgtg");
   strcpy(kmer[4],"tatatata");

   // mask=(MASK *)calloc(numSeq*5,sizeof(MASK));
   // if (!mask) { printf("calloc for mask failed!\n"); exit(0); }

   maskedFileName=alloc_char(500);
   id=-1;
   for (i=0; i<strlen(fileName); i++) {
      if (fileName[i]=='/') id=i; 
   }
   if (id==-1) strcpy(maskedFileName,fileName);
   else {
      for (k=0,i=id+1; i<strlen(fileName); i++,k++) maskedFileName[k]=fileName[i]; maskedFileName[k]='\0'; 
   }
   strcat(maskedFileName,".mask");
   fp=fopen(maskedFileName,"w");

   cn=0;
   for (i=0; i<numSeq; i++) {
      for (j=0; j<data[i].length-klen+1; j++) {
         for (k=0; k<klen; k++) s1[k]=data[i].seq[j+k]; s1[k]='\0';
         for (l=0; l<numKmer; l++) {
            pos=0;
            while (strncmp(s1,kmer[l],klen)==0) {
               switch (l) {
                  case 0: pos=pos+1; break;
                  case 1: pos=pos+1; break;
                  case 2: pos=pos+2; break;
                  case 3: pos=pos+2; break;
                  case 4: pos=pos+2; break;
                  default: break;
               }
               // pos++;
               for (k=0; k<klen; k++) s1[k]=data[i].seq[j+k+pos]; s1[k]='\0';
            };
            if (pos!=0) {
               fprintf(fp,"%s:%d-%d\t",data[i].name,j+1,j+pos+klen-1);
               for (k=0; k<pos+klen-1; k++) fprintf(fp,"%c",data[i].seq[j+k]); fprintf(fp,"\n");
               for (k=0; k<pos+klen-1; k++) data[i].seq[j+k]='n';
            }
         }
      }
   }

   numKmer=10; klen=12;
   strcpy(kmer[0],"ggaggaggagga");
   strcpy(kmer[1],"gaggaggaggag");
   strcpy(kmer[2],"agaagaagaaga");
   strcpy(kmer[3],"ctcctcctcctc");
   strcpy(kmer[4],"tcctcctcctcc");
   strcpy(kmer[5],"tcttcttcttct");
   strcpy(kmer[6],"tagtagtagtag");
   strcpy(kmer[7],"aataataataat");
   strcpy(kmer[8],"attattattatt");
   strcpy(kmer[9],"ataataataata");

   for (i=0; i<numSeq; i++) {
      for (j=0; j<data[i].length-klen+1; j++) {
         for (k=0; k<klen; k++) s1[k]=data[i].seq[j+k]; s1[k]='\0';
         for (l=0; l<numKmer; l++) {
            pos=0;
            while (strncmp(s1,kmer[l],klen)==0) {
               switch (l) {
                  case 0: pos=pos+3; break;  // three letter repeats
                  case 1: pos=pos+3; break;  // three letter repeats
                  case 2: pos=pos+3; break;  // three letter repeats
                  case 3: pos=pos+3; break;  // three letter repeats
                  case 4: pos=pos+3; break;  // three letter repeats
                  case 5: pos=pos+3; break;  // three letter repeats
                  case 6: pos=pos+3; break;  // three letter repeats
                  case 7: pos=pos+3; break;  // three letter repeats
                  case 8: pos=pos+3; break;  // three letter repeats
                  case 9: pos=pos+3; break;  // three letter repeats
                  default: break;
               }
               //pos++;
               for (k=0; k<klen; k++) s1[k]=data[i].seq[j+k+pos]; s1[k]='\0';
            };
            if (pos!=0) {
               fprintf(fp,"%s:%d-%d\t",data[i].name,j+1,j+pos+klen-1);
               for (k=0; k<pos+klen-1; k++) fprintf(fp,"%c",data[i].seq[j+k]); fprintf(fp,"\n");
               for (k=0; k<pos+klen-1; k++) data[i].seq[j+k]='n';
            }
         }
      }
   }

   numKmer=1; klen=15;
   strcpy(kmer[0],"cagcagcagcagcag");

   for (i=0; i<numSeq; i++) {
      for (j=0; j<data[i].length-klen+1; j++) {
         for (k=0; k<klen; k++) s1[k]=data[i].seq[j+k]; s1[k]='\0';
         for (l=0; l<numKmer; l++) {
            pos=0;
            while (strncmp(s1,kmer[l],klen)==0) {
               switch (l) {
                  case 0: pos=pos+3; break;  // three letter repeats
                  default: break;
               }
               // pos++;
               for (k=0; k<klen; k++) s1[k]=data[i].seq[j+k+pos]; s1[k]='\0';
            };
            if (pos!=0) {
               fprintf(fp,"%s:%d-%d\t",data[i].name,j+1,j+pos+klen-1);
               for (k=0; k<pos+klen-1; k++) fprintf(fp,"%c",data[i].seq[j+k]); fprintf(fp,"\n");
               for (k=0; k<pos+klen-1; k++) data[i].seq[j+k]='n';
            }
         }
      }
   }
   /*--------------------------------------
   numKmer=1; klen=16;
   strcpy(kmer[0],"catatatacatatata");

   for (i=0; i<numSeq; i++) {
      for (j=0; j<data[i].length-klen+1; j++) {
         for (k=0; k<klen; k++) s1[k]=data[i].seq[j+k]; s1[k]='\0';
         for (l=0; l<numKmer; l++) {
            pos=0;
            while (strncmp(s1,kmer[l],klen)==0) {
               switch (l) {
                  case 0: pos=pos+2; break;  
                  default: break; 
               }
               pos++;
               for (k=0; k<klen; k++) s1[k]=seq[i][j+k+pos]; s1[k]='\0';
            };
            if (pos!=0) { 
               for (k=0; k<pos+klen-1; k++) data[i].seq[j+k]='n';
            }
         }
      }
   }
   --------------------------------------*/
 
   /*
   maskedFileName=alloc_char(500);
   id=-1;
   for (i=0; i<strlen(fileName); i++) {
      if (fileName[i]=='/') id=i; 
   }
   if (id==-1) strcpy(maskedFileName,fileName);
   else {
      for (k=0,i=id+1; i<strlen(fileName); i++,k++) maskedFileName[k]=fileName[i]; maskedFileName[k]='\0'; 
   }
   strcat(maskedFileName,".mask");
   fp=fopen(maskedFileName,"w");

   for (i=0; i<cn; i++) {
      fprintf(fp,"%s\n",geneID[mask[i].id]);
      for (j=mask[i].start; j<mask[i].end; j++) {
         fprintf(fp,"%c",seq[mask[i].id][j]); 
      } fprintf(fp,"\n");
   }
   */
   /*-----------------------------------------------
   for (i=0; i<numSeq; i++) {
      fprintf(fp,"%s\n",data[i].name);
      for (j=0; j<data[i].length; j++) {
         fprintf(fp,"%c",seq[i][j]);
         if ((j+1)%50==0) fprintf(fp,"\n"); 
      }
   }
   ------------------------------------------------*/
   fclose(fp);

   if (kmer[0]) { free(kmer[0]); kmer[0]=NULL; }
   if (kmer)    { free(kmer);    kmer=NULL;    }
   if (s1)      { free(s1);      s1=NULL;      }
   if (maskedFileName)  { free(maskedFileName); maskedFileName=NULL; }
}

