/* This software is written by Leping Li and Tracy Xu at NIEHS/NIH */
#include "config.h"

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <time.h>
#include <ctype.h>
#include <sys/types.h>
#include <unistd.h>

#include "defines.h"
#include "evalue_meme.h"
#include "header.h"

// last modification: May 25, 2011

int main(int argc, char **argv) {

   if(argc<2) {
      printf("\n");
      printf("             +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~+\n");
      printf("             +                                                                                              +\n");
      printf("             +     coMOTIF: a mixture model with an EM algorithm for motif and co-factor motif discovery    +\n");
      printf("             +                                           v1.0                                               +\n");
      printf("             +                                       May 25, 2011                                           +\n");
      printf("             +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~+\n");
      printf("\n   Usage: ./coMOTIF -fseq seqFile -fpwm0 pwmFile -model modelType optional arguments\n");
      printf("\n");
      printf("   -fseq    string   File name for sequence data in FASTA format (case insensitive).\n");
      printf("                     Example:\n");
      printf("                     >200C_17C_chr1:91356950-91357349\n");
      printf("                     GTCGGTTTTCTTGCCAACACATACTTTATTTTCTCTTTATGGCTAAATAA\n");
      printf("                     AACTCCTGTGTGTGTATGTACCACATTTCACCCGTTCATCTGCTGCCAGA\n");
      printf("                     CGCTGGGCTGCAGTGAACAGCGATGCGGTAGGCATGAACGAGCAGCATGA\n");
      printf("                     ACGAGCAGGTGTCCCTGCGGTGTGCTTAGGCCTTTGGGTAGATTCCCAGG\n");
      printf("                     TACGGGTGGGTCATGTGATCTTGCCAATTGTTTTAAATTGAAGCCAGGTT\n");
      printf("                     TTTGGTTGACTCATTCTCATCAGCCACCAGTAGATGGAGGGAGTGAGACA\n");
      printf("                     TGCAAACAGAGTGCTGTCCCCACTGCCCGGAGTCTGTGACATCCATCCCT\n");
      printf("                     AAAGATGTGTGTTCATATTGTTCCGTGTGGATGTGCCCGAGTGTGTGTAG\n"); 
      printf("\n");
      printf("-model 1PWM or 2PWM  Specify which model to run: 1PWM - one-motif model or 2PWM - two-motif model (default).\n");
      printf("                     The 1PWM model consists of background and one motif (two components) whereas the 2PWM model\n");
      printf("                     consists of background, motif1, and co-factor motif2 (three components). The one-motif model\n");
      printf("                     identifies one motif at a time whereas the two-motif model identifies two motifs simultaneously.\n");
      printf("\n"); 
      printf("   -fpwm0   string   File name for the seed PWMs. All PWMs must be placed in a single file (see below). Each PWM must\n"); 
      printf("                     be represented by a) PWM identifier (with or without #); b) number of rows (4) & columns in each PWM;\n");
      printf("                     c) PWM matrix either in integer counts OR decimal frequencies. The first PWM is always taken as the\n");
      printf("                     PWM for the primary motif. Depending on the model type (1PWM or 2PWM), the remaining PWMs are ether\n");
      printf("                     used as the starting PWMs for the co-factor motifs or the starting PWMs for the one-motif model.\n\n");
      printf("                     In the example below, there are three PWMs. For the two-motif model, coMOTIF automatically carries\n");
      printf("                     out two independent runs: HNF4A/HNF1A and HNF4A/Foxa2. For the one-motif model, coMOTIF carries\n");
      printf("                     out three independent runs with each of the three PWMs as the starting PWM.\n");
      printf("\n");
      printf("                     Example1:\n");
      printf("                     #HNF4A.mx\n");
      printf("                     4       13\n");
      printf("                     28      2       12      5       3       59      53      56      4       6       3       4       42\n");
      printf("                     7       2       4       23      51      1       2       1       4       2       22      49      7\n");
      printf("                     27      56      35      20      4       3       10      8       58      33      11      5       10\n");
      printf("                     5       7       16      19      9       4       2       2       1       26      31      9       8\n");
      printf("                     #HNF1A.mx\n");
      printf("                     4       14\n");
      printf("                     5       1       1       1       20      16      1       8       14      2       0       13      8       5\n");
      printf("                     0       0       0       0       0       2       0       2       0       0       4       1       8       13\n");
      printf("                     14      20      0       0       0       1       0       4       1       0       0       3       3       0\n");
      printf("                     2       0       20      20      1       2       20      7       6       19      17      4       2       3\n");
      printf("                     Foxa2\n");
      printf("                     4       10\n");
      printf("                     0.4761 0.5951 0.5951 0.3095 0.0477 0.8807 0.9521 0.9759 0.0001 0.8807\n"); 
      printf("                     0.0953 0.1429 0.0715 0.0953 0.4999 0.1191 0.0477 0.0001 0.5237 0.0001\n"); 
      printf("                     0.0477 0.2143 0.1905 0.5475 0.0239 0.0001 0.0001 0.0239 0.0239 0.0715\n"); 
      printf("                     0.3809 0.0477 0.1429 0.0477 0.4285 0.0001 0.0001 0.0001 0.4523 0.0477\n");
      printf("\n");
      printf("                     If the co-factor motif is not abundant, it may be difficult to identify - meaning that the starting PWM\n");
      printf("                     may converge to a different PWM. In that case, one might want to consider: 1) set -full to 0. This allows\n");
      printf("                     coMOTIF to run the mixture model with the PWM for the co-factor motif fixed (without being updating)\n");
      printf("                     (preferred option); 2) alternatively, list the PWM for the co-factor motif multiple times in the PWM file\n");
      printf("                     (see example below) and set -maskS to 1 (see below). This allows coMOTIF iteratively masks the motif\n");
      printf("                     instances once found. Masking is not ideal and it does not guarantee finding desired motif.\n");
      printf("\n");
      printf("                     Example2: \n");
      printf("                     #HNF4A.mx\n");
      printf("                     4       13\n");
      printf("                     28      2       12      5       3       59      53      56      4       6       3       4       42\n");
      printf("                     7       2       4       23      51      1       2       1       4       2       22      49      7\n");
      printf("                     27      56      35      20      4       3       10      8       58      33      11      5       10\n");
      printf("                     5       7       16      19      9       4       2       2       1       26      31      9       8\n");
      printf("                     #Foxa2_a\n");
      printf("                     4       10\n");
      printf("                     20     25      25      13      2       37      40      41      0       37\n");
      printf("                     4      6       3       4       21      5       2       0       22      0\n");
      printf("                     2      9       8       23      1       0       0       1       1       3\n");
      printf("                     16     2       6       2       18      0       0       0       19      2\n");
      printf("                     #Foxa2_b\n");
      printf("                     4       10\n");
      printf("                     20     25      25      13      2       37      40      41      0       37\n");
      printf("                     4      6       3       4       21      5       2       0       22      0\n");
      printf("                     2      9       8       23      1       0       0       1       1       3\n");
      printf("                     16     2       6       2       18      0       0       0       19      2\n");
      printf("\n");
      printf("   -full    1 or 0   Indicator for whether or not to update the PWM for the co-factor motif in EM (1-yes[default], 0-no)\n");
      printf("                     This argument allows one to turn on or off the optimization for the co-factor PWM and allow coMOTIF\n");
      printf("                     to identify low-abundant co-factor motifs. However, this depends on the quality of the starting PWMs.\n");
      printf("\n"); 
      printf("                     PRIOR PARAMETERS FOR PROPORTIONS\n\n");
      printf("                     A sequence may not contain a binding site for either motif (pure 'noise'). A binding site may be\n");
      printf("                     present in plus or reverse complementary strand of a sequence. Thus, there are nine possible states:\n");
      printf("                     no motif (only noise), motif 1 on plus strand, motif 1 on minus strand, motif 2 on plus strand\n");
      printf("                     motif 2 on minus strand, both motifs 1 and 2 on plus strand, motif 1 on plus and motif 2 on minus, and\n");
      printf("                     motif1 on plus and motif 2 on minus strand. coMOTIF estimates the nine probabilities for each sequence.\n");
      printf("                     Summing these probabilities across all sequences give the nine proportions in the data.\n\n");
      printf("                     By default, coMOTIF uses a flat prior (equal proportion for all nine or three states). If you choose\n");
      printf("                     to use different initial values (priors). These probabilities can be specified in the parameter file\n");
      printf("                     with the argument -fparm (below).\n");
      printf("\n");
      printf("                            | backg  motif2+  motif2-\n");
      printf("                     -----------------------------------\n");
      printf("                     backg  |   p00    p01     p02 (background, motif2 in plus, motif2 in minus)\n");
      printf("                     motif1+|   p10    p1l     p12\n");
      printf("                     motif1-|   p20    p21     p22\n");
      printf("\n"); 
      printf("                     For a one-motif (two-component) model, there are only three such prior probabilities\n");
      printf("                            |\n");
      printf("                     -----------------------------------\n");
      printf("                     backg  |   p00\n");
      printf("                     motif1+|   p10\n");
      printf("                     motif1-|   p20\n");
      printf("\n"); 
      printf("  -fparm    string   File name for the prior parameters.\n");
      printf("                     Examples\n\n");
      printf("                     # flat prior motif proportions for two-motif model:\n");
      printf("                     3  3\n");
      printf("                     0.1111 0.1111 0.1111\n");
      printf("                     0.1111 0.1111 0.1111\n");
      printf("                     0.1111 0.1111 0.1111\n");
      printf("\n");
      printf("                     # flat prior motif proportions for one-motif model:\n");
      printf("                     3  1\n");
      printf("                     0.3333\n");
      printf("                     0.3333\n");
      printf("                     0.3333\n");
      printf("\n");
      printf("                     BACKGROUND MODEL\n\n");
      printf("                     To compute the probability of a sequence being generated by the background model, coMOTIF either uses\n");
      printf("                     the [A,C,G,T] frequencies in the input data as the parameters for the 0th-order background model or reads\n");
      printf("                     in the background model parameters from a user-specified file with -fbackg argument. When a user-specified\n" );
      printf("                     background model is used, coMOTIF automatically chooses the highest possible order as the order for the\n");
      printf("                     background model. The order of the background model can be changed using -bOrder argument (below).\n");
      printf("\n");
      printf("   -fbackg   string  File name for higher order Markov background model. Up to 9th order (nonamer + 1nt = octamer) is allowed.\n\n");
      printf("                     #monomer frequency\n");
      printf("                     a       0.20850000001660\n");
      printf("                     c       0.29149999998340\n");
      printf("                     g       0.29149999998340\n");
      printf("                     t       0.20850000001660\n");
      printf("                     #dimer frequency\n");
      printf("                     aa      0.04800960194357\n");
      printf("                     ac      0.05151030207800\n");
      printf("                     ag      0.08171634323790\n");
      printf("                     at      0.02720544114470\n");
      printf("                                 ....\n");
      printf("                     ta      0.03460692142891\n");
      printf("                     tc      0.05361072215865\n");
      printf("                     tg      0.07231446287687\n");
      printf("                     tt      0.04800960194357\n");
      printf("                     #trimer frequency\n");
      printf("                     aaa     0.01200480194395\n");
      printf("                     aac     0.01550620248175\n");
      printf("                     aag     0.01420568228200\n");
      printf("                     aat     0.00630252106809\n");
      printf("                     ....\n");
      printf("                     tta     0.01190476192858\n");
      printf("                     ttc     0.01180472191322\n");
      printf("                     ttg     0.01230492199004\n");
      printf("                     ttt     0.01200480194395\n");
      printf("                                 ....\n");
      printf("\n");
      printf("  -bOrder   0-9      The order of the Markov background model (0-9). This argument should only be used with -fbackg.\n");
      printf("\n");
      printf("Other arguments:\n");
      printf("\n"); 
      printf("  -em       integer  Maximal number of EM steps (default: 1000).\n"); 
      printf("  -detail   0 or 1   Print out all nine probabilities for each sequence (0-no[default], 1-yes.\n");
      printf("  -minF     float    Minimal fraction of sequences containing a site required for a motif to be reported\n");
      printf("                     (default: 0.05).\n");
      printf("  -posWt    1 or 0   Motif location prior [1-Gaussian(default), 0-uniform]. If you expect central enrichment as\n");
      printf("                     in ChIP-seq, use the default. The Gaussian priors are applied to both the primary motif and\n");
      printf("                     the co-factor motif, but with difference variances (25 for primary and 75 for co-factor).\n");
      printf("                     The joint location prior is uniform (L-w1-w2+1)*(L-w1-w2+2)\n");
      printf("  -sigma1   float    standard deviation for the Gaussian prior for motif 1 (primary)\n");
      printf("  -sigma2   float    standard deviation for the Gaussian prior for motif 2 (co-factor)\n");
      printf("  -maskR    0 or 1   Indicator for whether to mask simple repetive elements such as aaaaaaaa, ggaggaggaggagga\n");
      printf("                     before running the EM algorithm (0 -no [default], 1 - yes)\n\n");
      printf("  -maskS    0 or 1   Indicator for whether to mask motif instances once or not once found [0-no(default), 1-yes].\n");
      printf("                     For a low abundant motif, the initial PWM may converge to a different solution (PWM).\n");
      printf("                     Iteratively masking those motifs may lead to the identification of the motif. Note that\n");
      printf("                     this argument only applies to the co-factor motif.\n");
      printf("  -extTrim  0 or 1   Base extension and trimming (0 -no [default], 1 - yes).\n\n");
      printf("---------------------------------------------------------------------------------------------------------------------\n");
      printf("Examples:\n");
      printf("\n");
      printf("1. Identify one motif at a time using each of the PWMs in a file as the starting PWM with expected motif central\n");
      printf("   enrichment.\n\n");
      printf("   coMOTIF -fseq input.seq -fpwm0 pwmFileName -model 1PWM\n");
      printf("\n");
      printf("2. Identify one motif at a time using each of the PWMs in a file as the starting PWM with expected motif central\n");
      printf("   enrichment.\n\n");
      printf("   coMOTIF -fseq input.seq -fpwm0 pwmFileName -model 1PWM\n");
      printf("\n");
      printf("3. Identify a primary and cofactor motifs using the first and each of the remaining PWMs as the starting PWMs\n");
      printf("   for the primary and co-factor motifs, respectively.\n\n");
      printf("   coMOTIF -fseq input.seq -fpwm0 pwmFileName -model 2PWM\n");
      printf("\n");
      printf("4. Identify a primary and co-factor motifs in ChIP-seq data with a high-order Markov background model.\n");
      printf("   enrichment\n\n");
      printf("   coMOTIF -fseq input.seq -fpwm0 pwmFileName -model 2PWM -fbackg backgroundFileName (see file in <examples> directory)\n");
      printf("\n");
      printf("---------------------------------------------------------------------------------------------------------------------\n");
      printf("Description of Output Files\n\n");
      printf("   1) info.txt\n");
      printf("      This file contains summary information for the run, e.g., command line, parameters, etc.\n\n");
      printf("   2) <PWM1_PWM2>.txt or <PWM1>.txt\n");
      printf("       The PWM1 and PWM2 are the names of the primary and co-factor PWM names. Each contains the summary\n");
      printf("       results, estimated PWMs, and the predicted locations of the binding sites.\n\n");
      printf("   3) <PWM1_PWM2>.loc or <PWM1>.loc\n");
      printf("      This companion file containing the predicted locations of the binding sites. There are four columns in\n");
      printf("      <PWM1_PWM2>.loc. The last two columns list only locations where both motif1 and motif2 are found on the\n");
      printf("      same sequences for plotting the joint distribution.\n\n"); 
      printf("   4) estimatedPWM1.txt\n");
      printf("      This file contains the estimated PWMs for the primary motif from all runs.\n\n");
      printf("   5) estimatedPWM2a.txt, estimatedPWM2b.txt, estimatedPWM2c.txt\n");
      printf("      The estimated PWMs for the co-factor motifs from all runs are placed in three different files. The PWMs\n");
      printf("      are in STAMP format and the files can be loaded on STAMP (http://www.benoslab.pitt.edu/stamp/) for\n");
      printf("      similarity search.\n\n");
      printf("         A) estimatedPWM2a.txt contains the estimated PWMs that are different from the starting PWM and the\n");
      printf("            estimated PWM is also degenerate (less than 1/4 of its position having 1 bit or more information on\n");
      printf("            a 2-bit scale). Presumably the corresponding results (e.g., in files 2,3) may not be interesting.\n\n");
      printf("         B) estimatedPWM2b.txt contains the estimated PWMs that are different from the starting PWMs but not\n");
      printf("            degenerate (based on above criterion). This set may be informative.\n\n");
      printf("         C) estimatedPWM2c.txt contains the estimated PWMs that did not diverge and are not degenerate. The\n");
      printf("            corresponding results (files 2,3) may be the most interesting.\n\n"); 
      printf("---------------------------------------------------------------------------------------------------------------------\n");
      printf("Software Download\n");
      printf("   http://www.niehs.nih.gov/research/resources/software/comotif\n\n");
      printf("Contact: li3@niehs.nih.gov\n\n");
      printf("---------------------------------------------------------------------------------------------------------------------\n");
      exit(1);  
   }

   register int ii,i,j;
   int goodArgument;

   for (ii=1; ii<argc-1; ii++) {
      if (argv[ii][0]=='-' && isalpha(argv[ii][1])) {
         goodArgument=0;
         if (
             (strncmp(argv[ii],"-em",3)==0         && strlen(argv[ii])==3) ||
             (strncmp(argv[ii],"-fseq",5)==0       && strlen(argv[ii])==5) ||
             (strncmp(argv[ii],"-full",5)==0       && strlen(argv[ii])==5) ||
             (strncmp(argv[ii],"-minF",5)==0       && strlen(argv[ii])==5) ||
             (strncmp(argv[ii],"-model",6)==0      && strlen(argv[ii])==6) ||
             (strncmp(argv[ii],"-maskS",6)==0      && strlen(argv[ii])==6) ||
             (strncmp(argv[ii],"-maskR",6)==0      && strlen(argv[ii])==6) ||
             (strncmp(argv[ii],"-fpwm0",6)==0      && strlen(argv[ii])==6) ||
             (strncmp(argv[ii],"-fparm",6)==0      && strlen(argv[ii])==6) ||
             (strncmp(argv[ii],"-posWt",6)==0      && strlen(argv[ii])==6) ||
             (strncmp(argv[ii],"-fbackg",7)==0     && strlen(argv[ii])==7) ||
             (strncmp(argv[ii],"-sigma1",7)==0     && strlen(argv[ii])==7) ||
             (strncmp(argv[ii],"-sigma2",7)==0     && strlen(argv[ii])==7) ||
             (strncmp(argv[ii],"-bOrder",7)==0     && strlen(argv[ii])==7) ||
             (strncmp(argv[ii],"-detail",7)==0     && strlen(argv[ii])==7) ||
             (strncmp(argv[ii],"-extTrim",8)==0    && strlen(argv[ii])==8) 
            ) { goodArgument=1; }
         if (!goodArgument) { printf("argument: %s unknown\n",argv[ii]); exit(0);  }
      }
   }

   char *seqFileName=NULL;
   char *parameterFileName=NULL;
   char *pwmFileName=NULL;
   char *backgFileName=NULL;

   FILE *fq1;                     // estimated PWM1 (primary motif)
   FILE *fq2a,*fq2b,*fq2c;        // estimated PWM2 (co-factor motif) categorized into three classes
   FILE *fq3;                     // file containing PWM similarity
   FILE *f3;                      // summary information
   // FILE *f1;

   int numSeq,maxSeqLen,aveSeqLen; // sequence info
   int numPWM=0;                   // number of initial PWM2
   
   SAMPLE *data=NULL;            // data - sites may be masked 
   MODEL *initia=NULL;             // initial models
   MODEL *model0=NULL;             // starting models in EM
   MODEL *model1=NULL;             // updated models in EM
   WSCORE *wscore=NULL;            // sequence weights
   PRIOR *prior=NULL;              // motif location priors
   B_LIKELIHOOD *ll_b;             // background likelihood
   BACKGROUND_Model *back=NULL;    // background model
   M_SITE *site=NULL;              // motif sites
   P_SIM *sim=NULL;                // pwm similarity

   int *seqLen=NULL;
   int nsites[2];
   int numSeqHavingSites; 
   int step;
   double *logSeqBackgLL=NULL;
   int **seqBaseCount=NULL;
   double maxDiff;
   double logev[2];
   double bfreq[5];
   double p1=0;
   double p2=0;
   int maxCandidateSites;

   // default settings
   char modelType[10]="2PWM";// two-pwm model
   int numEM=200;
   int two_motif_model=1;    // indicator for whether to run one- or two-motif model, default two-motif
   int full=1;               // optimize co-factor motif if running 2PWM model. The PWM for primary motif is always optimized
   int maskS=0;              // no masking of motif sites once found
   int maskR=0;              // no masking of simple repetitive elements before running EM
   int extTrim=0;            // no motif extension and trimming
   int printProb=0;          // don't output all nine probabilities for each sequence
   int useMarkovBackg=0;     // use 0th-order background model estimated from data 
   int bOrder=9;             // if high order Markov order background model available, uses the hightest order: 9th
   int priorType=1;          // Gaussian motif location prior
   double minF=0.1;          // minimal fraction of sequences containing site to be reported
   int maxMarkovOrder=9;     // max background Markov order
   double sigma1=SIGMA1;
   double sigma2=SIGMA2;
 
   int proportionFileFound=0;
   int *siteCn;
   double maxDist[2];        // maximal L1 distance between PWMs
   int similarToStartPWM,degeneratePWM,pwmType,minPWMlen;

   time_t start,finish;

   // memory allocations
   initia=alloc_model(MAX_NUM_PWM);
   model0=alloc_model(2);
   model1=alloc_model(2);
   seqFileName=alloc_char(200);
   parameterFileName=alloc_char(200);
   pwmFileName=alloc_char(200);
   backgFileName=alloc_char(200);
   siteCn=alloc_int(2);

   numSeq=-1; maxSeqLen=-1; aveSeqLen=0;
   for (ii=1; ii<argc; ii++) {
      if (strncmp(argv[ii],"-fseq",5)==0 && argv[ii+1]!=NULL) {
         data=alloc_data(MAX_NUM_SEQ);
         numSeq=read_seq(data,argv[ii+1]);
         strcpy(seqFileName,argv[ii+1]);
         // print_seq(numSeq,data);
         maxSeqLen=max_seq_length(numSeq,data);
         seqBaseCount=alloc_int_int(numSeq,4);
         base_count(seqBaseCount,numSeq,data);
         aveSeqLen=average_seq_length(numSeq,data);
         logSeqBackgLL=alloc_double(numSeq); 
      }
      if (strncmp(argv[ii],"-fparm",6)==0 && argv[ii+1]!=NULL) {
         strcpy(parameterFileName,argv[ii+1]);
         read_parameters(parameterFileName,initia->p);
         proportionFileFound=1;
      } 
      if (strncmp(argv[ii],"-fpwm0",6)==0 && argv[ii+1]!=NULL) {
         strcpy(pwmFileName,argv[ii+1]);
         numPWM=read_pwm0(pwmFileName,initia);
         // print_all_pwm(initia,numPWM);
         // print_pwm(0,initia);
      }
      if (strncmp(argv[ii],"-em",3)==0 && argv[ii+1]!=NULL) {
         numEM=atoi(argv[ii+1]); 
         printf("number of EM step: %d\n",numEM);
      }
      if (strncmp(argv[ii],"-fbackg",7)==0 && argv[ii+1]!=NULL) {
         strcpy(backgFileName,argv[ii+1]);
         useMarkovBackg=1; 
         back=alloc_background();
         maxMarkovOrder=read_userBackgModel(argv[ii+1],back);
      }
      if (strncmp(argv[ii],"-model",6)==0 && argv[ii+1]!=NULL) {
         strcpy(modelType,argv[ii+1]);
         printf("\nmodel type: %s\n",modelType);
         if      (strncmp(modelType,"1PWM",4)==0) {
            printf("running a one-motif model\n");
            two_motif_model=0;
         }
         else if (strncmp(modelType,"2PWM",4)==0) {
            printf("running a two-motif model\n");
            two_motif_model=1;
         }
         else { 
            printf("\nmodel type %s unknown!",argv[ii+1]);
            printf("Please use -model 1PWM if you want to run one motif one-at-a-time OR\n");
            printf("  -model 2PWM to run two motifs simultaneously\n");
            exit(0); 
         }
      }
      if (strncmp(argv[ii],"-full",5)==0    && argv[ii+1]!=NULL) full=atoi(argv[ii+1]);
      if (strncmp(argv[ii],"-minF",5)==0    && argv[ii+1]!=NULL) minF=atof(argv[ii+1]);
      if (strncmp(argv[ii],"-posWt",6)==0   && argv[ii+1]!=NULL) priorType=atoi(argv[ii+1]);
      if (strncmp(argv[ii],"-maskS",6)==0   && argv[ii+1]!=NULL) maskS=atoi(argv[ii+1]);
      if (strncmp(argv[ii],"-maskR",6)==0   && argv[ii+1]!=NULL) maskR=atoi(argv[ii+1]);
      if (strncmp(argv[ii],"-bOrder",7)==0  && argv[ii+1]!=NULL) bOrder=atoi(argv[ii+1]);
      if (strncmp(argv[ii],"-detail",7)==0  && argv[ii+1]!=NULL) printProb=atoi(argv[ii+1]);
      if (strncmp(argv[ii],"-sigma1",7)==0  && argv[ii+1]!=NULL) sigma1=atof(argv[ii+1]);
      if (strncmp(argv[ii],"-sigma2",7)==0  && argv[ii+1]!=NULL) sigma2=atof(argv[ii+1]);
      if (strncmp(argv[ii],"-extTrim",8)==0 && argv[ii+1]!=NULL) extTrim=atoi(argv[ii+1]);
   }
   if (numPWM<=0)                    { printf("No initial PWMs found - exit!\n");       exit(0); }
   if (priorType!=0 && priorType!=1) { printf("-priorType must be 0 or 1\n");           exit(0); }
   if (extTrim!=0 && extTrim!=1)     { printf("-extTrim must follow by 0 or 1\n");      exit(0); }
   if (full!=0 && full!=1)           { printf("-full must take 0 or 1\n");              exit(0); }
   if (minF<=0 && minF>1)            { printf("Error: minF must be between 0 and 1\n"); exit(0); }
   if (numPWM==1 && two_motif_model) { 
      printf("\nError: Can't run a two-motif model since only one PWM found in %s\n",pwmFileName); 
      printf("\nset -model to 1PWM\n\n");
      exit(0); 
   }

   if (numPWM==1) two_motif_model=0;

   maxCandidateSites=maxSeqLen;

   // memory allocation
   ll_b=alloc_blikelihood(numSeq,maxSeqLen);
   prior=alloc_prior(numSeq,maxSeqLen);
   site=alloc_motif_site(numSeq);
   seqLen=alloc_int(numSeq);
   wscore=alloc_weight(1,maxCandidateSites);
   alloc_data_additional(data,numSeq,maxCandidateSites);
   sim=alloc_sim(numPWM);

   // i/o
   fq1=fopen("estimatedPWM1.txt","w");
   if (strncmp(modelType,"2PWM",4)==0) {
      fq2a=fopen("estimatedPWM2a.txt","w");
      fq2b=fopen("estimatedPWM2b.txt","w");
      fq2c=fopen("estimatedPWM2c.txt","w");
   }
   fq3=fopen("PWM2Similarity.txt","w");
   // f1=fopen("proportion.txt","w");
   f3=fopen("info.txt","w");
  
   if (!proportionFileFound) {
      if (two_motif_model) {
         for (i=0; i<3; i++) {
            for (j=0; j<3; j++) initia->p[i][j]=0.11111; 
         } 
      }
      else {
         for (i=0; i<3; i++) initia->p[i][0]=0.33333; 
      } 
   }
 
   // print_parameters(initia->p,two_motif_model);
   // PWM standardization
   adjust_parameters(initia,numPWM,two_motif_model);
 
   // parameters for 0th-order Markov background model from input data
   base_frequency(numSeq,data,initia->bfreq,initia->logbfreq);

   if (useMarkovBackg) {
      bOrder=min(bOrder,maxMarkovOrder);
      for (i=0; i<5; i++) bfreq[i]=exp(back->monomerFreq[i]);
   }
   else {
      for (i=0; i<5; i++) bfreq[i]=initia->bfreq[i];
      printf("\n-bOrder is ignored (no background model file is specified). Use 0th-order Markov\n");
      printf("background model with the [A,C,G,T] frequencies in the input data as its parameters.\n\n"); 
   }

   if (!full) {
      printf("\n\nEM is carried out on the primary PWM (PWM1) only\n"); 
   }

   time(&start);
   fprintf(f3,"------------------------------------------------------------------------------------------\n");
   fprintf(f3,"processor ID: %d\n",getpid());
   fprintf(f3,"\ncommand line: ");
   for (i=0; i<argc; i++) fprintf(f3,"%s ",argv[i]); fprintf(f3,"\n\n");

   fprintf(f3,"Data files\n");
   fprintf(f3,"   sequence file name:\t\t\t%s\n",seqFileName);
   fprintf(f3,"   Number of sequences:\t\t\t%d\n",numSeq);
   fprintf(f3,"   Average sequence length:\t\t%d\n",aveSeqLen);
   fprintf(f3,"   [A,C,G,T] frequencies:\t\t%5.4f %5.4f %5.4f %5.4f\n",
      initia->bfreq[0],initia->bfreq[1],initia->bfreq[2],initia->bfreq[3]);
   if (useMarkovBackg) {
       fprintf(f3,"   As a comparison, genome-wide [A,C,G,T] frequencies:\t%5.4f %5.4f %5.4f %5.4f\n",
          exp(back->monomerFreq[0]),exp(back->monomerFreq[1]),exp(back->monomerFreq[2]),exp(back->monomerFreq[3])); 
   }
   fprintf(f3,"\nMixture model\n");
   if (two_motif_model && full)  {
     fprintf(f3,"   Running a two-motif (three-component/nine states) mixture model with the two\n");
     fprintf(f3,"   starting PWMs for the two motifs and a background model for the background.\n");
     fprintf(f3,"   Both PWMs are optimized by the EM algorithm.\n");
   }
   else if (two_motif_model) {
     fprintf(f3,"   Running a two-motif (three-component/nine states) mixture model with two two\n");
     fprintf(f3,"   starting PWMs for the two motifs and a background model for the background.\n");
     fprintf(f3,"   The PWM for the co-regulator motif is fixed (without EM optimization).\n");
   }
   else {
      fprintf(f3,"   Running a one-motif (two-component/three states) mixture model\n");
      fprintf(f3,"   This runs one-motif-at-a-time as MEME.\n"); 
   }
   fprintf(f3,"\nMotif models\n");
   fprintf(f3,"   Starting PWMs taken from %s\n",pwmFileName);
   fprintf(f3,"   the first PWM is always the starting PWM for the primary motif whereas each of the\n");
   fprintf(f3,"   remaining PWMs is starting PWM for the co-factor motif.\n");
   fprintf(f3,"   Total number of PWMs:\t%d\n",numPWM);

   fprintf(f3,"\nBackground Model:\n");
   if (useMarkovBackg)  {
      fprintf(f3,"   a %d-th order Markov model estimated from the respective genome.\n",bOrder);
      fprintf(f3,"   background file name:\t\t%s\n",backgFileName);
   }
   else { 
      fprintf(f3,"   a 0th-order background model with the [A,C,G,T] frequencies in the input data\n");
      fprintf(f3,"   as its parameters.\n"); 
   }

   if (maskR) fprintf(f3,"simple repetitive elements (see documentation) are masked before running the software\n");

   fprintf(f3,"\nParameters\n");
   if (proportionFileFound)
      fprintf(f3,"   proportion priors taken from file:\t%s\n",parameterFileName);
   else {
      if (two_motif_model)
         fprintf(f3,"   flat proportion priors (0.11111) for all nine states\n");
      else 
         fprintf(f3,"   flat proportion priors (0.33333) for all three states\n");
   }
   fprintf(f3,"   Maximal number of EM steps:\t\t%d\n",numEM);
   fprintf(f3,"   EM convergence criterion\t\t%e\n",CONVERGENCE);
   if (priorType==0) fprintf(f3,"   uniform priors for motif locations\n");
   else              {
      fprintf(f3,"   Gaussian prior for motif locations\n");
      fprintf(f3,"   Mean: sequence center, standard deviations: %3.0f(motif1) and %3.0f(motif2)\n\n",sigma1,sigma2); 
   }

   if (!full) {
      fprintf(f3,"EM is carried out on the primary PWM (PWM1) only. The PWM for the co-factor\n");
      fprintf(f3,"  is taken as it is without being updated in the EM algorithm.\n"); 
   }
   fprintf(f3,"\nMinimal fraction of sequences containing a site required for a motif to be reported:\t%5.3f\n",minF);
   if (extTrim) fprintf(f3,"\nBase extension and trimming carried out for motifs\n");
   else         fprintf(f3,"\nNo base extension and trimming carried out for motifs\n");

   fprintf(f3,"\njob started:\t\t\t%s\n", asctime(localtime(&start)));
   fprintf(f3,"------------------------------------------------------------------------------------------\n");
   fflush(f3);

   fprintf(fq3,"#This file lists at most top 5 PWMs in your %s that are most similar to the estimated PWM for the co-factor motif. ",pwmFileName);
   fprintf(fq3,"This allows one to check if the estimated PWM(s) differ from the initial PWM(s), e.g. ");
   fprintf(fq3,"converged to a different PWM.\n\n");
   fflush(fq3);
   //fprintf(f1,"#This file lists the estimated proportions at every 5 steps for all PWMs\n\n");
   //fprintf(f1,"step\tbackground\tmotif1\tmotif2\tmotif12_joint\n");
   //fflush(f1);

   if (maskR) {
      printf("mask simple repeats with 'n' - ");
      mask_repetitive(data,numSeq,seqFileName);
      printf("done\n\n");
   }

   for (i=0; i<numPWM; i++) {
      consensus_pwm(initia->pwm[i],initia->pwmLen[i],initia->consensus0[i]);
   }
   print_all_pwm(initia,numPWM);

   for (i=0; i<numSeq; i++) seqLen[i]=data[i].length;
   // copy_data(odata,data,numSeq);

   if (!two_motif_model)  numPWM=numPWM+1; // to allow the loop below to run
   
   printf("\nmax number of EM: %d\n",numEM);
   for (ii=0; ii<numPWM-1; ii++) {

      copy_and_reverse_model(model0,initia,ii+1,two_motif_model);

      if (two_motif_model) {
         printf("\n*****running a two-motif model*****\n");
         printf("           PWMs: 1 and %d (%s -%s)\n\n",ii+2,model0->name[0],model0->name[1]);
         fprintf(f3,"PWM1 and PWM%d (%s-%s)\n\n",ii+2,model0->name[0],model0->name[1]);
         minPWMlen=min(model0->pwmLen[0],model0->pwmLen[1]); 
      }
      else {
         printf("\n*****running an one-motif model*****\n");
         printf("           PWM: %d (%s)\n\n",ii+1,model0->name[0]);
         fprintf(f3,"PWM%d (%s)\n",ii+1,model0->name[0]);
         minPWMlen=model0->pwmLen[0];
      }

      // print_pwm(ii,model0,two_motif_model);
 
      log_transforma(model0,two_motif_model);
      motif_prior(data,numSeq,model0->pwmLen[0],model0->pwmLen[1],prior,priorType,sigma1,sigma2);

      if (useMarkovBackg) weightscore_backg_markov(numSeq,data,model0,ll_b,back,min(bOrder,minPWMlen-1),two_motif_model);
      else                weightscore_backg(numSeq,data,model0,ll_b,two_motif_model);

      printf("\nStarting EM...\n");
      for (step=0; step<numEM; step++) {
         if ((step+1)%5==0) printf("EM step: %d\n", step+1);
   
         //1 E-step : given data, model0, compute sequence likelihood scores (weightscores) - ws
         //2 M-step : update the count matrix and model1 parameters
   
         // print_pwm(step,model0,two_motif_model);
         /*------------------------------------------------------------------------
         // initialize model1                                                    //
         // copy the followings from model0 to model1:                           //
         //      a) pwm lengths                                                  //
         //      b) pwm names                                                    //
         //      c) pwm consensus0                                               //
         -------------------------------------------------------------------------*/
         zero_model1_copy_name(model1,model0,two_motif_model);

         if (!full && two_motif_model) fix_pwm2(model1,initia,ii+1);

         for(i=0; i<numSeq; i++) {
            weightscore_motif(wscore, &data[i], model0, &ll_b[i],&prior[i],maxCandidateSites,two_motif_model,sigma1);
            normalization(wscore,&data[i],model0,two_motif_model);
            seq_probabilities(wscore,&data[i],model0,two_motif_model);
            if (full) update_model_full   (two_motif_model,model1, &data[i], wscore);
            else      update_model_partial(two_motif_model,model1, &data[i], wscore);
         }
         calModel(model1,two_motif_model);
         // print_pwm(step,model1,two_motif_model); // testing

         if ((step+1)%10==0) {
            print_pwm2(step,model1,two_motif_model,f3);
            /*---------------------------------------------------------------------------
            if (two_motif_model)  
               fprintf(f1,"%d\t%5.4f\t%5.4f\t%5.4f\t%5.4f\n",step+1,model1->p[0][0],model1->p[1][0]+model1->p[2][0],model1->p[0][1]+model1->p[0][2],
                  model1->p[1][1]+model1->p[1][2]+model1->p[2][1]+model1->p[2][2]);
            else 
               fprintf(f1,"%d\t%5.4f\t%5.4f\t%5.4f\n",step+1,model1->p[0][0],model1->p[1][0],model1->p[2][0]);
            fflush(f1);
            ---------------------------------------------------------------------------*/
         } 
         maxDiff=check_convergence(model0,model1,two_motif_model);

         if (maxDiff<CONVERGENCE || step==numEM-1) {
            if (maxDiff<=CONVERGENCE) {
               printf("EM converged...\n"); 
               fprintf(f3,"EM converged...\n"); 
               fprintf(f3,"maximal PWM difference:\t%6.5f\n",maxDiff);
            }
            else {
               printf("max EM steps reached...\n"); 
               fprintf(f3,"max EM steps reached...\n");
               fprintf(f3,"maximal PWM difference:\t%6.5f\n",maxDiff);
            }
            updateModelbyModel(model0, model1,two_motif_model);
            break; 
         }
         // update model0 from model1 and use model0 as the start for EM

 
         /*----------------------------------------------------------------------------
         // model1 is updated in EM, after each EM step, copy model1 to model0 as start
         //    a) proportions
         //    b) PWMs both plus and reverse complementary
         //    c) PWM lengths
         //    d) PWM names
         //    e) PWM consensus
         ----------------------------------------------------------------------------*/
         updateModelbyModel(model0, model1,two_motif_model);
         log_transforma(model0,two_motif_model);
      }
      // end of EM
      print_pwm(0,model0,two_motif_model);

      // check maximal difference between PWM at step (t+1) and (t)
      if (two_motif_model) {
         maxDist[0]=max_distance(initia->pwm[0],model0->pwm[0],model0->pwmLen[0]);
         maxDist[1]=max_distance(initia->pwm[ii+1],model0->pwm[1],model0->pwmLen[1]);
         similarity_to_all_pwms(sim,model0,initia,numPWM);
      }
      else {
         maxDist[0]=max_distance(initia->pwm[ii],model0->pwm[0],model0->pwmLen[0]);
         maxDist[1]=-1;
      }
 
      numSeqHavingSites=classification(numSeq,data,model0,site,nsites);
      predict_state(numSeq,data);

      pwmType=-1;
      if (two_motif_model) {
         p1=model0->p[1][0]+model0->p[2][0]+model0->p[1][1]+model0->p[1][2]+model0->p[2][1]+model0->p[2][2];
         p2=model0->p[0][1]+model0->p[0][2]+model0->p[1][1]+model0->p[1][2]+model0->p[2][1]+model0->p[2][2];
         printf("motif1 and motif2 proportions: %5.3f\t%5.3f\n",p1,p2); 
         if (p1<minF || p2<minF) continue;

         
         similarToStartPWM=similarity_to_original_pwm(initia->pwm[ii+1],model0->pwm[1],model0->pwmLen[1]);
         degeneratePWM=degenerate_pwm(model0->pwm[1],model0->pwmLen[1]);

         if (!similarToStartPWM && degeneratePWM) pwmType=0; 
         else if (!similarToStartPWM)             pwmType=1;
         else                                     pwmType=2; 
      }
      else {
         p1=model0->p[1][0]+model0->p[2][0];
         printf("motif1 proportion: %5.3f\n",p1); 
         if (p1<minF) continue;

         similarToStartPWM=similarity_to_original_pwm(initia->pwm[ii],model0->pwm[0],model0->pwmLen[0]);
         degeneratePWM=degenerate_pwm(model0->pwm[0],model0->pwmLen[0]);

         if (!similarToStartPWM && degeneratePWM) pwmType=0; 
         else if (!similarToStartPWM)             pwmType=1;
         else                                     pwmType=2; 
      }

      extention_trimming(numSeq,data,model0,numSeqHavingSites,site,extTrim,two_motif_model);

      printf("\nCompute motif E-value and write output\n");
      if (two_motif_model) {
          logev[0]=E_value(model0->opwm[0],nsites[0],bfreq,model0->pwmLenNew[0],numSeq,seqLen);
          consensus_pwm(model0->opwm[0],model0->pwmLenNew[0],model0->consensus[0]);
          logev[1]=E_value(model0->opwm[1],nsites[1],bfreq,model0->pwmLenNew[1],numSeq,seqLen);
          consensus_pwm(model0->opwm[1],model0->pwmLenNew[1],model0->consensus[1]);
      }
      else {
          logev[0]=E_value(model0->opwm[0],nsites[0],bfreq,model0->pwmLenNew[0],numSeq,seqLen);
          consensus_pwm(model0->opwm[0],model0->pwmLenNew[0],model0->consensus[0]);
      }

      // print_model(ii,maxDiff,model1,two_motif_model,fo);
      if (pwmType==0) 
         siteCn=print_result(ii,numSeq,data,model0,site,numSeqHavingSites,logev,printProb,maxDist,sim,initia,numPWM,two_motif_model,fq1,fq2a,fq3);
      else if (pwmType==1) 
         siteCn=print_result(ii,numSeq,data,model0,site,numSeqHavingSites,logev,printProb,maxDist,sim,initia,numPWM,two_motif_model,fq1,fq2b,fq3);
      else if (pwmType==2) 
         siteCn=print_result(ii,numSeq,data,model0,site,numSeqHavingSites,logev,printProb,maxDist,sim,initia,numPWM,two_motif_model,fq1,fq2c,fq3);

      if (maskS) mask_site(numSeqHavingSites,site,data,model0,two_motif_model);

      fprintf(f3,"motif1:\n"); 
      fprintf(f3,"   starting PWM consensus:\t%s\n",model0->consensus0[0]);
      fprintf(f3,"   number of sites:\t\t%d\n",siteCn[0]);
      fprintf(f3,"   motif consensus:\t\t%s\n",model0->consensus[0]);
      fprintf(f3,"   log(E-value):\t\t%5.2f\n",logev[0]);
      if (two_motif_model) {
         fprintf(f3,"motif2:\n"); 
         fprintf(f3,"   starting PWM consensus:\t%s\n",model0->consensus0[1]);
         fprintf(f3,"   number of sites:\t\t%d\n",siteCn[1]);
         fprintf(f3,"   motif consensus:\t\t%s\n",model0->consensus[1]);
         fprintf(f3,"   log(E-value):\t\t%5.2f\n",logev[1]);
      }
      fprintf(f3,"----------------------------------------------------------------------\n\n");
   }
   // fclose(f1); 
   fclose(fq1); 
   if (strncmp(modelType,"2PWM",4)==0) {
      fclose(fq2a); fclose(fq2b); fclose(fq2c);
   }
   time(&finish);
   fprintf(f3,"\nfinished: %s\n", asctime(localtime(&finish)));
   fprintf(f3,"approximated processor time in seconds: %f\n",difftime(finish,start));
   fclose(f3);

   return (0);
}
