#include "config.h"

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#ifdef HAVE_MALLOC_H
#include <malloc.h>
#endif

#include <math.h>
#include <time.h>
#include <ctype.h>
#include <sys/types.h>
#include <unistd.h>
#include "gadem.h"

#include "defines.h"
#include "random.h"
#include "evalue_meme.h"

/*---------------------------------------------------------------------------
// v1.3.1: last modifications 5/14/2011
// added user-input background model
// added masking simple repetitive elements and list them in .mask
// added a new argument -nmotifs: the maximal number of motifs sought
// removed high order background model (to be added back later)
// edited usage and info.txt printout
// simplified usage by elimating several options or setting them as default)
---------------------------------------------------------------------------*/

// previous modification 9/07/2009
// modifications:
//   1) fixed a minor bug in scan_sites.c
//   2) remove 6-mers, reduce search space
//   3) added C function getpid (process id)
//   4) added C function for cpu running time
//   5) set equal mutation rate for maxp and spaced dyads.
//      An optimal maxp may be important as it may affect how the EM converges
//   6) some cosmetic changes in output 
//   7) set default number of generations to 10
//   8) allowed a user-specified "seed" PWM
//   9) allowed a user-specified background model
//  10) included enrichment analysis
//  11) re-wrote pgf function (Staden's pgf method for llr null distribution)
//  12) fixed a bug in computing marginal probabilities for subsequences containing non-[a,c,g,t]
//  13) allow motif to overlap as an option

int main(int argc,char **argv) {

   register int jjj,ii,jj,i,j,k;
   // basic settings/info
   int numSeq,maxSeqLen,*seqLen;          // sequence info
   double aveSeqLen;                      // sequence info
   char **seq,**rseq,**geneID;            // sequence info
   char **oseq,**orseq;                   // copy of the original sequences
   char **sseq,**rsseq;                   // simulated seqs.
   double *bfreq1,*bfreq0;                // base frequencies
   double *ChIPScore;                     // chip score
   int maskR;                             // mask simple repeats before running the algorithm

   // pwms
   double ***pwm;                         // initial population of PWMs from spaced dyads
   int *pwmLen;                           // initial pwm lengths 
   double **t1pwm,**t2pwm;                // two pwms before and after EM steps
   double **opwm2;                        // EM-derived PWM 
   double **logpwm;                       // log-transformed EM-derived EM
   int **ipwm;                            // integer pwm for computing llr score distribution
   double ***opwm;                        // observed PWMs from identified sites
   double ***epwm;                        // em-optimized PWMs
   double **logepwm;                      // log(em-optimized PWM)
   int *pwmnewLen;                        // final motif length after extending to both ends

   // llr score distr.
   Pgfs *llrDist;                         // llr distribution from pgf method
   int llrDim;                            // llr distribution dimension

   // EM, motif, sites
   double pvalueCutoff;                   // user input, used to determine score cutoff based on ipwm
   int *scoreCutoff;                      // pwm score cutoff for the corresponding p-value cutoff
   double **score,**rscore;               // subsequence score, plus and minus strands
   double logev;                          // log of E-value of a motif;
   int useChIPscore;                      // indicator for using ChIP-seq score for seq. selection for EM
   int numEM;                             // number of EM steps
   double E_valueCutoff;                  // log E-value cutoff
   int nsitesEM;                          // number of binding sites in sequences subjected to EM
   int minsitesEM;                        // minimal number of sites in a motif in EM sequences
   Sites *siteEM;                         // binding sites in EM sequences
   int *nsites;                           // number of binding sites in full data
   int minsites;                          // minimal number of sites in a motif in full data
   Sites **site;                          // binding sites in all sequences
   int motifCn;                           // number of motifs sought and found
   int nmotifs;                           // maximal number of motifs sought
   int extTrim;
   int noMotifFound;                      // none of the dyads in the population resulted in a motif
   char **pwmConsensus;                   // consensus sequences of motifs
   double pwmDistCutoff;                  // test statistic for motif pwm similarity
   char *uniqMotif;                       // motifs in a population unique or not
   int numUniq;                           // number of unique motifs in a population
   int slideWinPWM;                       // sliding window for comparing pwm similarity
   int widthWt;                           // window width in which nucleotides are given large weights for PWM optimization 
   int fullScan;                          // scan scan on the original sequences or masked sequences

   // background
   int numBackgSets;

   // weights 
   double **posWeight;                    // spatial weights
   int weightType;                        // four weight types 0, 1, 2

   // words for spaced dyad
   Words *word;                           // top-ranked k-mers as the words for spaced dyads
   int numTop3mer,numTop4mer,numTop5mer;  // No. of top-ranked k-mers as words for dyads
   int maxWordSize;                       // max of the above three
   int numWordGroup;                      // number of non-zero k-mer groups
   int minSpaceWidth,maxSpaceWidth;       // min and max width of spacer of the spaced dyads
   Chrs **dyad;                           // initial population of "chromosomes"
   char **sdyad;                          // char of spaced dyads

   // GA
   int populationSize,numGeneration;      // GA parameters
   double maxpMutationRate;
   Fitness *fitness;                      // "chromosome" fitness
   Wheel *wheel;                          // roulette-wheel selection

   // to speed up only select a subset of sequences for EM algorithm
   double fEM;                            // percentage of sequences used in EM algorithm
   int numSeqEM;                          // number of sequences subject to EM
   char *Iseq;                            // Indicator if a sequence is used in EM or not
   int *emSeqLen;                         // length of sequences used in EM
   double pwmDiff;                        // pwm convergence
   int maxp;                              // initial setting for all motifs maxp=numSeqEM 
   double *maxpFactor;

   int numCycle;                          // number of GADEM cycles
   int generationNoMotif;                 // maximal number of GA generations in a GADEM cycle resulted in no motifs

   // mis.
   seed_t  seed;                          // random seed
   int goodArgument,motifCn2,id,numCycleNoMotif,verbose,minminSites;
   int startPWMfound,stopCriterion;
   char *mFileName,*oFileName,*pwmFileName,*pwm0FileName,*sFileName,*bFileName;
   FILE *fp,*fq,*fpwm;
   time_t start,finish;
   int cn[4],bcn[4],*seqCn,*bseqCn,avebnsites,avebnsiteSeq,totalSitesInput;

   if (argc<3) {
      printf("\n");
      printf("                 *-------------------------------------------------------------------------\n");
      printf("                 |                                                                        |\n");
      printf("                 |      GADEM: a motif discovery tool for large-scale sequence data       |\n");
      printf("                 |                                 v1.3.1                                 |\n");
      printf("                 |                  Last modification: May 14, 2011                     |\n");
      printf("                 |                                                                        |\n");
      printf("                 |         Multiple runs are recommended for 'unseeded' analysis.         |\n");
      printf("                 |      Each unseeded run automatically uses a different random seed.     |\n");
      printf("                 |       'Seeded' runs are deterministic; no repeat runs are needed.      |\n");
      printf("                 *-------------------------------------------------------------------------\n");
      printf("\n");
      printf("\n");
      printf("Usage: gadem -fseq seqFile [optional arguments]\n");
      printf("\n");
      printf("Optional arguments that need attention:\n");
      printf("\n");
      printf("  -posWt    0,1, or 2  Weight profile for positions on the sequence (see documentation).\n");
      printf("                       0 - no weight (uniform spatial prior), 1 or 2 - small or zero weights for\n");
      printf("                       the ends and large weights for the center (e.g. the center 50 bp) (1 -\n");
      printf("                       gaussian prior, 2 - triangular prior). If you expect strong central\n");
      printf("                       enrichment for motifs (as in ChIP-seq) and your sequences are long\n");
      printf("                       (e.g. >100 bp), choose type 1 (default).\n");
      printf("\n");
      printf("  -widthWt  integer    For -posWt 1 or 2, width of central sequence region with large EM weights\n");
      printf("                       for PWM optimization (default: 50). This argument is ignored when -posWt\n");
      printf("                       is 0 (uniform prior).\n");
      printf("\n");
      printf("  -ev       decimal    ln(E-value) cutoff for selecting MOTIFS (default: 0.0).\n");
      printf("                       If a seeded analysis fails to identify the expected motif, run GADEM with\n");
      printf("                       -verbose 1 to show motif ln(E-value)s on screen, then re-run with a larger\n");
      printf("                       ln(E-value) cutoff. This can help in identifying short and/or low abundance\n");
      printf("                       motifs, for which the default log(E-value) threshold may be too low. A larger\n");
      printf("                       value (e.g., 10000) can return groups of closely related motifs (possible \n");
      printf("                       variants) whereas a smaller value (<0) returns fewer (but conserved) motifs.\n\n");
      printf("                       To identify potential motif variants, set -ev large, e.g., 10000 and -minN small\n");
      printf("                       e.g., numSeq/10. This allows gadem to identify motif variants that are present\n");
      printf("                       in at least 10 percent of the sequences. A large log(E-value) cutoff enures\n");
      printf("                       such motifs are found.\n\n");
      printf("                       The subroutine for E-value calculation is adapted from the MEME package.\n");
      printf("\n");
      printf("  -pv       decimal    P-value cutoff for declaring BINDING SITES (default: 0.00025).\n");
      printf("                       Depending on data size and the motif, you might want to assess more than one\n");
      printf("                       value. For ChIP-seq data (e.g., 10 thousand +/-200-bp max-center peak 'cores'),\n");
      printf("                       p=0.00025 often seems appropriate. \n");
      printf("\n");
      printf("                       Given a subsequence s of length w, GADEM computes the log likelihood (llr)\n");
      printf("                       score, log{p(s|M)/p(s|B)}, where M is the EM-derived motif model, B is the\n");
      printf("                       0-th order Markov background model and w is the motif length. GADEM uses the\n");
      printf("                       [a,c,g,t] frequencies from the foreground data as the parameters for the\n");
      printf("                       0th Makrov background model. The subsequence is declared a binding site if \n");
      printf("                       its llr score is at or above the llr score corresponding to the p-value cutoff.\n");
      printf("                       This requires knowing the distribution of the llr score (under the null), and\n");
      printf("                       GADEM first integerizes the log(Mij/Bj) llr score matrix, where i=1,...,w and\n");
      printf("                       j=1,2,3,4, by multiplying it with a large constant (200) followed by rounding\n");
      printf("                       the real numbers to their closest integers. The null distribution of the\n");
      printf("                       integerized llr scores is then determined using the Staden probability\n");
      printf("                       generating function method (Comput. Appl. Biosci,5,89,1989). See Hertz,GZ &\n");
      printf("                       Stormo GD (Bioinformatics 1999, 15:563-577) for a review.\n");
      printf("\n");
      printf("  -minN     integer    Minimal number of sites required for a motif to be reported (default: numSeq/20).\n");
      printf("\n");
      printf("  -fpwm0    string     File name for the seed PWM, when a 'seeded' approach is used, in which a PWM \n");
      printf("                       (format below) is used as the starting PWM for the EM algorithm. This is an \n");
      printf("                       effective way of testing for an 'expected' motif, because it is focused, robust \n");
      printf("                       to noise, and much faster than 'unseeded' de novo discovery. Also, when a seed \n");
      printf("                       PWM is specified, the run results are deterministic, so only a single run is \n");
      printf("                       needed (repeat runs with the same settings will give identical results). In \n");
      printf("                       contrast, unseeded runs are stochastic, and we recommend comparing results \n");
      printf("                       from several repeat runs.\n");
      printf("\n");
      printf("                       Format: number of rows & columns followed by integer counts OR decimal freq.\n");
      printf("                       Example: PWM (CREB, JASPAR MA0018) in two acceptable representations:\n");
      printf("\n");
      printf("                       4     12\n");
      printf("                       0     3     0     2     5     0     0    16     0     0     1     5\n");
      printf("                       7     5     3     3     1     0     0     0    16     0     5     6\n");
      printf("                       5     4     6    11     7     0    15     0     0    16     0     3\n");
      printf("                       4     4     7     0     3    16     1     0     0     0    10     2\n");
      printf("\n");
      printf("                       4     12\n");
      printf("                       0.000 0.188 0.000 0.125 0.312 0.000 0.000 1.000 0.000 0.000 0.062 0.312\n");
      printf("                       0.438 0.312 0.188 0.188 0.062 0.000 0.000 0.000 1.000 0.000 0.312 0.375\n");
      printf("                       0.312 0.250 0.375 0.688 0.438 0.000 0.938 0.000 0.000 1.000 0.000 0.188\n");
      printf("                       0.250 0.250 0.438 0.000 0.188 1.000 0.062 0.000 0.000 0.000 0.625 0.125\n");
      printf("\n");
      printf("  -bfile    string     File name for background model. The current version only supports 0-th Markov\n");
      printf("                       background model\n");
      printf("                       Format:\n");
      printf("                       #monomer frequencies (this line is optional)\n");
      printf("                       a	0.2834\n");
      printf("                       c	0.2215\n");
      printf("                       g	0.2333\n");
      printf("                       t	0.2618\n");
      printf("\n");
      printf("Other optional arguments:\n");
      printf("\n");
      printf("  -gen      integer    Number of genetic algorithm (GA) generations (default: 5).\n");
      printf("  -pop      integer    GA population size (default: 100).\n");
      printf("                       Both default settings should work well for most datasets (ChIP-chip and\n");
      printf("                       ChIP-seq). The above two arguments are ignored in a seeded analysis,\n");
      printf("                       because spaced dyads and GA are no longer needed (-gen is set to 1 and\n");
      printf("                       -pop is set to 10 internally, corresponding to the 10 maxp choices).\n");
      printf("\n");
      printf("  -fullScan 0 or 1     GADEM keeps two copies of the input sequences internally: one (D) for\n");
      printf("                       discovering PWMs and one (S) for scanning for binding sites using the PWMs.\n");
      printf("                       Once a motif is identified, its instances in set D are always masked by Ns.\n");
      printf("                       However, masking motif instances in set S is optional, and scanning unmasked\n");
      printf("                       sequences allows sites of discovered motifs to overlap.\n");
      printf("\n");
      printf("                       0 (default) - scan masked sequences in S (disallow motif site overlap).\n");
      printf("                       1 - scan unmasked sequences in S (allow motif sites to overlap) (was default\n");
      printf("                       in v1.3).\n");
      printf("\n");
      printf("  -maskR    0 or 1     Mask repetitive, low-complexity sequences as below(default: 0-no masking,1-masking):\n");
      printf("                       a) 'aaaaaaaa', 'tttttttt', 'cacacaca', 'tgtgtgtg', or 'tatatatat', each of which\n");
      printf("                          is at least 8 nucleotides long.\n");
      printf("                       b) 'ggaggaggagga','gaggaggaggag','agaagaagaaga','ctcctcctcctc','tcctcctcctcc',\n");
      printf("                          'tcttcttcttct','tagtagtagtag','aataataataat','attattattatt','ataataataata'\n");
      printf("                           each of which is at least 12 nucleotides long.\n");
      printf("                       c) 'cagcagcagcagcag' that is at least 15 nucleotides long.\n");
      printf("                       no other subsequences are masked. The sequences with masked sites are outputed.\n");
      printf("\n");
      printf("  -em       integer    Number of EM steps (default: 40). One might want to set it to a larger value\n");
      printf("                       (e.g. 80) in a seeded run, because such runs are fast.\n\n");
      printf("                       If you wish to scan the sequences with a PWM, set -em to 0.\n");
      printf("\n");
      printf("  -fEM      decimal    Fraction of sequences used in EM to obtain PWMs in an unseeded analysis\n");
      printf("                       (default: 0.5). For unseeded motif discovery in a large dataset (e.g. >10\n");
      printf("                       million nt), one might want to set -fEM to a smaller value (e.g., 0.3 or 0.4)\n");
      printf("                       to reduce run time.\n");
      printf("\n");
      printf("                       Note that when only partial input data are used in EM and verbose is set to 1, the\n");
      printf("                       number of binding sites printed on screen is the number of sites found only\n");
      printf("                       in the fraction of sequences that are used in EM optimization[GR1].\n");
      printf("\n");
      printf("                       This argument is ignored in a seeded analysis, which uses all sequences in EM.\n");
      printf("\n");
      printf("  -extTrim  1 or 0     Base extension and trimming (1 -yes, 0 -no) (default: 1).\n");
      printf("  -nmotifs  integer    Maximal number of motifs sought (default: 25).\n");
      printf("\n");
      printf("  -maxw3    integer    Number of top-ranked trimers for spaced dyads (default: 20).\n");
      printf("  -maxw4    integer    Number of top-ranked tetramers for spaced dyads (default: 40).\n");
      printf("  -maxw5    integer    Number of top-ranked pentamers for spaced dyads (default: 60).\n");
      printf("\n");
      printf("  -mingap   integer    Minimal number of unspecified nucleotides in spaced dyads (default: 0).\n");
      printf("  -maxgap   integer    Maximal number of unspecified nucleotides in spaced dyads (default: 10).\n");
      printf("                       -mingap and -maxgap control the lengths of spaced dyads, and, with -extrim,\n");
      printf("                       control motif lengths. Longer motifs can be discovered by setting -maxgap to\n");
      printf("                       larger values (e.g. 50).\n");
      printf("                       To identify short motifs (6-10 bps), set both -maxgap and -maxw5 to 0\n");
      printf("\n");
      printf("  -useScore 0 or 1     Use top-scoring sequences for deriving PWMs. Sequence (quality) scores should\n");
      printf("                       be stored in input sequence FA headers (see documentation).\n");
      printf("                       0 - no (default, randomly select sequences), 1 - yes.\n");
      printf("\n");
      printf("  -fpwm     string     Name of output PWM file in STAMP format (http://www.benoslab.pitt.edu/stamp).\n");
      printf("                       (default: observedPWMs.txt). This file can be loaded into STAMP to compare\n");
      printf("                       each PWM with PWMs in databases for similarity.\n");
      printf("\n");
      printf("  -fout     string     Name of main GADEM output file (see documentation for description) (default:\n");
      printf("                       gadem.txt).\n");
      printf("\n");
      printf("  -nbs      integer    Number of sets of randomly simulated sequences (default: 10) using the [a,c,g,t]\n");
      printf("                       frequencies in the input sequences, with length matched between the two sets.\n");
      printf("                       Those sequences are used as the random sequences for assessing motif enrichment\n");
      printf("                       in the input data.\n");
      printf("\n");
      printf("\n");
      printf("  -verbose  1 or 0     Print immediate results on screen [1-yes (default), 0-no]. These results\n");
      printf("                       include the motif consensus sequence, number of sites (in the subset of sequences\n");
      printf("                       subjected to EM optimization, see -fEM, above), and ln(E-value).\n");
      printf("\n");
      printf("-------------------------------------------------------------------------------------------\n");
      printf("Examples:\n");
      printf("\n");
      printf("1. Unseeded analysis for ChIP-seq regions in which motifs are expected to be centrally enriched\n");
      printf("   gadem -fseq input.seq -minN 1000 -verbose 1\n");
      printf("\n");
      printf("2. Seeded analysis for regions in which the expected motif is centrally enriched \n");
      printf("   gadem -fseq input.seq -minN 1000 -fpwm0 user_startPWM.mx -verbose 1\n");
      printf("\n");
      printf("3. Seeded analysis with a user-specified background model\n");
      printf("   gadem -fseq input.seq -minN 1000 -fpwm0 user_startPWM.mx -verbose 1 -bfile bfile.txt\n");
      printf("\n");
      printf("4. Seeded analysis for regions,in which the expected motif is centrally enriched, controlled p-value threshold\n");
      printf("   gadem -fseq input.seq -minN 1000 -fpwm0 startPWM.mx -pv 0.00025 -verbose 1\n");
      printf("\n");
      printf("5. Seeded analysis for regions,in which the expected motif is centrally enriched, controlled p-value and log(E-value) thresholds\n");
      printf("   gadem -fseq input.seq -minN 1000 -fpwm0 startPWM.mx -pv 0.00025 -ev 0 -verbose 1\n");
      printf("\n");
      exit(0);
   }

   mFileName=alloc_char(500);     mFileName[0]='\0';
   oFileName=alloc_char(500);     oFileName[0]='\0';
   pwmFileName=alloc_char(500);   pwmFileName[0]='\0';
   sFileName=alloc_char(500);     sFileName[0]='\0';
   bFileName=alloc_char(500);     bFileName[0]='\0';
   seq=NULL; aveSeqLen=0; maxSeqLen=0; numSeq=0; 
   minsites=-1; 
   
   // default settings
   numWordGroup=3;
   numTop3mer=20; numTop4mer=40; numTop5mer=60;
   numGeneration=5; populationSize=100;
   pvalueCutoff=0.00025;
   E_valueCutoff=0.0; 
   extTrim=1;
   minSpaceWidth=0; maxSpaceWidth=10;
   useChIPscore=0; 
   numEM=40; fEM=0.5; widthWt=80; fullScan=0;
   slideWinPWM=6; numUniq=populationSize;
   strcpy(oFileName,"gadem.txt"); 
   strcpy(pwmFileName,"observedPWMs.txt"); 
   stopCriterion=NUM_NO_MOTIF;  
   numBackgSets=10; 
   weightType=1;
   verbose=0;
   startPWMfound=0;
   maskR=0;
   nmotifs=25;

   for (ii=1; ii<argc-1; ii++) {
      if (argv[ii][0]=='-' && isalpha(argv[ii][1])) {
         goodArgument=0;
         if (
             (strncmp(argv[ii],"-ev",3)==0       && strlen(argv[ii])==3) ||
             (strncmp(argv[ii],"-pv",3)==0       && strlen(argv[ii])==3) ||
             (strncmp(argv[ii],"-em",3)==0       && strlen(argv[ii])==3) ||
             (strncmp(argv[ii],"-pop",4)==0      && strlen(argv[ii])==4) ||
             (strncmp(argv[ii],"-gen",4)==0      && strlen(argv[ii])==4) ||
             (strncmp(argv[ii],"-fEM",4)==0      && strlen(argv[ii])==4) ||
             (strncmp(argv[ii],"-nbs",4)==0      && strlen(argv[ii])==4) ||
             (strncmp(argv[ii],"-fseq",5)==0     && strlen(argv[ii])==5) ||
             (strncmp(argv[ii],"-fout",5)==0     && strlen(argv[ii])==5) ||
             (strncmp(argv[ii],"-fpwm",5)==0     && strlen(argv[ii])==5) ||
             (strncmp(argv[ii],"-minN",5)==0     && strlen(argv[ii])==5) ||
             (strncmp(argv[ii],"-bfile",6)==0    && strlen(argv[ii])==6) ||
             (strncmp(argv[ii],"-maxw3",6)==0    && strlen(argv[ii])==6) ||
             (strncmp(argv[ii],"-maxw4",6)==0    && strlen(argv[ii])==6) ||
             (strncmp(argv[ii],"-maxw5",6)==0    && strlen(argv[ii])==6) ||
             (strncmp(argv[ii],"-fpwm0",6)==0    && strlen(argv[ii])==6) ||
             (strncmp(argv[ii],"-posWt",6)==0    && strlen(argv[ii])==6) ||
             (strncmp(argv[ii],"-maskR",6)==0    && strlen(argv[ii])==6) ||
             (strncmp(argv[ii],"-mingap",7)==0   && strlen(argv[ii])==7) ||
             (strncmp(argv[ii],"-maxgap",7)==0   && strlen(argv[ii])==7) ||
             (strncmp(argv[ii],"-nmotifs",8)==0  && strlen(argv[ii])==8) ||
             (strncmp(argv[ii],"-extTrim",8)==0  && strlen(argv[ii])==8) ||
             (strncmp(argv[ii],"-verbose",8)==0  && strlen(argv[ii])==8) ||
             (strncmp(argv[ii],"-widthWt",8)==0  && strlen(argv[ii])==8) ||
             (strncmp(argv[ii],"-fullScan",9)==0 && strlen(argv[ii])==9) ||
             (strncmp(argv[ii],"-useScore",9)==0 && strlen(argv[ii])==9)
            ) { goodArgument=1; }
         if (!goodArgument) { printf("argument: %s unknown\n",argv[ii]); exit(0);  }
      }
   }

   ChIPScore=alloc_double(MAX_NUM_SEQ);
   seqLen=alloc_int(MAX_NUM_SEQ); 
   geneID=alloc_char_char(MAX_NUM_SEQ,500);

   for (ii=1; ii<argc; ii++) {
      if (strncmp(argv[ii],"-fseq",5)==0 && argv[ii+1]!=NULL) {
         //printf("\nreading input sequences file: %s\n",argv[ii+1]);
         strcpy(mFileName,argv[ii+1]);
         seq=read_seq(&numSeq,seqLen,geneID,MAX_NUM_SEQ,MAX_SEQ_LENGTH,ChIPScore,argv[ii+1]);
         aveSeqLen=0; for (i=0; i<numSeq; i++) aveSeqLen +=seqLen[i]; aveSeqLen /=(double)numSeq;
         maxSeqLen=0; 
         for (i=0; i<numSeq; i++) {
            if (seqLen[i]>maxSeqLen) maxSeqLen=seqLen[i]; 
         }
 
         rseq=alloc_char_char(numSeq,maxSeqLen+1);
         oseq=alloc_char_char(numSeq,maxSeqLen+1);
         orseq=alloc_char_char(numSeq,maxSeqLen+1);

         reverse_seq(seq,rseq,numSeq,seqLen);
         // make a copy of the original sequences both strands
         for (i=0; i<numSeq; i++) {
            for (j=0; j<seqLen[i]; j++) { oseq[i][j]=seq[i][j]; orseq[i][j]=rseq[i][j]; } 
            oseq[i][seqLen[i]]='\0'; orseq[i][seqLen[i]]='\0'; 
         }
      }
      else if (strncmp(argv[ii],"-fpwm0",6)==0 && argv[ii+1]!=NULL) {
         strcpy(sFileName,argv[ii+1]);
         printf("\n|------------------------------------------------------------------|\n");
         printf("|                                                                  |\n");
         printf("|               *** Running a seeded analysis ***                  |\n");
         printf("|                                                                  |\n");
         printf("|------------------------------------------------------------------|\n\n");
         // printf("reading user-specified seed pwm:\t%s\n",sFileName);
         populationSize=FIXED_POPULATION;
         dyad  =alloc_chrs(populationSize,4);
         sdyad =alloc_char_char(populationSize,MAX_PWM_LENGTH+1);
         pwmLen=alloc_int(populationSize);
         pwm=alloc_double_double_double(populationSize,MAX_PWM_LENGTH,4);
         pwmLen[0]=read_pwm0(argv[ii+1],pwm[0]);
         for (i=1; i<populationSize; i++) {
            for (j=0; j<pwmLen[0]; j++) {
               for (k=0; k<4; k++) pwm[i][j][k]=pwm[0][j][k]; 
            }
            pwmLen[i]=pwmLen[0]; 
         }
         pwm0FileName=alloc_char(200);
         strcpy(pwm0FileName,argv[ii+1]);
         startPWMfound=1;
      }
      else if (strncmp(argv[ii],"-bfile",6)==0 && argv[ii+1]!=NULL) {
         strcpy(bFileName,argv[ii+1]);
         bfreq0=alloc_double(5);
         read_background(bFileName,bfreq0); 
      } 
      else if (strncmp(argv[ii],"-pv",3)==0       && strlen(argv[ii])==3  && argv[ii+1]!=NULL) pvalueCutoff=atof(argv[ii+1]); 
      else if (strncmp(argv[ii],"-em",3)==0       && strlen(argv[ii])==3  && argv[ii+1]!=NULL) numEM=atoi(argv[ii+1]);
      else if (strncmp(argv[ii],"-ev",3)==0       && strlen(argv[ii])==3  && argv[ii+1]!=NULL) E_valueCutoff=atof(argv[ii+1]);
      else if (strncmp(argv[ii],"-pop",4)==0      && strlen(argv[ii])==4  && argv[ii+1]!=NULL) populationSize=atoi(argv[ii+1]); 
      else if (strncmp(argv[ii],"-gen",4)==0      && strlen(argv[ii])==4  && argv[ii+1]!=NULL) numGeneration=atoi(argv[ii+1]); 
      else if (strncmp(argv[ii],"-fEM",4)==0      && strlen(argv[ii])==4  && argv[ii+1]!=NULL) fEM=atof(argv[ii+1]);
      else if (strncmp(argv[ii],"-nbs",4)==0      && strlen(argv[ii])==4  && argv[ii+1]!=NULL) numBackgSets=atoi(argv[ii+1]);
      else if (strncmp(argv[ii],"-fout",5)==0     && strlen(argv[ii])==5  && argv[ii+1]!=NULL) strcpy(oFileName,argv[ii+1]); 
      else if (strncmp(argv[ii],"-fpwm",5)==0     && strlen(argv[ii])==5  && argv[ii+1]!=NULL) strcpy(pwmFileName,argv[ii+1]); 
      else if (strncmp(argv[ii],"-minN",5)==0     && strlen(argv[ii])==5  && argv[ii+1]!=NULL) minsites=atoi(argv[ii+1]);
      else if (strncmp(argv[ii],"-maxw3",6)==0    && strlen(argv[ii])==6  && argv[ii+1]!=NULL) numTop3mer=atoi(argv[ii+1]);
      else if (strncmp(argv[ii],"-maxw4",6)==0    && strlen(argv[ii])==6  && argv[ii+1]!=NULL) numTop4mer=atoi(argv[ii+1]);
      else if (strncmp(argv[ii],"-maxw5",6)==0    && strlen(argv[ii])==6  && argv[ii+1]!=NULL) numTop5mer=atoi(argv[ii+1]);
      else if (strncmp(argv[ii],"-posWt",6)==0    && strlen(argv[ii])==6  && argv[ii+1]!=NULL) weightType=atoi(argv[ii+1]);
      else if (strncmp(argv[ii],"-maskR",6)==0    && strlen(argv[ii])==6  && argv[ii+1]!=NULL) maskR=atoi(argv[ii+1]);
      else if (strncmp(argv[ii],"-mingap",7)==0   && strlen(argv[ii])==7  && argv[ii+1]!=NULL) minSpaceWidth=atoi(argv[ii+1]);
      else if (strncmp(argv[ii],"-maxgap",7)==0   && strlen(argv[ii])==7  && argv[ii+1]!=NULL) maxSpaceWidth=atoi(argv[ii+1]);
      else if (strncmp(argv[ii],"-nmotifs",8)==0  && strlen(argv[ii])==8  && argv[ii+1]!=NULL) nmotifs=atoi(argv[ii+1]);
      else if (strncmp(argv[ii],"-extTrim",8)==0  && strlen(argv[ii])==8  && argv[ii+1]!=NULL) extTrim=atoi(argv[ii+1]);
      else if (strncmp(argv[ii],"-verbose",8)==0  && strlen(argv[ii])==8  && argv[ii+1]!=NULL) verbose=atoi(argv[ii+1]);
      else if (strncmp(argv[ii],"-widthWt",8)==0  && strlen(argv[ii])==8  && argv[ii+1]!=NULL) widthWt=atoi(argv[ii+1]);
      else if (strncmp(argv[ii],"-fullScan",9)==0 && strlen(argv[ii])==9  && argv[ii+1]!=NULL) fullScan=atoi(argv[ii+1]);
      else if (strncmp(argv[ii],"-useScore",9)==0 && strlen(argv[ii])==9  && argv[ii+1]!=NULL) useChIPscore=atoi(argv[ii+1]);
      else { }
   }

   // check for input parameters
   if (numGeneration<1)  { printf("\nError: numbe of generaton < 1.\n"); exit(0); }
   if (populationSize<1) { printf("\nError: population size < 1.\n");    exit(0); }
   if (minSpaceWidth<0)  { 
      printf("\nError: minimal number of unspecified bases in spaced dyads <0.\n"); 
      printf("   check -mingap setting\n\n"); exit(0);
   }
   if (maxSpaceWidth<0)  { 
      printf("\nError: maximal number of unspecified bases in spaced dyads <0.\n"); 
      printf("   check -maxgap setting\n\n"); exit(0);
   }
   if (minSpaceWidth>maxSpaceWidth) {
      printf("\nError: mingap setting must <= to maxgap setting.\n\n"); exit(0); 
   }
   if (maxSpaceWidth+12>MAX_PWM_LENGTH) {
      printf("\nError: maxgap setting plus word lengths exceed <MAX_PWM_LENGTH>.\n");
      printf("   For very long motifs, please set <MAX_PWM_LENGTH> in 'defines.h' accordingly.\n\n"); exit(0); 
   }
   if (numEM<0) {
      printf("\nError: number of EM steps is zero.\n"); exit(0); 
   }
   if (numEM==0) {
      printf("\nNote: number of EM steps = 0, no EM optimization is carried out.\n");  
   }

   if (fullScan!=0 && fullScan!=1) fullScan=0;

   maxWordSize=0;
   if (numTop3mer>maxWordSize) maxWordSize=numTop3mer;
   if (numTop4mer>maxWordSize) maxWordSize=numTop4mer;
   if (numTop5mer>maxWordSize) maxWordSize=numTop5mer;

   // any one, two or three: tetramer, pentamer, hexamer
   if (numTop3mer==0 && numTop4mer==0 && numTop5mer==0) {
      printf("\nError: maxw3, maxw4, and maxw5 all zero - no words for spaced dyads.\n"); exit(0);
   }

   if (startPWMfound && fEM!=0.5 && fEM!=1.0) {
      printf("\n***Note: -fEM argument is ignored in a seeded analysis***\n\n");
   }

   if (startPWMfound && numEM!=0) {
      if (populationSize!=10 && populationSize!=100) printf("\n***Note: -pop argument is ignored in a seeded analysis, -pop is set to 10.***\n\n");
      if (numGeneration!=1 && numGeneration!=5)      printf("\n***Note: -gen argument is ignored in a seeded analysis, -gen is set to 1.***\n\n");
      fEM=1.0;
      populationSize=FIXED_POPULATION; numGeneration=1;
   }
   else if (startPWMfound && numEM==0) {
      populationSize=1;  numGeneration=1;
      fEM=1.0;
   }

   // number of sequences for EM
   if (fEM>1.0 || fEM<=0.0) { 
      printf("\nError: the fraction of sequences subject to EM is %3.2f.\n",fEM); exit(0); 
   } 
   numSeqEM=(int)(fEM*numSeq);

   // memory callocations
   if (!startPWMfound) {
      printf("\n|------------------------------------------------------------------|\n");
      printf("|                                                                  |\n");
      printf("|              *** Running an unseeded analysis ***                |\n");
      printf("|                                                                  |\n");
      printf("|------------------------------------------------------------------|\n\n");
      dyad  =alloc_chrs(populationSize,4);
      pwm   =alloc_double_double_double(populationSize,MAX_PWM_LENGTH,4);
      pwmLen=alloc_int(populationSize);
      sdyad =alloc_char_char(populationSize,MAX_PWM_LENGTH+1);
      word  =alloc_word(numWordGroup,maxWordSize);
   }
   Iseq  =alloc_char(numSeq+1); 
   opwm  =alloc_double_double_double(populationSize,MAX_PWM_LENGTH,4);
   opwm2 =alloc_double_double(MAX_PWM_LENGTH,4);
   t1pwm =alloc_double_double(MAX_PWM_LENGTH,4);
   t2pwm =alloc_double_double(MAX_PWM_LENGTH,4);
   ipwm  =alloc_int_int(MAX_PWM_LENGTH,4);
   logpwm=alloc_double_double(MAX_PWM_LENGTH,4);

   score =alloc_double_double(numSeq,maxSeqLen);
   rscore=alloc_double_double(numSeq,maxSeqLen);

   epwm=alloc_double_double_double(populationSize,MAX_PWM_LENGTH,4);
   logepwm=alloc_double_double(MAX_PWM_LENGTH,4);

   wheel =alloc_wheel(populationSize);
   siteEM=alloc_site(MAX_SITES);
   fitness=alloc_fitness(populationSize);
   emSeqLen=alloc_int(numSeqEM);
   maxpFactor=alloc_double(populationSize);
   pwmConsensus=alloc_char_char(populationSize,MAX_PWM_LENGTH+1);
   uniqMotif=alloc_char(populationSize+1);
   scoreCutoff=alloc_int(populationSize);
   llrDist=alloc_distr(MAX_DIMENSION);
   posWeight=alloc_double_double(numSeq,maxSeqLen);
   sseq=alloc_char_char(MAX_NUM_SEQ,maxSeqLen+1);
   rsseq=alloc_char_char(MAX_NUM_SEQ,maxSeqLen+1);

   bfreq1=base_frequency(numSeq,seq,seqLen);
 
   if (bFileName[0]=='\0') {
      bfreq0=alloc_double(5);
      for (i=0; i<4; i++) bfreq0[i]=bfreq1[i]; 
   }
 
   // if minN not specified, set the defaults accordingly
   if (minsites==-1) minsites =max(2,(int)(numSeq/20)); 
   minsitesEM=(int)(fEM*minsites);

   maxpMutationRate=MAXP_MUTATION_RATE;

   seed=time(0);
   sgenrand(seed);

   // determine the distribution and critical cut point
   pwmDistCutoff=vector_similarity();

   /*---------- select a subset of sequences for EM only --------------*/
   if (useChIPscore==1) {
      select_high_scoring_seq_for_EM (ChIPScore,numSeq,numSeqEM,Iseq,fEM);
   }
   else {
      sample_without_replacement(Iseq,numSeqEM,numSeq);
   }
   /*-------------------- end of selection --------------------------*/

   if (maskR==1) {
      printf("mask simple repeats with 'n' - ");
      mask_repetitive(geneID,seq,numSeq,seqLen,mFileName);
      printf("done\n\n"); 
   }
   if (widthWt<20) {
      printf("\n***Note: the window width of sequence centered on the nucleotides having large weights\n");
      printf("   in EM for PWM optimization is small: %d\n",widthWt);
      printf("   Motif longer than %d will not be discovered\n\n",widthWt); 
   }

   time(&start);
   fp=fopen("info.txt","w");
   fprintf(fp,"processor ID: %d\n",getpid());
   fprintf(fp,"==========================================================================\n");
   if (startPWMfound) {
      fprintf(fp,"\n|------------------------------------------------------------------|\n");
      fprintf(fp,"|                                                                  |\n");
      fprintf(fp,"|              *** Running a seeded analysis ***                   |\n");
      fprintf(fp,"|                                                                  |\n");
      fprintf(fp,"|------------------------------------------------------------------|\n\n");
   }
   else {
      fprintf(fp,"\n|------------------------------------------------------------------|\n");
      fprintf(fp,"|                                                                  |\n");
      fprintf(fp,"|              *** Running an unseeded analysis ***                |\n");
      fprintf(fp,"|                                                                  |\n");
      fprintf(fp,"|------------------------------------------------------------------|\n\n");
   }
   fprintf(fp,"command line: ");
   for (i=0; i<argc; i++) fprintf(fp,"%s ",argv[i]); fprintf(fp,"\n\n");
   fprintf(fp,"Data:\n");
   fprintf(fp,"  input (ChIP) sequence file:\t\t\t\t%s\n",mFileName);
   fprintf(fp,"  number of sequences in input file:\t\t\t%d\n",numSeq);
   fprintf(fp,"  average sequence length:\t\t\t\t%d\n",(int)aveSeqLen);
   fprintf(fp,"  total number of nucleotides:\t\t\t\t%d\n",(int)(aveSeqLen*numSeq));
   fprintf(fp,"  [a,c,g,t] frequencies:\t\t\t\t%5.4f %5.4f %5.4f %5.4f\n\n",bfreq1[0],bfreq1[1],bfreq1[2],bfreq1[3]); 

   fprintf(fp,"Motif model:\n"); 
   if (!startPWMfound) {
      fprintf(fp,"  starting PWMs come from spaced dyads generated by gadem\n");
      fprintf(fp,"  spaced Dyads:\n");
      fprintf(fp,"    number of top-ranked 3-, 4-, 5-mer:\t\t\t%d %d %d\n",numTop3mer,numTop4mer,numTop5mer);
      fprintf(fp,"    minimal & maximal spacer(d) in dyads\t\t%d %d\n",minSpaceWidth,maxSpaceWidth);
      fprintf(fp,"    this setting identifies motifs from 6 to at least 20-25 bps long\n\n"); 
   }
   else {
      fprintf(fp,"  use the user-specified pwm as the starting PWM\t%s\n\n",sFileName);
      fprintf(fp,"  This pwm is repeatedly used as the starting PWMf for the EM algorithm. Similar\n");
      fprintf(fp,"  (motif variants) or different motifs may be identified. To identify motif vaiants\n");
      fprintf(fp,"  set -ev large, e.g., 10000 and -minN small, e.g., numSeq/10. This allows gadem\n");
      fprintf(fp,"  to identify motif variants that are present in at least 10 percent of the\n");
      fprintf(fp,"  sequences. A large log(E-value) cutoff enures such motifs are found.\n\n");
   }

   fprintf(fp,"Background model:\n");
   if (bFileName[0]=='\0') {
      fprintf(fp,"  background model estimated from the input data:\t%s\n",mFileName);
      fprintf(fp,"  \t\t\t\t\t\t\t%f %f %f %f\n",bfreq1[0],bfreq1[1],bfreq1[2],bfreq1[3]); 
   }
   else {
      fprintf(fp,"  background model read from file:\t\t\t%s\n",bFileName);
      fprintf(fp,"  \t\t\t\t\t\t\t%f %f %f %f\n",bfreq0[0],bfreq0[1],bfreq0[2],bfreq0[3]); 
   }
   fprintf(fp,"  background Markov order:\t\t\t\t0th\n\n");
   
   fprintf(fp,"Declaring BINDING SITES:\n");
   fprintf(fp,"  pwm score p-value cutoff for declaring binding site:\t%e\n",pvalueCutoff);
   fprintf(fp,"  null pwm log-likelihood ratio score distribution deterimined using:\n");
   fprintf(fp,"    Staden probability generating function method (Comput. Appl. Biosci., 5,89,1989).\n\n"); 

   fprintf(fp,"Declaring MOTIFS:\n");
   fprintf(fp,"  motif p-value (significance of alignment) is computed using the subroutine from MEME\n");
   fprintf(fp,"  log(E-value) cutoff:\t\t\t\t\t%5.2f\n\n",E_valueCutoff);

   fprintf(fp,"Motif prior probability type:\n");
   fprintf(fp,"  motif prior probability type (see documentation):\t%1d ",weightType);
   if      (weightType==1) fprintf(fp,"(gaussian motif location prior)\n\n");
   else if (weightType==2) fprintf(fp,"(triangle motif location prior)\n\n");
   else                    fprintf(fp,"(uniform motif location prior)\n\n"); 

   fprintf(fp,"Genetic Algorithm (GA):\n"); 
   if (!startPWMfound) {
      fprintf(fp,"  GA is used to optimize the spaced dyads which are subsequently converted into starting PWMs.\n");
      fprintf(fp,"  max number of GA generations:\t\t\t\t%d\n",numGeneration);
      fprintf(fp,"  GA population size:\t\t\t\t\t%d\n\n",populationSize);
   }
   else {
      fprintf(fp,"  running a seeded analysis - GA not needed.\n\n");
   }

   fprintf(fp,"EM:\n");
   fprintf(fp,"  maximal number of EM steps:\t\t\t\t%d\n",numEM);
   fprintf(fp,"  EM convergence criterion:\t\t\t\t\%e\n",PWM_CONVERGENCE);
   fprintf(fp,"  fraction (number) input sequences subject to EM\t%3.2f (%d)\n\n",fEM,numSeqEM);
   
   fprintf(fp,"MAXP value:\n");
   if (startPWMfound) {
      fprintf(fp,"  run EM on the starting pwm %s %d times, each with a different maxp:\n",pwm0FileName,FIXED_POPULATION);
      for (i=0; i<FIXED_POPULATION; i++) fprintf(fp,"%3.2f*numSeq ",FIXED_MAXPF*(i+1)); fprintf(fp,"\n");
      fprintf(fp,"  no spaced dyads are generated and used. pop=%d gen=1 (no GA).\n\n",FIXED_POPULATION);
   }
   else {
      fprintf(fp,"  GADEM determines maxps by choosing ");
      fprintf(fp,"  one of the five values (factor*numSeq):\n ");
      fprintf(fp,"   ");
      for (i=0; i<MAXP_SCALE; i++) fprintf(fp,"%3.2f*numSeq, ",MAXP_BASE+(double)i*MAXP_FACTOR); fprintf(fp,"\n");
      fprintf(fp,"  These factors are optimized along with dyads by the GA and reported.\n\n");
   }

   fprintf(fp,"Other parameters:\n");
   if (maskR==1) fprintf(fp,"  simple repeats such as 'aaaaaaaa' (see usage) are masked before running GADEM\n");
   else          fprintf(fp,"  simple repeats such as 'aaaaaaaa' (see usage) not masked before running GADEM\n");
   if (fullScan) {
      fprintf(fp,"  alway scan original unmaksed sequences (allow motif sites to overlap).\n");
   }
   fprintf(fp,"  minimal no. sites for each motif:\t\t\t%d\n",minsites);
   if (extTrim) fprintf(fp,"  base extension and trimming?\t\t\t\tyes\n");
   else         fprintf(fp,"  base extension and trimming?\t\t\t\tno\n");
   if (!startPWMfound) fprintf(fp,"  random seed:\t\t\t\t\t\t%lld\n\n",seed);
   fprintf(fp,"\njob started: %s\n", asctime(localtime(&start)));
   fprintf(fp,"=========================================================================\n\n");
   fflush(fp);

   printf("==============================================================================================\n");
   printf("input sequence file:  %s\n",mFileName);
   printf("number of sequences and average length:\t\t\t\t%d %5.1f\n",numSeq,aveSeqLen);
   if (startPWMfound) {
      printf("\nseed PWM: %s\n",sFileName);
      for (j=0; j<4; j++) {
         for (i=0; i<pwmLen[0]; i++) {
            if (i<pwmLen[0]-1) printf("%5.2f ", pwm[0][i][j]);
            else               printf("%5.2f\n",pwm[0][i][j]);
         }
      }
   }

   if (weightType==0) printf("uniform weight applies to each sequence - type:\t\t%d\n",weightType);
   else if (weightType==1) printf("gaussian weight applies to each sequence - type:\t\t%d\n",weightType);
   else if (weightType==2) printf("triangular weight applies to each sequence - type:\t\t%d\n",weightType);
   printf("number of GA generations & population size:\t\t\t%d %d\n\n",numGeneration,populationSize);
   printf("PWM score p-value cutoff for binding site declaration:\t\t%e\n",pvalueCutoff);
   printf("ln(E-value) cutoff for motif declaration:\t\t\t%f\n\n",E_valueCutoff);
   printf("number (percentage) of sequences selected for EM:\t\t%d(%4.1f\%)\n",numSeqEM,100.0*(double)numSeqEM/(double)numSeq);
   printf("number of EM steps:\t\t\t\t\t\t%d\n",numEM);
   printf("minimal no. sites considered for a motif:\t\t\t%d\n\n",minsites);
   printf("[a,c,g,t] frequencies in input data:\t\t\t\t%f %f %f %f\n",bfreq1[0],bfreq1[1],bfreq1[2],bfreq1[3]);
   if (bFileName[0]!='\0') printf("background [a,c,g,t] frequencies:\t\t\t\t%f %f %f %f\n",bfreq0[0],bfreq0[1],bfreq0[2],bfreq0[3]);
   printf("==============================================================================================\n");

   if (startPWMfound && fEM!=1.0) {
      printf("\n***Note: -fEM argument is ignored in a seeded analysis***\n\n");
   }

   printf("\nSarting GADEM... this may take a few hours to complete\n");
   printf("type: ctrl/z, then, bg, to run it in background\n\n");

   // determine seq length by counting only [a,c,g,t], seqLen is used in E-value calculation 
   effect_seq_length(seq,numSeq,seqLen,Iseq,emSeqLen);
   // determine the distribution and critical cut point
   pwmDistCutoff=vector_similarity();

   if      (weightType==0) assign_weight_uniform(seqLen,numSeq,posWeight);
   else if (weightType==1) assign_weight_normal(seqLen,numSeq,posWeight);
   else if (weightType==2) assign_weight_triangular(seqLen,numSeq,posWeight);
   else {
      printf("Motif prior probability type not found - please choose: 0, 1, or 2\n");
      printf("Consider: -posWt 1 (gaussian prior) or 2 (triangular prior) for strong central enrichment as in ChIP-seq\n");
      printf("          -posWt 0 (uniform prior) for others\n\n");
      exit(0);
   }

   if (startPWMfound) minminSites=minsites;
   else               minminSites=(int)(0.40*minsitesEM);

   fq=fopen(oFileName,"w");
   fpwm=fopen(pwmFileName,"w");
   motifCn=0; noMotifFound=0; numCycle=0; numCycleNoMotif=0; 

   if (startPWMfound) { 
      for (i=0; i<populationSize; i++) {
         standardize_pwm(pwm[i],pwmLen[i]);
         consensus_pwm(pwm[i],pwmLen[i],pwmConsensus[i]);
         strcpy(sdyad[i],pwmConsensus[i]); 
      }
   }
 
   do {
      if (!startPWMfound) {
         // identify top-ranked k-mers (k=3,4,5) for spaced dyads
         printf("\nGADEM cycle %2d: enumerate and count k-mers...   ",numCycle+1);
         numWordGroup=word_for_dyad(word,seq,rseq,numSeq,seqLen,bfreq1,&numTop3mer,&numTop4mer,&numTop5mer);
         printf("done.\n");

         // generating a "population" of spaced dyads
         printf("\ninitializing GA...   ");
         initialisation(dyad,populationSize,numWordGroup,word,minSpaceWidth,maxSpaceWidth,maxpFactor);
         printf("done.\n\n");
      }
      else {
         for (i=0; i<populationSize; i++) maxpFactor[i]=FIXED_MAXPF*(i+1); 
      }

      generationNoMotif=0;
      for (jjj=0; jjj<numGeneration; jjj++) {

         // convert spaced dyads to letter probability matrix
         if (!startPWMfound) dyad_to_pwm(word,populationSize,dyad,pwm,pwmLen);

         for (ii=0; ii<populationSize; ii++) {
            // to see from which spaced dyad a motif is derived
            if (!startPWMfound) pwm_profile(pwm[ii],pwmLen[ii],sdyad[ii]);

            // make a copy and then subject the copy to EM
            copy_pwm(pwm[ii],t1pwm,pwmLen[ii]); 
            // standarize pwm
            if (!startPWMfound) standardize_pwm(t1pwm,pwmLen[ii]);
            // for (j=0; j<4; j++) { for (m=0; m<pwmLen[0]; m++) printf("%4.3f ",t1pwm[m][j]); printf("\n"); } printf("\n"); 
           
            // EM on randomly selected sequences
            maxp=(int)(maxpFactor[ii]*numSeqEM); 

            for (jj=0; jj<numEM; jj++) {
               log_pwm(t1pwm,logpwm,pwmLen[ii]);
               // compute ll score of each w-mer | motif model
               ll_score_motif_model(numSeq,seq,rseq,seqLen,logpwm,pwmLen[ii],score,rscore,Iseq,bfreq0);
               // compute p(zij|y=1) probability of binding sites started at position j on seq i
               normalize(score,rscore,seqLen,pwmLen[ii],numSeq,Iseq,maxp,posWeight,weightType);
               // E-step
               construct_pwm(t2pwm,score,rscore,seq,rseq,seqLen,numSeq,pwmLen[ii],Iseq);
               // M-step
               standardize_pwm(t2pwm,pwmLen[ii]);
               // printf("EM: %2d\n",jj+1);
               // for (j=0; j<4; j++) { for (m=0; m<pwmLen[0]; m++) printf("%5.4f ",t2pwm[m][j]); printf("\n"); } printf("\n"); 
               pwmDiff=check_convergence(t1pwm,t2pwm,pwmLen[ii]);
               // copy t2pwm to t1pwm
               copy_pwm(t2pwm,t1pwm,pwmLen[ii]); 
               if (pwmDiff<=PWM_CONVERGENCE)  break;
            }
            copy_pwm(t1pwm,epwm[ii],pwmLen[ii]); // from to
            //for (j=0; j<4; j++) { for (i=0; i<pwmLen[ii]; i++) printf("%4.3f ",t1pwm[i][j]); printf("\n"); } printf("\n\n");  

            // log(PWM), then (double)log(PWM) to (int)PWM for determine score distribution
            log_ratio_to_int(epwm[ii],ipwm,pwmLen[ii],bfreq0);
            // for (j=0; j<4; j++) { for (i=0; i<pwmLen[ii]; i++) printf("%5d ",ipwm[i][j]); printf("\n"); } printf("\n\n"); 
            // compute score distribution of the (int)PWM using Staden's method 
            // printf("determining the PWM score distribution--\n");
            llrDim=pwm_score_dist(ipwm,pwmLen[ii],llrDist,bfreq0);
            // print_ptable(llrDist,llrDim);

            scoreCutoff[ii]=determine_cutoff(llrDist,llrDim,pvalueCutoff);

            // test each w-mer to see if a motif site - test statistic: ll, distribution: Staden method, cutoff: user-specified
            nsitesEM=scan_em_seq_ptable(llrDist,llrDim,siteEM,numSeq,seq,rseq,seqLen,ipwm,pwmLen[ii],scoreCutoff[ii],bfreq0,Iseq);
            // printf("maxp %d nsitesEM: %d pcutoff: %e scorecutoff: %d\n",maxp,nsitesEM,pvalueCutoff,scoreCutoff[ii]);
   
            // loose threshould at this step, as em only on a subset of sequences

            if (nsitesEM>=max(2,minminSites)) {
               // construct pwm from the identified sites
               align_sites_count(siteEM,seq,rseq,nsitesEM,pwmLen[ii],opwm[ii]);
               standardize_pwm(opwm[ii],pwmLen[ii]);
               consensus_pwm(opwm[ii],pwmLen[ii],pwmConsensus[ii]);
               // compute E-value of the relative entroy score of each motif, use it as fitness
               fitness[ii].value=E_value(opwm[ii],nsitesEM,bfreq0,pwmLen[ii],numSeqEM,emSeqLen);
            }
            else {
               // if too few sites in a motif
               align_sites_count(siteEM,seq,rseq,nsitesEM,pwmLen[ii],opwm[ii]);
               standardize_pwm(opwm[ii],pwmLen[ii]);
               consensus_pwm(opwm[ii],pwmLen[ii],pwmConsensus[ii]);
               fitness[ii].value=DUMMY_FITNESS;
               // for (i=0; i<pwmLen[ii]; i++) pwmConsensus[ii][i]='n'; pwmConsensus[ii][pwmLen[ii]]='\0'; 
            }
            fitness[ii].index=ii;
            if (verbose) { 
               printf("cyc.[%3d] gen.[%3d] pop.[%3d] ",numCycle+1,jjj+1,ii+1);
               if (!startPWMfound) printf(" spacedDyad: %s ",sdyad[ii]);
               for (j=strlen(sdyad[ii]); j<maxSpaceWidth+10; j++) printf(" ");
               printf(" motifConsensus: %s",pwmConsensus[ii]);
               for (j=strlen(sdyad[ii]); j<maxSpaceWidth+10; j++) printf(" ");
               //printf(" fitness: %7.2f\n",fitness[ii].value);
               printf(" maxpf: %3.2f fitness: %7.2f nsitesEM: %d\n",maxpFactor[ii],fitness[ii].value,nsitesEM);
            }
         }
         if (populationSize>1) sort_fitness(fitness,populationSize);

         /*-----------------------------------------------------------------------------
         printf("generation %3d top %d:\n", jjj+1, populationSize);
         for (i=0; i<populationSize; i++) 
            printf("   pwmConsens: %s\tlogev: %6.2f\n",pwmConsensus[fitness[i].index],fitness[i].value);
         printf("\n");  
         -----------------------------------------------------------------------------*/
        
         numUniq=check_pwm_uniqueness_dist(opwm,pwmLen,populationSize,fitness,pwmDistCutoff,E_valueCutoff,uniqMotif,slideWinPWM);

         printf("\nGADEM cycle[%3d] generation[%3d] number of unique motif: %d\n",numCycle+1,jjj+1,numUniq);
         fprintf(fp,"GADEM cycle[%3d] generation[%3d] number of unique motif(s): %d\n",numCycle+1,jjj+1,numUniq);
         for (i=0; i<populationSize; i++) {
            if (uniqMotif[i]=='1') {
               if (!startPWMfound) printf("   spacedDyad: %s ",sdyad[fitness[i].index]);
               for (j=strlen(sdyad[fitness[i].index]); j<maxSpaceWidth+10; j++) printf(" ");
               printf("motifConsensus: %s ",pwmConsensus[fitness[i].index]);
               for (j=strlen(sdyad[fitness[i].index]); j<maxSpaceWidth+10; j++) printf(" ");
               printf(" %3.2f fitness: %7.2f\n",maxpFactor[fitness[i].index],fitness[i].value);

               if (!startPWMfound) fprintf(fp,"   spacedDyad: %s ",sdyad[fitness[i].index]);
               for (j=strlen(sdyad[fitness[i].index]); j<maxSpaceWidth+10; j++) fprintf(fp," ");
               fprintf(fp,"motifConsensus: %s ",pwmConsensus[fitness[i].index]);
               for (j=strlen(sdyad[fitness[i].index]); j<maxSpaceWidth+10; j++) fprintf(fp," ");
               // fprintf(fp," fitness: %7.2f\n",fitness[i].value);
               fprintf(fp," %3.2f fitness: %7.2f\n",maxpFactor[fitness[i].index],fitness[i].value);
               //for (j=0; j<4; j++) {
               //   for (m=0; m<pwmLen[fitness[i].index]; m++) printf("%4.2f ",opwm[fitness[i].index][m][j]); printf("\n");
               //} printf("\n\n");
            }
         }
         printf("\n"); fprintf(fp,"\n"); fflush(fp);

         if (jjj<numGeneration-1) {

            // fitness based selection with replacement 
            roulett_wheel_fitness(fitness,populationSize,wheel);

            // mutation and crossover operations
            if (populationSize>1) {
               if (genrand()>=0.5) {
                  mutation (dyad,numWordGroup,word,minSpaceWidth,maxSpaceWidth,wheel,populationSize,fitness,uniqMotif,
                     maxpFactor,maxpMutationRate); 
               }
               else { 
                  crossover(dyad,numWordGroup,word,minSpaceWidth,maxSpaceWidth,wheel,populationSize,fitness,uniqMotif,
                     maxpFactor,maxpMutationRate); 
               }
            }
            else { 
               mutation (dyad,numWordGroup,word,minSpaceWidth,maxSpaceWidth,wheel,populationSize,fitness,uniqMotif,
                 maxpFactor,maxpMutationRate); 
            }
         }
      }
      numCycle++;

      site  =alloc_site_site(numUniq+1,MAX_SITES);
      nsites=alloc_int(numUniq+1);
      pwmnewLen=alloc_int(numUniq+1); // after base extension and trimming
      seqCn=alloc_int(MAX_NUM_SEQ);
      bseqCn=alloc_int(MAX_NUM_SEQ);

      // final step user-specified background model is used
      motifCn2=0; // motifCn per GADEM cycle
      for (ii=0; ii<populationSize; ii++) {

         id=fitness[ii].index;
         // for (j=0; j<4; j++) { for (i=0; i<pwmLen[id]; i++) printf("%4.3f ",epwm[id][i][j]); printf("\n"); } printf("\n");  exit(0);

         if (uniqMotif[ii]=='0') continue;

         // approximate the exact llr distribution using Staden's method
         printf("\nApproximate the exact pwm llr score distribution using the pgf method.\n");
         log_ratio_to_int(epwm[id],ipwm,pwmLen[id],bfreq0);

         // compute score distribution of the (int)PWM using Staden's method 
         llrDim=pwm_score_dist(ipwm,pwmLen[id],llrDist,bfreq0);
         scoreCutoff[id]=determine_cutoff(llrDist,llrDim,pvalueCutoff);
         if (fullScan) {
            nsites[motifCn2]=scan_llr_pgf(llrDist,llrDim,site[motifCn2],numSeq,oseq,orseq,seqLen,ipwm,pwmLen[id],scoreCutoff[id],bfreq0);
         }
         else {
            nsites[motifCn2]=scan_llr_pgf(llrDist,llrDim,site[motifCn2],numSeq,seq,rseq,seqLen,ipwm,pwmLen[id],scoreCutoff[id],bfreq0);
         }

         if (nsites[motifCn2]>=max(2,minsites)) {
            for (j=0; j<numSeq; j++) seqCn[j]=0;
            for (j=0; j<nsites[motifCn2]; j++) seqCn[site[motifCn2][j].seq]++;
      
            for (j=0; j<4; j++) cn[j]=0;
            for (j=0; j<numSeq; j++) {
               if (seqCn[j]==0) cn[0]++;
               if (seqCn[j]==1) cn[1]++;
               if (seqCn[j]==2) cn[2]++;
               if (seqCn[j]>2)  cn[3]++;
            }

            totalSitesInput=nsites[motifCn2];
            if (extTrim) {
               if (fullScan) {
                  extend_alignment(site[motifCn2],numSeq,oseq,orseq,seqLen,nsites[motifCn2],pwmLen[id],&(pwmnewLen[motifCn2]));
               }
               else {
                  extend_alignment(site[motifCn2],numSeq,seq,rseq,seqLen,nsites[motifCn2],pwmLen[id],&(pwmnewLen[motifCn2]));
               }
            }
            else { pwmnewLen[motifCn2]=pwmLen[id]; } 

            if (fullScan) {
               align_sites_count(site[motifCn2],oseq,orseq,nsites[motifCn2],pwmnewLen[motifCn2],opwm2);
            }
            else {
               align_sites_count(site[motifCn2],seq,rseq,nsites[motifCn2],pwmnewLen[motifCn2],opwm2);
            }
            standardize_pwm(opwm2,pwmnewLen[motifCn2]);

            logev=E_value(opwm2,nsites[motifCn2],bfreq0,pwmnewLen[motifCn2],numSeq,seqLen);
            if (logev<=E_valueCutoff) {
               consensus_pwm(opwm2,pwmnewLen[motifCn2],pwmConsensus[id]);
               if (fullScan) {
                  print_result_2(site[motifCn2],nsites[motifCn2],numSeq,oseq,orseq,seqLen,geneID,logev,opwm2,pwmnewLen[motifCn2],
                      motifCn+1,sdyad[id],pwmConsensus[id],numCycle,pvalueCutoff,maxpFactor[id],fq,fpwm);
                  print_motif(site[motifCn2],nsites[motifCn2],oseq,orseq,seqLen,pwmnewLen[motifCn2],motifCn+1,opwm2);
               }
               else {
                  print_result_2(site[motifCn2],nsites[motifCn2],numSeq,seq,rseq,seqLen,geneID,logev,opwm2,pwmnewLen[motifCn2],
                      motifCn+1,sdyad[id],pwmConsensus[id],numCycle,pvalueCutoff,maxpFactor[id],fq,fpwm);
                  print_motif(site[motifCn2],nsites[motifCn2],seq,rseq,seqLen,pwmnewLen[motifCn2],motifCn+1,opwm2);
               }
               // print_bed(site[motifCn2],nsites[motifCn2],geneID,seqLen,pwmnewLen[motifCn2],motifCn+1);
               mask_sites(nsites[motifCn2],seq,rseq,seqLen,site[motifCn2],pwmnewLen[motifCn2]);

               /* ----------------------compute the average number of sites in background sequences ----------------------*/
               avebnsites=0; avebnsiteSeq=0;
               for (i=0; i<numBackgSets; i++) {

                  simulate_background_seq(bfreq0,numSeq,seqLen,sseq);
                  reverse_seq(sseq,rsseq,numSeq,seqLen);

                  nsites[motifCn2]=scan_llr_pgf(llrDist,llrDim,site[motifCn2],numSeq,sseq,rsseq,seqLen,ipwm,pwmLen[id],scoreCutoff[id],bfreq0);

                  for (j=0; j<numSeq; j++) bseqCn[j]=0;
                  for (j=0; j<nsites[motifCn2]; j++) bseqCn[site[motifCn2][j].seq]++;
      
                  for (j=0; j<4; j++) bcn[j]=0;
                  for (j=0; j<numSeq; j++) {
                     if (bseqCn[j]==0) bcn[0]++;
                     if (bseqCn[j]==1) bcn[1]++;
                     if (bseqCn[j]==2) bcn[2]++;
                     if (bseqCn[j]>2)  bcn[3]++;
                  }
                  fprintf(fq,"background set[%2d] Seqs with 0,1,2,>2 sites: %d %d %d %d\n",i+1,bcn[0],bcn[1],bcn[2],bcn[3]);
                  avebnsites+=nsites[motifCn2]; avebnsiteSeq+=(numSeq-bcn[0]);
               } 
               avebnsites/=numBackgSets; avebnsiteSeq/=numBackgSets;
               fprintf(fq,"average number of sites in background sequences: %d, fold enrichment: %5.3f.\n",
                  avebnsites,(double)totalSitesInput/(double)avebnsites);
               fprintf(fq,"average number of background sequences that contain at least one site: %d, fold enrichment: %5.3f.\n",
                  avebnsiteSeq,(double)(cn[1]+cn[2]+cn[3])/(double)(bcn[1]+bcn[2]+bcn[3]));
               fprintf(fq,"-------------------------------------------------------\n");
               fflush(fq); 
               /* -----------------end compute the average number of sites in background sequences ----------------------*/
               motifCn++; motifCn2++; numCycleNoMotif=0;
            } 
         }
      }
      for (i=0; i<motifCn2; i++) {
         // mask_sites(nsites[i],seq,rseq,seqLen,site[i],pwmnewLen[i]); 
         // fq=fopen("tmp.seq","w");
         // for (i=0; i<numSeq; i++) {
         //   fprintf(fq,"%s\n",geneID[i]); fprintf(fq,"%s\n",seq[i]);
         //}
         //fclose(fq);
         //exit(0);
      }
      if (site[0])   { free(site[0]);   site[0]=NULL;   }
      if (site)      { free(site);      site=NULL;      }
      if (nsites)    { free(nsites);    nsites=NULL;    }
      if (pwmnewLen) { free(pwmnewLen); pwmnewLen=NULL; }
      
      if (motifCn2==0) numCycleNoMotif++;
      if (motifCn==nmotifs) { printf("Maximal number of motifs (%d) reached\n",nmotifs); break;  }
      if (numCycleNoMotif==stopCriterion) noMotifFound=1;

   } while (!noMotifFound);
   fclose(fq); fclose(fpwm);

   time(&finish);
   fprintf(fp,"\nfinished: %s\n", asctime(localtime(&finish)));
   fprintf(fp,"approximated processor time in seconds: %f\n",difftime(finish,start));
   fclose(fp);
   system("mv info.txt info.done.txt");

   if (!startPWMfound) {  
      if (dyad[0])      { free(dyad[0]);         dyad[0]=NULL;    }
      if (dyad)         { free(dyad);            dyad=NULL;       }
   }
   if (seqLen)          { free(seqLen);          seqLen=NULL;     }
   if (pwm[0][0])       { free(pwm[0][0]);       pwm[0][0]=NULL;  }
   if (pwm[0])          { free(pwm[0]);          pwm[0]=NULL;     }
   if (pwm)             { free(pwm);             pwm=NULL;        }
   if (opwm2[0])        { free(opwm2[0]);        opwm2[0]=NULL;   }
   if (opwm2)           { free(opwm2);           opwm2=NULL;      }
   if (opwm[0][0])      { free(opwm[0][0]);      opwm[0][0]=NULL; }
   if (opwm[0])         { free(opwm[0]);         opwm[0]=NULL;    }
   if (opwm)            { free(opwm);            opwm=NULL;       }
   if (t1pwm[0])        { free(t1pwm[0]);        t1pwm[0]=NULL;   }
   if (t1pwm)           { free(t1pwm);           t1pwm=NULL;      }
   if (t2pwm[0])        { free(t2pwm[0]);        t2pwm[0]=NULL;   }
   if (t2pwm)           { free(t2pwm);           t2pwm=NULL;      }
   if (logpwm[0])       { free(logpwm[0]);       logpwm[0]=NULL;  }
   if (logpwm)          { free(logpwm);          logpwm=NULL;     }
   if (ipwm[0])         { free(ipwm[0]);         ipwm[0]=NULL;    }
   if (ipwm)            { free(ipwm);            ipwm=NULL;       }
   if (pwmLen)          { free(pwmLen);          pwmLen=NULL;     }
   if (seq[0])          { free(seq[0]);          seq[0]=NULL;     }
   if (seq)             { free(seq);             seq=NULL;        }
   if (rseq[0])         { free(rseq[0]);         rseq[0]=NULL;    }
   if (rseq)            { free(rseq);            rseq=NULL;       }
   if (geneID[0])       { free(geneID[0]);       geneID[0]=NULL;  }
   if (oseq[0])         { free(oseq[0]);         oseq[0]=NULL;    }
   if (oseq)            { free(oseq);            oseq=NULL;       }
   if (orseq[0])        { free(orseq[0]);        orseq[0]=NULL;   }
   if (orseq)           { free(orseq);           orseq=NULL;      }
   if (geneID)          { free(geneID);          geneID=NULL;     }
   if (score[0])        { free(score[0]);        score[0]=NULL;   }
   if (score)           { free(score);           score=NULL;      }
   if (rscore[0])       { free(rscore[0]);       rscore[0]=NULL;  }
   if (rscore)          { free(rscore);          rscore=NULL;     }
   if (bfreq0)          { free(bfreq0);          bfreq0=NULL;     }
   if (bfreq1)          { free(bfreq1);          bfreq1=NULL;     }
   if (wheel)           { free(wheel);           wheel=NULL;      }
   if (fitness)         { free(fitness);         fitness=NULL;    }
   if (mFileName)       { free(mFileName);       mFileName=NULL;  }
   if (oFileName)       { free(oFileName);       oFileName=NULL;  }
   if (pwmFileName)     { free(pwmFileName);     pwmFileName=NULL;}
   if (sdyad[0])        { free(sdyad[0]);        sdyad[0]=NULL;   }
   if (sdyad)           { free(sdyad);           sdyad=NULL;      }
   if (siteEM)          { free(siteEM);          siteEM=NULL;     }
   if (pwmConsensus[0]) { free(pwmConsensus[0]); pwmConsensus[0]=NULL; }
   if (pwmConsensus)    { free(pwmConsensus);    pwmConsensus=NULL;    }
   if (!startPWMfound && word) destroy_word(word,numWordGroup);

   return (1);
}

void print_ptable(Pgfs *llrDist,int llrDim) {

   FILE *fp;
   int i;

   fp=fopen("ptable.txt","w");
   for (i=0; i<llrDim; i++) fprintf(fp,"%d\t%e\n",llrDist[i].score,llrDist[i].prob);
   fclose(fp);
}

void select_high_scoring_seq_for_EM (double *ChIPScore,int numSeq,int numSeqEM,char *Iseq,double fEM) {

   register int i;
   int numSeqWithQualityScore,numSeqEMtmp1,numSeqEMtmp2;
   double *tmpScore;
   double ChIPscoreCutoff;

   tmpScore=alloc_double(numSeq);

   numSeqWithQualityScore=0;
   for (i=0; i<numSeq; i++) {
      if (ChIPScore[i]>0) numSeqWithQualityScore++;
   }

   tmpScore=alloc_double(numSeq);
   for (i=0; i<numSeq; i++) tmpScore[i]=ChIPScore[i];
   sort_double(tmpScore,numSeq);

   ChIPscoreCutoff=tmpScore[(int)(fEM*numSeq)];

   if (numSeqWithQualityScore<=(int)(fEM*numSeq)) {
      for (i=0; i<numSeq; i++) Iseq[i]='0';
      numSeqEMtmp1=0;
      for (i=0; i<numSeq; i++) {
         if (ChIPScore[i]>0) {
            Iseq[i]='1'; numSeqEMtmp1++;
         }
      }
      numSeqEMtmp2=0;
      for (i=0; i<numSeq; i++) {
         if (ChIPScore[i]<=0) {
            Iseq[i]='1'; numSeqEMtmp2++;
            if (numSeqEMtmp1+numSeqEMtmp2==numSeqEM) break;
         }
      }
   }
   else {
      for (i=0; i<numSeq; i++) Iseq[i]='0';
      numSeqEMtmp1=0; numSeqEMtmp2=0;
      for (i=0; i<numSeq; i++) {
         if (ChIPScore[i]>=ChIPscoreCutoff) {
            Iseq[i]='1'; numSeqEMtmp1++;
            if (numSeqEMtmp1==numSeqEM) break;
         }
      }
   }
   if (tmpScore)  { free(tmpScore);  tmpScore=NULL;  }
   if (ChIPScore) { free(ChIPScore); ChIPScore=NULL; }
   
}

void read_background(char *filename,double *bfreq) {

   FILE *fp;
   char *buffer,*tok,letter[2];
   int i,len,numTab;
   double sum;

   fp=fopen(filename,"r");
   if (!fp) { perror(filename); exit(0); }

   buffer=alloc_char(250);

   for (i=0; i<4; i++) bfreq[i]=-1;

   while (!feof(fp)) {
      if (fgets(buffer,250,fp)>0) {
         if (buffer[0]=='#') continue;
         len=strlen(buffer);
         buffer[len-1]='\0';
         numTab=0;
         for (i=0; i<len; i++) {
            if (buffer[i]=='\0') numTab++; 
         }
         if (numTab>0) {
            tok=strtok(buffer,"\t");
            if (strlen(tok)>1) continue;
            letter[0]=tok[0];
            tok=strtok(0,"\t");
            if      (letter[0]=='A' || letter[0]=='a') { 
               if (bfreq[0]==-1)  bfreq[0]=atof(tok); 
            }
            else if (letter[0]=='C' || letter[0]=='c') { 
               if (bfreq[1]==-1)  bfreq[1]=atof(tok); 
            }
            else if (letter[0]=='G' || letter[0]=='g') {
               if (bfreq[2]==-1)  bfreq[2]=atof(tok); 
            }
            else if (letter[0]=='T' || letter[0]=='t') {
               if (bfreq[3]==-1)  bfreq[3]=atof(tok); 
            }
            else  { printf("Error reading %s: non-[A,C,G,T]\n",filename); exit(0); } 
         }
         else {
            tok=strtok(buffer," ");
            letter[0]=tok[0];
            if (strlen(tok)>1) continue;
            tok=strtok(0," ");
            if      (letter[0]=='A' || letter[0]=='a') { 
               if (bfreq[0]==-1)  bfreq[0]=atof(tok); 
            }
            else if (letter[0]=='C' || letter[0]=='c') { 
               if (bfreq[1]==-1)  bfreq[1]=atof(tok); 
            }
            else if (letter[0]=='G' || letter[0]=='g') {
               if (bfreq[2]==-1)  bfreq[2]=atof(tok); 
            }
            else if (letter[0]=='T' || letter[0]=='t') {
               if (bfreq[3]==-1)  bfreq[3]=atof(tok); 
            }
            else  { printf("Error reading %s: non-[A,C,G,T]\n",filename); exit(0); } 
         }
      }
   }
   fclose(fp);

   for (i=0; i<4; i++) {
      if (bfreq[i]==-1) {
         switch (i) {
            case 0: printf("freq. for 'a' not found in %s\n",filename); break; 
            case 1: printf("freq. for 'c' not found in %s\n",filename); break; 
            case 2: printf("freq. for 'g' not found in %s\n",filename); break; 
            case 3: printf("freq. for 't' not found in %s\n",filename); break; 
            default: break; 
         }
         exit(0); 
      } 
   }
   sum=0; for (i=0; i<4; i++) sum +=bfreq[i];
   if (fabs(sum-1.0)>0.001) {
      printf("Warning: frequenices do not add to 1.0\n");
      printf("Please check %s\n",filename);
      exit(0); 
   }
   if (buffer) { free(buffer); buffer=NULL; }
}

