#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <math.h>
#include "background.h"
#include "defines.h"
#include <ctype.h>
#include <time.h>
/* -- bcs 2009 */
/* as always, A=0, C=1, G=2, T=3. (base 4) */
#include <assert.h>
const char NMerNames[][20] = { /* If you want, you can put in greek names 
				  above 10... */
    "",
    "monomer", "dimer", "trimer", "tetramer", "pentamer", "hexamer",
    "heptamer", "octamer", "nonamer", "decamer", "11-mer", "12-mer",
    "13-mer", "14-mer", "15-mer", "16-mer", "17-mer", "18-mer",
    "19-mer", "20-mer", "21-mer", "22-mer", "23-mer", "24-mer",
    "25-mer", "26-mer", "27-mer", "28-mer", "29-mer", "30-mer",
    "31-mer", "32-mer"
};

/* Helper functions for ACGT<->Integer interconversions. Int->ACGT is not
 * reenterant! Don't use it in a threaded program.  */
const char ACGT[5] = "acgt";
// This is a table for converting the letters ACGT into their base-4 number
// equivalents. Do character&7 to get the index. (ACGT are all unique in the
// last three bits.)
//                       A     C  T        G
const int NVAL[8] = { 0, 0, 0, 1, 3, 0, 0, 2 };
const int NNVAL[8]= { 0, 3, 0, 2, 0, 0, 0, 1 };
// complements....       T     G  A        C
const char *int2nuc(unsigned long n, size_t length)
{
    static char buffer[INT2NUC_BUFFER_LENGTH] = "";	
    // ^--- WARNING, NOT THREADSAFE ---^

    for (int i = INT2NUC_BUFFER_LENGTH - 2; i >= 0; --i) {
	buffer[i] = ACGT[n & 3];
	n >>= 2;
    }
    if (!length) { // then pick the "natural" length, i.e. no leading 'a's.
	// This isn't really of much utility, though, come to think of it - bcs
	while (buffer[length] == 'a')
	    ++length;
	return buffer + length;
    } else
	return buffer + INT2NUC_BUFFER_LENGTH - length - 1;
}

/* This version is for a char array
 *              0123456
 * say you have GATACCA
 *               ^
 *               ptr
 * And you call nuc2intn(ptr, 5). Then it returns the integer corisponding to 
 * "ATACC", that is, 3011 base 4, or 197. */
unsigned long nuc2intn(const char *nuc, size_t length)
{
    //const unsigned int table = 0x8340; 
    // A magic constant. We basically use it for a lookup table as a
    // replacement for the array NVAL.
    // turns out using the array is faster...

    unsigned long n = 0;
    for (size_t i = 0; i < length; ++i) {
	if ((nuc[i]|0x20) == 'n') return 0xFFFFFFFFFFFFFFFFl;
	//n = (n << 2) | (3&(table >> ((nuc[i] & 7) << 1))); // NVAL[(nuc[i] & 7]
	n = (n << 2) | NVAL[nuc[i] & 7];
    }
    return n;
}

/* Negative version. Note, it goes backward in memory... you pass it the end
   of the string, not the begining. */
unsigned long cmpl_nuc2intn(const char *nuc, size_t length)
{
    //const unsigned int table = 0x408C;
    unsigned long n = 0;
    for (long i = 0; i > -(signed long) length; --i) {
	if ((nuc[i]|0x20) == 'n') return 0xFFFFFFFFFFFFFFFFl;
	//n = (n << 2) | (3&(table >> ((nuc[i] & 7) << 1))); //(3&(~NVAL[nuc[i]&7]));
	n = (n << 2) | (NNVAL[nuc[i]&7]);
    }
    return n;

}

// This version for a c string (null terminated), must contain only ACGT.
unsigned long nuc2int(const char *nuc)
{
    unsigned long n = 0;
    while (*nuc)
	n = (n << 2) | NVAL[*(nuc++) & 7];
    return n;
}

/* The following nucs2intn, count up to k-mers all at once, and complements.
   (for a significant speed improvement.)
   This kind of thing really needs a picture to explain... 

   Making some effort to explain the parameters...
   nuc: point this at the beginning of the sequence in memory.
   length: make this the length of the sequence nuc.
   k: the longest oligomer you want to consider.
   counts: an arrray indexed as [k][sequence], which will be incremented as 
   appropriate. */ 
				/* +0			+1 */
const unsigned long NVALS[]  = {0xFFFFFFFFFFFFFFFFl, 0x0000000000000000l,
			/*2*/	0xFFFFFFFFFFFFFFFFl, 0x0000000000000001l,
			/*4*/	0x0000000000000003l, 0xFFFFFFFFFFFFFFFFl,
			/*6*/	0xFFFFFFFFFFFFFFFFl, 0x0000000000000002l};

const unsigned long CNVALS[] = {0xFFFFFFFFFFFFFFFFl, 0xC000000000000000l,
				0xFFFFFFFFFFFFFFFFl, 0x8000000000000000l,
				0x0000000000000000l, 0xFFFFFFFFFFFFFFFFl,
				0xFFFFFFFFFFFFFFFFl, 0x4000000000000000l};

/* This function basically runs a bunch of nuc2intn and cmpl_nuc2intn in
   parallel with each other... works basically the same way. 
   Also, it actually worked on the first try! */ 
int nucs2intn (const char *nuc, size_t length, unsigned int k, double **counts,
	long *sum_wc) {
	unsigned long forward, reverse;
	for (size_t i = 0; i < length; ++i) {
		forward = reverse = 0;
		for (size_t j = 0; j < (i+k < length? k : length-i); ++j) { 
			if ((nuc[i+j]|0x20) == 'n') break;
			forward = (forward << 2) |  NVALS[nuc[i+j]&7];
			reverse = (reverse >> 2) | CNVALS[nuc[i+j]&7];
			++counts[j+1][forward];
			++counts[j+1][reverse>>((sizeof(long)*8 - 2)-j*2)];
			sum_wc[j] += 2;
		}
	}

	return 0;
}

/* Interesting note:
	A = 0b00
	C = 0b01
	G = 0b10
	T = 0b11
   So, we can use the bitwise complement to get the complementary base.
   Whoever chose the order A,C,G,T picked very well indeed...
*/

/* Function to allocate a new background model.
 * note that if we had more than a constant number (e.g. 1) in the program,
 * we would need to create a corisponding free_bg_model function, or else
 * a memory leak would result. */
bg_model *alloc_bg_model(size_t max_k)
{
    bg_model *tmp = malloc(sizeof(bg_model));
    assert(NULL != tmp);

    tmp->max_k = max_k;

    tmp->kmer_freq = calloc(sizeof(double *), max_k + 1);
    assert(NULL != tmp->kmer_freq);

    // no 0-mers; rather, this is used to store 
    // the totals during processing.
    tmp->kmer_freq[0] = calloc(sizeof(double), max_k);

    for (unsigned long i = 1; i <= max_k; ++i)
	tmp->kmer_freq[i] = calloc(sizeof(double), (size_t) pow(4, i));
    /* It is not nessicary to zero the memory since calloc
     * does this for us. */

    //tmp->transition_n = calloc(sizeof(double *), max_k);
    //assert(NULL != tmp->transition_n);

    //tmp->transition_n[0] = NULL;    // 0 Would be redundant with the monomers
    //for (unsigned long i = 1; i < max_k; ++i)
	//tmp->transition_n[i] = calloc(sizeof(double), 4);

    return tmp;
}

int count_kmer_occurances(double **counts, const char *seq, size_t seqlen,
			  int max_k, long *sum_wc)
{
    /* although counter-intuitive, it may be faster to iternate k tthen
       i, for large k, because of swapping behavior / locality of counts[]. 
       This is because counts[k] is large with respect to seq[] when k is
       large (since its size is 4**k.) */
    unsigned long tmp; // Needed so we can check for 0xFFFFFFFFFFFFFFFF 
		       //which is what it returns in event of an error.
    for (int k = 1; k <= max_k; ++k) {
	for (size_t i = 0; i < seqlen; ++i) {
	    //assert(seq[i] != '\0');
	    if (k < seqlen - i) {
		tmp = nuc2intn(seq+i, k);
		if (tmp != 0xFFFFFFFFFFFFFFFFl) {
			++counts[k][nuc2intn(seq + i, k)];
			++sum_wc[k];
		}
	    }
	    if (i >= k) {
		tmp = cmpl_nuc2intn(seq+i, k);
		if (tmp != 0xFFFFFFFFFFFFFFFFl) {
			++counts[k][cmpl_nuc2intn(seq + i, k)];
			++sum_wc[k];
		}
	    }
	}
    }
    return 0;
}

/* converts the counts into frequencies, the sum of which is 1 */
int compute_kmer_frequencies(double **counts, int max_k,
			     long *sum_of_window_counts)
{
    for (int i = 1; i <= max_k; ++i) {
	fprintf(stderr,
		"\r[%6.2f%%] Computing %2d-mer frequencies... (%ld items)",
		100.0 * i / max_k, i, (long) pow(4, i));
	for (unsigned long j = 0; j < (unsigned long) pow(4, i); ++j) 
	    counts[i][j] /= sum_of_window_counts[i];
	
    }
    fputs("\n", stderr);
    return 0;
}

/* seqlen should be size_t or unsigned long really */
int build_bg_model(bg_model * target, FASTA_file *f, int fflag)
{

    time_t tm = 0;
    double time_estimate = 0;
    int tstart = 0;

    long sowc[target->max_k];	// sum of window counts
    for (int i = 0; i < target->max_k; ++i)
	sowc[i] = 0;
    for (long i = 0; i < f->sequence_count; ++i) {
		/* Count the occurances over all sequences */
	if (!(i % 10000))
	    tm = 0;
	if (!tm) {
	    tm = time(NULL);
	    tstart = i;
	}

	// Enable one counting engine or the other...
	nucs2intn(f->seqs[i].sequence, f->seqs[i].seq_len, target->max_k, 
		target->kmer_freq, sowc);
	//count_kmer_occurances(target->kmer_freq, f->seqs[i].sequence, 
	//	f->seqs[i].seq_len, target->max_k, sowc);

	fprintf(stderr, "[%6.2f%%] Counting %5ld/%5ld sequences,",
		(100.0 * i) / f->sequence_count, i, f->sequence_count);
	if (time(NULL) - tm >= 1)
	    time_estimate = (f->sequence_count - i) * ((time(NULL) - tm) / (
		(double) (i - tstart)));
	if (!time_estimate)
	    fprintf(stderr, " (Estimating Time...)\r");
	else
	    fprintf(stderr, " %6.0f second%s remaining.\r", time_estimate,
		    (long) time_estimate == 1 ? " " : "s");

    }
    //65minutes - counting mm9

    /* using the information available to us in seqlen, compute the number 
	of windows that we saw (which must be equivalent to the number of 
	times we incremented any count in count_kmer_occurances.) 
	Forward and reverse must both be taken
       into account or the frequencies will sum to 2 rather than 1. */

    fprintf(stderr, "\nComputing K-Mer Frequencies...\n");
    /* Now, do the divisions to get the frequencies... */

    if (fflag) compute_kmer_frequencies(target->kmer_freq, target->max_k, sowc);

    /* Check our math */
    if (SHOW_FP_ERROR && fflag) {
	fprintf(stderr, " - Floating Point Error Report -\n"
		"  (Note: Seeing this is normal.)\n");
	long double total;
	for (int k = 1; k < target->max_k; ++k) {
	    total = 0;
	    // This itself introduces some floating point error. We could
	    // minimize it by sorting the list... but there are no 
	    // huge differences
	    // in the frequencies so it should not be an issue.
	    for (int j = 0; j < pow(4, k); ++j)
		total += target->kmer_freq[k][j];
	    fprintf(stderr, "%d-mer: Sum = %lg, ~Error = %lg\n",
		    k, (double) total / 1.0,
		    (double) fabsl(1 - total) / pow(4, k));
	}

    }

    return 0;
}

void print_bg_model(bg_model * model, FILE * file, int format)
{
    if (format)
	fprintf(file, "# 1 to %d-mer frequencies for file %s\n",
		model->max_k, model->filename);
    for (int k = 1; k <= model->max_k; ++k) {
	fprintf(file, "#%s frequency\n", NMerNames[k]);
	for (int j = 0; j < pow(4, k); ++j)
	    if (format)
		fprintf(file, "%12s %.16f\n",
			int2nuc(j, k), model->kmer_freq[k][j]);
	    else
		fprintf(file, "%.16f\n", model->kmer_freq[k][j]);
    }
}

bg_model *load_bg_model(const char *filename, int order) {
	char buffer[128];
	char *seqp;
	char *valp;

	double freq;
	unsigned long seq;
	int seqlen;

	bg_model *m = alloc_bg_model(order);
	assert(NULL != m);
	m->filename = malloc(strlen(filename)+1);
	assert(NULL != m->filename);
	strcpy(m->filename, filename);

	FILE *file = fopen(filename, "r");

	if (NULL == file) return NULL;
	// note, all noncomment lines must be blank or valid.
	while (!feof(file)) {
		fgets(buffer, 127, file);
		if (buffer[0] == '#' || buffer[0] == '\n') continue;
		seqp = buffer;
		while (!isalpha(*seqp))
			++seqp;
		valp = seqp;
		while (!isdigit(*valp++))
			;
		seqlen = valp-seqp-2;
		if (seqlen > order) break;
		freq = atof(valp-1);
		seq = nuc2intn(seqp, seqlen);

		m->kmer_freq[seqlen][seq] = freq;
		//printf("read <%s>/%d = %.10g\n", 
		//	int2nuc(seq,seqlen),seqlen, freq);

	}
	fprintf(stderr, "Loaded <%s>, order %d/%d %g\n", filename, order,
		m->max_k, m->kmer_freq[order][0]);

	return m;	
}
