#ifndef __BACKGROUND_H__
#define __BACKGROUND_H__
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include "defines.h"

// Bryce Schroeder (bryce.schroeder@gmail.com) modified this code provided
// to me by Dr. Leping Li.

/* bcs 2009 */
/* The big difference between this and the old version of the background model
 * structure is that we no longer store the actual k-mers as strings. Rather,
 * the index is interpreted as the k-mer, by considering the base-4 
 * representation of the index, using the digits a,c,g,t. For example,
 * (A = 0, C = 1, G = 2, T = 3)
 *	aa = 0, ac = 1, ag = 2, at = 3, 
 *	ca = 4, cc = 5, cg = 6, cc = 7,
 *	ga = 8, gc = 9, ...,    tt = 15
 * As an additional example, if we want to find the frequency of
 * the heptamer GATACCA, we would look in kmer_freq[7][8980]:
 *       gatacca = 2030110 (base 4) = 8980
 * Inversely, if we are printing out the data in the background model, we can
 * uniquely determine the sequence that goes with the index we are currently at
 * in much the same way. Suppose i=3120:
 * 3120 = 300300 (base 4) = TAATAA.
 * This greatly reduces the memory requirements and overhead of generating all
 * the sequences as strings, which should make analysis with k > 8 more 
 * practical.
 *
 * note that a key implication here is that, since we use long int for the 
 * index, on a 64 bit platform we are limited to 32-mers. Practically, this is
 * not a problem, as computer and memory resources required would be so vast
 * that they would make it impossible long before that limitation. (Also, the
 * validity of the model for large k would be questionable at best.) */

typedef struct BG_model {
	/* The revised structure supports any reasonable k, a sparse array being
	 * used with the first index being k and the second being the sequence
	 * of interest, interpreted as a base-4 number, explained above.
	 * N.B. the first index is k, not k-1; position zero is unused. */

	/* A comment is in order regarding kmer_freq. While the building of the 
	 * model is in progress, it is used to count the occurances of the 
	 * sequence to which it refers. Only at the end is it divided to yield
	 * the frequency. */
	double **kmer_freq;
	//double **transition_n;
	unsigned int max_k;
	char *filename;
} bg_model;
void print_bg_model(bg_model *target, FILE *, int);
bg_model *alloc_bg_model(size_t max_k);
int build_bg_model(bg_model *target, FASTA_file *f, int fflag);
bg_model *load_bg_model(const char *filename, int order);
unsigned long nuc2intn(const char *nuc, size_t length);
/*char **seq, long nseq, int *seqlen);*/
/* --- end bcs --- */


const char *int2nuc(unsigned long, size_t);
unsigned long nuc2int(const char *nuc);

#endif
