/* Written by Bryce Schroeder for Dr. Leping Li, NIEHS in 2009.
	bryce.schroeder@gmail.com / bryce.schroeder@nih.gov
	li3@nih.gov 

	usage:
	simulateMarkovSeq Background.model Order num_Seq Sequence-length Output.fa
	Generate Sequence-count number of sequences of Sequence-length each,
	writing them to Output.fa, using the Markov model of order Order 
	contained in Background.model.

	FIXME / TODO:
		0. Find multiples bug!!!
			This only seems to occur when k_max is even, adn
			only affects k_max-mers.
		1. Modify read_seq to also write FASTA files.
		2-- Consider other data structures for the markov model
			(I don't think there are any...)
		2. Write Markov Model function. apparently exentraneous
			divisions as justified by memory concerns, and, for
			large k, the fact that most will be unused.
		3. Testing.
*/

#include <assert.h>
#include <stdlib.h>
#include <time.h>
#include "background.h"
#include "defines.h"

char mpick(bg_model *m, char *context, int length) {
	static int count = 0;
	const char nu[] = "ACGT";
	++count;

	int idx = 0;
	if (NULL != context && length)
		idx = nuc2intn(context, length);
	double total = 0, nfreq[4];
	for (int i = 0; i < 4; ++i)
		total += nfreq[i] = m->kmer_freq[length+1][(idx<<2) + i];
	for (int i = 0; i < 4; ++i)
		nfreq[i] = (nfreq[i] / total) + (i? nfreq[i-1] : 0);
	/*printf("<%s> A %g, C %g, G %g, T %g\n", int2nuc(idx, length),
		nfreq[0],
		nfreq[1],
		nfreq[2],
		nfreq[3]);*/

	double rn = random()/(double)RAND_MAX;
	int i = 0;
	while (rn > nfreq[i]) 
		++i; 
	//assert(i <= 3);
	//assert(i >= 0);
	return nu[i];
	
}

/* Add a new sequence to target, using the specified background model,
 * of length len. */
int genseq(bg_model *model, FASTA_file *target, size_t len) {
	//fprintf(stderr, "Using model <%s>, order %d %g\n", model->filename,
	//	model->max_k, model->kmer_freq[model->max_k][0]);

	char buffer[len+model->max_k+1];
	
	for (int i = 0; i < model->max_k; ++i) {
		buffer[i] = mpick(model, buffer, i);
		
	}
	for (int i = model->max_k; i <= len; ++i) {
		buffer[i] = mpick(model, buffer+i-model->max_k, model->max_k-1);
		
	}
	buffer[len+1] = '\0';
	
	int nsq = fasta_add_seq(target);
	char namebuffer[80];
	sprintf(namebuffer, "Simulated sequence from <%s>, order %d.",
		model->filename, model->max_k-1);
	fasta_set_name(target, nsq, namebuffer);
	fasta_add_data(target, nsq, buffer);
	fasta_finish(target, nsq);

	return 0;
}


int main (int argc, char *argv[]) {
	srandom(time(NULL));

	if (argc != 6) {
		fprintf(stderr, 
			"Usage:\n./simulateMarkovSeq model(e.g., frequency_file from kmerFreq) MarkovOrder numSeq seqlen out.fa\n");
		exit(1);
	}

	size_t seqcount = atoi(argv[3]);
	size_t seqlen = atoi(argv[4]);

	bg_model *model = load_bg_model(argv[1],atoi(argv[2])+1);

	FASTA_file *out = alloc_fasta();
	for (int i = 0; i < seqcount; ++i)
		genseq(model, out, seqlen);
	write_fasta(out, argv[5]);
	
}
