/*
ClusterExplorer.c
Copyright (C) 2007, 2008 Tong Zhou <tong.zhou@mail.utexas.edu>

This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1
*/

//GNU Scientific Library (GSL) -- library and development package need to be installed
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <gsl/gsl_sf_gamma.h>
#include <gsl/gsl_randist.h>
#include <gsl/gsl_rng.h>
#include <gsl/gsl_cdf.h>
#include <gsl/gsl_sort.h>
#define MAX 20000
#define SIMULATION_ROUND 10000
#define MIN_COUNT 10

long seed, seed_flag = 0;

/*Store mutation information*/
struct MutationPattern
{
	int mutation[MAX];
	int seq_len, mut_num, bury_num, expose_num;
};

/*Store single cluster information*/
struct Cluster
{
	double p;
	int start_i, mut_k;
	int cluster_len, expose_len, bury_len;
};

/*Store all clusters for one sequence*/
struct ClusterArray
{
	struct Cluster * arr;
	int num;
};

/*Store P-value related data*/
struct ClusterP
{
	double p;
	int k_counter, total_counter;
};

struct MutationPattern getMutationPattern(char *, char *, int);
struct ClusterP simulateP(struct MutationPattern, struct Cluster, char *);
struct ClusterArray filtClusterCandidate(struct MutationPattern, char *);
double calcBinomialCumuProbability(struct MutationPattern, int, int, int, int, int);
int * simuMutationS(int seq_len, int mut_num, int bury_num, int expose_num, char * bury_pattern);
int isInList(int, int *, int);

int main(int argc, char *argv[])
{
	int i;
	if (argc != 2 && argc != 3)
	{
		printf("Input error!\n");
		printf("Correct format:\n");
		printf("Command  [Input filename] [Output filename]\n");
		return 0;
	}
	char * align_filename, * out_filename, tmpstr[100];
	align_filename = argv[1];
	FILE * fp1 = fopen(align_filename, "r");
	if (fp1 == NULL)
	{
		printf("No such file or directory: %s\n", align_filename);
		return 0;
	}
	if (argc == 3)
		out_filename = argv[2];
	else
	{
		strcpy(tmpstr, align_filename);
		tmpstr[strnlen(tmpstr) - 3] = 'o';
		tmpstr[strnlen(tmpstr) - 2] = 'u';
		tmpstr[strnlen(tmpstr) - 1] = 't';
		out_filename = tmpstr;
	}
	char seq[2][MAX], current_char;
	int seq_counter = 0, seqlen[2] = {0 ,0};
	i = 0;
	while (!feof(fp1))
	{
		current_char = fgetc(fp1);
		if (current_char >= 33 && current_char <= 126)
		{
			seq[i][seq_counter] = current_char;
			seq_counter++;
		}
		else
		{
			if (seq_counter != 0)
			{
				seq[i][seq_counter] = '\0';
				seqlen[i] = seq_counter;
				i++;
				seq_counter = 0;
			}
		}
		if (i == 2)
			break;
	}
	if (i==1 && seq_counter != 0)
	{
		seq[i][seq_counter] = '\0';
		seqlen[i] = seq_counter;
		i++;
	}
	if (seqlen[0] != seqlen[1] || seq_counter >= MAX)
	{
		printf("Format error in alignment file\n");
		fclose(fp1);
		return 0;
	}
	FILE * fp2 = fopen(out_filename, "w");
	if (fp2 == NULL)
	{
		printf("Cann't create such a file: %s\n", out_filename);
		return 0;
	}
	printf("\nInput:\n");
	puts(seq[0]);
	puts(seq[1]);
	for (i = 0; i < seqlen[1]; i++)
		if (seq[0][i] != 'b' && seq[0][i] != 'e')
		{
			printf("Illegal character in input file\n");
			return 0;
		}
	struct MutationPattern pattern;	
	struct ClusterArray cluster_list;
	struct ClusterP p_rec;
	int mut_k, start_i, d_ik, seq_start, seq_end;
	float bury_fraction;
	double pb; //P-value before simulation correction (Qs)
	pattern = getMutationPattern(seq[0], seq[1], seqlen[1]);
	printf("\nSequence length: %d\nMutation #: %d\nBuried mutation #: %d\nExposed mutation #: %d\n", pattern.seq_len, pattern.mut_num, pattern.bury_num, pattern.expose_num);
	cluster_list = filtClusterCandidate(pattern, seq[0]);
	printf("\n          \ti\tk\tstart\tend\tbury\tq\tp\n");
	for (i = 0; i < cluster_list.num; i++)
	{
		//All indice start from zero
		mut_k = cluster_list.arr[i].mut_k;
		start_i = cluster_list.arr[i].start_i;
		seq_start = pattern.mutation[start_i];
		seq_end = pattern.mutation[start_i + mut_k - 1];
		bury_fraction = (float)(cluster_list.arr[i].bury_len) / (float)(cluster_list.arr[i].cluster_len);
		p_rec = simulateP(pattern, cluster_list.arr[i], seq[0]);
		pb = cluster_list.arr[i].p;
		fprintf(fp2, "%d\t%d\t%d\t%d\t%d\t%d\t%g\t%g\t%g\n", pattern.seq_len, pattern.mut_num, start_i, mut_k, seq_start, seq_end, bury_fraction, pb, p_rec.p);
		fflush(fp2);
		printf("  Cluster %d\t%d\t%d\t%d\t%d\t%.2g\t%.1e\t%.1e\n", i, start_i, mut_k, seq_start, seq_end, bury_fraction, pb, p_rec.p);
	}
	free(cluster_list.arr);
	fclose(fp1);
	fclose(fp2);
	return 1;
}

/*Get structure related mutation data for a given sequence*/
struct MutationPattern getMutationPattern(char * bury_pattern, char * seq, int seq_counter)
{
	int i, j, count = 0, bury_num = 0, expose_num = 0;
	struct MutationPattern pattern;
	for (i = 0; i < seq_counter; i++)
	{
		if (seq[i] == 'm')
		{
			pattern.mutation[count] = i;
			if (bury_pattern[i] == 'b')
				bury_num++;
			if (bury_pattern[i] == 'e')
				expose_num++;
			count++;
		}
	}
	pattern.mut_num = count;
	pattern.bury_num = bury_num;
	pattern.expose_num = expose_num;
	pattern.seq_len = seq_counter;
	return pattern;
}

/*Calculated the P-value by simulation correction*/
struct ClusterP simulateP(struct MutationPattern mutation_pattern, struct Cluster cluster_candidate, char * bury_pattern)
{
	int * mutation = NULL;
	int i, j, while_counter = 0;
	struct ClusterArray cluster_list;
	struct MutationPattern simu_pattern;
	double p, current_p;
	int current_mut_k, current_bury_len, current_cluster_len;
	long counter = 0, p_counter = 0;
	struct ClusterP p_rec;
	while (p_counter < MIN_COUNT)
	{
		for (i = 0; i < SIMULATION_ROUND; i++)
		{
			if ((i + 1) % (SIMULATION_ROUND / 10) == 0)
				printf("     \t\t.\t.\t.\t.\t.\t.\t.\n");
			mutation = simuMutationS(mutation_pattern.seq_len, mutation_pattern.mut_num, mutation_pattern.bury_num, mutation_pattern.expose_num, bury_pattern);
			for (j = 0; j < mutation_pattern.mut_num; j++)
				simu_pattern.mutation[j] = mutation[j];
			simu_pattern.seq_len = mutation_pattern.seq_len;
			simu_pattern.mut_num = mutation_pattern.mut_num;
			simu_pattern.bury_num = mutation_pattern.bury_num;
			simu_pattern.expose_num = mutation_pattern.expose_num;
			cluster_list = filtClusterCandidate(simu_pattern, bury_pattern);
			for (j = 0; j < cluster_list.num; j++)
			{
				counter++;
				current_p = cluster_list.arr[j].p;
				if (current_p <= cluster_candidate.p)
					p_counter++;
			}
			free(mutation);
			free(cluster_list.arr);
		}
		while_counter++;
		if (while_counter > 100)
			break;
	}
	if ((counter != 0) && (p_counter != 0))
		p = (double)(p_counter) / (double)(counter);
	else
	{
		if ((counter != 0) && (p_counter == 0)) //No simulated Qs is smaller than real Qs
			p = 1 / (float)(counter);
		else
			p = -1;
	}
	p_rec.p = p;
	p_rec.k_counter = p_counter;
	p_rec.total_counter = counter;
	return p_rec;
}

/*Get cluster candidate with local minimum Q-value*/
struct ClusterArray filtClusterCandidate(struct MutationPattern mutation_pattern, char * bury_pattern)
{
	int i, j, k, min_i, min_k, min_bury_cluster_len, min_expose_cluster_len, min_cluster_len, start_k = 2, start_i, mut_k;
	float p, min_p, current_p, next_p, last_p;
	int bury_seq_len = 0, expose_seq_len = 0, bury_cluster_len = 0, expose_cluster_len = 0;
	struct Cluster * candidate_list = NULL;
	candidate_list = malloc((mutation_pattern.mut_num-1) * sizeof(struct Cluster));
	for (i = 0; i < mutation_pattern.seq_len; i++)
	{
		if (bury_pattern[i] == 'b')
			bury_seq_len++;
		if (bury_pattern[i] == 'e')
			expose_seq_len++;
	}
	//Calculate Q-value alone the mutation list
	for (i = 0; i < mutation_pattern.mut_num - 1; i++)
	{
		min_p = 1;
		min_k = -1;
		min_bury_cluster_len = -1;
		min_expose_cluster_len = -1;
		for (k = start_k; k < mutation_pattern.mut_num - i +1; k++)
		{
			bury_cluster_len = 0;
			expose_cluster_len = 0;
			for (j = mutation_pattern.mutation[i]; j <= mutation_pattern.mutation[i + k -1]; j++)
			{
				if (bury_pattern[j] == 'b')
					bury_cluster_len++;
				if (bury_pattern[j] == 'e')
					expose_cluster_len++;
			}
			p = calcBinomialCumuProbability(mutation_pattern, bury_seq_len, expose_seq_len, bury_cluster_len, expose_cluster_len, k);
			if (p <= min_p)
			{
				min_p = p;
				min_k = k;
				min_bury_cluster_len = bury_cluster_len;
				min_expose_cluster_len = expose_cluster_len;
			}
		}
		candidate_list[i].start_i = i;
		candidate_list[i].mut_k = min_k;
		candidate_list[i].bury_len = min_bury_cluster_len;
		candidate_list[i].expose_len = min_expose_cluster_len;
		candidate_list[i].cluster_len = mutation_pattern.mutation[i + min_k - 1] - mutation_pattern.mutation[i] + 1;
		candidate_list[i].p = min_p;
	}
	//Detect the clusters with local minimum Q-value
	struct Cluster * extreme_p_list = NULL;
	extreme_p_list = malloc((mutation_pattern.mut_num-1) * sizeof(struct Cluster));
	int extreme_p_num = 0;
	if (mutation_pattern.mut_num == 2)
	{
		extreme_p_list[0] = candidate_list[0];
		extreme_p_num = 1;
	}	
	if (mutation_pattern.mut_num >= 3)
	{
		for (i = 0; i < mutation_pattern.mut_num - 1; i++)
		{
			if (i == 0)
			{
				current_p = candidate_list[i].p;
				next_p = candidate_list[i+1].p;
				if (current_p <= next_p)
				{
					extreme_p_list[extreme_p_num] = candidate_list[i];
					extreme_p_num++;
				}
				continue;
			}
			if (i == mutation_pattern.mut_num - 2)
			{
				current_p = candidate_list[i].p;
				last_p = candidate_list[i-1].p;
				if (current_p <= last_p)
				{
					extreme_p_list[extreme_p_num] = candidate_list[i];
					extreme_p_num++;
				}
				continue;
			}
			if ((i > 0) && (i < mutation_pattern.mut_num - 2))
			{
				current_p = candidate_list[i].p;
				last_p = candidate_list[i-1].p;
				next_p = candidate_list[i+1].p;
				if ((current_p <= last_p) && (current_p <= next_p))
				{
					extreme_p_list[extreme_p_num] = candidate_list[i];
					extreme_p_num++;
				}
				continue;
			}
			
		}
	}
	//Determine if there is any overlap between different clusters with local minimun Q-value
	struct Cluster * current_group = NULL;
	current_group = malloc((mutation_pattern.mut_num-1) * sizeof(struct Cluster));
	int cluster_count = 0;
	struct ClusterArray * group_list = NULL;
	group_list = malloc((mutation_pattern.mut_num-1) * sizeof(struct ClusterArray));
	int group_count = 0;
	int group_start = -1, group_end = -1;
	int overlap_flag;
	for (i = 0; i < extreme_p_num; i++)
	{
		start_i = extreme_p_list[i].start_i;
		mut_k = extreme_p_list[i].mut_k;
		overlap_flag = 0;
		for (j = start_i; j < start_i + mut_k; j++)
		{
			if ((j >= group_start) && (j <= group_end))
			{
				overlap_flag = 1;
				break;
			}
		}
		if (overlap_flag == 1)
		{
			current_group[cluster_count] = extreme_p_list[i];
			cluster_count++;
			group_end = start_i + mut_k - 1;
		}
		else
		{
			if (cluster_count == 0)
			{
				current_group[cluster_count] = extreme_p_list[i];
				cluster_count++;
			}
			else
			{
				struct Cluster * temp = NULL;
				temp = malloc(cluster_count * sizeof(struct Cluster));
				for (k = 0; k < cluster_count; k++)
					temp[k] = current_group[k];
				group_list[group_count].arr = temp;
				group_list[group_count].num = cluster_count;
				group_count++;
				cluster_count = 0;
				current_group[cluster_count] = extreme_p_list[i];
				cluster_count++;
			}
			group_start = start_i;
			group_end = start_i + mut_k - 1;
		}
	}
	if (cluster_count != 0)
	{
		struct Cluster * temp = NULL;
		temp = malloc(cluster_count * sizeof(struct Cluster));
		for (k = 0; k < cluster_count; k++)
			temp[k] = current_group[k];
		group_list[group_count].arr = temp;
		group_list[group_count].num = cluster_count;
		group_count++;
	}
	struct Cluster * final_list = NULL;
	final_list = malloc(group_count * sizeof(struct Cluster));
	for (i = 0; i < group_count; i++)
	{
		min_p = 1;
		min_i = -1;
		min_k = -1;
		min_bury_cluster_len = -1;
		min_expose_cluster_len = -1;
		min_cluster_len = -1;
		for (j = 0; j < group_list[i].num; j++)
		{
			p = group_list[i].arr[j].p;
			if (p < min_p)
			{
				min_p = p;
				min_i = group_list[i].arr[j].start_i;
				min_k = group_list[i].arr[j].mut_k;
				min_bury_cluster_len = group_list[i].arr[j].bury_len;
				min_expose_cluster_len = group_list[i].arr[j].expose_len;
				min_cluster_len = group_list[i].arr[j].cluster_len;
			}
		}
		final_list[i].p = min_p;
		final_list[i].start_i = min_i;
		final_list[i].mut_k = min_k;
		final_list[i].bury_len = min_bury_cluster_len;
		final_list[i].expose_len = min_expose_cluster_len;
		final_list[i].cluster_len = min_cluster_len;
	}
	struct ClusterArray result;
	result.arr = final_list;
	result.num = group_count;
	free(candidate_list);
	free(extreme_p_list);
	free(current_group);
	for (i = 0; i < group_count; i++)
		free(group_list[i].arr);
	free(group_list);
	return result;
}

/*Calculate q-value (Eqa. 3)*/
double calcBinomialCumuProbability(struct MutationPattern mutation_pattern, int bury_seq_len, int expose_seq_len, int n_b, int n_e, int k)
{	
	double P_b, P_e, sum = 0, p;
	int i;
	P_b = (double)(mutation_pattern.bury_num) / (double)(bury_seq_len);
	P_e = (double)(mutation_pattern.expose_num) / (double)(expose_seq_len);
	for (i = 0; i < k; i++)
		sum += gsl_ran_binomial_pdf((double)(i), P_e, (double)(n_e)) * gsl_cdf_binomial_P((double)(k - i - 1), P_b, (double)(n_b));
	p = 1 - sum;
	return p;
}

/*Do mutation simulation for a sequence with given length, mutation number, buried mutation number, exposed mutation number and buried/exposed information for each residue*/
int * simuMutationS(int seq_len, int mut_num, int bury_num, int expose_num, char * bury_pattern)
{
	int * mutation = NULL;
	int i, pos, curr_len = 0;
	int bury_count = 0, expose_count = 0, unknown_count = 0;
	gsl_rng * r;
	mutation = malloc(mut_num * sizeof(int));
	r = gsl_rng_alloc (gsl_rng_taus);
	if (seed_flag == 0)
	{
		seed = seq_len * mut_num;
		seed_flag = 1;
	}
	if (seed > 100000000)
		seed = 1;
	gsl_rng_set(r, seed);
	seed++;
	while (curr_len < mut_num)
	{
		pos = (int)(gsl_rng_uniform_int(r, seq_len));
		while (isInList(pos, mutation, curr_len))
			pos = (int)(gsl_rng_uniform_int(r, seq_len));
		if (bury_pattern[pos] == 'b' && bury_count < bury_num)
		{
			mutation[curr_len] = pos;
			curr_len++;
			bury_count++;
		}
		if (bury_pattern[pos] == 'e' && expose_count < expose_num)
		{
			mutation[curr_len] = pos;
			curr_len++;
			expose_count++;
		}
	}
	gsl_sort_int(mutation, 1, mut_num);
	gsl_rng_free (r);
	return mutation;
}

int isInList(int element, int *list, int len)
{
	int i;
	for (i = 0; i < len; i++)
	{
		if (element == list[i])
			return 1;
	}
	return 0;
}
