#include <iostream>
#include <fstream>
#include <string>
#include <cmath>
#include <stdlib.h> 
#include <conio.h>
#include <sstream>
#include <list>
#include <iterator>
#include <iomanip> //to print double with desired precision
#include <time.h> //use timer function to generate actual random indices
#include "config.h" //include configuration file that helps in global settings

using namespace std;

long cepcoecount=0; //count the number of cepstral coefficients
double distortion=0,olddistortion=0; //distortion-->current distortion
int iterations=0;
string line;
double w[cepsize];//to store tokhura weights given by sir
int clustrep[cbsize]={0}; //word represented by the cluster
long wordendindex[wordcount];//to store wordendindex' ending indices from the mapping file
string words[wordcount]; //to store the words from mapping file.
long wordcluster[cbsize][wordcount];//to count the most number of word type vectors in a cluster, to decide which word that cluster represents

ifstream inpt,logmap;
ofstream logcep,logdist,cb, logclustersize;



struct node
{
    long cluster;
    double data[cepsize];
};

list < struct node *>nodelist,codebook; //to store the universe
list < struct node *>::iterator iter; //to iterate trhough list
list < struct node *>::iterator iternl;//to iterate through test data
list < struct node *>::iterator itercb; // to iterate through codebook vectors
struct node* centers[cbsize]; //to store sum of vectors of a cluster
//struct node* centersquare[cbsize]; //to store sum of squares of vectors of a cluster


void readceps() // this method reads all the universe of cepstral coefficients into list called nodelist
{
	string item;
    inpt.open(universe,ios::in);
    if(!inpt) //display error message if we cant open the file
    {
        cout << "file cant be open" << endl;
	    inpt.close();
		system("pause");
    }

	struct node* temp; //temp node to take each vector from textfile into nodelist
    while(!inpt.eof())
    {
        getline(inpt,line);//get the line by reading until newline is encountered
		cepcoecount++; //increment the number of cepstral coefficients by one
		stringstream ss(line); // convert the line to string stream so that it can be splitted easily
		int i=0;
		temp=new node;
		while(getline(ss,item,delim))//split w.r.t delim and splitted substrings lies in item
		{
			temp->data[i++]=stod(item.c_str());//c.str()-->to get a pointer to a "null-terminated character array with data equivalent to those stored in the string"
			//cout << temp->data[i-1] << endl;
		}
		temp->cluster=-1; //indicates not yet clustered
		nodelist.push_back(temp);
    }
	nodelist.pop_back(); //to eliminate the last line from text file which is nothing but new line and hence stores garbage results
	cepcoecount--;//so decrement the count by 1
    cout << "number of cepstral coefficients = " << cepcoecount << endl;
    inpt.close();
}

void initialization() //to select initial cluster centers
{
	time_t timer;
	long ini;
	struct node* tempcopy,*tempstore;
	for(int i=0;i < cbsize;i++)
	{
		time(&timer); //get the time in ticks
		ini=(rand()+timer)%(cepcoecount-0+1)+0;//min-->0, max-->cepcoecount for random number generation between two numbers
		//cout << ini << endl;
		iter = nodelist.begin();
		for(long j=1;(j < ini) && (iter!=nodelist.end());j++) //skip the initial vectors to reach the desired vector
			iter++;
		tempcopy=(*iter);
		tempcopy->cluster=i; //update the cluster number
		tempstore=new node;
		tempstore->cluster=i; //assign cluster number
		for(int j=0;j < cepsize;j++) //copy the data values
			tempstore->data[j]=tempcopy->data[j];
		codebook.push_back(tempstore); //store it in codebook
		//cout << "stored in code book " << i << endl;
	}
}

void initializecentroid()
{
	for(int i=0;i < cbsize;i++)
	{
		(centers[i])->cluster=0;//use cluster to store number of elements in the cluster as index already indicates the cluster number
		//(centersquare[i])->cluster=0;//use cluster to store number of elements in the cluster as index already indicates the cluster number
		for(int j=0;j < cepsize;j++)
		{
			(centers[i])->data[j]=0; //set all the data values to zero
			//(centersquare[i])->data[j]=0; //set all the data values to zero
		}
	}
}

void centroid(struct node *temp)
{
	((centers[temp->cluster])->cluster)++; //increment the number of elements in the cluster by 1
	for(int i=0;i < cepsize;i++) //add the corresponding dimensional elements which later deviding with cluster size gives centroid
		(centers[temp->cluster])->data[i]=(centers[temp->cluster])->data[i] + temp->data[i];
}

void eucledian() //classification based on eucledian distance
{
	double refdist=0,testdist=0;
	int index=0; //index of the cluster
	distortion=0;
	iternl=nodelist.begin(); //get the first vector from the universe
	while(iternl!=nodelist.end()) //iterate through universe of the vectors until all vectors are explored
	{
		itercb=codebook.begin(); //get the first code vector of the codebook
		refdist=0; //initialize reference distance
		for(int j=0;j < cepsize;j++) //assume the distance from first vector as reference distance
		{
			refdist+=((*iternl)->data[j]-(*itercb)->data[j])*((*iternl)->data[j]-(*itercb)->data[j]); //compute distance from first code vector as reference distance
		}
		index=0; //assume the vector belongs to first cluster
		for(int i=1;i < cbsize;i++)
		{
			itercb++; // go to next codebook vector
			testdist=0;
			for(int j=0;j < cepsize;j++)//find the distance from other vectors
			{
				testdist+=((*iternl)->data[j]-(*itercb)->data[j])*((*iternl)->data[j]-(*itercb)->data[j]); // compute the distance from the center of a cluster
			}
			if(testdist < refdist) //make the shortest distance as reference distance and save the correpsonding codebook vector index
			{
				refdist=testdist; //make the current lowest distance as the reference distance
				index=i; //and the vector may belongs to corresponding cluster
			}
		}
		distortion+=refdist; //add the distance to distortion
		(*iternl)->cluster=index; //associate the vector with the correpsonding cluster with lowest distance
		centroid(*iternl);//send it for centroid calculation
		//cout << "distortion = " << distortion << "  cluster = " << (*iternl)->cluster << endl;
		iternl++; //get next vector from universe
	}
}

void tokhura()
{
	double refdist=0,testdist=0;
	int index=0; //index of the cluster
	distortion=0;
	long nodecounter=0; //to count the nodes
	memset(wordcluster, 0, sizeof wordcluster); //initialize whole 2D array to zero
	iternl=nodelist.begin(); //get the first vector from the universe
	while(iternl!=nodelist.end()) //iterate through universe of the vectors until all vectors are explored
	{
		itercb=codebook.begin(); //get the first code vector of the codebook
		refdist=0; //initialize reference distance
		for(int j=0;j<cepsize;j++) //assume the distance from first vector as reference distance
		{
			refdist+=w[j]*((*iternl)->data[j]-(*itercb)->data[j])*((*iternl)->data[j]-(*itercb)->data[j]); //compute distance from first code vector as reference distance
		}
		refdist/=cepsize;//as per tokhura distance
		index=0; //assume the vector belongs to first cluster
		for(int i=1;i < cbsize;i++)
		{
			itercb++; // go to next codebook vector
			testdist=0;
			for(int j=0;j < cepsize;j++)//find the distance from other vectors
			{
				testdist+=w[j]*((*iternl)->data[j]-(*itercb)->data[j])*((*iternl)->data[j]-(*itercb)->data[j]); // compute the distance from the center of a cluster
			}
			testdist/=cepsize; //as per tokhura distance formula
			if(testdist < refdist) //make the shortest distance as reference distance and save the correpsonding codebook vector index
			{
				refdist=testdist; //make the current lowest distance as the reference distance
				index=i; //and the vector may belongs to corresponding cluster
			}
		}
		distortion+=refdist; //add the distance to distortion
		(*iternl)->cluster=index; //associate the vector with the correpsonding cluster with lowest distance
		for(int i=0;i < wordcount;i++)
		{
			if(nodecounter <= wordendindex[i])
			{
				wordcluster[index][i]++;
				break;
			}
		}
		nodecounter++; //increment the node counter
		centroid(*iternl);//send it for centroid calculation
		//cout << "distortion = " << distortion << "  cluster = " << (*iternl)->cluster << endl;
		iternl++; //get next vector from universe
	}
}

void tokhuraweightsbyme()
{
	struct node* temp, *xmean, *xsquare,*variance;
	temp=new node;
	xmean=new node; // to store sum of the data elements
	xsquare=new node; //to sum of the squares of the elements
	variance=new node; //to store the variance
	for(int i=0;i<cepsize;i++) //initialize the values to zero
	{
		xmean->data[i]=0;
		xsquare->data[i]=0;
	}
	iter = nodelist.begin(); //get the first vector from universe

	while(iter != nodelist.end()) //iterate till the last vector in the universe is obtained
	{
		temp=(*iter);
		for(int i=0;i<cepsize;i++) //get each data element of a vector
		{
			(xmean->data[i])+=temp->data[i]; //add the data element
			(xsquare->data[i])+=(temp->data[i])*(temp->data[i]); //add the square of the data element
		}
		iter++; //go to next vector of the universe
	}
	cout << "tokhura weights are" << endl;
	for(int i=0;i<cepsize;i++) //get each data element of a vector
	{
		variance->data[i]=(xsquare->data[i])/cepcoecount-((xmean->data[i])*(xmean->data[i]))/(cepcoecount*cepcoecount);
		cout  <<  fixed  <<  setprecision(precision); // to print with desired precision
		cout << "SIGMA" << i << "=" << variance->data[i] << "\t";
		cout << "w[" << i << "] = " << 1/variance->data[i] << endl;
		w[i]=1/variance->data[i]; //reciprocal of the variance gives tokhura weight
	}
}

void classification()
{
	//eucledian(); //classification based on eucledian distance
	tokhura(); //classification based on tokhura distance
}

void updatecodevector() // to update the code vectors to centroids of the clusters
{
	long i=0;
	iterations++;
	logclustersize << "Iteration number: " << iterations << endl;
	itercb=codebook.begin(); //get the first code vector of the codebook
	while(itercb!=codebook.end()) //iterate till last code vector is obtained
	{
		if(centers[i]->cluster==0) //indicates the cluster do not have any vectors associated with it
		{
			cout << "cluster " << i << " is empytcell" << endl;
			logclustersize << "cluster " << i << " is empytcell" << endl;
			itercb++; //go to next code vector
			i++;
			continue;
		}
		for(int j=0;j<cepsize;j++) //get each data element from the code vector
			(*itercb)->data[j]=(centers[i]->data[j])/(centers[i]->cluster);

		itercb++; //get next code vector
		logclustersize << "cluster " << i << " ---> " << centers[i]->cluster << " vectors " << endl;
		i++;
	}
	logclustersize << endl;
	initializecentroid();//erase the old values by initializing the values to zeros
}

bool termination() //to check the termination condition
{
	if(olddistortion==0) //executed for the first iteration of the algo.
	{
		olddistortion=distortion; //assign current distortion to old distortion
		return false;
	}
	if((olddistortion-distortion)*100/distortion<thresholdist) //check the threshold condition
		return true;
	olddistortion=distortion; //store the current distortion in olddistortion for later comparison
	return false;
}

void displayvectors() //to display all vectors of cepstral coefficients
{
	struct node* temp;
	iter = nodelist.begin(); //get the first vector from universe

	while(iter != nodelist.end()) //iterate till the last vector in the universe is obtained
	{
		temp=(*iter);
		for(int i=0;i<cepsize;i++) //get each data element of a vector
			cout  <<  temp->data[i]  <<  " ";
		cout << endl;
		iter++; // move to next vector
	}

}

void displaycodebook() // to display code vectors in code book
{
	struct node* temp;
	iter = codebook.begin();
	cout << "vectors in codebook" << endl;
	cb.open("codebook.txt");
	if(!cb) //if we cant open the file notify with error message
	{
		cout << "can't open codebook.txt file" << endl;
		cb.close();
		return;
	}
	while(iter != codebook.end()) //iterate till the end of the codebook
	{
		temp=(*iter);
		for(int i=0;i<cepsize;i++) //get each data element from the vector
		{
			cout  <<  fixed  <<  setprecision(precision); // to print with desired precision
			cout  <<  temp->data[i]  <<  " "; //write to standard output
			cb  <<  fixed  <<  setprecision(precision); //to store with desired precision
			cb << temp->data[i]  <<  " "; //write to file
		}
		cout << endl;
		cb << endl;
		iter++; //move to next code vector
	}
	cb.close(); //close the file
}

void disp_cluster_repres() //display which word each cluster represents
{
	cout  <<  "\n\n    cluster\tword\t"  << endl;
	for(int i=0;i<cbsize;i++)
	{
		int max=0;
		for(int j=0;j &wordcount;j++)
		{
			if(wordcluster[i][max]<wordcluster[i][j])
				max=j;
		}
		if(wordcluster[i][max]==0)
		{
			cout << "\t" << i  <<  "\tempty cell" << endl;
				continue;
		}
		cout  << "\t" <<  i << "\t"  << words[max] << endl;
	}
}

void tokhuraweightsbysir()
{
	w[0]=1;//these are tokhura weights given by sir
	w[1]=3;
	w[2]=7;
	w[3]=13;
	w[4]=19;
	w[5]=22;
	w[6]=25;
	w[7]=33;
	w[8]=42;
	w[9]=50;
	w[10]=56;
	w[11]=61;
}

void algo()
{
	for(int i=0;i<cbsize;i++) //create it once and initialize after every iteration
	{
		centers[i]=new node;
		//centersquare[i]=new node;
	}
	initialization(); //to select initial cluster centers
	initializecentroid(); // to initialize all centroid values to zeros

	logdist.open("logdistortion.txt"); //log file to store the distortion after each iteration
	logclustersize.open("logclustersize.txt");
	while(!termination())
	{
		classification(); // classify the vectors based on nearest cluster center
		updatecodevector(); //update the cluster centers to new centroids
		cout << "distortion = " << distortion << "\titeration = " << iterations << endl;
		logdist << distortion/cbsize << endl; //average distortion per codebook vector or per cluster
		//logdist << abs((olddistortion-distortion))*100/distortion << endl;
	}
	logdist.close();
	logclustersize.close();
}

void kmeans()
{
	readceps(); //to read the cepstral coefficient vectors from available universe
	tokhuraweightsbysir(); //use the tokhura weights given by sir
	//tokhuraweightsbyme(); //compute the tokhura weights from universe
	algo(); // apply algorithm on universe
	//displayvectors(); //diplay vectors of universe
	//displaycodebook(); //store and display generated codebook
}

int main()
{
	string splitmap;//used to split each line in mapping file with respect to mapdelim delimiter
	logmap.open("mapping.txt",ios::in);//read the mapping from vowels to vectors
	if(!logmap) //display error message if we cant open the file
    {
        cout << "file can not be open" << endl;
	    logmap.close();
		system("pause");
		return 0;
    }
	getline(logmap, line); //read and ignore first line
	int temp=0; //to count the lines in mapping file
	cout << "reading mapping file..." << endl;
	while(!logmap.eof())
	{
		string tempstring; //to read temporary string which will further be converted into long
		getline(logmap,line);
		stringstream ss(line);
		int j=0;
		while(getline(ss,splitmap,mapdelim))//split w.r.t delim and splitted substrings lies in splitmap
		{
			if(j==0)
				words[temp++]=splitmap;
			if(j!=2) //ending index of each word is stored in second column of mapping file.
			{
				j++;
				continue;
			}
			j++;
			wordendindex[temp]=atol(splitmap.c_str());
			//cout << splitmap->data[i-1] << endl;
			cout << temp-1 << "th word ending index is " << wordendindex[temp-1] << endl;
		}
	}
	logmap.close();
	kmeans(); //call k-means
	disp_cluster_repres(); //display which word each cluster represents
	system("pause");
	//delete []wordcluster;
	return 0;
}