DNA序列分类:Fisher判别法

#include <stdio.h>
#include <string.h>
#include <math.h>

#define DNA_A 'a'
#define DNA_T 't'
#define DNA_C 'c'
#define DNA_G 'g'

#define DNA_AN 0
#define DNA_TN 1
#define DNA_CN 2
#define DNA_GN 3

int main()
{
	int Clover[64];
	char AminoAcids[21][25];
	int i, j, count, ich, length[20], len, k, p, q;
	char dnaArray[20][200];	//DNA序列
	int num[60][22], ClovNum[60];
	float percent[60][22];
	FILE *file;
	float distance[60], disMin[3], dis;
	int sel[20];
	double JD[22], temp;			//平均距离
	int dSite[22], tempInt, featureSel[8];
	float percentY[20], evgM[2][8], tildeM[2], evgMinus[8];
	float perMinusA[10][8], perMinusB[10][8], stanA[8][8], stanB[8][8], stanZZ[8][8], stanInv[8][8];
	float disperS[2][8], disperSW[8];
	float omega[8], thresY;
	float testYY[20];
	int testInt[20];

	strcpy(AminoAcids[0], "aaaaag");	
	strcpy(AminoAcids[1], "aataacgaagacgatgag");
	strcpy(AminoAcids[2], "taatagtat");
	strcpy(AminoAcids[3], "tac");
	strcpy(AminoAcids[4], "caacagcatcac");
	strcpy(AminoAcids[5], "agaagcagtaggtcgtca");
	strcpy(AminoAcids[6], "ggaggcggtggg");
	strcpy(AminoAcids[7], "tgatgctgttgg");
	strcpy(AminoAcids[8], "cgacgccgtcgg");
	strcpy(AminoAcids[9], "ataatg");
	strcpy(AminoAcids[10], "gtagtg");
	strcpy(AminoAcids[11], "gttgtc");
	strcpy(AminoAcids[12], "ttattg");
	strcpy(AminoAcids[13], "tttttc");
	strcpy(AminoAcids[14], "ctactg");
	strcpy(AminoAcids[15], "cttctc");
	strcpy(AminoAcids[16], "acaacg");
	strcpy(AminoAcids[17], "acc");
	strcpy(AminoAcids[18], "gcagccgctgcgtcttcc");
	strcpy(AminoAcids[19], "ccaccgcctccc");
	strcpy(AminoAcids[20], "attatcact");

	strcpy(dnaArray[0], "aggcacggaaaaacgggaataacggaggaggacttggcacggcattacacggaggacgaggtaaaggaggcttgtctacggccggaagtgaagggggatatgaccgcttgg");
	strcpy(dnaArray[1], "cggaggacaaacgggatggcggtattggaggtggcggactgttcggggaattattcggtttaaacgggacaaggaaggcggctggaacaaccggacggtggcagcaaagga");
	strcpy(dnaArray[2], "gggacggatacggattctggccacggacggaaaggaggacacggcggacatacacggcggcaacggacggaacggaggaaggagggcggcaatcggtacggaggcggcgga");
	strcpy(dnaArray[3], "atggataacggaaacaaaccagacaaacttcggtagaaatacagaagcttagatgcatatgttttttaaataaaatttgtattattatggtatcataaaaaaaggttgcga");
	strcpy(dnaArray[4], "cggctggcggacaacggactggcggattccaaaaacggaggaggcggacggaggctacaccaccgtttcggcggaaaggcggagggctggcaggaggctcattacggggag");
	strcpy(dnaArray[5], "atggaaaattttcggaaaggcggcaggcaggaggcaaaggcggaaaggaaggaaacggcggatatttcggaagtggatattaggagggcggaataaaggaacggcggcaca");
	strcpy(dnaArray[6], "atgggattattgaatggcggaggaagatccggaataaaatatggcggaaagaacttgttttcggaaatggaaaaaggactaggaatcggcggcaggaaggatatggaggcg");
	strcpy(dnaArray[7], "atggccgatcggcttaggctggaaggaacaaataggcggaattaaggaaggcgttctcgcttttcgacaaggaggcggaccataggaggcggattaggaacggttatgagg");
	strcpy(dnaArray[8], "atggcggaaaaaggaaatgtttggcatcggcgggctccggcaactggaggttcggccatggaggcgaaaatcgtgggcggcggcagcgctggccggagtttgaggagcgcg");
	strcpy(dnaArray[9], "tggccgcggaggggcccgtcgggcgcggatttctacaagggcttcctgttaaggaggtggcatccaggcgtcgcacgctcggcgcggcaggaggcacgcgggaaaaaacg");
	strcpy(dnaArray[10], "gttagatttaacgttttttatggaatttatggaattataaatttaaaaatttatattttttaggtaagtaatccaacgtttttattactttttaaaattaaatatttatt");
	strcpy(dnaArray[11], "gtttaattactttatcatttaatttaggttttaattttaaatttaatttaggtaagatgaatttggttttttttaaggtagttatttaattatcgttaaggaaagttaaa");
	strcpy(dnaArray[12], "gtattacaggcagaccttatttaggttattattattatttggattttttttttttttttttttaagttaaccgaattattttctttaaagacgttacttaatgtcaatgc");
	strcpy(dnaArray[13], "gttagtcttttttagattaaattattagattatgcagtttttttacataagaaaatttttttttcggagttcatattctaatctgtctttattaaatcttagagatatta");
	strcpy(dnaArray[14], "gtattatatttttttatttttattattttagaatataatttgaggtatgtgtttaaaaaaaatttttttttttttttttttttttttttttttaaaatttataaatttaa");
	strcpy(dnaArray[15], "gttatttttaaatttaattttaattttaaaatacaaaatttttactttctaaaattggtctctggatcgataatgtaaacttattgaatctatagaattacattattgat");
	strcpy(dnaArray[16], "gtatgtctatttcacggaagaatgcaccactatatgatttgaaattatctatggctaaaaaccctcagtaaaatcaatccctaaacccttaaaaaacggcggcctatccc");
	strcpy(dnaArray[17], "gttaattatttattccttacgggcaattaattatttattacggttttatttacaattttttttttttgtcctatagagaaattacttacaaaacgttattttacatactt");
	strcpy(dnaArray[18], "gttacattatttattattatccgttatcgataattttttacctcttttttcgctgagtttttattcttactttttttcttctttatataggatctcatttaatatcttaa");
	strcpy(dnaArray[19], "gtatttaactctctttactttttttttcactctctacattttcatcttctaaaactgtttgatttaaacttttgtttctttaaggattttttttacttatcctctgttat");

	//判断各个密码子的值,以及归于哪一个氨基酸
	for(i=0; i<64; i++)
	{
		Clover[i] = -1;
	}
	for(i=0; i<21; i++)
	{
		len = strlen(AminoAcids[i]);
		for(j=0; j<len; j+=3)
		{
			count = 0;
			ich = AminoAcids[i][j];
			switch(ich)
			{
			case 97:
				count += DNA_AN;
				break;
			case 116:
				count += DNA_TN;
				break;
			case 99:
				count += DNA_CN;
				break;
			case 103:
				count += DNA_GN;
				break;
			}
			ich = AminoAcids[i][j+1];
			switch(ich)
			{
			case 97:
				count += DNA_AN * 4;
				break;
			case 116:
				count += DNA_TN * 4;
				break;
			case 99:
				count += DNA_CN * 4;
				break;
			case 103:
				count += DNA_GN * 4;
				break;
			}
			ich = AminoAcids[i][j+2];
			switch(ich)
			{
			case 97:
				count += DNA_AN * 16;
				break;
			case 116:
				count += DNA_TN * 16;
				break;
			case 99:
				count += DNA_CN * 16;
				break;
			case 103:
				count += DNA_GN * 16;
				break;
			}
			Clover[count] = i;
		}
	}

	for(i=0; i<64; i++)
	{
		//printf("%d\n", Clover[i]);
	}
	//提取20*3组频率信息
	for(i=0; i<60; i++)
	{
		ClovNum[i] = 0;
		for(j=0; j<22; j++)
		{
			num[i][j] = 0;
			percent[i][j] = 0;
		}
	}
	for(i=0; i<20; i++)
	{
		length[i] = strlen(dnaArray[i]);
		for(k=0; k<3; k++)
		{			
			for(j=k; j<length[i]; j+=3)
			{
				count = 0;
				ich = dnaArray[i][j-2];
				switch(ich)
				{
				case 97:
					count += DNA_AN;
					break;
				case 116:
					count += DNA_TN;
					break;
				case 99:
					count += DNA_CN;
					break;
				case 103:
					count += DNA_GN;
					break;
				}
				ich = dnaArray[i][j-1];
				switch(ich)
				{
				case 97:
					count += DNA_AN * 4;
					break;
				case 116:
					count += DNA_TN * 4;
					break;
				case 99:
					count += DNA_CN * 4;
					break;
				case 103:
					count += DNA_GN * 4;
					break;
				}
				ich = dnaArray[i][j];
				switch(ich)
				{
				case 97:
					count += DNA_AN * 16;
					break;
				case 116:
					count += DNA_TN * 16;
					break;
				case 99:
					count += DNA_CN * 16;
					break;
				case 103:
					count += DNA_GN * 16;
					break;
				}
				num[3*i+k][Clover[count]] ++;
				ClovNum[3*i+k]++;
				if(dnaArray[i][j-2] == 'a' || dnaArray[i][j-2] == 't')
				{
					num[3*i+k][21] ++;
				}
				if(dnaArray[i][j-1] == 'a' || dnaArray[i][j-1] == 't')
				{
					num[3*i+k][21] ++;
				}
				if(dnaArray[i][j] == 'a' || dnaArray[i][j] == 't')
				{
					num[3*i+k][21] ++;
				}
			}
		}
	}
	//求百分比
	for(i=0; i<60; i++)
	{
		for(j=0; j<21; j++)
		{
			percent[i][j] = num[i][j] * 1.0 / ClovNum[i];
		}
		percent[i][21] = num[i][21] * 1.0 / length[i/3];
	}
	file = fopen("FisherNum.txt", "w");
	for(i=0; i<60; i++)
	{
		fprintf(file, "%d  ", ClovNum[i]);
		for(j=0; j<22; j++)
		{
			fprintf(file, "%d  ", num[i][j]);
		}
		fprintf(file, "\n");
	}
	fclose(file);
	file = fopen("FisherPercent.txt", "w");
	for(i=0; i<60; i++)
	{
		for(j=0; j<22; j++)
		{
			fprintf(file, "%f  ", percent[i][j]);
		}
		fprintf(file, "\n");
	}
	fclose(file);

	//确定代表A、B两类的氨基酸序列
	for(i=0; i<20; i++)
	{
		sel[i] = -1;
	}
	for(i=0; i<10; i++)
	{
		for(k=0; k<3; k++)
		{
			disMin[k] = 60000;
			for(j=0; j<30; j++)
			{
				if(i == j/3)
					continue;
				dis = 0;
				for(p=0; p<22; p++)
				{
					dis += (percent[3*i+k][p]-percent[j][p]) * (percent[3*i+k][p]-percent[j][p]);
				}
				if(disMin[k] > dis)
					disMin[k] = dis;
			}
		}
		dis = 60000;
		for(k=0; k<3; k++)
		{
			if(disMin[k] < dis)
			{
				dis = disMin[k];
				sel[i] = 3*i+k;
			}
		}
	}
	for(i=10; i<20; i++)
	{
		for(k=0; k<3; k++)
		{
			disMin[k] = 60000;
			for(j=30; j<60; j++)
			{
				if(i == j/3)
					continue;
				dis = 0;
				for(p=0; p<22; p++)
				{
					dis += (percent[3*i+k][p]-percent[j][p]) * (percent[3*i+k][p]-percent[j][p]);
				}
				if(disMin[k] > dis)
					disMin[k] = dis;
			}
		}
		dis = 60000;
		for(k=0; k<3; k++)
		{
			if(disMin[k] < dis)
			{
				dis = disMin[k];
				sel[i] = 3*i+k;
			}
		}
	}
	for(i=0; i<20; i++)
	{
		printf("%d ", sel[i]);
	}
	printf("\n");

	//求取特征向量
	for(i=0; i<22; i++)
	{
		JD[i] = 0;
	}
	for(i=0; i<10; i++)
	{
		for(j=10; j<20; j++)
		{
			for(p=0; p<22; p++)
			{
				JD[p] += sqrt((percent[sel[i]][p]-percent[sel[j]][p]) * (percent[sel[i]][p]-percent[sel[j]][p]));
			}
		}
	}
	for(i=0; i<22; i++)
	{
		printf("%f  ", JD[i]);
		dSite[i] = i;
	}
	printf("\n");
	//对JD数组进行排序,求最大的8项
	for(i=0; i<21; i++)  
        for(j=21; j>=i+1; j--)
            if(JD[j] > JD[j-1])  
            {  
                temp = JD[j];
				JD[j] = JD[j-1];
				JD[j-1] = temp;

				tempInt = dSite[j];
				dSite[j] = dSite[j-1];
				dSite[j-1] = tempInt;
            };
	//提取前8个特征位置
	for(i=0; i<8; i++)
	{
		featureSel[i] = dSite[i];
		printf("%d  ", dSite[i]);
	}
	printf("\n");

	//将8个特征向量数据输入到txt中
	file = fopen("feature.txt", "w");
	for(i=0; i<10; i++)
	{
		fprintf(file, "%2d  %2d  ", i, 1);
		for(j=0; j<8; j++)
		{
			fprintf(file, "%f  ", percent[dSite[i]][featureSel[j]]);
		}
		fprintf(file, "\n");
	}
	fprintf(file, "\n");
	for(i=10; i<20; i++)
	{
		fprintf(file, "%2d  %2d  ", i, -1);
		for(j=0; j<8; j++)
		{
			fprintf(file, "%f  ", percent[dSite[i]][featureSel[j]]);
		}
		fprintf(file, "\n");
	}
	fclose(file);

	//Fisher线性判别法
	
	//计算样本均值向量
	for(i=0; i<2; i++)
	{
		for(j=0; j<8; j++)
			evgM[i][j] = 0;
	}
	for(i=0; i<10; i++)
	{
		for(j=0; j<8; j++)
		{
			evgM[0][j] += percent[dSite[i]][featureSel[j]];
		}
	}
	for(i=10; i<20; i++)
	{
		for(j=0; j<8; j++)
		{
			evgM[1][j] += percent[dSite[i]][featureSel[j]];
		}
	}
	for(i=0; i<2; i++)
	{
		for(j=0; j<8; j++)
		{
			evgM[i][j] /= 10;
			printf("%f  %f\n", evgM[0][j], evgM[1][j]);
		}
		
	}
	//计算样本类内离散度矩阵和总类内离散度矩阵
	for(i=0; i<10; i++)
	{
		for(j=0; j<8; j++)
		{
			perMinusA[i][j] = percent[dSite[i]][featureSel[j]] - evgM[0][j];
		}
	}
	for(i=10; i<20; i++)
	{
		for(j=0; j<8; j++)
		{
			perMinusB[i-10][j] = percent[dSite[i]][featureSel[j]] - evgM[1][j];
		}
	}
	for(i=0; i<8; i++)
	{
		for(j=0; j<8; j++)
		{
			stanA[i][j] = 0;
			for(p=0; p<10; p++)
				stanA[i][j] += perMinusA[p][i] * perMinusA[p][j];
		}
	}
	for(i=0; i<8; i++)
	{
		for(j=0; j<8; j++)
		{
			stanB[i][j] = 0;
			for(p=0; p<10; p++)
				stanB[i][j] += perMinusB[p][i] * perMinusB[p][j];
		}
	}
	for(i=0; i<8; i++)
	{
		for(j=0; j<8; j++)
		{
			stanZZ[i][j] = stanA[i][j] + stanB[i][j];
		}
	}
	file = fopen("stanZZ.txt", "w");
	for(i=0; i<8; i++)
	{
		for(j=0; j<8; j++)
		{
			fprintf(file, "%f  ", stanZZ[i][j]);
		}
		fprintf(file, "\n");
	}
	fclose(file);
	//读取stanZZ矩阵的逆矩阵
	file = fopen("stanInv.txt", "r");
	if(file == NULL)
		printf("Wrong!\n");
	for(i=0; i<8; i++)
	{
		for(j=0; j<8; j++)
		{
			fscanf(file, "%f", &stanInv[i][j]);
			printf("%f  ", stanInv[i][j]);

		}
		printf("\n");
	}
	fclose(file);
	//求判别函数之参数w
	for(i=0; i<8; i++)
	{
		evgMinus[i] = evgM[0][i] - evgM[1][i];
	}
	printf("判别函数参数:\n");
	for(i=0; i<8; i++)
	{
		omega[i] = 0;
		for(j=0; j<8; j++)
		{
			omega[i] += stanInv[i][j] * evgMinus[j];
		}
		printf("%f\n", omega[i]);
	}
	//求门限y
	//d维映射到1维
	for(j=0; j<20; j++)
		percentY[j] = 0;
	for(i=0; i<10; i++)
	{
		for(j=0; j<8; j++)
		{
			percentY[i] += percent[dSite[i]][featureSel[j]] * omega[j];
		}
	}
	for(i=10; i<20; i++)
	{
		for(j=0; j<8; j++)
		{
			percentY[i] += percent[dSite[i]][featureSel[j]] * omega[j];
		}
	}
	//求某一类的样本均值
	tildeM[0] = tildeM[1] = 0;
	for(i=0; i<10; i++)
	{
		tildeM[0] += percentY[i];
	}
	for(i=10; i<20; i++)
	{
		tildeM[1] += percentY[i];
	}
	tildeM[0] /= 10;
	tildeM[1] /= 10;
	thresY = (tildeM[0] + tildeM[1])/2;
	printf("门限Y = %f\n", thresY);

	file  = fopen("InfoFun.txt", "w");
	for(i=0; i<8; i++)
	{
		fprintf(file, "%f  ", omega[i]);
	}
	fprintf(file, "\n%f", thresY);
	fclose(file);
	for(i=0; i<20; i++)
	{
		testYY[i] = 0;
		for(j=0; j<8; j++)
		{
			testYY[i] += percent[dSite[i]][featureSel[j]] * omega[j];
		}
		if(testYY[i] > thresY)
			printf("1\n");
		else
			printf("0\n");
	}
	file  = fopen("Eight.txt", "w");
	for(i=0; i<20; i++)
	{
		for(j=0; j<8; j++)
		{
			fprintf(file, "%f  ", percent[dSite[i]][featureSel[j]]);
		}
		fprintf(file, "\n");
	}
	fclose(file);

	return 0;
}

你可能感兴趣的:(DNA序列分类:Fisher判别法)