#include <stdio.h> #include <string.h> #include <math.h> #define DNA_A 'a' #define DNA_T 't' #define DNA_C 'c' #define DNA_G 'g' #define DNA_AN 0 #define DNA_TN 1 #define DNA_CN 2 #define DNA_GN 3 int main() { int Clover[64]; char AminoAcids[21][25]; int i, j, count, ich, length[20], len, k, p, q; char dnaArray[20][200]; //DNA序列 int num[60][22], ClovNum[60]; float percent[60][22]; FILE *file; float distance[60], disMin[3], dis; int sel[20]; double JD[22], temp; //平均距离 int dSite[22], tempInt, featureSel[8]; float percentY[20], evgM[2][8], tildeM[2], evgMinus[8]; float perMinusA[10][8], perMinusB[10][8], stanA[8][8], stanB[8][8], stanZZ[8][8], stanInv[8][8]; float disperS[2][8], disperSW[8]; float omega[8], thresY; float testYY[20]; int testInt[20]; strcpy(AminoAcids[0], "aaaaag"); strcpy(AminoAcids[1], "aataacgaagacgatgag"); strcpy(AminoAcids[2], "taatagtat"); strcpy(AminoAcids[3], "tac"); strcpy(AminoAcids[4], "caacagcatcac"); strcpy(AminoAcids[5], "agaagcagtaggtcgtca"); strcpy(AminoAcids[6], "ggaggcggtggg"); strcpy(AminoAcids[7], "tgatgctgttgg"); strcpy(AminoAcids[8], "cgacgccgtcgg"); strcpy(AminoAcids[9], "ataatg"); strcpy(AminoAcids[10], "gtagtg"); strcpy(AminoAcids[11], "gttgtc"); strcpy(AminoAcids[12], "ttattg"); strcpy(AminoAcids[13], "tttttc"); strcpy(AminoAcids[14], "ctactg"); strcpy(AminoAcids[15], "cttctc"); strcpy(AminoAcids[16], "acaacg"); strcpy(AminoAcids[17], "acc"); strcpy(AminoAcids[18], "gcagccgctgcgtcttcc"); strcpy(AminoAcids[19], "ccaccgcctccc"); strcpy(AminoAcids[20], "attatcact"); strcpy(dnaArray[0], "aggcacggaaaaacgggaataacggaggaggacttggcacggcattacacggaggacgaggtaaaggaggcttgtctacggccggaagtgaagggggatatgaccgcttgg"); strcpy(dnaArray[1], "cggaggacaaacgggatggcggtattggaggtggcggactgttcggggaattattcggtttaaacgggacaaggaaggcggctggaacaaccggacggtggcagcaaagga"); strcpy(dnaArray[2], "gggacggatacggattctggccacggacggaaaggaggacacggcggacatacacggcggcaacggacggaacggaggaaggagggcggcaatcggtacggaggcggcgga"); strcpy(dnaArray[3], "atggataacggaaacaaaccagacaaacttcggtagaaatacagaagcttagatgcatatgttttttaaataaaatttgtattattatggtatcataaaaaaaggttgcga"); strcpy(dnaArray[4], "cggctggcggacaacggactggcggattccaaaaacggaggaggcggacggaggctacaccaccgtttcggcggaaaggcggagggctggcaggaggctcattacggggag"); strcpy(dnaArray[5], "atggaaaattttcggaaaggcggcaggcaggaggcaaaggcggaaaggaaggaaacggcggatatttcggaagtggatattaggagggcggaataaaggaacggcggcaca"); strcpy(dnaArray[6], "atgggattattgaatggcggaggaagatccggaataaaatatggcggaaagaacttgttttcggaaatggaaaaaggactaggaatcggcggcaggaaggatatggaggcg"); strcpy(dnaArray[7], "atggccgatcggcttaggctggaaggaacaaataggcggaattaaggaaggcgttctcgcttttcgacaaggaggcggaccataggaggcggattaggaacggttatgagg"); strcpy(dnaArray[8], "atggcggaaaaaggaaatgtttggcatcggcgggctccggcaactggaggttcggccatggaggcgaaaatcgtgggcggcggcagcgctggccggagtttgaggagcgcg"); strcpy(dnaArray[9], "tggccgcggaggggcccgtcgggcgcggatttctacaagggcttcctgttaaggaggtggcatccaggcgtcgcacgctcggcgcggcaggaggcacgcgggaaaaaacg"); strcpy(dnaArray[10], "gttagatttaacgttttttatggaatttatggaattataaatttaaaaatttatattttttaggtaagtaatccaacgtttttattactttttaaaattaaatatttatt"); strcpy(dnaArray[11], "gtttaattactttatcatttaatttaggttttaattttaaatttaatttaggtaagatgaatttggttttttttaaggtagttatttaattatcgttaaggaaagttaaa"); strcpy(dnaArray[12], "gtattacaggcagaccttatttaggttattattattatttggattttttttttttttttttttaagttaaccgaattattttctttaaagacgttacttaatgtcaatgc"); strcpy(dnaArray[13], "gttagtcttttttagattaaattattagattatgcagtttttttacataagaaaatttttttttcggagttcatattctaatctgtctttattaaatcttagagatatta"); strcpy(dnaArray[14], "gtattatatttttttatttttattattttagaatataatttgaggtatgtgtttaaaaaaaatttttttttttttttttttttttttttttttaaaatttataaatttaa"); strcpy(dnaArray[15], "gttatttttaaatttaattttaattttaaaatacaaaatttttactttctaaaattggtctctggatcgataatgtaaacttattgaatctatagaattacattattgat"); strcpy(dnaArray[16], "gtatgtctatttcacggaagaatgcaccactatatgatttgaaattatctatggctaaaaaccctcagtaaaatcaatccctaaacccttaaaaaacggcggcctatccc"); strcpy(dnaArray[17], "gttaattatttattccttacgggcaattaattatttattacggttttatttacaattttttttttttgtcctatagagaaattacttacaaaacgttattttacatactt"); strcpy(dnaArray[18], "gttacattatttattattatccgttatcgataattttttacctcttttttcgctgagtttttattcttactttttttcttctttatataggatctcatttaatatcttaa"); strcpy(dnaArray[19], "gtatttaactctctttactttttttttcactctctacattttcatcttctaaaactgtttgatttaaacttttgtttctttaaggattttttttacttatcctctgttat"); //判断各个密码子的值,以及归于哪一个氨基酸 for(i=0; i<64; i++) { Clover[i] = -1; } for(i=0; i<21; i++) { len = strlen(AminoAcids[i]); for(j=0; j<len; j+=3) { count = 0; ich = AminoAcids[i][j]; switch(ich) { case 97: count += DNA_AN; break; case 116: count += DNA_TN; break; case 99: count += DNA_CN; break; case 103: count += DNA_GN; break; } ich = AminoAcids[i][j+1]; switch(ich) { case 97: count += DNA_AN * 4; break; case 116: count += DNA_TN * 4; break; case 99: count += DNA_CN * 4; break; case 103: count += DNA_GN * 4; break; } ich = AminoAcids[i][j+2]; switch(ich) { case 97: count += DNA_AN * 16; break; case 116: count += DNA_TN * 16; break; case 99: count += DNA_CN * 16; break; case 103: count += DNA_GN * 16; break; } Clover[count] = i; } } for(i=0; i<64; i++) { //printf("%d\n", Clover[i]); } //提取20*3组频率信息 for(i=0; i<60; i++) { ClovNum[i] = 0; for(j=0; j<22; j++) { num[i][j] = 0; percent[i][j] = 0; } } for(i=0; i<20; i++) { length[i] = strlen(dnaArray[i]); for(k=0; k<3; k++) { for(j=k; j<length[i]; j+=3) { count = 0; ich = dnaArray[i][j-2]; switch(ich) { case 97: count += DNA_AN; break; case 116: count += DNA_TN; break; case 99: count += DNA_CN; break; case 103: count += DNA_GN; break; } ich = dnaArray[i][j-1]; switch(ich) { case 97: count += DNA_AN * 4; break; case 116: count += DNA_TN * 4; break; case 99: count += DNA_CN * 4; break; case 103: count += DNA_GN * 4; break; } ich = dnaArray[i][j]; switch(ich) { case 97: count += DNA_AN * 16; break; case 116: count += DNA_TN * 16; break; case 99: count += DNA_CN * 16; break; case 103: count += DNA_GN * 16; break; } num[3*i+k][Clover[count]] ++; ClovNum[3*i+k]++; if(dnaArray[i][j-2] == 'a' || dnaArray[i][j-2] == 't') { num[3*i+k][21] ++; } if(dnaArray[i][j-1] == 'a' || dnaArray[i][j-1] == 't') { num[3*i+k][21] ++; } if(dnaArray[i][j] == 'a' || dnaArray[i][j] == 't') { num[3*i+k][21] ++; } } } } //求百分比 for(i=0; i<60; i++) { for(j=0; j<21; j++) { percent[i][j] = num[i][j] * 1.0 / ClovNum[i]; } percent[i][21] = num[i][21] * 1.0 / length[i/3]; } file = fopen("FisherNum.txt", "w"); for(i=0; i<60; i++) { fprintf(file, "%d ", ClovNum[i]); for(j=0; j<22; j++) { fprintf(file, "%d ", num[i][j]); } fprintf(file, "\n"); } fclose(file); file = fopen("FisherPercent.txt", "w"); for(i=0; i<60; i++) { for(j=0; j<22; j++) { fprintf(file, "%f ", percent[i][j]); } fprintf(file, "\n"); } fclose(file); //确定代表A、B两类的氨基酸序列 for(i=0; i<20; i++) { sel[i] = -1; } for(i=0; i<10; i++) { for(k=0; k<3; k++) { disMin[k] = 60000; for(j=0; j<30; j++) { if(i == j/3) continue; dis = 0; for(p=0; p<22; p++) { dis += (percent[3*i+k][p]-percent[j][p]) * (percent[3*i+k][p]-percent[j][p]); } if(disMin[k] > dis) disMin[k] = dis; } } dis = 60000; for(k=0; k<3; k++) { if(disMin[k] < dis) { dis = disMin[k]; sel[i] = 3*i+k; } } } for(i=10; i<20; i++) { for(k=0; k<3; k++) { disMin[k] = 60000; for(j=30; j<60; j++) { if(i == j/3) continue; dis = 0; for(p=0; p<22; p++) { dis += (percent[3*i+k][p]-percent[j][p]) * (percent[3*i+k][p]-percent[j][p]); } if(disMin[k] > dis) disMin[k] = dis; } } dis = 60000; for(k=0; k<3; k++) { if(disMin[k] < dis) { dis = disMin[k]; sel[i] = 3*i+k; } } } for(i=0; i<20; i++) { printf("%d ", sel[i]); } printf("\n"); //求取特征向量 for(i=0; i<22; i++) { JD[i] = 0; } for(i=0; i<10; i++) { for(j=10; j<20; j++) { for(p=0; p<22; p++) { JD[p] += sqrt((percent[sel[i]][p]-percent[sel[j]][p]) * (percent[sel[i]][p]-percent[sel[j]][p])); } } } for(i=0; i<22; i++) { printf("%f ", JD[i]); dSite[i] = i; } printf("\n"); //对JD数组进行排序,求最大的8项 for(i=0; i<21; i++) for(j=21; j>=i+1; j--) if(JD[j] > JD[j-1]) { temp = JD[j]; JD[j] = JD[j-1]; JD[j-1] = temp; tempInt = dSite[j]; dSite[j] = dSite[j-1]; dSite[j-1] = tempInt; }; //提取前8个特征位置 for(i=0; i<8; i++) { featureSel[i] = dSite[i]; printf("%d ", dSite[i]); } printf("\n"); //将8个特征向量数据输入到txt中 file = fopen("feature.txt", "w"); for(i=0; i<10; i++) { fprintf(file, "%2d %2d ", i, 1); for(j=0; j<8; j++) { fprintf(file, "%f ", percent[dSite[i]][featureSel[j]]); } fprintf(file, "\n"); } fprintf(file, "\n"); for(i=10; i<20; i++) { fprintf(file, "%2d %2d ", i, -1); for(j=0; j<8; j++) { fprintf(file, "%f ", percent[dSite[i]][featureSel[j]]); } fprintf(file, "\n"); } fclose(file); //Fisher线性判别法 //计算样本均值向量 for(i=0; i<2; i++) { for(j=0; j<8; j++) evgM[i][j] = 0; } for(i=0; i<10; i++) { for(j=0; j<8; j++) { evgM[0][j] += percent[dSite[i]][featureSel[j]]; } } for(i=10; i<20; i++) { for(j=0; j<8; j++) { evgM[1][j] += percent[dSite[i]][featureSel[j]]; } } for(i=0; i<2; i++) { for(j=0; j<8; j++) { evgM[i][j] /= 10; printf("%f %f\n", evgM[0][j], evgM[1][j]); } } //计算样本类内离散度矩阵和总类内离散度矩阵 for(i=0; i<10; i++) { for(j=0; j<8; j++) { perMinusA[i][j] = percent[dSite[i]][featureSel[j]] - evgM[0][j]; } } for(i=10; i<20; i++) { for(j=0; j<8; j++) { perMinusB[i-10][j] = percent[dSite[i]][featureSel[j]] - evgM[1][j]; } } for(i=0; i<8; i++) { for(j=0; j<8; j++) { stanA[i][j] = 0; for(p=0; p<10; p++) stanA[i][j] += perMinusA[p][i] * perMinusA[p][j]; } } for(i=0; i<8; i++) { for(j=0; j<8; j++) { stanB[i][j] = 0; for(p=0; p<10; p++) stanB[i][j] += perMinusB[p][i] * perMinusB[p][j]; } } for(i=0; i<8; i++) { for(j=0; j<8; j++) { stanZZ[i][j] = stanA[i][j] + stanB[i][j]; } } file = fopen("stanZZ.txt", "w"); for(i=0; i<8; i++) { for(j=0; j<8; j++) { fprintf(file, "%f ", stanZZ[i][j]); } fprintf(file, "\n"); } fclose(file); //读取stanZZ矩阵的逆矩阵 file = fopen("stanInv.txt", "r"); if(file == NULL) printf("Wrong!\n"); for(i=0; i<8; i++) { for(j=0; j<8; j++) { fscanf(file, "%f", &stanInv[i][j]); printf("%f ", stanInv[i][j]); } printf("\n"); } fclose(file); //求判别函数之参数w for(i=0; i<8; i++) { evgMinus[i] = evgM[0][i] - evgM[1][i]; } printf("判别函数参数:\n"); for(i=0; i<8; i++) { omega[i] = 0; for(j=0; j<8; j++) { omega[i] += stanInv[i][j] * evgMinus[j]; } printf("%f\n", omega[i]); } //求门限y //d维映射到1维 for(j=0; j<20; j++) percentY[j] = 0; for(i=0; i<10; i++) { for(j=0; j<8; j++) { percentY[i] += percent[dSite[i]][featureSel[j]] * omega[j]; } } for(i=10; i<20; i++) { for(j=0; j<8; j++) { percentY[i] += percent[dSite[i]][featureSel[j]] * omega[j]; } } //求某一类的样本均值 tildeM[0] = tildeM[1] = 0; for(i=0; i<10; i++) { tildeM[0] += percentY[i]; } for(i=10; i<20; i++) { tildeM[1] += percentY[i]; } tildeM[0] /= 10; tildeM[1] /= 10; thresY = (tildeM[0] + tildeM[1])/2; printf("门限Y = %f\n", thresY); file = fopen("InfoFun.txt", "w"); for(i=0; i<8; i++) { fprintf(file, "%f ", omega[i]); } fprintf(file, "\n%f", thresY); fclose(file); for(i=0; i<20; i++) { testYY[i] = 0; for(j=0; j<8; j++) { testYY[i] += percent[dSite[i]][featureSel[j]] * omega[j]; } if(testYY[i] > thresY) printf("1\n"); else printf("0\n"); } file = fopen("Eight.txt", "w"); for(i=0; i<20; i++) { for(j=0; j<8; j++) { fprintf(file, "%f ", percent[dSite[i]][featureSel[j]]); } fprintf(file, "\n"); } fclose(file); return 0; }