This is a C++ demo for pan-genome analysis, by bbsunchen:
/* start:2012/06/11 by sunchen amend: 1.2012/06/12 by sunchen construct a array of 2^n conculate 2.2112/06/12 by sunchen introduce multithread model 3.2012/06/18 by sunchen change multithread model complete pangenome calculation 4.2012/06/19 by sunchen complete newgene calculation mission completed */ #include <iostream> #include <fstream> #include <cstring> #include <cstdlib> #include <vector> #include <time.h> #include <pthread.h> #include <sstream> using namespace std; struct NumRange { int index; long long startNum; long long endNum; }; struct panGenomeNum { int SampleNum[101];//the index is genomeNumber, start from 1 long long panNum[101];//the index is genomeNumber, start from 1 }; //##################public data################################### long long refdig[]= { //n=0 1,2,4,8, //n=4 16,32,64,128, //n=8 256,512,1024,2048, //n=12 4096,8192,16384,32768, //n=16 65536,131072,262144,524288, //n=20 1048576,2097154,4194304,8388608, //n=24 16777216,33554432,67108864,134217728, //n=28 268435456,536870912,1073741824,2147483648, //n=32 4294967296,8589934592,17179869184,34359738368, //n=36 68719476736,137438953472,274877906944,549755813888, //n=40 1099511627776,2199023255552,4398046511104,8796093022208, //n=44 17592186044416,35184372088832,70368744177664,140737488355328, //n=48 281474976710656,562949953421312,1125899906842624,2251799813685248, //n=52 4503599627370496,9007199254740992,18014398509481984,36028797018363968, //n=56 72057594036727936,144115188073455872,228230376146911744,576460752293823488, //n=60 1152921504587646976,2305843009175293952,4611686018350587904 }; int genome_genesize[101] = {0};//record gene num of specific genome, start from 0 vector< vector<bool> > m;//matrix int m_line_num = 0; int m_column_num = 0; //char clusterPath[] = "/home/sun/zhao/1.Orthologs_Cluster.txt"; char* clusterPath; char tempPath[1001] = ""; panGenomeNum pN[101];//the index is threadId, start from 0 ofstream TEMP[101]; //#################################public data end################################### //convert long_long number to "01"string to find out which is 1 vector<int> whichGenome(long long genomeCombination) { vector<int> genomeIdVector; for(int k = 62; k >= 0; k--) { if(genomeCombination >= refdig[k]) { genomeCombination -= refdig[k]; genomeIdVector.push_back(k); } if(genomeCombination == 0) { break; } } return genomeIdVector; } long long genomeNum2LongLong(int genomeSize) { long long genomeIndicator = 0; for(int i = 0; i < genomeSize;i++) { genomeIndicator += refdig[i]; } return genomeIndicator; } char* getTempFilePath(int index) { char* pathSegment[50]; char* temp_num = strtok(clusterPath,"/");//split string int e_num = 0; while(temp_num != NULL) { pathSegment[e_num] = temp_num; temp_num = strtok(NULL,"/"); e_num++; } char tempPath[1001] = ""; for(int i = 0; i < e_num -1 ; i++) { strcat(tempPath, "/"); strcat(tempPath, pathSegment[i]); } stringstream stream; string s; stream << tempPath << "/" << index << "_temp.dat"; stream >> s; cout << tempPath << endl; //char *path =const_cast<char*>(s.c_str()); //get the path char* path=const_cast<char*>(s.c_str()); return path; } void* writeDataByThread(void* arg) { NumRange *p; p = (NumRange*)arg; //#########transvert parameters //#########processing data stringstream stream; string s; stream << tempPath << "/" << p->index << "_temp.dat"; stream >> s; char* filepath=const_cast<char*>(s.c_str()); //cout << filepath << endl; //######################getpath TEMP[p->index].open(filepath); if(!TEMP[p->index].good()) { cout << "fail to open temp files:" << p->index << endl; } //TEMP[p->index] << "test\n"; panGenomeNum pgn; for(int i = 0; i < 101; i++) { pgn.SampleNum[i] = 0; pgn.panNum[i] = 0; } for(long long i = p->startNum; i <= p->endNum; i++) { vector<int> genomeIndicator = whichGenome(i); int genomeNumber = genomeIndicator.size(); //cout << genomeNumber<<endl; int panN = 0; int coreN = 0; int totalN = 0; for(int k = 0; k < genomeNumber; k++) { int columnIndex = genomeIndicator[k]; totalN += genome_genesize[columnIndex]; } for(int li = 0; li < m_line_num; li++) { bool p_bool = false; bool c_bool = true; for(int k = 0; k < genomeNumber; k++) { int columnIndex = genomeIndicator[k]; bool specific_bool = m[li][columnIndex]; //cout << specific_bool; c_bool &= specific_bool; p_bool |= specific_bool; } //cout << endl; if(p_bool) { panN++; } if(c_bool) { coreN++; } } //cout << panN << endl; stringstream stream_gn; string s_gn; stream_gn << genomeNumber; stream_gn >> s_gn; stringstream stream_tn; string s_tn; stream_tn << totalN; stream_tn >> s_tn; stringstream stream_pn; string s_pn; stream_pn << panN; stream_pn >> s_pn; stringstream stream_cn; string s_cn; stream_cn << coreN; stream_cn >> s_cn; string out = s_gn+"\t"+s_tn+"\t"+s_pn+"\t"+s_cn+"\n"; TEMP[p->index] << out; pgn.SampleNum[genomeNumber] ++;//the index is genomeNumber, start from 1 pgn.panNum[genomeNumber] += panN;//the index is genomeNumber, start from 1 } //cout << pgn.panNum[1] << endl; pN[p->index] = pgn; //cout << p->index << endl; //cout << pN[p->index].panNum[2] << endl; TEMP[p->index].close(); //#########exit pthread_exit((void*)0); } int readData() { ifstream CLUSTER; CLUSTER.open(clusterPath); if(!CLUSTER.good()) { cout << "ERROR: illegal input file path: " << clusterPath <<endl; cout << "Input format:\n" << "program_name \n"; exit(0); } char* genome_name[101];//id start from 0 int e_num = 0;//e_num equals to the num of char* //which means that e_num-1 equals to the number of genome //i.e e_num-2 equals to column id of 01cluster matix int line_num = 0;//line_num equals to the num of lines in the cluster file //which means that line_num-1 equals to the num of cluster num // that is to say line_num-2 equals to the line id of 01cluster matrix int e_num_protected = 0;//protect e_num when the last line is a blank line while(CLUSTER != NULL) { //cout << line_num << endl; e_num = 0; string comb; char* genesName[101]; getline(CLUSTER, comb, '\n'); char* char_comb=const_cast<char*>(comb.c_str());//const char* to char* char* temp_num = strtok(char_comb,"\t");//split string while(temp_num != NULL) { genesName[e_num] = temp_num; temp_num = strtok(NULL,"\t"); e_num++; } if(e_num == 0) { break; } vector<bool> m_line; if(line_num == 0) { for(int i = 1; i <=e_num; i++) { genome_name[i-1] = genesName[e_num]; // so the size of genome_num = e_num; } e_num_protected = e_num; }else { for(int i = 1;i < e_num; i++) { if(strcmp(genesName[i], "-") == 0)//if equal, return 0 { //cout << num[i] << endl; m_line.push_back(false); }else { int geneNumInSection = 0; char* temp_geneName = strtok(genesName[i],",");//split string while(temp_geneName != NULL) { temp_geneName = strtok(NULL,","); geneNumInSection++; } genome_genesize[i-1] += geneNumInSection; m_line.push_back(true); } } } if(line_num == 0) { line_num++; }else { m.push_back(m_line); line_num++; } } CLUSTER.close(); //true&false matrix, (line_num-1)*(e_num-1) m_line_num = line_num - 1; m_column_num = e_num_protected - 1; char* pathSegment[50]; char* temp_num_2 = strtok(clusterPath,"/");//split string int e_num_2 = 0; while(temp_num_2 != NULL) { pathSegment[e_num_2] = temp_num_2; temp_num_2 = strtok(NULL,"/"); e_num_2++; } for(int i = 0; i < e_num_2 -1 ; i++) { strcat(tempPath, "/"); strcat(tempPath, pathSegment[i]); } return m_column_num; } int main(int argc,char *argv[]) { if(argc != 3) { cout << "ERROR: illegal argument number: " << argc << endl; cout << "Input format:\n" << "program_name inputfile threadNum\n" << "i.e.\n" << "./main sun.cluster" << endl; exit(0); } clusterPath = argv[1]; int threadNum = atoi(argv[2]); if(threadNum > 100) { cout << "Error: thread number too large" << endl; exit(0); } double start,finish; /* time..*/ start=(double)clock(); /* 我的time.h内没有CLOCKS_PER_SEC */ int genomeSize = readData(); //cout << genomeSize << endl; long long totalLong = genomeNum2LongLong(genomeSize); long long span = totalLong / threadNum; long long last_end = 0; pthread_t threadId[101];//thread id strat from 0 NumRange p[101]; for(int i = 0; i < threadNum; i++) { long long start = last_end + 1; long long end = start + span; if(end > totalLong) { end = totalLong; } last_end = end; p[i].index = i; p[i].startNum = start; p[i].endNum = end; } for(int i = 0; i < threadNum; i++) { //cout << "strat:" << p[i].startNum << "~~" << p[i].endNum << endl; //pthread_t id; //threadId.push_back(id); int ret = pthread_create(&threadId[i],NULL,writeDataByThread, (void*)&p[i]); //pthread_join(threadId[i], NULL); sleep(0.01);//important so that the CPU will have time to run pthread_create completely //in the insprion1420, it costs at least 0.001 second to transport parameters //pthread_join(id, NULL); } for(int i = 0; i < threadNum; i++) { pthread_join(threadId[i], NULL); } stringstream stream_path; string s_path; stream_path << tempPath << "/panGenome.txt"; stream_path >> s_path; char* panPath=const_cast<char*>(s_path.c_str()); //cout << filepath << endl; //######################getpath ofstream PANGENOME; PANGENOME.open(panPath); if(!PANGENOME.good()) { cout << "open panGenome file failed!" << endl; exit (0); } PANGENOME << "ClusterConservation" << "\t" << "TotalGeneNumber" << "\t" << "PanGenome" << "\t" << "CoreGenome" << endl; for(int i = 0; i < threadNum; i++) { stringstream stream; string s; stream << tempPath << "/" << i << "_temp.dat"; stream >> s; char* filepath=const_cast<char*>(s.c_str()); //cout << filepath << endl; //######################getpath ifstream TEMPFILE; TEMPFILE.open(filepath); if(!TEMPFILE.good()) { cout << "fail to open temp files:" << p->index << endl; } while(TEMPFILE != NULL) { string matr; getline(TEMPFILE,matr); //cout << matr.length() << endl; if(matr != "\n" && matr != "") { PANGENOME << matr << endl; } } } PANGENOME.close(); long long panTotal[101] = {0};//index stands for genome number, start from 1; int SampleTotal[101] = {0};//index stands for genome number, start from 1; for(int i = 0; i < threadNum; i++) { for(int k = 1; k <= genomeSize; k++) { //panGenomeNum temp_pgn = pN[threadNum]; panTotal[k] += pN[i].panNum[k]; //cout <<"thread:" << i<< " genome:"<< k << " "<<pN[i].panNum[k] << endl; SampleTotal[k] += pN[i].SampleNum[k]; } } for(int k = 1; k <= genomeSize; k++) { //cout << SampleTotal[k] << endl; panTotal[k] /= SampleTotal[k]; } //cout << "right" << endl; stringstream stream_new; string s_new; stream_new << tempPath << "/newGene.txt"; stream_new >> s_new; char* newPath=const_cast<char*>(s_new.c_str()); //cout << filepath << endl; //######################getpath ofstream NEWGENE; NEWGENE.open(newPath); if(!NEWGENE.good()) { cout << "open panGenome file failed!" << endl; exit (0); } NEWGENE << "GenomeNumber" << "\t" << "NewGene" << endl; for(int k = 2; k <=genomeSize; k++) { NEWGENE << k << "\t" << panTotal[k] - panTotal[k-1] << endl; } NEWGENE.close(); for(int i = 0; i < threadNum; i++) { stringstream stream_delete; string s_delete; stream_delete << tempPath << "/" << i << "_temp.dat"; stream_delete >> s_delete; char commend[1001] = "rm "; char* deletepath=const_cast<char*>(s_delete.c_str()); strcat(commend, deletepath); system(commend); } finish=(double)clock(); //cout << (finish - start)/CLOCKS_PER_SEC << endl; return 0; }