pan-genome analysis sample code

This is a C++ demo for pan-genome analysis, by bbsunchen:

/*
start:2012/06/11 by sunchen
amend:
	1.2012/06/12 by sunchen
	construct a array of 2^n
	conculate
	2.2112/06/12 by sunchen
	introduce multithread model
	3.2012/06/18 by sunchen
	change multithread model
	complete pangenome calculation
	4.2012/06/19 by sunchen
	complete newgene calculation
	mission completed
	
*/

#include <iostream>
#include <fstream>
#include <cstring>
#include <cstdlib>
#include <vector>
#include <time.h>
#include <pthread.h>
#include <sstream>
using namespace std;



struct NumRange
{
    int index;
    long long startNum;
    long long endNum;
};
struct panGenomeNum
{
    int SampleNum[101];//the index is genomeNumber, start from 1
    long long panNum[101];//the index is genomeNumber, start from 1
};
//##################public data###################################
long long refdig[]=
{
    //n=0
    1,2,4,8,
    //n=4
    16,32,64,128,
    //n=8
    256,512,1024,2048,
    //n=12
    4096,8192,16384,32768,
    //n=16
    65536,131072,262144,524288,
    //n=20
    1048576,2097154,4194304,8388608,
    //n=24
    16777216,33554432,67108864,134217728,
    //n=28
    268435456,536870912,1073741824,2147483648,
    //n=32
    4294967296,8589934592,17179869184,34359738368,
    //n=36
    68719476736,137438953472,274877906944,549755813888,
    //n=40
    1099511627776,2199023255552,4398046511104,8796093022208,
    //n=44
    17592186044416,35184372088832,70368744177664,140737488355328,
    //n=48
    281474976710656,562949953421312,1125899906842624,2251799813685248,
    //n=52
    4503599627370496,9007199254740992,18014398509481984,36028797018363968,
    //n=56
    72057594036727936,144115188073455872,228230376146911744,576460752293823488,
    //n=60
    1152921504587646976,2305843009175293952,4611686018350587904
};

int genome_genesize[101] = {0};//record gene num of specific genome, start from 0
vector< vector<bool> > m;//matrix
int m_line_num = 0;
int m_column_num = 0;
//char clusterPath[] = "/home/sun/zhao/1.Orthologs_Cluster.txt";
char* clusterPath;
char tempPath[1001] = "";
panGenomeNum pN[101];//the index is threadId, start from 0
ofstream TEMP[101];
//#################################public data end###################################

//convert long_long number to "01"string to find out which is 1
vector<int> whichGenome(long long genomeCombination)
{

    vector<int> genomeIdVector;
    for(int k = 62; k >= 0; k--)
    {
        if(genomeCombination >= refdig[k])
        {
            genomeCombination -= refdig[k];
            genomeIdVector.push_back(k);
        }
        if(genomeCombination == 0)
        {
            break;
        }
    }
    return genomeIdVector;
}
long long genomeNum2LongLong(int genomeSize)
{
    long long genomeIndicator = 0;
    for(int i = 0; i < genomeSize;i++)
    {
        genomeIndicator += refdig[i];
    }
    return genomeIndicator;

}

char* getTempFilePath(int index)
{
    char* pathSegment[50];
    char* temp_num = strtok(clusterPath,"/");//split string
    int e_num = 0;
    while(temp_num != NULL)
    {
        pathSegment[e_num] = temp_num;
        temp_num = strtok(NULL,"/");
        e_num++;
    }
    char tempPath[1001] = "";
    for(int i = 0; i < e_num -1 ; i++)
    {
        strcat(tempPath, "/");
        strcat(tempPath, pathSegment[i]);
    }
    stringstream stream;
    string s;
    stream << tempPath << "/" << index << "_temp.dat";
    stream >> s;
    cout << tempPath << endl;
    //char *path =const_cast<char*>(s.c_str()); //get the path
    char* path=const_cast<char*>(s.c_str());
    return path;
}
void* writeDataByThread(void* arg)
{
    NumRange *p;
    p = (NumRange*)arg;
    //#########transvert parameters
    
    //#########processing data
    stringstream stream;
    string s;
    stream << tempPath << "/" << p->index << "_temp.dat";
    stream >> s;
    char* filepath=const_cast<char*>(s.c_str());
    //cout << filepath << endl;
    //######################getpath
    TEMP[p->index].open(filepath);
    if(!TEMP[p->index].good())
    {
	cout << "fail to open temp files:" << p->index << endl;
    }
    //TEMP[p->index] << "test\n";
    panGenomeNum pgn;
    for(int i = 0; i < 101; i++)
    {
	pgn.SampleNum[i] = 0;
	pgn.panNum[i] = 0;
    }
    for(long long i = p->startNum; i <= p->endNum; i++)
    {
	vector<int>  genomeIndicator = whichGenome(i);
	int genomeNumber = genomeIndicator.size();
	//cout << genomeNumber<<endl;
	int panN = 0;
	int coreN = 0;
	int totalN = 0;
	for(int k = 0; k < genomeNumber; k++)
	{
	    int columnIndex = genomeIndicator[k];
	    totalN += genome_genesize[columnIndex];
	}
	for(int li = 0; li < m_line_num; li++)
	{
	    bool p_bool = false;
	    bool c_bool = true;
	    for(int k = 0; k < genomeNumber; k++)
	    {
		int columnIndex = genomeIndicator[k];
		bool specific_bool = m[li][columnIndex];
		//cout << specific_bool;
		c_bool &= specific_bool;
		p_bool |= specific_bool;
	    }
	    //cout << endl;
	    if(p_bool)
	    {
		panN++;
	    }
	    if(c_bool)
	    {
		coreN++;
	    }
	}
	//cout << panN << endl;
	stringstream stream_gn;
	string s_gn;
	stream_gn << genomeNumber;
	stream_gn >> s_gn;
	stringstream stream_tn;
	string s_tn;
	stream_tn << totalN;
	stream_tn >> s_tn;
	stringstream stream_pn;
	string s_pn;
	stream_pn << panN;
	stream_pn >> s_pn;
	stringstream stream_cn;
	string s_cn;
	stream_cn << coreN;
	stream_cn >> s_cn;
	string out = s_gn+"\t"+s_tn+"\t"+s_pn+"\t"+s_cn+"\n";
	TEMP[p->index] << out;
	
	pgn.SampleNum[genomeNumber] ++;//the index is genomeNumber, start from 1
	pgn.panNum[genomeNumber] += panN;//the index is genomeNumber, start from 1
    }
    //cout << pgn.panNum[1] << endl;
    pN[p->index] = pgn;
    //cout << p->index << endl;
    //cout << pN[p->index].panNum[2] << endl;
    
    TEMP[p->index].close();
    //#########exit
    pthread_exit((void*)0);
    
}

int readData()
{
    ifstream CLUSTER;
    CLUSTER.open(clusterPath);
    if(!CLUSTER.good())  
    {  
        cout << "ERROR: illegal input file path: " << clusterPath <<endl;  
        cout <<  
        "Input format:\n" <<  
        "program_name  \n";  
        exit(0);  
    }


    char* genome_name[101];//id start from 0

    int e_num = 0;//e_num equals to the num of char*
    //which means that e_num-1 equals to the number of genome
    //i.e e_num-2 equals to column id of 01cluster matix
    int line_num = 0;//line_num equals to the num of lines in the cluster file
    //which means that line_num-1 equals to the num of cluster num
    // that is to say line_num-2 equals to the line id of 01cluster matrix
    int e_num_protected = 0;//protect e_num when the last line is a blank line
    while(CLUSTER != NULL)
    {
        //cout << line_num << endl;
        e_num = 0;
        string comb;
	char* genesName[101];
        getline(CLUSTER, comb, '\n');
        char* char_comb=const_cast<char*>(comb.c_str());//const char* to char*
        char* temp_num = strtok(char_comb,"\t");//split string

        while(temp_num != NULL)
        {
            genesName[e_num] = temp_num;
            temp_num = strtok(NULL,"\t");
            e_num++;
        }
        if(e_num == 0)
        {
            break;
        }
	
        vector<bool> m_line;
        if(line_num == 0)
        {
            for(int i = 1; i <=e_num; i++)
            {
                genome_name[i-1] = genesName[e_num];
                // so the size of genome_num = e_num;
            }
	    e_num_protected = e_num;
        }else
        {
            for(int i = 1;i < e_num; i++)
            {
                if(strcmp(genesName[i], "-") == 0)//if equal, return 0
                {
                    //cout << num[i] << endl;
                    m_line.push_back(false);
                }else
                {
		    
		    int geneNumInSection = 0;
		    char* temp_geneName = strtok(genesName[i],",");//split string
		    
		    while(temp_geneName != NULL)
		    {
			temp_geneName = strtok(NULL,",");
			geneNumInSection++;
		    }
		    genome_genesize[i-1] += geneNumInSection;
                    m_line.push_back(true);
                }
            }
        }
        if(line_num == 0)
        {
            line_num++;
        }else
        {
            m.push_back(m_line);
            line_num++;
        }
    }
    CLUSTER.close();
    //true&false matrix, (line_num-1)*(e_num-1)

    m_line_num = line_num - 1;
    m_column_num = e_num_protected - 1;
    
    char* pathSegment[50];
    char* temp_num_2 = strtok(clusterPath,"/");//split string
    int e_num_2 = 0;
    while(temp_num_2 != NULL)
    {
        pathSegment[e_num_2] = temp_num_2;
        temp_num_2 = strtok(NULL,"/");
        e_num_2++;
    }
    for(int i = 0; i < e_num_2 -1 ; i++)
    {
        strcat(tempPath, "/");
        strcat(tempPath, pathSegment[i]);
    }
    
    return m_column_num;
}
int main(int argc,char *argv[])
{
    if(argc != 3)  
    {  
        cout << "ERROR: illegal argument number: " << argc << endl;  
        cout <<
	"Input format:\n" <<
	"program_name inputfile threadNum\n" <<
	"i.e.\n" <<
	"./main sun.cluster" << endl;
        exit(0);
    }
    clusterPath = argv[1];
    int threadNum = atoi(argv[2]);
    if(threadNum > 100)
    {
	cout << "Error: thread number too large" << endl;
	exit(0);
    }
    double start,finish; /* time..*/
    start=(double)clock(); /* 我的time.h内没有CLOCKS_PER_SEC */
    
    int genomeSize = readData();
    //cout << genomeSize << endl;
    long long totalLong = genomeNum2LongLong(genomeSize);
    long long span = totalLong / threadNum;
    long long last_end = 0;
    pthread_t threadId[101];//thread id strat from 0    
    NumRange p[101];
    for(int i = 0; i < threadNum; i++)
    {
	long long start = last_end + 1;
        long long end = start + span;
        if(end > totalLong)
        {
            end = totalLong;
        }
        last_end = end;
        p[i].index = i;
        p[i].startNum = start;
        p[i].endNum = end;
    }
    for(int i = 0; i < threadNum; i++)
    {
	//cout << "strat:" << p[i].startNum << "~~" << p[i].endNum << endl;
        //pthread_t id;
        //threadId.push_back(id);
        int ret = pthread_create(&threadId[i],NULL,writeDataByThread, (void*)&p[i]);
	//pthread_join(threadId[i], NULL);
	sleep(0.01);//important so that the CPU will have time to run pthread_create completely
	//in the insprion1420, it costs at least 0.001 second to transport parameters
	//pthread_join(id, NULL);
    }
    for(int i = 0; i < threadNum; i++)
    {
        pthread_join(threadId[i], NULL);
    }
    
    stringstream stream_path;
    string s_path;
    stream_path << tempPath << "/panGenome.txt";
    stream_path >> s_path;
    char* panPath=const_cast<char*>(s_path.c_str());
    //cout << filepath << endl;
    //######################getpath
    ofstream PANGENOME;
    PANGENOME.open(panPath);
    if(!PANGENOME.good())
    {
	cout << "open panGenome file failed!" << endl;
	exit (0);
    }
    PANGENOME << "ClusterConservation" << "\t" << "TotalGeneNumber" << "\t" << "PanGenome" << "\t" << "CoreGenome" << endl;

    for(int i = 0; i < threadNum; i++)
    {
	stringstream stream;
	string s;
	stream << tempPath << "/" << i << "_temp.dat";
	stream >> s;
	char* filepath=const_cast<char*>(s.c_str());
	//cout << filepath << endl;
	//######################getpath
	ifstream TEMPFILE;
	TEMPFILE.open(filepath);
	if(!TEMPFILE.good())
	{
	    cout << "fail to open temp files:" << p->index << endl;
	}
	while(TEMPFILE != NULL)  
	{  
	    string matr;  
	    getline(TEMPFILE,matr);  
	    //cout << matr.length() << endl;
	    if(matr != "\n" && matr != "")
	    {
		PANGENOME << matr << endl;
	    }
	}  
    }
    PANGENOME.close();
    long long panTotal[101] = {0};//index stands for genome number, start from 1;
    int SampleTotal[101] = {0};//index stands for genome number, start from 1;
    for(int i = 0; i < threadNum; i++)
    {
	for(int k = 1; k <= genomeSize; k++)
	{
	    //panGenomeNum temp_pgn = pN[threadNum];
	    panTotal[k] += pN[i].panNum[k];
	    //cout <<"thread:" << i<< " genome:"<< k << "   "<<pN[i].panNum[k] << endl;
	    SampleTotal[k] += pN[i].SampleNum[k];
	}
    }
    
    for(int k = 1; k <= genomeSize; k++)
    {
	//cout << SampleTotal[k] << endl;
	panTotal[k] /= SampleTotal[k];
    }
    //cout << "right" << endl;
    stringstream stream_new;
    string s_new;
    stream_new << tempPath << "/newGene.txt";
    stream_new >> s_new;
    char* newPath=const_cast<char*>(s_new.c_str());
    //cout << filepath << endl;
    //######################getpath
    ofstream NEWGENE;
    NEWGENE.open(newPath);
    if(!NEWGENE.good())
    {
	cout << "open panGenome file failed!" << endl;
	exit (0);
    }
    NEWGENE << "GenomeNumber" << "\t" << "NewGene" << endl;
    for(int k = 2; k <=genomeSize; k++)
    {
	NEWGENE << k << "\t" << panTotal[k] - panTotal[k-1] << endl;
    }
    NEWGENE.close();
    

    for(int i = 0; i < threadNum; i++)
    {
	stringstream stream_delete;
	string s_delete;
	stream_delete << tempPath << "/" << i << "_temp.dat";
	stream_delete >> s_delete;
	char commend[1001] = "rm ";
	char* deletepath=const_cast<char*>(s_delete.c_str());
	strcat(commend, deletepath);
	system(commend);
    }
    finish=(double)clock();
    //cout << (finish - start)/CLOCKS_PER_SEC << endl;
    return 0;
}
 

你可能感兴趣的:(sample)