T-test处理cafe的结果

记录一下,主要是写了一个t检验的部分,在整个代码的后半部分,前面主要是为了处理cafe文件,生成csv格式的文件,方便pandas读取
生成文件型如下方:

GFID    <0>gN   <2>gN   <4>gN   <6>gN   <5>gN   <3>gN   <8>gN   <10>gN  <12>gN  <14>gN  <16>g
2       82      33      33      32      32      31      26      26      25      24      25
4       5       14      7       10      9       10      3       26      11      6       5
7       1       5       0       2       2       3       3       3       2       0       0
8       0       1       12      0       6       6       35      47      0       1       40
9       12      18      7       16      12      12      15      9       4       16      18
10      5       2       1       8       6       6       7       6       5       5       6
11      4       0       5       3       4       4       3       6       8       12      7
12      13      17      6       7       8       11      12      10      13      7       6
13      0       4       7       4       6       6       13      12      12      15      14
14      3       8       7       6       7       7       5       11      6       12      10
15      1       7       1       3       3       6       21      17      9       23      23
16      0       4       5       3       5       5       13      3       11      7       3
17      0       1       1       10      2       2       0       11      7       5       1
18      0       14      6       2       6       7       16      2       10      12      6
19      1       1       0       0       0       1       1       3       2       2       1
20      10      8       7       6       7       7       7       7       7       7       7
21      1       8       6       9       7       7       9       7       0       7       4
:

这是代码部分

import sys,re

###!!! store the line informations
f1 = open(sys.argv[1],"r")#report_run.cafe
fw = open(sys.argv[2],"w")#结果输出文件
flag = 0
trenumlist,treelist,tplist,nplist = [],[],[],[]
for i in f1:
	if flag == 0:
		if re.match("'ID'",i):
			flag = 1
			continue
		elif re.match("# IDs of nodes:",i):
			nodetree = i.strip().split(":")[1]
			nodelist = re.findall("<\d+>",nodetree)#存储所有节点编号的列表["<0>","<20>"...]
		elif re.match("# Output format for:",i):
			idstring = i.strip().split(":")[2].strip(" ")
			idstring = idstring.replace(")","")
			idstring = idstring.replace("(","")
			idstring = idstring.replace(" ",",")
			idlist = idstring.split(",")
	else:
		wholeLine = i.strip().split()
		trenumlist.append(wholeLine[0])
		treelist.append(wholeLine[1])
		tplist.append(wholeLine[2])
		nplist.append(wholeLine[3])
###找到字符串所有"("和")"的位置
def findStringAllIndex(s,pattern):
	pattern_index = []
	matches = re.finditer(pattern,s)
	for match in matches:
		pattern_index.append(match.start())
	return pattern_index
#bracket_index = findStringAllIndex(nodetree,"\(|\)")
### 把树按照括号和,分割成列表((D19A_33:24.509,D1A_32:24.509)_32:17.5479) to [D19A_33:24.509,D1A_32:24.509,_32:17.5479]
def SplitTree(tree):
	tree = tree.replace(")","(")
	tree = tree.replace("(",",")
	listbybrack = tree.split(",")
	listAllNode = []
	for i in listbybrack:
		if i != "":
			listAllNode.append(i)
	return listAllNode
strain_index = SplitTree(nodetree)
#print(strain_index) #['D20A<0>', 'D15A<2>', 'D19A<4>', 'D1A<6>', '<5>', '<3>', 'D8A<8>', 'D5A<10>', 'D23A<12>', 'D14A<14>', 'D13A<16>', '<15>', '<13>', '<11>', '<9>', '<7>', 'D17A<18>', 'D10A<20>', 'D2A<22>', 'D4A<24>', '<23>', 'D3A<26>', 'D18A<28>', 'D16A<30>', '<29>', 'D7A<32>', 'D9A<34>', 'D29A<36>', 'D24A<38>', '<37>', '<35>', '<33>', '<31>', '<27>', '<25>', '<21>', '<19>', '<17>', '<1>']
strain_node_dict = {} #创建节点和物种对应的字典{物种:节点}
for i in strain_index:
	if re.match(r"D\d+A<\d+>",i):
		a = i.split("<")
		strain_node_dict[a[0]]="<"+a[1]
#print(strain_node_dict) #{'D20A': '<0>', 'D15A': '<2>', 'D19A': '<4>', 'D1A': '<6>', 'D8A': '<8>', 'D5A': '<10>', 'D23A': '<12>', 'D14A': '<14>', 'D13A': '<16>', 'D17A': '<18>', 'D10A': '<20>', 'D2A': '<22>', 'D4A': '<24>', 'D3A': '<26>', 'D18A': '<28>', 'D16A': '<30>', 'D7A': '<32>', 'D9A': '<34>', 'D29A': '<36>', 'D24A': '<38>'}

### 这部分是处理树的数据,输出对应节点的基因数目的字典 {"<0>","1"}
def addnodes(tree1,nodetree): #把树的节点加上的函数,tree1为字符串型树,treenode为字符串型节点数
	strains = re.findall("\)_",tree1)
	nodes = re.findall("\)<\d+>",nodetree)
	for i in nodes:
		tree1 = tree1.replace(")_",i+"_",1)
	nodes = re.findall(r"A<\d+>",nodetree)
	for i in nodes:
		tree1 = tree1.replace("A_",i+"_",1)
	return tree1

def Get_node_gene_num_dick(tree):
	tree = tree+":0"#原始树缺少最后节点的枝长,添加枝长为0,将树变成标准树
	tree = addnodes(tree,nodetree)#把树的节点加上
	node_list = SplitTree(tree)#调用函数 SplitTree() 把树按照括号和,分割成列表
	node_gene_num_dick = {}#将节点的基因数目存入字典node_gene_num_dick {"<0>":1}
	for j in node_list:
		nodeID = re.search("<\d+>",j).group(0)
		nodeGeneNum = j.split("_")[1].split(":")[0]
		node_gene_num_dick[nodeID] = nodeGeneNum
	return node_gene_num_dick


#### 这部分是处理节点p值数据的内容,按照csv格式输出对应节点的p值信息
#for nodePvalue in nplist:
def Get_node_Pvalue_dick(node_pvalue_string):
	node_Pvalue_dick = {}#先分割,然后zip,然后存入字典	
#	idlist #原来的idlist存入此列表中了
	nodePvalue = node_pvalue_string.replace("("," ").replace(")"," ").replace(","," ").split()
	node_to_Pvalue = zip(idlist,nodePvalue)
	for j in node_to_Pvalue:#将节点对应的p值存入字典{node:pvalue}
		node_Pvalue_dick["<"+j[0]+">"] = j[1]
	return node_Pvalue_dick


###将结果输出
###首先输出第一行
fw.write("GFID")
for i in nodelist:
	fw.write("\t"+i+"gN")
for i in nodelist:
	 fw.write("\t"+i+"Pv")
fw.write("\n")
#第一行输出完成

#输出节点对应基因数目和p值
for i in range(len(trenumlist)):
	fw.write(trenumlist[i])#输出基因家族编号
	node_gene_num_dick = Get_node_gene_num_dick(treelist[i])#节点基因数目字典
	node_Pvalue_dick = Get_node_Pvalue_dick(nplist[i])#节点p值字典
	for j in nodelist:
		fw.write("\t"+node_gene_num_dick[j])
	for j in nodelist:
		if j != '<1>':
			fw.write("\t"+node_Pvalue_dick[j])
		else:
			fw.write("\t"+tplist[i])#因为<1>节点是在最后,如果你们的树的最后的节点是其他数字,请在前三行作相应的修改,将最后节点的p值替换为整个树的p值
	fw.write("\n")
f1.close()
fw.close()
####自此,对cafe结果文件的标准化展示成csv文件完成



###接下来将是t检验部分
blacklist = ["D1A","D5A","D8A","D13A","D14A","D15A","D19A","D23A"]
yellowlist = ["D2A","D3A","D4A","D7A","D9A","D10A","D16A","D17A","D18A","D24A","D29A"]
import pandas as pd
from scipy import stats
f = open(sys.argv[2],"r")
fcsv = pd.read_csv(f,sep="\t")
#strain_node_dict 整个是存储物种对应的节点的字典{strain:node}
blackBig,yellowBig = [],[]
for i in range(fcsv.shape[0]):
	blackNum,yellowNum,blackPv,yellowPv = [],[],[],[]
	for j in blacklist:#获得黑系全部的基因数目和p值
		blackNum.append(fcsv.loc[i,strain_node_dict[j]+"gN"])
		blackPv.append(fcsv.loc[i,strain_node_dict[j]+"Pv"])
	for j in yellowlist:#获得黄系全部的基因数目和对应p值
		yellowNum.append(fcsv.loc[i,strain_node_dict[j]+"gN"])
		yellowPv.append(fcsv.loc[i,strain_node_dict[j]+"Pv"])
	#t检验
	levene = stats.levene(blackNum,yellowNum)#levene检验查看方差死否相当,levene.pvalue>0.05认为两组方差相等,不相等则参数 equal_var 需要设置成 False
	if levene.pvalue <= 0.05:#t检验
		t, p = stats.ttest_ind(blackNum,yellowNum,equal_var=False)
	else:
		t, p = stats.ttest_ind(blackNum,yellowNum,equal_var=True)
	if p < 0.05: #认为相差明显
		if t < 0: #第一组数据小于第二组
			yellowBig.append(fcsv.loc[i,"GFID"])
		else:
			blackBig.append(fcsv.loc[i,"GFID"])

print(blackBig)
print(yellowBig)
print(len(blackBig))
print(len(yellowBig))
#for i in blacklist:#获得黑系全部的基因数目
	
#print(fcsv.loc[2,"GFID"])	#通过loc函数获得对应行列


你可能感兴趣的:(Python,生物信息学,python,开发语言)