记录一下,主要是写了一个t检验的部分,在整个代码的后半部分,前面主要是为了处理cafe文件,生成csv格式的文件,方便pandas读取
生成文件型如下方:
GFID <0>gN <2>gN <4>gN <6>gN <5>gN <3>gN <8>gN <10>gN <12>gN <14>gN <16>g
2 82 33 33 32 32 31 26 26 25 24 25
4 5 14 7 10 9 10 3 26 11 6 5
7 1 5 0 2 2 3 3 3 2 0 0
8 0 1 12 0 6 6 35 47 0 1 40
9 12 18 7 16 12 12 15 9 4 16 18
10 5 2 1 8 6 6 7 6 5 5 6
11 4 0 5 3 4 4 3 6 8 12 7
12 13 17 6 7 8 11 12 10 13 7 6
13 0 4 7 4 6 6 13 12 12 15 14
14 3 8 7 6 7 7 5 11 6 12 10
15 1 7 1 3 3 6 21 17 9 23 23
16 0 4 5 3 5 5 13 3 11 7 3
17 0 1 1 10 2 2 0 11 7 5 1
18 0 14 6 2 6 7 16 2 10 12 6
19 1 1 0 0 0 1 1 3 2 2 1
20 10 8 7 6 7 7 7 7 7 7 7
21 1 8 6 9 7 7 9 7 0 7 4
:
这是代码部分
import sys,re
###!!! store the line informations
f1 = open(sys.argv[1],"r")#report_run.cafe
fw = open(sys.argv[2],"w")#结果输出文件
flag = 0
trenumlist,treelist,tplist,nplist = [],[],[],[]
for i in f1:
if flag == 0:
if re.match("'ID'",i):
flag = 1
continue
elif re.match("# IDs of nodes:",i):
nodetree = i.strip().split(":")[1]
nodelist = re.findall("<\d+>",nodetree)#存储所有节点编号的列表["<0>","<20>"...]
elif re.match("# Output format for:",i):
idstring = i.strip().split(":")[2].strip(" ")
idstring = idstring.replace(")","")
idstring = idstring.replace("(","")
idstring = idstring.replace(" ",",")
idlist = idstring.split(",")
else:
wholeLine = i.strip().split()
trenumlist.append(wholeLine[0])
treelist.append(wholeLine[1])
tplist.append(wholeLine[2])
nplist.append(wholeLine[3])
###找到字符串所有"("和")"的位置
def findStringAllIndex(s,pattern):
pattern_index = []
matches = re.finditer(pattern,s)
for match in matches:
pattern_index.append(match.start())
return pattern_index
#bracket_index = findStringAllIndex(nodetree,"\(|\)")
### 把树按照括号和,分割成列表((D19A_33:24.509,D1A_32:24.509)_32:17.5479) to [D19A_33:24.509,D1A_32:24.509,_32:17.5479]
def SplitTree(tree):
tree = tree.replace(")","(")
tree = tree.replace("(",",")
listbybrack = tree.split(",")
listAllNode = []
for i in listbybrack:
if i != "":
listAllNode.append(i)
return listAllNode
strain_index = SplitTree(nodetree)
#print(strain_index) #['D20A<0>', 'D15A<2>', 'D19A<4>', 'D1A<6>', '<5>', '<3>', 'D8A<8>', 'D5A<10>', 'D23A<12>', 'D14A<14>', 'D13A<16>', '<15>', '<13>', '<11>', '<9>', '<7>', 'D17A<18>', 'D10A<20>', 'D2A<22>', 'D4A<24>', '<23>', 'D3A<26>', 'D18A<28>', 'D16A<30>', '<29>', 'D7A<32>', 'D9A<34>', 'D29A<36>', 'D24A<38>', '<37>', '<35>', '<33>', '<31>', '<27>', '<25>', '<21>', '<19>', '<17>', '<1>']
strain_node_dict = {} #创建节点和物种对应的字典{物种:节点}
for i in strain_index:
if re.match(r"D\d+A<\d+>",i):
a = i.split("<")
strain_node_dict[a[0]]="<"+a[1]
#print(strain_node_dict) #{'D20A': '<0>', 'D15A': '<2>', 'D19A': '<4>', 'D1A': '<6>', 'D8A': '<8>', 'D5A': '<10>', 'D23A': '<12>', 'D14A': '<14>', 'D13A': '<16>', 'D17A': '<18>', 'D10A': '<20>', 'D2A': '<22>', 'D4A': '<24>', 'D3A': '<26>', 'D18A': '<28>', 'D16A': '<30>', 'D7A': '<32>', 'D9A': '<34>', 'D29A': '<36>', 'D24A': '<38>'}
### 这部分是处理树的数据,输出对应节点的基因数目的字典 {"<0>","1"}
def addnodes(tree1,nodetree): #把树的节点加上的函数,tree1为字符串型树,treenode为字符串型节点数
strains = re.findall("\)_",tree1)
nodes = re.findall("\)<\d+>",nodetree)
for i in nodes:
tree1 = tree1.replace(")_",i+"_",1)
nodes = re.findall(r"A<\d+>",nodetree)
for i in nodes:
tree1 = tree1.replace("A_",i+"_",1)
return tree1
def Get_node_gene_num_dick(tree):
tree = tree+":0"#原始树缺少最后节点的枝长,添加枝长为0,将树变成标准树
tree = addnodes(tree,nodetree)#把树的节点加上
node_list = SplitTree(tree)#调用函数 SplitTree() 把树按照括号和,分割成列表
node_gene_num_dick = {}#将节点的基因数目存入字典node_gene_num_dick {"<0>":1}
for j in node_list:
nodeID = re.search("<\d+>",j).group(0)
nodeGeneNum = j.split("_")[1].split(":")[0]
node_gene_num_dick[nodeID] = nodeGeneNum
return node_gene_num_dick
#### 这部分是处理节点p值数据的内容,按照csv格式输出对应节点的p值信息
#for nodePvalue in nplist:
def Get_node_Pvalue_dick(node_pvalue_string):
node_Pvalue_dick = {}#先分割,然后zip,然后存入字典
# idlist #原来的idlist存入此列表中了
nodePvalue = node_pvalue_string.replace("("," ").replace(")"," ").replace(","," ").split()
node_to_Pvalue = zip(idlist,nodePvalue)
for j in node_to_Pvalue:#将节点对应的p值存入字典{node:pvalue}
node_Pvalue_dick["<"+j[0]+">"] = j[1]
return node_Pvalue_dick
###将结果输出
###首先输出第一行
fw.write("GFID")
for i in nodelist:
fw.write("\t"+i+"gN")
for i in nodelist:
fw.write("\t"+i+"Pv")
fw.write("\n")
#第一行输出完成
#输出节点对应基因数目和p值
for i in range(len(trenumlist)):
fw.write(trenumlist[i])#输出基因家族编号
node_gene_num_dick = Get_node_gene_num_dick(treelist[i])#节点基因数目字典
node_Pvalue_dick = Get_node_Pvalue_dick(nplist[i])#节点p值字典
for j in nodelist:
fw.write("\t"+node_gene_num_dick[j])
for j in nodelist:
if j != '<1>':
fw.write("\t"+node_Pvalue_dick[j])
else:
fw.write("\t"+tplist[i])#因为<1>节点是在最后,如果你们的树的最后的节点是其他数字,请在前三行作相应的修改,将最后节点的p值替换为整个树的p值
fw.write("\n")
f1.close()
fw.close()
####自此,对cafe结果文件的标准化展示成csv文件完成
###接下来将是t检验部分
blacklist = ["D1A","D5A","D8A","D13A","D14A","D15A","D19A","D23A"]
yellowlist = ["D2A","D3A","D4A","D7A","D9A","D10A","D16A","D17A","D18A","D24A","D29A"]
import pandas as pd
from scipy import stats
f = open(sys.argv[2],"r")
fcsv = pd.read_csv(f,sep="\t")
#strain_node_dict 整个是存储物种对应的节点的字典{strain:node}
blackBig,yellowBig = [],[]
for i in range(fcsv.shape[0]):
blackNum,yellowNum,blackPv,yellowPv = [],[],[],[]
for j in blacklist:#获得黑系全部的基因数目和p值
blackNum.append(fcsv.loc[i,strain_node_dict[j]+"gN"])
blackPv.append(fcsv.loc[i,strain_node_dict[j]+"Pv"])
for j in yellowlist:#获得黄系全部的基因数目和对应p值
yellowNum.append(fcsv.loc[i,strain_node_dict[j]+"gN"])
yellowPv.append(fcsv.loc[i,strain_node_dict[j]+"Pv"])
#t检验
levene = stats.levene(blackNum,yellowNum)#levene检验查看方差死否相当,levene.pvalue>0.05认为两组方差相等,不相等则参数 equal_var 需要设置成 False
if levene.pvalue <= 0.05:#t检验
t, p = stats.ttest_ind(blackNum,yellowNum,equal_var=False)
else:
t, p = stats.ttest_ind(blackNum,yellowNum,equal_var=True)
if p < 0.05: #认为相差明显
if t < 0: #第一组数据小于第二组
yellowBig.append(fcsv.loc[i,"GFID"])
else:
blackBig.append(fcsv.loc[i,"GFID"])
print(blackBig)
print(yellowBig)
print(len(blackBig))
print(len(yellowBig))
#for i in blacklist:#获得黑系全部的基因数目
#print(fcsv.loc[2,"GFID"]) #通过loc函数获得对应行列