igvtools算是非常权威的bam文件查看器,市面上就这么一款软件,暂时没发现功能表现类似的。
上面的图片通常是你打开bam文件的可视化结果,当你聚焦到某一个位置的时候,igvtools会告诉你这一列有多少A,T,C,G,N。现在你想保存这个信息到txt文件,什么,你想先找找软件有没这个功能?对不起,没有。通常你这里可以截个图就完成任务了。那么如果你想保存信息到txt文件其实是有办法的,不过需要用到igvtools的接口,它的命令行模式完成。
./igvtools count -w 1 --bases '+bam+' result.wig +'fasta'
运行的时候你只需替换上面命令的bam为你的bam文件名,fasta是你的参考序列如hg19,或者特定的参考文件。运行后会生成一个result.wig文件,名字可变,但拓展名是.wig。
打开result.wig文件,内容如下:
第一列是位置,后面依次是A,C,G,T,N的数量,这样你打开igvtools可视化的信息就保存到这个 .wig文件里了,不放心的可以打开igvtools可视化在核对下。
接下来你可能想统计一下每个位置的突变频率,并显示参考fasta的碱基是什么,这个有点复杂,被注释掉的信息还有几个统计信息,直接给下面的代码吧:
import os
import sys
import re
fasta=sys.argv[1]
bam=sys.argv[2]
#fasta='psa.191021.fa'
#bam='gzy0000000S3_ct_M.sort.bam'
#a=os.system('./igvtools count -w 1 --bases '+bam+' result.wig '+fasta)
chrName=[]
refSeq=[]
nameRange=[]
with open(fasta,'r') as f:
line=f.readlines()
for i in line:
print(len(i))
lines=[i.strip()for i in line if len(i)!=1]
#print(lines)
#num=[i for i in range((len(lines))) if i%2==0]
ref=[]
x=0
for i in lines:
#print(i)
if i[0]=='>':
chrName.append(i.strip())
nameRange.append(x)
x+=1
#ref=lines[i].strip()[1:]
#seq=lines[i+1].strip()
#lineNum=len(seq)
nameRange.append(len(lines))
#print(nameRange)
t=1
seq=''
while t<=len(nameRange)-1:
#print(lines[(int(nameRange[t-1])+1):int(nameRange[t])])
seq=''.join(lines[(int(nameRange[t-1])+1):int(nameRange[t])])
#print(seq)
refSeq.append([seq])
t+=1
#chrName.append(ref)
#print(chrName)
A=[]
C=[]
G=[]
T=[]
N=[]
complexList=[]
maxLis=0
posLis=[] # second line
with open('result.wig','r') as f:
#maxLis=len(f.readlines())
for i in f:
complexList.append(re.split(r'\s+',i)[0])
if re.split(r'\s+',i)[0].isdigit():
posLis.append(re.split(r'\s+',i)[0])
A.append(re.split(r'\s+',i)[1])
C.append(re.split(r'\s+',i)[2])
G.append(re.split(r'\s+',i)[3])
T.append(re.split(r'\s+',i)[4])
N.append(re.split(r'\s+',i)[5])
maxLis+=1
rangeLis=[]
index=0
for i in complexList:
if i=='variableStep':
rangeLis.append(index)
index+=1
rangeLis.append(maxLis)
resultLis=[]
i=1
while i<=len(rangeLis)-1:
resultLis.append(rangeLis[i]-rangeLis[i-1]-1)
i+=1
chrLis=[] #first line
indexNum=0
while indexNum<=len(resultLis)-1:
chrLis+=[chrName[indexNum]]*resultLis[indexNum]
indexNum+=1
'''
variableStep=[]
indexV=0
for i in complexList:
if i=='variableStep':
variableStep.append(indexV)
indexV+=1
'''
sliceList=[] #list for num
i=1
while i<=(len(rangeLis)-1):
sliceList+=[complexList[rangeLis[i-1]+1:rangeLis[i]]]
i+=1
#d='AAAAACCATCATATGCGGAACTAAATGCACAAAGACCTCATTATTGGAACATTAGCCATGATTATTTTAATATTCTCACAGCTTTGCAATTTTGAGAATATACTAGCATTATATAAGAAGGAAGAGGAGAAGGAGGAATAAGAGGGAAAGGAGGAGAAAGAGAAGTTGGTAAACAGAGGCCTAGTTAAGAATTCCTTGCCTTAGTGGTGAACAAGGACTAAACACAGACAATGGGTGAAACACAGACGCTAATTCACATAACAGAGAGTAGGCAACCTTAAGAATGAATTGATGCAGACTCCTATAGAATTCCTCTGTTATGACTGGGTTCTTATTTTCTCCTCCTTGTATGTAGTTGAAATTTCATCATTATGAATAGTTCCTTGGATCTTTTTTTAAAG'
t=0
#refString=''
#print(refSeq[26])
#print(sliceList[26])
#print(''.join([refSeq[26][0][int(i)] for i in sliceList[26]]))
#print(refSeq)
#print(refSeq)
refLine=[]
while t<=(len(sliceList)-1):
#a=''.join([refSeq[t][0][int(i)-1] for i in sliceList[t]])
refLine+=[refSeq[t][0][int(i)-1].upper() for i in sliceList[t]]
print(refLine)
#a=refSeq[t][0][int(sliceList[t][0])-1:int(sliceList[t][-1])]
#refString+=a
t+=1
#refLine=list(refString) # third line
'''
Cpg=[0]*len(A) # ninth line
NonCpg=[0]*len(A) #tenth line
CpgPos=[]
NonCpgPos=[]
i=0
while i
#file='2.txt'
file=sys.argv[4]
table=[]
filterList=[]
dic={'A':A,'C':C,'T':T,'G':G}
#print(refLine)
snpFrequency=[]
helpDic={'A':['T','C','G'],'T':['A','G','C'],'C':['A','T','G'],'G':['A','C','T']}
#par=0.3
par=float(sys.argv[3])
print(par)
with open(file,'w') as f:
f.write('seqName'+'\t'+'pos'+'\t'+'ref'+'\t'+'A'+'\t'+'T'+'\t'+'C'+'\t'+'G'+'\t'+'N'+'\t'+'frequency'+'\n')
for i in range(len(A)):
print(refLine[i])
if refLine[i]=='N':
filterList.append(i)
else:
#letter=['A','T','C','G']
#letter.remove(refLine[i])
#helpLis=helpDic[refLine[i]]
snpFrequency.append(max([float(dic[k][i]) for k in helpDic[refLine[i]]])/(float(A[i])+float(T[i])+float(C[i])+float(G[i])+float(N[i])))
if float(dic[helpDic[refLine[i]][0]][i])==0 and float(dic[helpDic[refLine[i]][1]][i])==0 and float(dic[helpDic[refLine[i]][2]][i])==0:
#filterList.append(i)
pass
elif ((max([float(dic[k][i]) for k in helpDic[refLine[i]]])/(float(A[i])+float(T[i])+float(C[i])+float(G[i])+float(N[i]))>=par)and (
(max([float(dic[k][i+1]) for k in helpDic[refLine[i+1]]])/(float(A[i+1])+float(T[i+1])+float(C[i+1])+float(G[i+1])+float(N[i+1]))<=par)
and (max([float(dic[k][i-1]) for k in helpDic[refLine[i-1]]])/(float(A[i-1])+float(T[i-1])+float(C[i-1])+float(G[i-1])+float(N[i-1]))<=par))
) :
#filterList.append(i)
pass
elif (max([float(dic[k][i]) for k in helpDic[refLine[i]]])/(float(A[i])+float(T[i])+float(C[i])+float(G[i])+float(N[i]))<=par):
#filterList.append(i)
pass
else:
#pass
f.write(chrLis[i]+'\t'+posLis[i]+'\t'+refLine[i]+'\t'+A[i]+'\t'+T[i]+'\t'+C[i]+'\t'+G[i]+'\t'+N[i]+'\t'+str(snpFrequency[i])+'\n')
'''
for i in range(len(A)):
finallStr=''
#finallStr=chrLis[i]+'\t'+posLis[i]+'\t'+refLine[i]+'\t'+A[i]+'\t'+T[i]+'\t'+C[i]+'\t'+G[i]+'\t'+N[i]+'\t'+str(Cpg[i])+'\t'+str(NonCpg[i])+'\n'
if i not in filterList:
finallStr=chrLis[i]+'\t'+posLis[i]+'\t'+refLine[i]+'\t'+A[i]+'\t'+T[i]+'\t'+C[i]+'\t'+G[i]+'\t'+N[i]+'\t'+str(snpFrequency[i])+'\n'
#finallStr=chrLis[i]+'\t'+posLis[i]+'\t'+refLine[i]+'\t'+A[i]+'\t'+T[i]+'\t'+C[i]+'\t'+G[i]+'\t'+N[i]+'\t'+str(snpFrequency[i])+'\n'
table.append(finallStr)
'''
'''
with open(file,'w') as f:
#f.write('seqName'+'\t'+'pos'+'\t'+'ref'+'\t'+'A'+'\t'+'T'+'\t'+'C'+'\t'+'G'+'\t'+'N'+'\t'+'Cpg'+'\t'+'NonCpg'+'\t'+'\n')
f.write('seqName'+'\t'+'pos'+'\t'+'ref'+'\t'+'A'+'\t'+'T'+'\t'+'C'+'\t'+'G'+'\t'+'N'+'\t'+'frequency'+'\n')
for i in range(len(A)):
if i not in filterList:
f.write(chrLis[i]+'\t'+posLis[i]+'\t'+refLine[i]+'\t'+A[i]+'\t'+T[i]+'\t'+C[i]+'\t'+G[i]+'\t'+N[i]+'\t'+str(snpFrequency[i])+'\n')
#f.writelines(table)
'''