1、c处理vcf文件,初步统计snp位置
位置信息有5种
down_stream是gene下游2k以内,up_stream 是gene上游2k以内,gene_exon是snp在外显子内,gene_intron是gene在内含子内,uninfluncial是gene不在上诉所有区域
SnpType.py
内容:
#这个脚本是用来看snp哪些是在基因中(分为exon和intron),哪些是在基因的上游、下游
#最终输出结果为:scaffold position 位置 geneid
#输入文件1:snpeff结果文件(/public/home/wangwen_lab/zhangjiexiong/Ageing/SNP/snpeff/snpeffC_F/02.runC/mor_imp_strictestfilter_Snp_AncRef_selected_anno4.vcf)
#输入文件2:gff3文件(/public/home/wangwen_lab/zhangjiexiong/Ageing/SNP/genome.gff3)
import sys,re
file1 = sys.argv[1]
file2 = sys.argv[2]
file3 = sys.argv[3]
f1 = open(file1,'r')
f2 = open(file2,'r')
f3 = open(file3,'w')
flag = 'fuck'
f1dick = {}
arr1 = []
for i in f1:
if re.match("#",i):
continue
a = i.split("\t")
if flag != a[0] and flag is not 'fuck':
f1dick[flag] = arr1
flag = a[0]
arr1 = []
# print(a[0])
arr1.append(a[1])
elif flag is 'fuck':
flag = a[0]
else:
arr1.append(a[1])
f1dick[flag] = arr1
#for i in f1dick:
# print(i,f1dick[i])
sc = "fuck"
f2dick1 = {}
f2dick2 = {}
f2arr1 = []
for i in f2:
if re.match("M",i) == None:
f2dick1[geneid] = f2arr1
continue
a = i.split("\t")
if a[2] == 'gene':
f2arr1 = []
f2arr1.append(a[3:5])
if sc != a[0]:
f2dick2[sc] = f2dick1
f2dick1 = {}
sc = a[0]
geneid = a[8].split(";")[0].split("=")[1]
if a[2] == 'exon':
f2arr1.append(a[3:5])
f2dick1[geneid] = f2arr1
f2dick2[sc] = f2dick1
del f2dick2['fuck']
f2.close()
for i in f1dick.keys():
for j in f1dick[i]:#取到scaffold内的所有snp位点
try:
taq = 0
for k in f2dick2[i].keys():#对gff3文件中所有的scaffold为i的gene进行遍历
a = f2dick2[i][k]
amax = max(a[0])
amin = min(a[0])
flag = 0
if int(amin) <= int(j) <= int(amax):#判断在gene中后判断在exon还是在intron
for u in a[1:]:
if int(min(u))<int(j)<int(max(u)):
flag = "gene_exon"
if flag != "gene_exon":
flag = "gene_intron"
elif int(min(a[0]))-2000 <= int(j) <=int(min(a[0])):#判断是不是在gene下游
flag = 'down_stream'
elif int(max(a[0]))+2000 >= int(j) >= int(max(a[0])):#判断是不是gene上游
flag = "up_stream"
else:
pass
if flag != 0:
f3.write(i+"\t"+j+"\t"+k+"\t"+flag+"\t"+max(a[0])+"\t"+min(a[0])+"\n")
taq = 1
if taq == 0:
f3.write(i+"\t"+j+"\t"+"\t"+"Uninfluncial"+"\n")
except KeyError:
print("KeyError",sys.exc_info())
f3.close()
输入文件为vcf和gff3
vcf内容如下:
##contig=
##contig=
##contig=
##contig=
##contig=
##contig=
##reference=file:///public/home/wangwen_lab/zhangjiexiong/Ageing/SNP/genome.fasta
##source=SelectVariants
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Mor_importuna
Morimp01GS001 1387 . C T 969.80 PASS AC=2;AF=1.00;AN=2;BaseQRankSum=1.56;ClippingRankSum=-4.910e-01;DP=28;ExcessHet=
Morimp01GS001 1418 . C T 1427.78 PASS AC=2;AF=1.00;AN=2;BaseQRankSum=-1.073e+00;ClippingRankSum=-2.400e-02;DP=43;Exce
Morimp01GS001 8939 . A C 282.77 PASS AC=1;AF=0.500;AN=2;BaseQRankSum=1.67;ClippingRankSum=-5.330e-01;DP=13;ExcessHet
Morimp01GS001 9251 . C T 1538.77 PASS AC=1;AF=0.500;AN=2;BaseQRankSum=0.460;ClippingRankSum=0.631;DP=88;ExcessHet=3.0
Morimp01GS001 12787 . T G 2514.77 PASS AC=1;AF=0.500;AN=2;BaseQRankSum=2.25;ClippingRankSum=0.383;DP=109;ExcessHet=3.0
Morimp01GS001 13324 . C G 2177.77 PASS AC=1;AF=0.500;AN=2;BaseQRankSum=2.17;ClippingRankSum=1.50;DP=88;ExcessHet=3.010
Morimp01GS001 13446 . G C 1158.77 PASS AC=1;AF=0.500;AN=2;BaseQRankSum=0.994;ClippingRankSum=-1.990e-01;DP=58;ExcessHe
Morimp01GS001 13866 . A C 2370.77 PASS AC=1;AF=0.500;AN=2;BaseQRankSum=-1.094e+00;ClippingRankSum=-6.080e-01;DP=100;Ex
Morimp01GS001 14181 . T C 1455.77 PASS AC=1;AF=0.500;AN=2;BaseQRankSum=0.710;ClippingRankSum=-6.500e-01;DP=71;ExcessHe
Morimp01GS001 14320 . A G 1473.77 PASS AC=1;AF=0.500;AN=2;BaseQRankSum=1.42;ClippingRankSum=-8.810e-01;DP=64;ExcessHet
Morimp01GS001 14344 . C T 1471.77 PASS AC=1;AF=0.500;AN=2;BaseQRankSum=1.48;ClippingRankSum=-1.880e-01;DP=69;ExcessHet
Morimp01GS001 14606 . T G 2329.77 PASS AC=1;AF=0.500;AN=2;BaseQRankSum=-1.280e-01;ClippingRankSum=-1.583e+00;DP=94;Exc
Morimp01GS001 14869 . A G 2209.77 PASS AC=1;AF=0.500;AN=2;BaseQRankSum=-4.340e-01;ClippingRankSum=-2.854e+00;DP=99;Exc
Morimp01GS001 14897 . C T 2048.77 PASS AC=1;AF=0.500;AN=2;BaseQRankSum=0.603;ClippingRankSum=1.03;DP=88;ExcessHet=3.01
Morimp01GS001 15056 . A T 1930.77 PASS AC=1;AF=0.500;AN=2;BaseQRankSum=0.870;ClippingRankSum=-3.810e-01;DP=88;ExcessHe
Morimp01GS001 15384 . A G 1690.77 PASS AC=1;AF=0.500;AN=2;BaseQRankSum=-1.241e+00;ClippingRankSum=0.431;DP=86;ExcessHe
Morimp01GS001 15564 . C T 2023.77 PASS AC=1;AF=0.500;AN=2;BaseQRankSum=0.198;ClippingRankSum=0.014;DP=85;ExcessHet=3.0
Morimp01GS001 15729 . C T 2458.77 PASS AC=1;AF=0.500;AN=2;BaseQRankSum=
gff3如下:
Morimp01GS001 AUGUSTUS gene 16797 18718 0.81 + . ID=MIM04M26Gene00001;Name=MIM04M26Gene00001
Morimp01GS001 AUGUSTUS mRNA 16797 18718 0.81 + . ID=MIM04M26Gene00001.t1;Parent=MIM04M26Gene00001
Morimp01GS001 AUGUSTUS exon 16797 16949 . + . ID=MIM04M26Gene00001.t1.exon1;Parent=MIM04M26Gene00001.t1
Morimp01GS001 AUGUSTUS CDS 16797 16949 0.82 + 0 ID=MIM04M26Gene00001.t1.CDS1;Parent=MIM04M26Gene00001.t1
Morimp01GS001 AUGUSTUS exon 17024 17219 . + . ID=MIM04M26Gene00001.t1.exon2;Parent=MIM04M26Gene00001.t1
Morimp01GS001 AUGUSTUS CDS 17024 17219 0.99 + 0 ID=MIM04M26Gene00001.t1.CDS2;Parent=MIM04M26Gene00001.t1
Morimp01GS001 AUGUSTUS exon 17298 18718 . + . ID=MIM04M26Gene00001.t1.exon3;Parent=MIM04M26Gene00001.t1
Morimp01GS001 AUGUSTUS CDS 17298 18718 0.98 + 2 ID=MIM04M26Gene00001.t1.CDS3;Parent=MIM04M26Gene00001.t1
Morimp01GS001 AUGUSTUS gene 45970 47158 0.55 - . ID=MIM04M26Gene00002;Name=MIM04M26Gene00002
Morimp01GS001 AUGUSTUS mRNA 45970 47158 0.55 - . ID=MIM04M26Gene00002.t1;Parent=MIM04M26Gene00002
Morimp01GS001 AUGUSTUS exon 47073 47158 . - . ID=MIM04M26Gene00002.t1.exon1;Parent=MIM04M26Gene00002.t1
Morimp01GS001 AUGUSTUS CDS 47073 47158 0.6 - 0 ID=MIM04M26Gene00002.t1.CDS1;Parent=MIM04M26Gene00002.t1
Morimp01GS001 AUGUSTUS exon 46887 46999 . - . ID=MIM04M26Gene00002.t1.exon2;Parent=MIM04M26Gene00002.t1
Morimp01GS001 AUGUSTUS CDS 46887 46999 0.92 - 1 ID=MIM04M26Gene00002.t1.CDS2;Parent=MIM04M26Gene00002.t1
Morimp01GS001 AUGUSTUS exon 46644 46784 . - . ID=MIM04M26Gene00002.t1.exon3;Parent=MIM04M26Gene00002.t1
Morimp01GS001 AUGUSTUS CDS 46644 46784 0.91 - 2 ID=MIM04M26Gene00002.t1.CDS3;Parent=MIM04M26Gene00002.t1
Morimp01GS001 AUGUSTUS exon 45970 46559 . - . ID=MIM04M26Gene00002.t1.exon4;Parent=MIM04M26Gene00002.t1
Morimp01GS001 AUGUSTUS CDS 45970 46559 0.92 - 2 ID=MIM04M26Gene00002.t1.CDS4;Parent=MIM04M26Gene00002.t1
输入结果为c.txt
对结果再进行排序
sort -k1 -k2 -V c.txt > snp_type_F.list
snp_type_F.list结果如下:
Morimp01GS001 1418 Uninfluncial
Morimp01GS001 1476 Uninfluncial
Morimp01GS001 8939 Uninfluncial
Morimp01GS001 9251 Uninfluncial
Morimp01GS001 12787 Uninfluncial
Morimp01GS001 13324 Uninfluncial
Morimp01GS001 13446 Uninfluncial
Morimp01GS001 13866 Uninfluncial
Morimp01GS001 14181 Uninfluncial
Morimp01GS001 14320 Uninfluncial
Morimp01GS001 14344 Uninfluncial
Morimp01GS001 14606 Uninfluncial
Morimp01GS001 14869 MIM04M26Gene00001 down_stream 18718 16797
Morimp01GS001 14897 MIM04M26Gene00001 down_stream 18718 16797
Morimp01GS001 15056 MIM04M26Gene00001 down_stream 18718 16797
Morimp01GS001 15384 MIM04M26Gene00001 down_stream 18718 16797
Morimp01GS001 15564 MIM04M26Gene00001 down_stream 18718 16797
Morimp01GS001 15729 MIM04M26Gene00001 down_stream 18718 16797
Morimp01GS001 15747 MIM04M26Gene00001 down_stream 18718 16797
Morimp01GS001 15943 MIM04M26Gene00001 down_stream 18718 16797
Morimp01GS001 15981 MIM04M26Gene00001 down_stream 18718 16797
Morimp01GS001 16351 MIM04M26Gene00001 down_stream 18718 16797
Morimp01GS001 16374 MIM04M26Gene00001 down_stream 18718 16797
Morimp01GS001 16376 MIM04M26Gene00001 down_stream 18718 16797
Morimp01GS001 16472 MIM04M26Gene00001 down_stream 18718 16797
Morimp01GS001 16496 MIM04M26Gene00001 down_stream 18718 16797
Morimp01GS001 16640 MIM04M26Gene00001 down_stream 18718 16797
Morimp01GS001 16761 MIM04M26Gene00001 down_stream 18718 16797
Morimp01GS001 17077 MIM04M26Gene00001 gene_exon 18718 16797
Morimp01GS001 17340 MIM04M26Gene00001 gene_exon 18718 16797
Morimp01GS001 17470 MIM04M26Gene00001 gene_exon 18718 16797
Morimp01GS001 17478 MIM04M26Gene00001 gene_exon 18718 16797
Morimp01GS001 17963 MIM04M26Gene00001 gene_exon 18718 16797
Morimp01GS001 18077 MIM04M26Gene00001 gene_exon 18718 16797
Morimp01GS001 18153 MIM04M26Gene00001 gene_exon 18718 16797
Morimp01GS001 18448 MIM04M26Gene00001 gene_exon 18718 16797
Morimp01GS001 18449 MIM04M26Gene00001 gene_exon 18718 16797
Morimp01GS001 18798 MIM04M26Gene00001 up_stream 18718 16797
Morimp01GS001 18802 MIM04M26Gene00001 up_stream 18718 16797
Morimp01GS001 18970 MIM04M26Gene00001 up_stream 18718 16797
Morimp01GS001 19137 MIM04M26Gene00001 up_stream 18718 16797
Morimp01GS001 19292 MIM04M26Gene00001 up_stream 18718 16797
Morimp01GS001 19367 MIM04M26Gene00001 up_stream 18718 16797
Morimp01GS001 19394 MIM04M26Gene00001 up_stream 18718 16797
Morimp01GS001 19401 MIM04M26Gene00001 up_stream 18718 16797
Morimp01GS001 19446 MIM04M26Gene00001 up_stream 18718 16797
Morimp01GS001 19785 MIM04M26Gene00001 up_stream 18718 16797
Morimp01GS001 19832 MIM04M26Gene00001 up_stream 18718 16797
Morimp01GS001 19885 MIM04M26Gene00001 up_stream 18718 16797
次文件position存在重复(eg: 既在A gene内,又在B gene的上游)
处理一下
display_SnpType.py
内容如下:
#这个是用来处理SnpType.py
import sys,re,math
file1 = sys.argv[1]
file2 = sys.argv[2]
f1 = open(file1,'r')
f2 = open(file2,'w')
la = [0,0]
a = []
fuck = []
arr =[]
arrline = []
for i in f1:
if re.search("Uninfluncial",i):
f2.write(i)
else:
line = i.strip()
a = line.split("\t")
if a[1] == la[1]:
arr.append(la[4])
arr.append(la[5])
arrline.append(la)
else:
if la[1] == 0:
la = a
continue
arrline.append(la)
arr.append(la[4])
arr.append(la[5])
for j in arr:
c = abs(int(j)-int(la[1]))
fuck.append(c)
t = min(fuck)
zjx = fuck.index(t)
x = zjx//2
fuck = []
for zz in arrline[x]:
f2.write(zz+"\t")
f2.write("\n")
arrline = []
arr = []
la = a
arrline.append(la)
arr.append(la[4])
arr.append(la[5])
for j in arr:
c = abs(int(j)-int(la[1]))
fuck.append(c)
t = min(fuck)
zjx = fuck.index(t)
x = zjx//2
fuck = []
for zz in arrline[x]:
f2.write(zz+"\t")
f2.write("\n")
arrline = []
arr = []
f1.close()
f2.close()
python display_SnpType.py snp_type_F.list snp_type_F_clear.list
结果和处理之前差不多,就是根据距离筛选了与SNP的position关联最强的gene,一个position对应一个gene