用snpeff产出的vcf提取4DTv位点,用于构建进化树 转载自https://blog.csdn.net/u012110870/article/details/105507476
徐州更 提到 2019年NG 414个西瓜重测序
分析方法里,可以从重测序的SNP中,选择4DTV位点来构建进化树。
代码来源于徐州更,
python3 calc_4dTv_in_eff_vcf.py input.vcf output.vcf ref.fa
从snpEff注释的vcf文件中提取4DTv位点的vcf.
calc_4dTv_in_eff_vcf.py
代码如下所示:
#!/usr/bin/env python3
from sys import argv
from pysam import VariantFile
from pysam import FastaFile
file_in = argv[1]
file_out = argv[2]
fafile = argv[3]
codon = set(["TC", "CT", "CC", "CG", "AC", "GT", "GC", "GG"])
rev_dict = dict(A='T',T='A', C='G', G='C')
bcf_in = VariantFile(file_in)
bcf_out = VariantFile(file_out, "w", header = bcf_in.header)
fa_in = FastaFile(fafile)
for rec in bcf_in.fetch():
ann = rec.info['ANN']
info = rec.info['ANN'][0].split('|')
# only use synonymouse variants
if info[1] != "synonymous_variant":
continue
# only the 3rd position can be 4dTv
if int(info[9][2:-3]) % 3 != 0:
continue
# determine the strand by the REF column and mutation
# if the ref is not same as the mutation site
if rec.ref == info[9][-3]:
pre = fa_in.fetch(rec.chrom, rec.pos-3, rec.pos-1)
else:
tmp = fa_in.fetch(rec.chrom, rec.pos, rec.pos+2)
tmp.upper()
pre = rev_dict[tmp[1]] + rev_dict[tmp[0]]
if pre not in codon:
continue
bcf_out.write(rec)
把4DTV的vcf转换为phylip软件需要的phy格式 vcf2phylip地址
vcf2phylip.py
代码如下:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
The script converts a collection of SNPs in VCF format into a PHYLIP, FASTA,
NEXUS, or binary NEXUS file for phylogenetic analysis. The code is optimized
to process VCF files with sizes >1GB. For small VCF files the algorithm slows
down as the number of taxa increases (but is still fast).
Any ploidy is allowed, but binary NEXUS is produced only for diploid VCFs.
"""
__author__ = "Edgardo M. Ortiz"
__credits__ = "Juan D. Palacio-Mejía"
__version__ = "2.4"
__email__ = "[email protected]"
__date__ = "2020-10-04"
import argparse
import gzip
import os
import random
import sys
# Dictionary of IUPAC ambiguities for nucleotides
# '*' is a deletion in GATK, deletions are ignored in consensus, lowercase consensus is udes when an
# 'N' or '*' is part of the genotype. Capitalization is used by some software but ignored by Geneious
# for example
ambiguities = {"*" :"-", "A" :"A", "C" :"C", "G" :"G", "N" :"N", "T" :"T",
"*A" :"a", "*C" :"c", "*G" :"g", "*N" :"n", "*T" :"t",
"AC" :"M", "AG" :"R", "AN" :"a", "AT" :"W", "CG" :"S",
"CN" :"c", "CT" :"Y", "GN" :"g", "GT" :"K", "NT" :"t",
"*AC" :"m", "*AG" :"r", "*AN" :"a", "*AT" :"w", "*CG" :"s",
"*CN" :"c", "*CT" :"y", "*GN" :"g", "*GT" :"k", "*NT" :"t",
"ACG" :"V", "ACN" :"m", "ACT" :"H", "AGN" :"r", "AGT" :"D",
"ANT" :"w", "CGN" :"s", "CGT" :"B", "CNT" :"y", "GNT" :"k",
"*ACG" :"v", "*ACN" :"m", "*ACT" :"h", "*AGN" :"r", "*AGT" :"d",
"*ANT" :"w", "*CGN" :"s", "*CGT" :"b", "*CNT" :"y", "*GNT" :"k",
"ACGN" :"v", "ACGT" :"N", "ACNT" :"h", "AGNT" :"d", "CGNT" :"b",
"*ACGN":"v", "*ACGT":"N", "*ACNT":"h", "*AGNT":"d", "*CGNT":"b", "*ACGNT":"N"}
# Dictionary for translating biallelic SNPs into SNAPP, only for diploid VCF
# 0 is homozygous reference
# 1 is heterozygous
# 2 is homozygous alternative
gen_bin = {"./.":"?",
".|.":"?",
"0/0":"0",
"0|0":"0",
"0/1":"1",
"0|1":"1",
"1/0":"1",
"1|0":"1",
"1/1":"2",
"1|1":"2"}
def extract_sample_names(vcf_file):
"""
Extract sample names from VCF file
"""
if vcf_file.endswith(".gz"):
opener = gzip.open
else:
opener = open
sample_names = []
with opener(vcf_file, "rt") as vcf:
for line in vcf:
line = line.strip("\n")
if line.startswith("#CHROM"):
record = line.split("\t")
sample_names = [record[i].replace("./", "") for i in range(9, len(record))]
break
return sample_names
def is_anomalous(record, num_samples):
"""
Determine if the number of samples in current record corresponds to number of samples described
in the line '#CHROM'
"""
return bool(len(record) != num_samples + 9)
def is_snp(record):
"""
Determine if current VCF record is a SNP (single nucleotide polymorphism) as opposed to MNP
(multinucleotide polymorphism)
"""
return bool(len(record[3]) == 1
and len(record[4]) - record[4].count(",") == record[4].count(",") + 1)
def num_genotypes(record, num_samples):
"""
Get number of genotypes in VCF record, total number of samples - missing genotypes
"""
missing = 0
for i in range(9, num_samples + 9):
if record[i].startswith("."):
missing += 1
return num_samples - missing
def get_matrix_column(record, num_samples, resolve_IUPAC):
"""
Transform a VCF record into a phylogenetic matrix column with nucleotides instead of numbers
"""
nt_dict = {str(0): record[3].replace("-","*"), ".": "N"}
alt = record[4].replace("-", "*")
alt = alt.split(",")
for n in range(len(alt)):
nt_dict[str(n+1)] = alt[n]
column = ""
for i in range(9, num_samples + 9):
genotype = record[i].split(":")[0].replace("/", "").replace("|", "")
if resolve_IUPAC:
column += nt_dict[random.choice(genotype)]
else:
column += ambiguities["".join(sorted(set([nt_dict[j] for j in genotype])))]
return column
def get_matrix_column_bin(record, num_samples):
"""
If VCF is diploid, return an alignment column in NEXUS binary from a VCF record
"""
column = ""
for i in range(9, num_samples + 9):
genotype = record[i].split(":")[0]
if len(genotype) == 3:
column += gen_bin[genotype]
else:
column += "?"
return column
def main():
parser = argparse.ArgumentParser(description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument("-i", "--input",
action = "store",
dest = "filename",
required = True,
help = "Name of the input VCF file, can be gzipped")
parser.add_argument("-m", "--min-samples-locus",
action = "store",
dest = "min_samples_locus",
type = int,
default = 4,
help = "Minimum of samples required to be present at a locus (default=4)")
parser.add_argument("-o", "--outgroup",
action = "store",
dest = "outgroup",
default = "",
help = "Name of the outgroup in the matrix. Sequence will be written as first taxon in the "
"alignment.")
parser.add_argument("-p", "--phylip-disable",
action = "store_true",
dest = "phylipdisable",
help = "A PHYLIP matrix is written by default unless you enable this flag")
parser.add_argument("-f", "--fasta",
action = "store_true",
dest = "fasta",
help = "Write a FASTA matrix, disabled by default")
parser.add_argument("-n", "--nexus",
action = "store_true",
dest = "nexus",
help = "Write a NEXUS matrix, disabled by default")
parser.add_argument("-b", "--nexus-binary",
action = "store_true",
dest = "nexusbin",
help = "Write a binary NEXUS matrix for analysis of biallelic SNPs in SNAPP, only diploid "
"genotypes will be processed, disabled by default.")
parser.add_argument("-r", "--resolve-IUPAC",
action = "store_true",
dest = "resolve_IUPAC",
help = "Randomly resolve heterozygous genotypes to avoid IUPAC ambiguities in the matrices")
parser.add_argument("-v", "--version",
action = "version",
version = "%(prog)s {version}".format(version=__version__))
args = parser.parse_args()
filename = args.filename
min_samples_locus = args.min_samples_locus
outgroup = args.outgroup.split(",")[0].split(";")[0]
phylipdisable = args.phylipdisable
fasta = args.fasta
nexus = args.nexus
nexusbin = args.nexusbin
resolve_IUPAC = args.resolve_IUPAC
# Get samples names and number of samples in VCF
sample_names = extract_sample_names(filename)
num_samples = len(sample_names)
if len(sample_names) == 0:
print("\nSample names not found in VCF, your file may be corrupt or missing the header.\n")
sys.exit()
print("\nConverting file '{}':\n".format(filename))
print("Number of samples in VCF: {:d}".format(len(sample_names)))
# If the 'min_samples_locus' is larger than the actual number of samples in VCF readjust it
min_samples_locus = min(num_samples, min_samples_locus)
# Output filename will be the same as input file, indicating the minimum of samples specified
if filename.endswith(".gz"):
outfile = filename.replace(".vcf.gz",".min"+str(min_samples_locus))
else:
outfile = filename.replace(".vcf",".min"+str(min_samples_locus))
# We need to create an intermediate file to hold the sequence data vertically and then transpose
# it to create the matrices
if fasta or nexus or not phylipdisable:
temporal = open(outfile+".tmp", "w")
# If binary NEXUS is selected also create a separate temporal
if nexusbin:
temporalbin = open(outfile+".bin.tmp", "w")
##########################
# PROCESS GENOTYPES IN VCF
if filename.endswith(".gz"):
opener = gzip.open
else:
opener = open
with opener(filename, "rt") as vcf:
# Initialize line counter
snp_num = 0
snp_accepted = 0
snp_shallow = 0
mnp_num = 0
snp_biallelic = 0
while 1:
# Load large chunks of file into memory
vcf_chunk = vcf.readlines(50000)
if not vcf_chunk:
break
for line in vcf_chunk:
line = line.strip()
if line and not line.startswith("#"): # skip empty and commented lines
# Split line into columns
record = line.split("\t")
# Keep track of number of genotypes processed
snp_num += 1
# Print progress every 500000 lines
if snp_num % 500000 == 0:
print("{:d} genotypes processed.".format(snp_num))
if is_anomalous(record, num_samples):
print("Skipped potentially malformed line: {}".format(line))
continue
else:
# Check if the SNP has the minimum number of samples required
if num_genotypes(record, num_samples) < min_samples_locus:
# Keep track of loci rejected due to exceeded missing data
snp_shallow += 1
continue
else:
# Check that neither REF nor ALT contain MNPs
if is_snp(record):
# Add to running sum of accepted SNPs
snp_accepted += 1
# If nucleotide matrices are requested
if fasta or nexus or not phylipdisable:
# Transform VCF record into an alignment column
site_tmp = get_matrix_column(record, num_samples, resolve_IUPAC)
# Uncomment for debugging
# print(site_tmp)
# Write entire row of single nucleotide genotypes to temp file
temporal.write(site_tmp+"\n")
# Write binary NEXUS for SNAPP if requested
if nexusbin:
# Check that the SNP only has two alleles
if len(record[4]) == 1:
# Add to running sum of biallelic SNPs
snp_biallelic += 1
# Translate genotype into 0 for homozygous REF, 1 for
# heterozygous, and 2 for homozygous ALT
binsite_tmp = get_matrix_column_bin(record, num_samples)
# Write entire row to temporary file
temporalbin.write(binsite_tmp+"\n")
else:
# Keep track of loci rejected due to multinucleotide genotypes
mnp_num += 1
# Print useful information about filtering of SNPs
print("Total of genotypes processed: {:d}".format(snp_num))
print("Genotypes excluded because they exceeded the amount "
"of missing data allowed: {:d}".format(snp_shallow))
print("Genotypes that passed missing data filter but were "
"excluded for being MNPs: {:d}".format(mnp_num))
print("SNPs that passed the filters: {:d}".format(snp_accepted))
if nexusbin:
print("Biallelic SNPs selected for binary NEXUS: {:d}".format(snp_biallelic))
print("")
if fasta or nexus or not phylipdisable:
temporal.close()
if nexusbin:
temporalbin.close()
#######################
# WRITE OUTPUT MATRICES
if not phylipdisable:
output_phy = open(outfile+".phy", "w")
output_phy.write("{:d} {:d}\n".format(len(sample_names), snp_accepted))
if fasta:
output_fas = open(outfile+".fasta", "w")
if nexus:
output_nex = open(outfile+".nexus", "w")
output_nex.write("#NEXUS\n\nBEGIN DATA;\n\tDIMENSIONS NTAX={:d} NCHAR={:d};\n\tFORMAT "
"DATATYPE=DNA MISSING=N GAP=- ;\nMATRIX\n".format(len(sample_names),
snp_accepted))
if nexusbin:
output_nexbin = open(outfile+".bin.nexus", "w")
output_nexbin.write("#NEXUS\n\nBEGIN DATA;\n\tDIMENSIONS NTAX={:d} NCHAR={:d};\n\tFORMAT "
"DATATYPE=SNP MISSING=? GAP=- ;\nMATRIX\n".format(len(sample_names),
snp_biallelic))
# Get length of longest sequence name
len_longest_name = 0
for name in sample_names:
if len(name) > len_longest_name:
len_longest_name = len(name)
# Write outgroup as first sequence in alignment if the name is specified
idx_outgroup = None
if outgroup in sample_names:
idx_outgroup = sample_names.index(outgroup)
if fasta or nexus or not phylipdisable:
with open(outfile+".tmp") as tmp_seq:
seqout = ""
# This is where the transposing happens
for line in tmp_seq:
seqout += line[idx_outgroup]
# Write FASTA line
if fasta:
output_fas.write(">"+sample_names[idx_outgroup]+"\n"+seqout+"\n")
# Pad sequences names and write PHYLIP or NEXUS lines
padding = (len_longest_name + 3 - len(sample_names[idx_outgroup])) * " "
if not phylipdisable:
output_phy.write(sample_names[idx_outgroup]+padding+seqout+"\n")
if nexus:
output_nex.write(sample_names[idx_outgroup]+padding+seqout+"\n")
# Print current progress
print("Outgroup, '{}', added to the matrix(ces).".format(outgroup))
if nexusbin:
with open(outfile+".bin.tmp") as bin_tmp_seq:
seqout = ""
# This is where the transposing happens
for line in bin_tmp_seq:
seqout += line[idx_outgroup]
# Write line of binary SNPs to NEXUS
padding = (len_longest_name + 3 - len(sample_names[idx_outgroup])) * " "
output_nexbin.write(sample_names[idx_outgroup]+padding+seqout+"\n")
# Print current progress
print("Outgroup, '{}', added to the binary matrix.".format(outgroup))
# Write sequences of the ingroup
for s in range(0, len(sample_names)):
if s != idx_outgroup:
if fasta or nexus or not phylipdisable:
with open(outfile+".tmp") as tmp_seq:
seqout = ""
# This is where the transposing happens
for line in tmp_seq:
seqout += line[s]
# Write FASTA line
if fasta:
output_fas.write(">"+sample_names[s]+"\n"+seqout+"\n")
# Pad sequences names and write PHYLIP or NEXUS lines
padding = (len_longest_name + 3 - len(sample_names[s])) * " "
if not phylipdisable:
output_phy.write(sample_names[s]+padding+seqout+"\n")
if nexus:
output_nex.write(sample_names[s]+padding+seqout+"\n")
# Print current progress
print("Sample {:d} of {:d}, '{}', added to the nucleotide matrix(ces).".format(
s+1, len(sample_names), sample_names[s]))
if nexusbin:
with open(outfile+".bin.tmp") as bin_tmp_seq:
seqout = ""
# This is where the transposing happens
for line in bin_tmp_seq:
seqout += line[s]
# Write line of binary SNPs to NEXUS
padding = (len_longest_name + 3 - len(sample_names[s])) * " "
output_nexbin.write(sample_names[s]+padding+seqout+"\n")
# Print current progress
print("Sample {:d} of {:d}, '{}', added to the binary matrix.".format(
s+1, len(sample_names), sample_names[s]))
if not phylipdisable:
output_phy.close()
if fasta:
output_fas.close()
if nexus:
output_nex.write(";\nEND;\n")
output_nex.close()
if nexusbin:
output_nexbin.write(";\nEND;\n")
output_nexbin.close()
if fasta or nexus or not phylipdisable:
os.remove(outfile+".tmp")
if nexusbin:
os.remove(outfile+".bin.tmp")
print( "\nDone!\n")
if __name__ == "__main__":
main()
注意:VCF文件中至少要有四组样本才能正常运行。
注意:vcf里分组的材料名长度必须要小于等于10个字符,多于10个字符的在后续分析中会被自动截断为10个。
python3 vcf2phylip.py -i sample.out.4DTv.vcf -o sample.phy
进化树构建
安装phylip
http://evolution.genetics.washington.edu/phylip.html
运行目录在exe里,添加到环境变量即可。
运行脚本phylip_tree.sh
,即可自动生成constree文件,然后使用figtree
桌面版可视化树。
运行方法:
bash phylip_tree.sh sample.phy sample_name
phylip_tree.sh
内容如下:
#目的:自动化生成phylip需要的par文件
#运行方法:bash phylip_tree.sh sample.phy sample_name
#sample_name是输出文件前缀
if [ $# -eq 0 ] || [ $# -eq 1 ];then
echo "Usage:
bash phylip_tree.sh sample.py sample_name"
exit 1
fi
#定义输入文件
sample=$1 #phy文件
simple=$2 #输出结果文件前缀
#定义输出par函数
function make_par(){
#cat seqboot.par
echo "$sample
R
1000
Y
9" >$simple.seqboot.par
#cat dnadist.par
echo "$simple.seqboot.out
T
2.3628
M
D
1000
2
Y" >$simple.dnadist.par
#cat neighbor.par
echo "$simple.dnadist.out
M
1000
9
Y" >$simple.neighbor.par
# cat consense.par
echo "$simple.nei.tree
Y">$simple.consense.par
}
###par文件参数讲解
<
或者使用mega7构建进化树参考1 参考2 mega的使用方法
研究表明,GS(基因组大小)和TE(转座子)的数量有着非常大的关系,自交过程中,TE被清除,导致GS变小。NC 参考文献
maize TE annotation 《nature》