pixy用于从vcf变异文件中计算pi(π),Dxy和Fst的值。
1.使用conda安装pixy github
conda install -c conda-forge pixy
conda install -c bioconda htslib
2.用法示例 官方文档
一键计算pi,fst,dxy
pixy --stats pi fst dxy \
--vcf data/vcf/ag1000/chrX_36Ag_allsites.vcf.gz \
--populations Ag1000_sampleIDs_popfile.txt \
--window_size 10000 \
--n_cores 8
通过指定bed来计算
pixy --stats pi fst dxy \
--vcf data/vcf/ag1000/chrX_36Ag_allsites.vcf.gz \
--populations Ag1000_sampleIDs_popfile.txt \
--bed_file genomic_windows.bed
参数 --chromosomes
可以指定染色体,有多条染色体使用逗号分割--chromosomes Chr1,Chr3,Chr10
--population
Ag1000_sampleIDs_popfile.txt 文件的格式如下:
ERS223827 BFS
ERS223759 BFS
ERS223750 BFS
ERS223967 AFS
ERS223970 AFS
ERS223924 AFS
ERS224300 AFS
ERS224168 KES
ERS224314 KES
第一列是snp中的样本的名称,第二列是分组信息,中间用制表符分割。
3.可视化
使用ggplot2
pixy_to_long <- function(pixy_files){
pixy_df <- list()
for(i in 1:length(pixy_files)){
stat_file_type <- gsub(".*_|.txt", "", pixy_files[i])
if(stat_file_type == "pi"){
df <- read_delim(pixy_files[i], delim = "\t")
df <- df %>%
gather(-pop, -window_pos_1, -window_pos_2, -chromosome,
key = "statistic", value = "value") %>%
rename(pop1 = pop) %>%
mutate(pop2 = NA)
pixy_df[[i]] <- df
} else{
df <- read_delim(pixy_files[i], delim = "\t")
df <- df %>%
gather(-pop1, -pop2, -window_pos_1, -window_pos_2, -chromosome,
key = "statistic", value = "value")
pixy_df[[i]] <- df
}
}
bind_rows(pixy_df) %>%
arrange(pop1, pop2, chromosome, window_pos_1, statistic)
}
绘制在所有染色体级别的分布图
# create a custom labeller for special characters in pi/dxy/fst
pixy_labeller <- as_labeller(c(avg_pi = "pi",
avg_dxy = "D[XY]",
avg_wc_fst = "F[ST]"),
default = label_parsed)
# plotting summary statistics across all chromosomes
pixy_df %>%
mutate(chrom_color_group = case_when(as.numeric(chromosome) %% 2 != 0 ~ "even",
chromosome == "X" ~ "even",
TRUE ~ "odd" )) %>%
mutate(chromosome = factor(chromosome, levels = c(1:22, "X", "Y"))) %>%
filter(statistic %in% c("avg_pi", "avg_dxy", "avg_wc_fst")) %>%
ggplot(aes(x = (window_pos_1 + window_pos_2)/2, y = value, color = chrom_color_group))+
geom_point(size = 0.5, alpha = 0.5, stroke = 0)+
facet_grid(statistic ~ chromosome,
scales = "free_y", switch = "x", space = "free_x",
labeller = labeller(statistic = pixy_labeller,
value = label_value))+
xlab("Chromsome")+
ylab("Statistic Value")+
scale_color_manual(values = c("grey50", "black"))+
theme_classic()+
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
panel.spacing = unit(0.1, "cm"),
strip.background = element_blank(),
strip.placement = "outside",
legend.position ="none")+
scale_x_continuous(expand = c(0, 0)) +
scale_y_continuous(expand = c(0, 0), limits = c(0,NA))