12月份月计划(改)--全外显子测序分析方法构建

参考流程:
生信技能树
(全)WES(全外显子测序)分析流程
WES流程文件解读1
http://userweb.eng.gla.ac.uk/cosmika.goswami/snp_calling/SNPCalling.html

xxx、流程搭建
文章:
数据集:SRP067248

1、数据下载

mkdir WGS_SRP067248
my_env=/mnt/d/WHQ/WGS_SRP067248

1)登录网站获取下载列表

下载列表

获取aspera地址

该地址保存为ena_fastq.txt(注:两列变一列)

2)数据批量下载

cd $my_env
mkdir fastq
cd fastq
vim ascp_com1
#!/bin/bash
cat ./ena_fastq.txt|while read id
do

  echo "ascp -QT -l 300m -P33001 -i ~/.aspera/connect/etc/asperaweb_id_dsa.openssh era-fasp@${id} ./"
done
:wq
./ascp_com1>ascp_com2
less ascp_com2
#注意此时com2中会存在一些奇怪的^M,要使用代码去掉,不然会报错
#产生的原因:https://blog.csdn.net/kehana/article/details/90766084
#去掉的办法:https://www.jb51.net/article/142224.htm
#去除多余的^M
sed -i ‘s/^M//g' ascp_com2
#注意:^M的输入方式是 Ctrl + v ,然后Ctrl + M
#在后台运行
nohup ./ascp_com2 &

3)整理数据

#备份样品名称列表
ls SRR*|while read id ;do echo "${id%%_*}";done|uniq>samplelist

#创建对应的文件夹并把PE-fastq转移进去
cat samplelist |while read id ; do mkdir $id;mv $id* ./id/; done
#检查是否有未下载好的文件
cat samplelist |while read id ; do ls ./$id/; done|grep partial|while read id; do id=${id%%.*}; echo `grep $id ena_ftp_dir.txt` ; done>faildownloadSRR
#未下载好的文件重新下载
vim ascp_com3
#!/bin/bash
cat ./faildownloadSRR|while read id
do
 echo "ascp -QT -l 300m -P33001 -i ~/.aspera/connect/etc/asperaweb_id_dsa.openssh era-fasp@${id} ./"
done
./ascp_com3>ascp_com4
./ascp_com4
#注意此时com4中会存在一些奇怪的^M,要使用代码去掉,不然会报错
#产生的原因:https://blog.csdn.net/kehana/article/details/90766084
#去掉的办法:https://www.jb51.net/article/142224.htm
#去除多余的^M
sed -i ‘s/^M//g' ascp_com4
#注意:^M的输入方式是 Ctrl + v ,然后Ctrl + M
#在后台运行
nohup ./ascp_com4 &

4)质控

#下载fastp:conda install fastp
cd $my_env
mkdir fastp
cd fastp
cat ../fastq/samplelist |while read id;  do a=`ls ../fastq/$id/*1.fastq.gz`; b=`ls ../fastq/$id/*2.fastq.gz`; c=$id_clean_1.fastq.gz; d=$id_clean_2.fastq.gz; fastp -i $a -o $c -I
$b -O $d; done 

5)构建索引
参考基因组下载(gatk)

cd $my_env
mkdir index
cd index
#gatk注释文件下载(太慢)
wget ftp://[email protected]/bundle/hg38/*
mkdir beta
cd beta
wget ftp://[email protected]/bundle/hg38/beta/*
#基因组(不是索引)下载(优先)
 wget ftp://[email protected]/bundle/hg38/Homo_sapiens_assembly38.fasta.gz
gunzip Homo_sapiens_assembly38.fasta.gz
mv Homo_sapiens_assembly38.fasta gatk_hg38_human.fasta
bwa index gatk_hg38_human.fasta
#用时1h左右

6)比对

cd $my_env
mkdir mapping
cd mapping
#按照教程,先比对一些子集,摸索一下条件
mkdir mapping_sub
cd mapping_sub
zless ../../fastp/SRR3022344_clean_1.fastq.gz |head -10000>SRR3022344_1_10000sub.gz
zless ../../fastp/SRR3022344_clean_2.fastq.gz |head -10000>SRR3022344_2_10000sub.gz
sample="SRR3022344"
 bwa mem  -R "@RG\tID:$sample\tSM:$sample\tLB:WGS\tPL:Illumina"  ../../index/genome_index/gatk_human_hg38.fasta SRR3022344_1_10000sub.gz SRR3022344_2_10000sub.gz |samtools sort -@ 5 -o SRR3022344_10000sub.bam
samtools view SRR3022344_10000sub.bam |tail -10|less
#bam文件的解释:https://www.jianshu.com/p/edb814e72270

#取全部的子集,在跑大数据的间隙用作练习
cd ../fastp/
mkdir fastp_sub
cd fastp_sub
cat ../../fastq/samplelist |while read id ; do  fq1=`find ../ -name $id*_1.*`; fq2=`find ../ -name $id*_2.*`; (zcat $fq1|head -1000>$(basename $id"_1_sub1000.gz")); (zcat $fq2|head -1000>$(basename $id"_2_sub1000.gz")); done
#取子集进行比对
cd ../../mapping/mapping_sub
cat ../../fastq/samplelist |while read id; do fq1=`find ../../fastp/fastp_sub/ -name $id*1_*`; fq2=`find ../../fastp/fastp_sub/ -name $id*2_*`; bwa mem -t 15 -R "@RG\tID:$id\tSM:$id\tLB:WGS\tPL:Illumina" ../../index/genome_index/gatk_human_hg38.fasta $fq1 $fq2 |samtools sort -@ 15 -o $id.bam -; done


#全部文件的比对代码
mkdir mapping_all
cd mapping_all
cat ../../fastq/samplelist |while read id ; do  sample=$id; R1=`find ../../fastp/ -name $id*"clean_1"*`; R2=`find ../../fastp/ -name $id*"clean_2"*`; bwa mem -t 15 -R "@RG\tID:$sample\tSM:$sample\tLB:WGS\tPL:Illumina" ../../index/genome_index/gatk_human_hg38.fasta $R1 $R2 |samtools sort -@ 15 -o $sample.bam -; done



bwa mem -R解释:会在bam文件中多生成一行,如果没有的话gatk会报错,但是samtools不会


image.png

7)简单寻找variation

#先跑小数据
ref= ref=../../index/genome_index/gatk_human_hg38.fasta
time samtools mpileup -ugf $ref  ../../mapping/mapping_sub/*.bam | bcftools call -vmO z -o sub.vcf.gz
#之后为了igv可视化
ls ../../mapping/mapping_sub*.bam |xargs -i samtools index {}

#跑大数据
mkdir ../easy_all
cd ../easy_all
ref= ref=../../index/genome_index/gatk_human_hg38.fasta
time samtools mpileup -ugf $ref  ../../mapping/mapping_sub/*.bam | bcftools call -vmO z -o sub.vcf.gz

你可能感兴趣的:(12月份月计划(改)--全外显子测序分析方法构建)