生信探索

转录组实战01: 从数据下载到定量fastp+STAR

生信交流与合作请关注公众号@生信探索

01.建立工作目录

cd ~
mkdir -p Project/Human_16_Asthma_Bulk
cd Project/Human_16_Asthma_Bulk

# 建立数据存放目录 data
mkdir -p data/rawdata  data/cleandata/fastp

# 建立STAR目录
mkdir STAR

02. 环境准备

micromamba create -n RNA
micromamba activate RNA 
micromamba install -y -c hcc aspera-cli
micromamba install -y -c bioconda fastqc multiqc fastp samtools star

03.准备文件

https://www.gencodegenes.org/human/

mkdir -p ~/DataHub/Genomics/GENCODE/release_42
cd ~/DataHub/Genomics/GENCODE/release_42

# 下载参考基因组及注释文件
#>>>downGENCODE.sh
wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_42/gencode.v42.annotation.gtf.gz
wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_42/GRCh38.p13.genome.fa.gz

mv gencode.v42.annotation.gtf.gz HS.gencode.v42.annotation.gtf.gz
mv GRCh38.p13.genome.fa.gz HS.GRCh38.p13.genome.fa.gz

gzip -kd HS.gencode.v42.annotation.gtf.gz
gzip -kd HS.GRCh38.p13.genome.fa.gz
#<<

nohup zsh downGENCODE.sh &> downGENCODE.sh.log &

04.建立索引

构建star的索引

mkdir -p ~/DataHub/Genomics/star_index/human
cd ~/DataHub/Genomics/star_index/human
n_jobs=12
#>>>star_index.sh>>>
STAR \
--runMode genomeGenerate \
--genomeDir ~/DataHub/Genomics/star_index/human \
--genomeFastaFiles ~/DataHub/Genomics/GENCODE/release_42/HS.GRCh38.p13.genome.fa \
--sjdbOverhang 100 \
--sjdbGTFfile ~/DataHub/Genomics/GENCODE/release_42/HS.gencode.v42.annotation.gtf \
--runThreadN ${n_jobs} 
#<<

nohup sh star_index.sh &> star_index.sh.log &

05.下载原始数据

https://www.ebi.ac.uk/ena/browser/view/PRJNA229998，下载TSV文件

cd ~/Project/Human_16_Asthma_Bulk/data/rawdata
cut -f 9 filereport_read_run_PRJNA229998_tsv.txt | tr ';' '\n' |grep '_[12].fastq.gz' > fq.txt

#>>>downloadFQ.sh
cat fq.txt |while read i
do
ascp -QT -l 300m -P33001 \
-i ~/micromamba/envs/RNA/etc/asperaweb_id_dsa.openssh \
era-fasp@$i \
.
done
#<<
nohup sh downloadFQ.sh &> downloadFQ.log & 

#制作了check.md5文件
cut -f 7 filereport_read_run_PRJNA229998_tsv.txt | sed '1d' | awk -F ';' '{print $(NF-1)"\n"$NF}' > md5.txt
cut -f 7 -d '/' fq.txt | paste md5.txt - > check.md5
rm md5.txt

#检查md5值是否一样，文件是否下载完整
d5sum -c check.md5

05.质控

质量评估

#fastqc
nohup fastqc -t 6 -o .  SRR*.fastq.gz > fastqc.log &
#multiqc
multiqc *.zip

fastp数据过滤

#>>>fastp.sh>>>
rawdata_dir=~/Project/Human_16_Asthma_Bulk/data/rawdata
cleandata_dir=~/Project/Human_16_Asthma_Bulk/data/cleandata/fastp
n_jobs=8
cut -f 4 ${rawdata_dir}/filereport_read_run_PRJNA229998_tsv.txt | sed '1d' > ${rawdata_dir}/run_accession.txt

cat ${rawdata_dir}/run_accession.txt | while read i
do
fastp \
--in1 ${rawdata}/${i}_1.fastq.gz \
--in2 ${rawdata}/${i}_2.fastq.gz  \
--out1 ${cleandata}/${i}_1.fastp.fq.gz \
--out2 ${cleandata}/${i}_2.fastp.fq.gz \
--json ${cleandata}/${i}.fastp.json \
--html ${cleandata}/${i}.fastp.html \
--report_title ${cleandata}/${i} \
--thread ${n_jobs}
done
#<<

cd ~/Project/Human_16_Asthma_Bulk/data/cleandata/fastp
nohup sh fastp.sh &> fastp.sh.log &

06.STAR数据比对和定量

统计对比结果

#>>>star.sh>>>
rawdata_dir=~/Project/Human_16_Asthma_Bulk/data/rawdata
index_dir=~/DataHub/Genomics/star_index/human
input_dir=~/Project/Human_16_Asthma_Bulk/data/cleandata/fastp
out_dir=~/Project/Human_16_Asthma_Bulk/STAR
n_jobs=8

cat ${rawdata_dir}/run_accession.txt | while read i
do
STAR \
--readFilesIn ${input_dir}/${i}_1.fastp.fq.gz ${input_dir}/${i}_2.fastp.fq.gz \
--outSAMattrRGline ID:sample SM:sample PL:ILLUMINA \
--genomeDir ${index_dir} \
--readFilesCommand zcat \
--runThreadN ${n_jobs} \
--twopassMode Basic \
--outFilterMultimapNmax 20 \
--alignSJoverhangMin 8 \
--alignSJDBoverhangMin 1 \
--outFilterMismatchNmax 999 \
--outFilterMismatchNoverLmax 0.1 \
--alignIntronMin 20 \
--alignIntronMax 1000000 \
--alignMatesGapMax 1000000 \
--outFilterType BySJout \
--outFilterScoreMinOverLread 0.33 \
--outFilterMatchNminOverLread 0.33 \
--limitSjdbInsertNsj 1200000 \
--outFileNamePrefix ${out_dir}/${i} \
--outSAMstrandField intronMotif \
--outFilterIntronMotifs None \
--alignSoftClipAtReferenceEnds Yes \
--quantMode TranscriptomeSAM GeneCounts \
--outSAMtype BAM SortedByCoordinate \
--outSAMunmapped Within \
--genomeLoad NoSharedMemory \
--chimSegmentMin 15 \
--chimJunctionOverhangMin 15 \
--chimOutType Junctions SeparateSAMold WithinBAM SoftClip \
--chimOutJunctionFormat 1 \
--chimMainSegmentMultNmax 1 \
--outSAMattributes NH HI AS nM NM ch
done
#<<

nohup sh star.sh &> star.sh.log &

整理count数据，想不出怎么用linux合并数据框比较优雅，所以用Python吧还是

from pathlib import Path
import pandas as pd
import datatable as dt

dir="STAR"
count_list = []
tpm_list = []
paths = Path(dir).glob("*ReadsPerGene.out.tab")

for x,y in enumerate(list(paths)):
    if x < 1:
        Geneid =  pd.read_csv(y,usecols=[0],sep='\t',skiprows=4,header=None)
        Geneid.rename(columns={0:'Geneid'},inplace=True)
    _count_df = pd.read_csv(y,usecols=[1],sep='\t',skiprows=4,header=None)
    count_list.append(_count_df.rename(columns={1:y.name.split('Reads')[0]}))

count_df = pd.concat(count_list,axis=1)
count_df.insert(0,column='Geneid',value=Geneid)
dt.Frame(count_df).to_csv('star_count.csv.gz')

对比质量

#>>>star_qc.sh>>>
cd ~/Human_16_Asthma_Bulk/STAR
n_jobs=12
ls *sortedByCoord.out.bam | while read i
do
samtools flagstat -@ ${n_jobs} $i > ${i/bam/flagstat}
done
multiqc -o ./  *.flagstat
#<<
nohup zsh star_qc.sh &> star_qc.sh.log &

Reference

https://www.bioinformatics.babraham.ac.uk/projects/trim_galore/

https://github.com/OpenGene/fastp

https://samtools.github.io/

https://www.jianshu.com/p/2fa4f5b870f5

https://www.jianshu.com/p/6da36135e2d1

https://docs.gdc.cancer.gov/Data/Bioinformatics_Pipelines/Expression_mRNA_Pipeline/

https://zhuanlan.zhihu.com/p/360427232