Chipseq双端测序数据分析

Download fastq file

#method 1: sratoolkits
cat srr.txt | while read line
do
    prefetch -O $wkd/sra $line
done
#method 2: axel
#find the links from ebi (srr)
cat link.txt | while read line
do
    axel -n 30 ${line}
done

Define

nThread=32
bowtie2Index='~/reference.and.annotations/Homo_sapiens/UCSC/hg38/Sequence/Bowtie2Index/hg38'

mkdir

if [ ! -d trim ];then
        mkdir -p trim
fi
if [ ! -d bam ];then
        mkdir -p bam
fi
if [ ! -d bw ];then
        mkdir -p bw
fi
if [ ! -d peak ];then
        mkdir -p peak
fi

QC

#ls fastq/*.fq.gz | while read line
#do
#       aa=$(basename $line .fq.gz)
#       if [ ! -e fastqc/${aa}_fastqc.html ];then
#               fastqc -o fastqc -t $nThread $line
#       fi
#done

Loop

#default fastq file name: a.fq.gz
ls fastq/*.fq.gz | while read file
do
    aa=$(basename $file".fq.gz")
    bb=${aa%_*}
    echo $bb
done | sort | uniq | while read sample
do
    if [ ! -e trim/${sample}_1_val_1.fq.gz ];then   
        trim_galore --paired fastq/${sample}_1.fq.gz fastq/${sample}_2.fq.gz --gzip -o trim
    fi
    if [ ! -e bam/${sample}.bam ];then
                bowtie2 -I 15 -X 1000 -x $bowtie2Index -p $nThread -1 trim/${sample}_1_val_1.fq.gz -2 trim/${sample}_2_val_2.fq.gz | samtools sort -@ nThread -O bam -o bam/${sample}.bam
        fi
done

一些可选操作

samtools merge

samtools merge WT.bam 1.bam 2.bam
samtools merge KO.bam 3.bam 4.bam

bam2bw

# 不去除input
ls bam/*bam |while read file
do
    samtools index -@ $nThread $file
    id=$(basename $file .bam)
    bamCoverage -b $file -o bw/${id}.bw -p $nThread
done
bigwigCompare -p $nThread -b1 SMARCC1_arid1a-KO.bw -b2 input_arid1a-KO.bw -o bw/SMARCC1_arid1a-KO-exInput.bw

# 去除 input
ls bam/*bam |while read file
do
    samtools index -@ $nThread $file
done
bamCompare -p $nThread --bamfile1 bam/SMARCA4_H.bam --bamfile2 bam/input_H.bam --outFileName bw/SMARCA4_H.bw

macs2 callpeak

ls bam/*.bam | while read file
do
    id=$(basename $file .bam)
    macs2 callpeak -t $file -m 10 30 -p 1e-5 -f BAMPE -g hs -n ${id} --outdir peak 2>${id}.masc2.log
done

te

mkdir pic
nThread=16
size=5000
#samp="H2A119ub"

ls bed/*.bed | while read line
do
    id=$(basename $line ".bed")
    computeMatrix scale-regions  -p $nThread -S bw/*.bw -R $line  -a 5000 -b 5000 -o pic/${id}.computeMatrix
    plotHeatmap --matrixFile  pic/${id}.computeMatrix -o pic/${id}.WT_KO.point.heatmap.pdf --yAxisLabel ${id}.KO-WT.signal
    plotProfile --dpi 720 -m pic/${id}.computeMatrix  -out pic/${id}.WT_KO.heatmap.merge.pdf --plotTitle ${id}  --plotFileFormat pdf --perGroup  --yAxisLabel ${id}.Seq.KO-WT.signal
#computeMatrix reference-point -p $nThread --referencePoint TSS -b $size -a $size -S bw/*.bw -R bed/RNF2.bed  --skipZeros -o ./pic/${size}.mat.gz
done
computeMatrix reference-point -p $nThread --referencePoint TSS -b $size -a $size -S bw/*.bw -R bed/RNF2.bed  --skipZeros -o ./pic/${size}.mat.gz
plotHeatmap -m ./pic/${samp}.${size}.mat.gz -out ./pic/${samp}.${size}.heat.png --colorList green,yellow,red
plotProfile -m ./pic/${samp}.${size}.mat.gz -out ./pic/${samp}.${size}.profile.png --perGroup

computeMatrix reference-point -p 16 --referencePoint TSS -b 5000 -a 5000 -S bw/Rif1*H2A119ub.bw -R bed/Rif1.bed  --skipZeros -o ./pic/H2A119ub.mat.gz
plotHeatmap -m ./pic/H2A119ub.mat.gz -out ./pic/H2A119ub.heat.png
plotProfile -m ./pic/H2A119ub.mat.gz -out ./pic/H2A119ub.profile.png --perGroup

deeptools 可视化TSS上下游5K,peaks位于TSS下游一点

computeMatrix reference-point -p 16 --referencePoint TSS -b 5000 -a 5000 -S bw/*.bw -R ~/maos/annotation/chipseq/mm9/ucsc.refseq.jimmy.bed --skipZeros -o 5000.mat.gz
plotHeatmap -m 5000.mat.gz -out 5000.heat.png
plotProfile -m 5000.mat.gz -out 5000.profile.png --perGroup
plotHeatmap --dpi 720 -m 5000.mat.gz -out 5000.heat.pdf --plotFileFormat pdf
plotProfile --dpi 720 -m 5000.mat.gz -out 5000.profile.pdf --plotFileFormat pdf --perGroup
5000.profile.png

5000.heat.png

deeptools 可视化TSS上下游2K,peaks位于TSS下游一点

computeMatrix reference-point -p 16 --referencePoint TSS -b 2000 -a 2000 -S bw/*.bw -R ~/maos/annotation/chipseq/mm9/ucsc.refseq.jimmy.bed --skipZeros -o 2000.mat.gz
plotHeatmap -m 2000.mat.gz -out 2000.heat.png
plotProfile -m 2000.mat.gz -out 2000.profile.png --perGroup
plotHeatmap --dpi 720 -m 2000.mat.gz -out 2000.heat.pdf --plotFileFormat pdf
plotProfile --dpi 720 -m 2000.mat.gz -out 2000.profile.pdf --plotFileFormat pdf --perGroup
2000.profile.png

2000.heat.png

Chipseeker包注释与可视化

ChIPseeker的功能分为三类:

  1. 注释:提取peak附近最近的基因, 注释peak所在区域。
  2. 比较:估计ChIP peak数据集中重叠部分的显著性;整合GEO数据集,以便于将当前结果和已知结果比较。
  3. peak的覆盖情况;TSS区域结合的peak的平均表达谱和热图;基因组注释;TSS距离;peak和基因的重叠。

download packages

source ("https://bioconductor.org/biocLite.R")
biocLite("ChIPseeker")
biocLite("org.Mm.eg.db")
biocLite("TxDb.Mmusculus.UCSC.mm9.knownGene")
biocLite("clusterProfiler")
biocLite("ReactomePA")
biocLite("DOSE")

loading packages

library("ChIPseeker")
library("org.Mm.eg.db")
library("clusterProfiler")
library("TxDb.Mmusculus.UCSC.mm9.knownGene")
txdb <- TxDb.Mmusculus.UCSC.mm9.knownGene

code body

rkr<-readPeakFile("./peak/Rif1-KO-RNF2_peaks.narrowPeak")
rwr<-readPeakFile("./peak/Rif1-WT-RNF2_peaks.narrowPeak")
rkh<-readPeakFile("./peak/Rif1-KO-H2A119ub_peaks.narrowPeak")
rwh<-readPeakFile("./peak/Rif1-WT-H2A119ub_peaks.narrowPeak")

#单个peak做图
#Chip peaks coverage plot, whole genome
covplot(rkr)
#Chip peaks coverage plot, specific chr
covplot(rkr,chrs=paste("chr",c(1:19,"X","Y"),sep=""))
promoter <- getPromoters(TxDb=txdb, upstream=2000, downstream=2000)
tagMatrix <- getTagMatrix(rkr, windows=promoter)
tagHeatmap(tagMatrix, xlim=c(-2000, 2000), color="red")

#注释
#对每个样本都进行注释,单个PEAK分析
size<-5000
peakAnnoRKR <- annotatePeak(rkr, tssRegion=c(-size, size),TxDb=txdb, annoDb="org.Mm.eg.db",verbose=FALSE)
peakAnnoRWR <- annotatePeak(rkr, tssRegion=c(-size, size),TxDb=txdb, annoDb="org.Mm.eg.db",verbose=FALSE)
#
plotAnnoBar(peakAnnoRKR)
vennpie(peakAnnoRKR)
upsetplot(peakAnnoRKR) #upset技术适用于多于5个集合的表示情况。
plotDistToTSS(peakAnnoRKR,title="Distribution of transcription factor-binding loci\nrelative to TSS")

#多个peak的比较
#Heatmap of ChIP binding to TSS regions
peaks <- list(rkr=rkr,rwr=rwr,wkh=rkh,rwh=rwh)
peaks[[1]]
size=2000
promoter <- getPromoters(TxDb=txdb, upstream=size, downstream=size)
tagMatrixList <- lapply(peaks, getTagMatrix, windows=promoter)
#Error: cannot allocate vector of size 818.0 Mb
plotAvgProf(tagMatrixList, xlim=c(-size, size))
plotAvgProf(tagMatrixList, xlim=c(-size, size), conf=0.95,resample=500, facet="row")
tagHeatmap(tagMatrixList, xlim=c(-size, size), color=NULL)

#多个样本注释
size<-5000
peakAnnoList <- lapply(peaks, annotatePeak, TxDb=txdb,
                       tssRegion=c(-size, size), verbose=FALSE)
plotAnnoBar(peakAnnoList)
plotDistToTSS(peakAnnoList,title="Distribution of transcription factor-binding loci\nrelative to TSS")
#Overlap of peaks and annotated genes
genes= lapply(peakAnnoList, function(i) as.data.frame(i)$geneId)
vennplot(genes)

#KEGG
names(genes) = sub("_", "\n", names(genes))
compKEGG <- compareCluster(geneCluster=genes,fun="enrichKEGG",pvalueCutoff=0.05,pAdjustMethod="BH")

Distribution of transcription factor-binding loci relative to TSS

Distribution of transcription factor-binding loci relative to TSS
2.png

Overlap of peaks and annotated genes

Overlap of peaks and annotated genes

你可能感兴趣的:(Chipseq双端测序数据分析)