处理NGS的新神器systemPipeR

首先附上软件链接:https://www.bioconductor.org/packages/release/bioc/html/systemPipeR.html
http://www.bioconductor.org/help/course-materials/2015/BioC2015/systemPipeR_Presentation.html
systemPipeR可以系统的,重头到尾的分析NGS数据
目前的Workflow templates有:
1.RNA-Seq Workflow
2.ChIP-Seq Workflow
3.VAR-Seq Workflow
4.Ribo-Seq Workflow
由于交作业是采用Chip-seq,所以这里分享我的收获心得,并且大部分代码都可以在这个包的readme上查到,那么我会对有些代码进行说明介绍和结果的展示

加载包

首先先安装包

options(BioC_mirror="http://mirrors.ustc.edu.cn/bioc/")
BiocManager::install("systemPipeRdata")
BiocManager::install("systemPipeR")
BiocManager::install("ChIPseeker")
BiocManager::install("ChIPpeakAnno")
BiocManager::install("seqLogo")
BiocManager::install("BCRANK")
install.packages("tidyr")
install.packages("graphlayouts")
install.packages("ggridges")

library(systemPipeR)
library(systemPipeRdata)

加载数据

然后我是利用包内的示例数据

setwd(choose.dir())
genWorkenvir(workflow = "chipseq")#在工作目录中自动创建了chipseq目录及子目录,并生成数据
setwd("chipseq")

这一步是生成示例数据


示例数据

数据质检

那么第一步,我们读取测序的fastq文件

targetspath <- system.file("extdata", "targets_chip.txt", package = "systemPipeR")
targets <- read.delim(targetspath, comment.char = "#")
targets[1:4, -c(5, 6)]

其中targets_chip.txt是数据信息


targets_chip.txt

下一步就是对测序的reads质量进行修剪

args <- systemArgs(sysma = "param/trim.param", mytargets = "targets_chip.txt")
filterFct <- function(fq, cutoff = 20, Nexceptions = 0) {
    qcount <- rowSums(as(quality(fq), "matrix") <= cutoff, na.rm = TRUE)
    fq[qcount <= Nexceptions]
    # 保留读取 Phred 分数 > = cutoff 为20的 reads,cutoff可以自定义
}
preprocessReads(args = args, Fct = "filterFct(fq, cutoff=20, Nexceptions=0)", 
    batchsize = 1e+05)
writeTargetsout(x = args, file = "targets_chip_trim.txt", overwrite = TRUE)

这一步相当于做fastqc和过滤
再来就是生成report

args <- systemArgs(sysma = "param/trim.param", mytargets = "targets_chip.txt")
library(BiocParallel)
library(batchtools)
f <- function(x) {
  library(systemPipeR)
  args <- systemArgs(sysma = "param/trim.param", mytargets = "targets_chip.txt")
  seeFastq(fastq = infile1(args)[x], batchsize = 1e+05, klength = 8)
}

resources <- list(walltime = 120, ntasks = 1, ncpus = 4, memory = 1024)
fqlist <- lapply(seq(along = trim), f)
pdf("./results/fastqReport.pdf", height = 18, width = 4 * length(fqlist))
seeFastqPlot(unlist(fqlist, recursive = FALSE))
dev.off()

结果

上图所示结果即为每个fastq文件的质检结果,与fastqc类似
而param/trim.param是一些参数
param/trim.param

比对

这一步建议在工作站上使用,过程较为长

rgs <- systemArgs(sysma = "param/bowtieSE.param", mytargets = "targets_chip_trim.txt")

sysargs(args)[1]  # 第一个 FASTQ 文件的命令行参数

system("bowtie2-build ./data/tair10.fasta ./data/tair10.fasta")
# 对基因组文件建立索引

resources <- list(walltime = 120, ntasks = 1, ncpus = cores(args), 
    memory = 1024)

reg <- clusterRun(args, conffile = ".batchtools.conf.R", Njobs = 18, 
    template = "batchtools.slurm.tmpl", runid = "01", resourceList = resources)

getStatus(reg = reg)
waitForJobs(reg = reg)
writeTargetsout(x = args, file = "targets_bam.txt", overwrite = TRUE)

#检测bam文件是否生成完
file.exists(outpaths(args))

其中param/bowtieSE.param,targets_chip_trim.txt和param/trim.param一样是一些参数,targets_chip_trim.txt与targets_chip.txt一样,只不过是修剪过的
而targets_bam.txt是输出的bam文件信息
接着我们可以统计下mapping的信息

read_statsDF <- alignStats(args = args)

write.table(read_statsDF, "results/alignStats.xls", row.names = FALSE, 
    quote = FALSE, sep = "\t")

read.delim("results/alignStats.xls")

peak calling

我们用MACS2来call peak
首先是要合并我们比对的bam文件

args <- systemArgs(sysma = NULL, mytargets = "targets_bam.txt")

args_merge <- mergeBamByFactor(args, overwrite = TRUE)

writeTargetsout(x = args_merge, file = "targets_mergeBamByFactor.txt", 
    overwrite = TRUE)

对于没有参考的call peaks

args <- systemArgs(sysma = "param/macs2_noinput.param", mytargets = "targets_mergeBamByFactor.txt")

sysargs(args)[1]  

runCommandline(args)
file.exists(outpaths(args))

writeTargetsout(x = args, file = "targets_macs.txt", overwrite = TRUE)

这是param/macs2_noinput.param的内容


param/macs2_noinput.param

那么有参的情况

writeTargetsRef(infile = "targets_mergeBamByFactor.txt", outfile = "targets_bam_ref.txt", 
    silent = FALSE, overwrite = TRUE)

args_input <- systemArgs(sysma = "param/macs2.param", mytargets = "targets_bam_ref.txt")

sysargs(args_input)[1]

runCommandline(args_input)
file.exists(outpaths(args_input))

writeTargetsout(x = args_input, file = "targets_macs_input.txt", 
    overwrite = TRUE)

我们注意观察到,无参和有参的区别是有没有这一句

writeTargetsRef(infile = "targets_mergeBamByFactor.txt", outfile = "targets_bam_ref.txt", 
    silent = FALSE, overwrite = TRUE)

有参的利用比对好的bam文件做成有参的bam_ref文件
第二个区别就是参数区别,无参采用param/macs2_noinput.param,有参采用param/macs2.param


targets_bam_ref.txt

利用ChIPpeakAnno注释

library(ChIPpeakAnno)
library(GenomicFeatures)

args <- systemArgs(sysma = "param/annotate_peaks.param", mytargets = "targets_macs.txt")

# txdb <- loadDb('./data/tair10.sqlite')
txdb <- makeTxDbFromGFF(file = "data/tair10.gff", format = "gff", 
    dataSource = "TAIR", organism = "Arabidopsis thaliana")

ge <- genes(txdb, columns = c("tx_name", "gene_id", "tx_type"))

for (i in seq(along = args)) {
    peaksGR <- as(read.delim(infile1(args)[i], comment = "#"), 
        "GRanges")
    annotatedPeak <- annotatePeakInBatch(peaksGR, AnnotationData = genes(txdb))
    df <- data.frame(as.data.frame(annotatedPeak), as.data.frame(values(ge[values(annotatedPeak)$feature, 
        ])))
    write.table(df, outpaths(args[i]), quote = FALSE, row.names = FALSE, 
        sep = "\t")
}

writeTargetsout(x = args, file = "targets_peakanno.txt", overwrite = TRUE)

这里的示例数据的注释是 Arabidopsis thaliana GFF文件

利用CHIPseeker注释

library(ChIPseeker)

args <- systemArgs(sysma = "param/annotate_peaks.param", mytargets = "targets_macs.txt")

# txdb <- loadDb('./data/tair10.sqlite')
txdb <- makeTxDbFromGFF(file = "data/tair10.gff", format = "gff", 
    dataSource = "TAIR", organism = "Arabidopsis thaliana")

ge <- genes(txdb, columns = c("tx_name", "gene_id", "tx_type"))

for (i in seq(along = args)) {
    peakAnno <- annotatePeak(infile1(args)[i], TxDb = txdb, verbose = FALSE)
    df <- as.data.frame(peakAnno)
    write.table(df, outpaths(args[i]), quote = FALSE, row.names = FALSE, 
        sep = "\t")
}

writeTargetsout(x = args, file = "targets_peakanno.txt", overwrite = TRUE)

#看一下注释情况,此处举个例子
peak <- readPeakFile(infile1(args)[1])

covplot(peak, weightCol = "X.log10.pvalue.")
peakHeatmap(outpaths(args)[1], TxDb = txdb, upstream = 1000, 
    downstream = 1000, color = "red")

plotAvgProf2(outpaths(args)[1], TxDb = txdb, upstream = 1000, 
    downstream = 1000, xlab = "Genomic Region (5'->3')", ylab = "Read Count Frequency")

对overlap区段的peak进行计数

library(GenomicRanges)

args <- systemArgs(sysma = "param/count_rangesets.param", mytargets = "targets_macs.txt")

args_bam <- systemArgs(sysma = NULL, mytargets = "targets_bam.txt")

bfl <- BamFileList(outpaths(args_bam), yieldSize = 50000, index = character())

countDFnames <- countRangeset(bfl, args, mode = "Union", ignore.strand = TRUE)

writeTargetsout(x = args, file = "targets_countDF.txt", overwrite = TRUE)

差异peaks分析

这里利用的是rundiff()这个函数,来比较实验组与对照组的差异peaks

args_diff <- systemArgs(sysma = "param/rundiff.param", mytargets = "targets_countDF.txt")

cmp <- readComp(file = args_bam, format = "matrix")

dbrlist <- runDiff(args = args_diff, diffFct = run_edgeR, targets = targetsin(args_bam), 
    cmp = cmp[[1]], independent = TRUE, dbrfilter = c(Fold = 2, 
        FDR = 1))

writeTargetsout(x = args_diff, file = "targets_rundiff.txt", 
    overwrite = TRUE)

注意可在dbrfilter = c(Fold = 2,
FDR = 1)这里设置fold change和FDR

GO富集

对peaks富集的基因做GO功能富集

args <- systemArgs(sysma = "param/macs2.param", mytargets = "targets_bam_ref.txt")

args_anno <- systemArgs(sysma = "param/annotate_peaks.param", 
    mytargets = "targets_macs.txt")

annofiles <- outpaths(args_anno)

gene_ids <- sapply(names(annofiles), function(x) unique(as.character(read.delim(annofiles[x])[, 
    "gene_id"])), simplify = FALSE)

load("data/GO/catdb.RData")

BatchResult <- GOCluster_Report(catdb = catdb, setlist = gene_ids, 
    method = "all", id_type = "gene", CLSZ = 2, cutoff = 0.9, 
    gocats = c("MF", "BP", "CC"), recordSpecGO = NULL)

Motif 分析

我们的目的是查看蛋白绑定在什么样的DNA区段,以及它有什么样的功能

library(Biostrings)
library(seqLogo)
library(BCRANK)

args <- systemArgs(sysma = "param/annotate_peaks.param", mytargets = "targets_macs.txt")

rangefiles <- infile1(args)

for (i in seq(along = rangefiles)) {
    df <- read.delim(rangefiles[i], comment = "#")
    peaks <- as(df, "GRanges")

    names(peaks) <- paste0(as.character(seqnames(peaks)), "_", 
        start(peaks), "-", end(peaks))

    peaks <- peaks[order(values(peaks)$X.log10.pvalue., decreasing = TRUE)]

    pseq <- getSeq(FaFile("./data/tair10.fasta"), peaks)
    names(pseq) <- names(peaks)
    writeXStringSet(pseq, paste0(rangefiles[i], ".fasta"))
}

#可视化
BCRANKout <- bcrank(paste0(rangefiles[1], ".fasta"), restarts = 25, 
    use.P1 = TRUE, use.P2 = TRUE)

toptable(BCRANKout)

topMotif <- toptable(BCRANKout, 1)

weightMatrix <- pwm(topMotif, normalize = FALSE)

weightMatrixNormalized <- pwm(topMotif, normalize = TRUE)

pdf("results/seqlogo.pdf")
seqLogo(weightMatrixNormalized)
dev.off()

新版的

当然,升级到R3.6以上,systemPipeR有一些新用法,原理是一样的

Before start, Make sure following packages are already installed:
#options(BioC_mirror="http://mirrors.ustc.edu.cn/bioc/")
#BiocManager::install("systemPipeRdata")
#BiocManager::install("systemPipeR")
#BiocManager::install("ChIPseeker")
#BiocManager::install("ChIPpeakAnno")
#BiocManager::install("seqLogo")
#BiocManager::install("BCRANK")
#install.packages("tidyr")
#install.packages("graphlayouts")
#install.packages("ggridges")

library(systemPipeR)
library(systemPipeRdata)
#####################################################
## 1. Generate workflow environment
#####################################################
setwd(choose.dir())
genWorkenvir(workflow = "chipseq")#在工作目录中自动创建了chipseq目录及子目录,并生成了实验数据
setwd("chipseq")

#####################################################
## 2. Read preprocessing
#####################################################
## 2.1 Experiment definition provided by targets file
## The targets file defines all FASTQ files and sample comparisons of ## the analysis workflow. 
targetspath <- system.file("extdata", "targets_chip.txt", package = "systemPipeR")
targets <- read.delim(targetspath, comment.char = "#")
targets[1:4, -c(5, 6)]

## 2.2 Read quality filtering and trimming
## The following example shows how one can design a custom read preprocessing
## function using utilities provided by the ShortRead package, and then apply it with
## preprocessReads in batch mode to all FASTQ samples referenced in the
## corresponding SYSargs2 instance (trim object below). More detailed information
## on read preprocessing is provided in systemPipeR's main vignette.
## First, we construct SYSargs2 object from cwl and yml param and targets files.
dir_path <- system.file("extdata/cwl/preprocessReads/trim-se", package = "systemPipeR")
trim <- loadWF(targets = targetspath, wf_file = "trim-se.cwl", input_file = "trim-se.yml", dir_path = dir_path)
trim <- renderWF(trim, inputvars = c(FileName = "_FASTQ_PATH1_", SampleName = "_SampleName_"))
output(trim)[1:2]
## Next, we execute the code for trimming all the raw data.
filterFct <- function(fq, cutoff = 20, Nexceptions = 0) {
  qcount <- rowSums(as(quality(fq), "matrix") <= cutoff, na.rm = TRUE)
  fq[qcount <= Nexceptions]
  # Retains reads where Phred scores are >= cutoff with N
  # exceptions
}
preprocessReads(args = trim, Fct = "filterFct(fq, cutoff=20, Nexceptions=0)", 
                batchsize = 1e+05)
writeTargetsout(x = trim, file = "targets_chip_trim.txt", step = 1, 
                new_col = "FileName", new_col_output_index = 1, overwrite = TRUE)

## 2.3 FASTQ quality report
## The following seeFastq and seeFastqPlot functions generate and plot a series of
## useful quality statistics for a set of FASTQ files including per cycle quality box 
## plots, base proportions, base-level quality trends, relative k-mer diversity, length
## and occurrence distribution of reads, number of reads above quality cutoffs and mean
## quality distribution. The results are written to a PDF file named fastqReport.pdf.
## Parallelization of FASTQ quality report via scheduler (e.g. Slurm) across several
## compute nodes.
library(BiocParallel)
library(batchtools)
f <- function(x) {
  targets <- system.file("extdata", "targets_chip.txt", package = "systemPipeR")
  dir_path <- system.file("extdata/cwl/preprocessReads/trim-se", 
                          package = "systemPipeR")
  trim <- loadWorkflow(targets = targets, wf_file = "trim-se.cwl", 
                       input_file = "trim-se.yml", dir_path = dir_path)
  trim <- renderWF(trim, inputvars = c(FileName = "_FASTQ_PATH1_", 
                                       SampleName = "_SampleName_"))
  seeFastq(fastq = infile1(trim)[x], batchsize = 1e+05, klength = 8)
}
resources <- list(walltime = 120, ntasks = 1, ncpus = 4, memory = 1024)
fqlist <- lapply(seq(along = trim), f)
pdf("./results/fastqReport.pdf", height = 18, width = 4 * length(fqlist))
seeFastqPlot(unlist(fqlist, recursive = FALSE))
dev.off()

#####################################################
## 3   Alignments 
#####################################################
3.1 Read mapping with Bowtie2
#The NGS reads of this project will be aligned with Bowtie2 against the reference
## genome sequence (Langmead and Salzberg 2012). The parameter settings of the
## aligner are defined in the bowtie2-index.cwl and bowtie2-index.yml files. In
## ChIP-Seq experiments it is usually more appropriate to eliminate reads mapping to
## multiple locations. To achieve this, users want to remove the argument setting
## -k 50 non-deterministic in the configuration files.
# Building the index:
dir_path <- system.file("extdata/cwl/bowtie2/bowtie2-idx", package = "systemPipeR")
idx <- loadWorkflow(targets = NULL, wf_file = "bowtie2-index.cwl",
input_file = "bowtie2-index.yml", dir_path = dir_path)
idx <- renderWF(idx)
idx
cmdlist(idx)
# Run in single machine
#runCommandline(idx, make_bam = FALSE)   # Skip, This must run on Linux
targets <- system.file("extdata", "targets_chip.txt", package = "systemPipeR")
dir_path <- system.file("extdata/cwl/bowtie2/bowtie2-se", package = "systemPipeR")
args <- loadWF(targets = targets, wf_file = "bowtie2-mapping-se.cwl",
input_file = "bowtie2-mapping-se.yml", dir_path = dir_path)
args <- renderWF(args, inputvars = c(FileName = "_FASTQ_PATH1_", SampleName = "_SampleName_"))
args
cmdlist(args)[1:2]
run the alignments sequentially on a single system.
args <- runCommandline(args, force = F)
#Check whether all BAM files have been created and write out the new targets file.
writeTargetsout(x = args, file = "targets_bam.txt", step = 1, new_col = "FileName", new_col_output_index = 1, overwrite = TRUE)
outpaths <- subsetWF(args, slot = "output", subset = 1, index = 1)
file.exists(outpaths)

# 3.2 Read and alignment stats
read_statsDF <- alignStats(args = args)
write.table(read_statsDF, "results/alignStats.xls", row.names = FALSE,quote = FALSE, sep = "\t")
read.delim("results/alignStats.xls")

#####################################################
## 4   Peak calling with MACS2 (Skip this step)
#####################################################
# Merging BAM files of technical and/or biological replicates can improve the
# sensitivity of the peak calling by increasing the depth of read coverage. The
# mergeBamByFactor function merges BAM files based on grouping information
# specified by a factor, here the Factor column of the imported targets file. It also
# returns an updated SYSargs2 object containing the paths to the merged BAM files
# as well as to any unmerged files without replicates. This step can be skipped if
# merging of BAM files is not desired.
# 4.1 Merge BAM files of replicates prior to peak calling
dir_path <- system.file("extdata/cwl/mergeBamByFactor", package = "systemPipeR")
args <- loadWF(targets = "targets_bam.txt", wf_file = "merge-bam.cwl", 
input_file = "merge-bam.yml", dir_path = dir_path)
args <- renderWF(args, inputvars = c(FileName = "_BAM_PATH_", SampleName = "_SampleName_"))
args_merge <- mergeBamByFactor(args = args, overwrite = TRUE)
writeTargetsout(x = args_merge, file = "targets_mergeBamByFactor.txt", 
step = 1, new_col = "FileName", new_col_output_index = 1, 
overwrite = TRUE)
# 4.2 Peak calling without input/reference sample
# MACS2 can perform peak calling on ChIP-Seq data with and without input samples
# (Zhang et al. 2008). The following performs peak calling without input on all samples
# specified in the corresponding args object. Note, due to the small size of the
# sample data, MACS2 needs to be run here with the nomodel setting. For real data
# sets, users want to remove this parameter in the corresponding *.param file(s).
dir_path <- system.file("extdata/cwl/MACS2/MACS2-noinput/", package = "systemPipeR")
args <- loadWF(targets = "targets_mergeBamByFactor.txt", wf_file = "macs2.cwl", 
input_file = "macs2.yml", dir_path = dir_path)
args <- renderWF(args, inputvars = c(FileName = "_FASTQ_PATH1_",                                     SampleName = "_SampleName_"))

runCommandline(args, make_bam = FALSE, force = T)
outpaths <- subsetWF(args, slot = "output", subset = 1, index = 1)
file.exists(outpaths)
writeTargetsout(x = args, file = "targets_macs.txt", step = 1, 
new_col = "FileName", new_col_output_index = 1, overwrite = TRUE)
# 4.3 Peak calling with input/reference sample
# To perform peak calling with input samples, they can be most conveniently specified
# in the SampleReference column of the initial targets file. The writeTargetsRef
# function uses this information to create a targets file intermediate for running
# MACS2 with the corresponding input samples.
writeTargetsRef(infile = "targets_mergeBamByFactor.txt", outfile = "targets_bam_ref.txt", 
silent = FALSE, overwrite = TRUE)
dir_path <- system.file("extdata/cwl/MACS2/MACS2-input/", package = "systemPipeR")
args_input <- loadWF(targets = "targets_bam_ref.txt", wf_file = "macs2-input.cwl", 
input_file = "macs2.yml", dir_path = dir_path)
args_input <- renderWF(args_input, inputvars = c(FileName1 = "_FASTQ_PATH1_", 
FileName2 = "_FASTQ_PATH2_", SampleName = "_SampleName_"))
cmdlist(args_input)[1]
## Run
args_input <- runCommandline(args_input, make_bam = FALSE, force = T)
outpaths_input <- subsetWF(args_input, slot = "output", subset = 1, 
index = 1)
file.exists(outpaths_input)
writeTargetsout(x = args_input, file = "targets_macs_input.txt", 
step = 1, new_col = "FileName", new_col_output_index = 1, 
overwrite = TRUE)
# The peak calling results from MACS2 are written for each sample to separate files in
# the results directory. They are named after the corresponding files with extensions
# used by MACS2.
# 4.4 Identify consensus peaks
# The following example shows how one can identify consensus preaks among two
# peak sets sharing either a minimum absolute overlap and/or minimum relative
# overlap using the subsetByOverlaps or olRanges functions, respectively. Note, the
# latter is a custom function imported below by sourcing it.
outpaths <- subsetWF(args, slot = "output", subset = 1, index = 1)  ## escolher um dos outputs index
peak_M1A <- outpaths["M1A"]
peak_M1A <-as(read.delim(peak_M1A, comment = "#")[, 1:3], "GRanges")
peak_A1A <- outpaths["A1A"]
peak_A1A <- as(read.delim(peak_A1A, comment = "#")[, 1:3], "GRanges")
(myol1 <- subsetByOverlaps(peak_M1A, peak_A1A, minoverlap = 1))
# Returns any overlap
myol2 <- olRanges(query = peak_M1A, subject = peak_A1A, output = "gr")
# Returns any overlap with OL length information
myol2[values(myol2)["OLpercQ"][, 1] >= 50]
# Returns only query peaks with a minimum overlap of 50%

## 5. Annotate peaks with genomic context
#####################################################
## 5.1 Annotation with ChIPpeakAnno package
## The following annotates the identified peaks with genomic context information
## using the ChIPpeakAnno and ChIPseeker packages, respectively (Zhu et al. 2010;
## Yu, Wang, and He 2015).
library(ChIPpeakAnno)
library(GenomicFeatures)
dir_path <- system.file("extdata/cwl/annotate_peaks", package = "systemPipeR")
# 将targets_macs.txt复制到chipseq目录中
args <- loadWF(targets = "targets_macs.txt", wf_file = "annotate-peaks.cwl", 
               input_file = "annotate-peaks.yml", dir_path = dir_path)
args <- renderWF(args, inputvars = c(FileName = "_FASTQ_PATH1_", 
                                     SampleName = "_SampleName_"))

txdb <- makeTxDbFromGFF(file = "data/tair10.gff", format = "gff", 
                        dataSource = "TAIR", organism = "Arabidopsis thaliana")
ge <- genes(txdb, columns = c("tx_name", "gene_id", "tx_type"))
# 将Peaks中的文件复制到chipseq/results目录中
for (i in seq(along = args)) {
  peaksGR <- as(read.delim(infile1(args)[i], comment = "#"), 
                "GRanges")
  annotatedPeak <- annotatePeakInBatch(peaksGR, AnnotationData = genes(txdb))
  df <- data.frame(as.data.frame(annotatedPeak), as.data.frame(values(ge[values(annotatedPeak)$feature, 
                                                                         ])))
  outpaths <- subsetWF(args, slot = "output", subset = 1, index = 1)
  write.table(df, outpaths[i], quote = FALSE, row.names = FALSE, 
              sep = "\t")
}
writeTargetsout(x = args, file = "targets_peakanno.txt", step = 1, 
                new_col = "FileName", new_col_output_index = 1, overwrite = TRUE)
## The peak annotation results are written for each peak set to separate files in the
## results directory. They are named after the corresponding peak files with
## extensions specified in the annotate_peaks.param file, here *.peaks.annotated.xls.

## 5.2 Annotation with ChIPseeker package
## Same as in previous step but using the ChIPseeker package for annotating the peaks.
library(ChIPseeker)
for (i in seq(along = args)) {
  peakAnno <- annotatePeak(infile1(args)[i], TxDb = txdb, verbose = FALSE)
  df <- as.data.frame(peakAnno)
  outpaths <- subsetWF(args, slot = "output", subset = 1, index = 1)
  write.table(df, outpaths[i], quote = FALSE, row.names = FALSE, 
              sep = "\t")
}
writeTargetsout(x = args, file = "targets_peakanno.txt", step = 1, 
                new_col = "FileName", new_col_output_index = 1, overwrite = TRUE)
# Summary plots provided by the ChIPseeker package. Here applied only to one sample for demonstration purposes.
peak <- readPeakFile(infile1(args)[1])
covplot(peak, weightCol = "X.log10.pvalue.")
outpaths <- subsetWF(args, slot = "output", subset = 1, index = 1)
peakHeatmap(outpaths[1], TxDb = txdb, upstream = 1000, downstream = 1000, 
            color = "red")
plotAvgProf2(outpaths[1], TxDb = txdb, upstream = 1000, downstream = 1000, 
             xlab = "Genomic Region (5'->3')", ylab = "Read Count Frequency")

#####################################################
## 6 Count reads overlapping peaks 
#####################################################
## The countRangeset function is a convenience wrapper to perform read counting
## iteratively over serveral range sets, here peak range sets. Internally, the read
## counting is performed with the summarizeOverlaps function from the
## GenomicAlignments package. The resulting count tables are directly saved to files,
## one for each peak set.
library(GenomicRanges)
dir_path <- system.file("extdata/cwl/count_rangesets", package = "systemPipeR")
args <- loadWF(targets = "targets_macs.txt", wf_file = "count_rangesets.cwl", 
               input_file = "count_rangesets.yml", dir_path = dir_path)
args <- renderWF(args, inputvars = c(FileName = "_FASTQ_PATH1_", 
                                     SampleName = "_SampleName_"))

## Bam Files
targets <- system.file("extdata", "targets_chip.txt", package = "systemPipeR")
dir_path <- system.file("extdata/cwl/bowtie2/bowtie2-se", package = "systemPipeR")
args_bam <- loadWF(targets = targets, wf_file = "bowtie2-mapping-se.cwl", 
                   input_file = "bowtie2-mapping-se.yml", dir_path = dir_path)
args_bam <- renderWF(args_bam, inputvars = c(FileName = "_FASTQ_PATH1_", 
                                             SampleName = "_SampleName_"))
args_bam <- output_update(args_bam, dir = FALSE, replace = TRUE, 
                          extension = c(".sam", ".bam"))
outpaths <- subsetWF(args_bam, slot = "output", subset = 1, index = 1)

# 将Bam中的文件复制到chipseq/results目录中
bfl <- BamFileList(outpaths, yieldSize = 50000, index = character())
countDFnames <- countRangeset(bfl, args, mode = "Union", ignore.strand = TRUE)
writeTargetsout(x = args, file = "targets_countDF.txt", step = 1, 
                new_col = "FileName", new_col_output_index = 1, overwrite = TRUE)

#####################################################
## 7 Differential binding analysis
#####################################################
## The runDiff function performs differential binding analysis in batch mode for
## several count tables using edgeR or DESeq2 (Robinson, McCarthy, and Smyth 2010;
## Love, Huber, and Anders 2014). Internally, it calls the functions run_edgeR and
## run_DESeq2. It also returns the filtering results and plots from the downstream
## filterDEGs function using the fold change and FDR cutoffs provided under the
## dbrfilter argument.
dir_path <- system.file("extdata/cwl/rundiff", package = "systemPipeR")
args_diff <- loadWF(targets = "targets_countDF.txt", wf_file = "rundiff.cwl", 
                    input_file = "rundiff.yml", dir_path = dir_path)
args_diff <- renderWF(args_diff, inputvars = c(FileName = "_FASTQ_PATH1_", 
                                               SampleName = "_SampleName_"))

cmp <- readComp(file = args_bam, format = "matrix")
dbrlist <- runDiff(args = args_diff, diffFct = run_edgeR, targets = targets.as.df(targets(args_bam)), 
                   cmp = cmp[[1]], independent = TRUE, dbrfilter = c(Fold = 2, 
                                                                     FDR = 1))
writeTargetsout(x = args_diff, file = "targets_rundiff.txt", 
                step = 1, new_col = "FileName", new_col_output_index = 1, 
                overwrite = TRUE)

#####################################################
## 8 GO term enrichment analysis
#####################################################
## The following performs GO term enrichment analysis for each annotated peak set.
dir_path <- system.file("extdata/cwl/annotate_peaks", package = "systemPipeR")
# 将targets_bam_ref.txt复制到chipseq目录中
args <- loadWF(targets = "targets_bam_ref.txt", wf_file = "annotate-peaks.cwl", 
               input_file = "annotate-peaks.yml", dir_path = dir_path)
args <- renderWF(args, inputvars = c(FileName1 = "_FASTQ_PATH1_", 
                                     FileName2 = "_FASTQ_PATH2_", SampleName = "_SampleName_"))

args_anno <- loadWF(targets = "targets_macs.txt", wf_file = "annotate-peaks.cwl", 
                    input_file = "annotate-peaks.yml", dir_path = dir_path)
args_anno <- renderWF(args_anno, inputvars = c(FileName = "_FASTQ_PATH1_", 
                                               SampleName = "_SampleName_"))
annofiles <- subsetWF(args_anno, slot = "output", subset = 1, 
                      index = 1)
gene_ids <- sapply(names(annofiles), function(x) unique(as.character(read.delim(annofiles[x])[, 
                                                                                              "geneId"])), simplify = FALSE)
load("data/GO/catdb.RData")
BatchResult <- GOCluster_Report(catdb = catdb, setlist = gene_ids, 
                                method = "all", id_type = "gene", CLSZ = 2, cutoff = 0.9, 
                                gocats = c("MF", "BP", "CC"), recordSpecGO = NULL)

#####################################################
## 9 Motif analysis
#####################################################
## 9.1 Parse DNA sequences of peak regions from genome
## Enrichment analysis of known DNA binding motifs or de novo discovery of novel
## motifs requires the DNA sequences of the identified peak regions. To parse the
## corresponding sequences from the reference genome, the getSeq function from
## the Biostrings package can be used. The following example parses the sequences
## for each peak set and saves the results to separate FASTA files, one for each peak
## set. In addition, the sequences in the FASTA files are ranked (sorted) by increasing ## p-values as expected by some motif discovery tools, such as BCRANK.
library(Biostrings)
library(seqLogo)
library(BCRANK)
dir_path <- system.file("extdata/cwl/annotate_peaks", package = "systemPipeR")
args <- loadWF(targets = "targets_macs.txt", wf_file = "annotate-peaks.cwl", 
               input_file = "annotate-peaks.yml", dir_path = dir_path)
args <- renderWF(args, inputvars = c(FileName = "_FASTQ_PATH1_", 
                                     SampleName = "_SampleName_"))

rangefiles <- infile1(args)
for (i in seq(along = rangefiles)) {
  df <- read.delim(rangefiles[i], comment = "#")
  peaks <- as(df, "GRanges")
  names(peaks) <- paste0(as.character(seqnames(peaks)), "_", 
                         start(peaks), "-", end(peaks))
  peaks <- peaks[order(values(peaks)$X.log10.pvalue., decreasing = TRUE)]
  pseq <- getSeq(FaFile("./data/tair10.fasta"), peaks)
  names(pseq) <- names(peaks)
  writeXStringSet(pseq, paste0(rangefiles[i], ".fasta"))
}

## 9.2 Motif discovery with BCRANK
## The Bioconductor package BCRANK is one of the many tools available for de novo
## discovery of DNA binding motifs in peak regions of ChIP-Seq experiments. The given
## example applies this method on the first peak sample set and plots the sequence
## logo of the highest ranking motif.
set.seed(0)
BCRANKout <- bcrank(paste0(rangefiles[1], ".fasta"), restarts = 25, 
                    use.P1 = TRUE, use.P2 = TRUE)
toptable(BCRANKout)
topMotif <- toptable(BCRANKout, 1)
weightMatrix <- pwm(topMotif, normalize = FALSE)
weightMatrixNormalized <- pwm(topMotif, normalize = TRUE)
pdf("results/seqlogo.pdf")
seqLogo(weightMatrixNormalized)
dev.off()

你可能感兴趣的:(处理NGS的新神器systemPipeR)