ChIPpeakAnno 注释peak

# 1. 初步使用

  • 运行下面几行代码,初步了解ChIPpeakAnno使用
library(ChIPpeakAnno)
## import the MACS output
macs <- system.file("extdata", "MACS_peaks.xls", package="ChIPpeakAnno")
macsOutput <- toGRanges(macs, format="MACS")
## annotate the peaks with precompiled ensembl annotation
data(TSS.human.GRCh38)
macs.anno <- annotatePeakInBatch(macsOutput, AnnotationData=TSS.human.GRCh38)
## add gene symbols
library(org.Hs.eg.db)
macs.anno <- addGeneIDs(annotatedPeak=macs.anno, 
                        orgAnn="org.Hs.eg.db", 
                        IDs2Add="symbol")

if(interactive()){## annotate the peaks with UCSC annotation
    library(GenomicFeatures)
    library(TxDb.Hsapiens.UCSC.hg38.knownGene)
    ucsc.hg38.knownGene <- genes(TxDb.Hsapiens.UCSC.hg38.knownGene)
    macs.anno <- annotatePeakInBatch(macsOutput, 
                                     AnnotationData=ucsc.hg38.knownGene)
    macs.anno <- addGeneIDs(annotatedPeak=macs.anno, 
                            orgAnn="org.Hs.eg.db", 
                            feature_id_type="entrez_id",
                            IDs2Add="symbol")
}

# 2. 详细使用例子

ChIPpeakAnno使用GRanges 保存peak数据。ChIPpeakAnno内置toGRanges 函数可以将BED, GFF等格式转换为GRanges。

  • toGRanges()函数
toGRanges(data, format=c("BED", "GFF", "GTF", 
                                  "MACS", "MACS2", "MACS2.broad", 
                                  "narrowPeak", "broadPeak",
                                  "others"), 
                   header=FALSE, comment.char="#", colNames=NULL, ...)
bed <- system.file("extdata", "MACS_output.bed", package="ChIPpeakAnno")
gr1 <- toGRanges(bed, format="BED", header=FALSE) 
## one can also try import from rtracklayer
library(rtracklayer)
gr1.import <- import(bed, format="BED")
identical(start(gr1), start(gr1.import))

# [1] TRUE
gr1[1:2]

GRanges object with 2 ranges and 1 metadata column:
              seqnames      ranges strand |     score
                       | 
  MACS_peak_1     chr1 28341-29610      * |    160.81
  MACS_peak_2     chr1 90821-91234      * |    133.12
  -------
  seqinfo: 1 sequence from an unspecified genome; no seqlengths
gr1.import[1:2] #note the name slot is different from gr1

GRanges object with 2 ranges and 2 metadata columns:
      seqnames      ranges strand |        name     score
               |  
  [1]     chr1 28341-29610      * | MACS_peak_1    160.81
  [2]     chr1 90821-91234      * | MACS_peak_2    133.12
  -------
  seqinfo: 1 sequence from an unspecified genome; no seqlengths
gff <- system.file("extdata", "GFF_peaks.gff", package="ChIPpeakAnno")
gr2 <- toGRanges(gff, format="GFF", header=FALSE, skip=3)
ol <- findOverlapsOfPeaks(gr1, gr2)
  • 韦恩图展示overlap结果
makeVennDiagram(ol,
                fill=c("#009E73", "#F0E442"), # circle fill color
                col=c("#D55E00", "#0072B2"), #circle border color
                cat.col=c("#D55E00", "#0072B2"))
image.png
  • 饼图展示结果
pie1(table(ol$overlappingPeaks[["gr1///gr2"]]$overlapFeature))
image.png
  • 找到overlap peak后,可以使用annotatePeakInBatch在maxgap使用AnnotationData中的基因组特征对peak进行注释,在以下示例maxgap=5kb。
overlaps <- ol$peaklist[["gr1///gr2"]]
## ============== old style ===========
## data(TSS.human.GRCh37) 
## overlaps.anno <- annotatePeakInBatch(overlaps, AnnotationData=annoData, 
##                                      output="overlapping", maxgap=5000L)
## overlaps.anno <- addGeneIDs(overlaps.anno, "org.Hs.eg.db", "symbol")
## ============== new style ===========
library(EnsDb.Hsapiens.v75) ##(hg19)
## create annotation file from EnsDb or TxDb
annoData <- toGRanges(EnsDb.Hsapiens.v75, feature="gene")
annoData[1:2]
GRanges object with 2 ranges and 1 metadata column:
                  seqnames      ranges strand |   gene_name
                           | 
  ENSG00000223972     chr1 11869-14412      + |     DDX11L1
  ENSG00000227232     chr1 14363-29806      - |      WASH7P
  -------
  seqinfo: 273 sequences from GRCh37 genome
  • annotatePeakInBatch进行注释
    获取距离最近的TSS、miRNA、外显子等的距离
overlaps.anno <- annotatePeakInBatch(overlaps, AnnotationData=annoData, 
                                    output="overlapping", maxgap=5000L)
overlaps.anno$gene_name <- 
    annoData$gene_name[match(overlaps.anno$feature,
                             names(annoData))]
head(overlaps.anno)
GRanges object with 6 ranges and 11 metadata columns:
                       seqnames        ranges strand |
                                  |
  X001.ENSG00000228327     chr1 713791-715578      * |
  X001.ENSG00000237491     chr1 713791-715578      * |
  X002.ENSG00000237491     chr1 724851-727191      * |
  X003.ENSG00000272438     chr1 839467-840090      * |
  X004.ENSG00000223764     chr1 856361-856999      * |
  X004.ENSG00000187634     chr1 856361-856999      * |
                                                 peakNames
                                           
  X001.ENSG00000228327 gr1__MACS_peak_13,gr2__001,gr2__002
  X001.ENSG00000237491 gr1__MACS_peak_13,gr2__001,gr2__002
  X002.ENSG00000237491          gr2__003,gr1__MACS_peak_14
  X003.ENSG00000272438          gr1__MACS_peak_16,gr2__004
  X004.ENSG00000223764          gr1__MACS_peak_17,gr2__005
  X004.ENSG00000187634          gr1__MACS_peak_17,gr2__005
                              peak         feature
                            
  X001.ENSG00000228327         001 ENSG00000228327
  X001.ENSG00000237491         001 ENSG00000237491
  X002.ENSG00000237491         002 ENSG00000237491
  X003.ENSG00000272438         003 ENSG00000272438
  X004.ENSG00000223764         004 ENSG00000223764
  X004.ENSG00000187634         004 ENSG00000187634
                       start_position end_position
                                
  X001.ENSG00000228327         700237       714006
  X001.ENSG00000237491         714150       745440
  X002.ENSG00000237491         714150       745440
  X003.ENSG00000272438         840214       851356
  X004.ENSG00000223764         852245       856396
  X004.ENSG00000187634         860260       879955
                       feature_strand insideFeature
                             
  X001.ENSG00000228327              -  overlapStart
  X001.ENSG00000237491              +  overlapStart
  X002.ENSG00000237491              +        inside
  X003.ENSG00000272438              +      upstream
  X004.ENSG00000223764              -  overlapStart
  X004.ENSG00000187634              +      upstream
                       distancetoFeature shortestDistance
                                       
  X001.ENSG00000228327               215              215
  X001.ENSG00000237491              -359              359
  X002.ENSG00000237491             10701            10701
  X003.ENSG00000272438              -747              124
  X004.ENSG00000223764                35               35
  X004.ENSG00000187634             -3899             3261
                       fromOverlappingOrNearest     gene_name
                                       
  X001.ENSG00000228327              Overlapping RP11-206L10.2
  X001.ENSG00000237491              Overlapping RP11-206L10.9
  X002.ENSG00000237491              Overlapping RP11-206L10.9
  X003.ENSG00000272438              Overlapping  RP11-54O7.16
  X004.ENSG00000223764              Overlapping   RP11-54O7.3
  X004.ENSG00000187634              Overlapping        SAMD11
  -------
  seqinfo: 1 sequence from an unspecified genome; no seqlengths
  • 绘制peak与最近特征(如转录起始位点TSS)的距离分布。
gr1.copy <- gr1
gr1.copy$score <- 1
binOverFeature(gr1, gr1.copy, annotationData=annoData,
               radius=5000, nbins=10, FUN=c(sum, length),
               ylab=c("score", "count"), 
               main=c("Distribution of aggregated peak scores around TSS", 
                      "Distribution of aggregated peak numbers around TSS"))
image.png
  • assignChromosomeRegion()可以统计 exon, intron, enhancer, proximal promoter, 5’ UTR 和3’ UTR在peak中的分布。
if(require(TxDb.Hsapiens.UCSC.hg19.knownGene)){
    aCR<-assignChromosomeRegion(gr1, nucleotideLevel=FALSE, 
                           precedence=c("Promoters", "immediateDownstream", 
                                         "fiveUTRs", "threeUTRs", 
                                         "Exons", "Introns"), 
                           TxDb=TxDb.Hsapiens.UCSC.hg19.knownGene)
    barplot(aCR$percentage)
}
image.png

# 3. ChIPpeakAnno 各种运用场景

  • loading called peaks from BED, GFF, or other formats;
  • evaluating and visualizing the concordance among the biological replicates;
  • combining peaks from replicates; preparing genomic annotation(s) as GRanges;
  • associating/annotating peaks with the annotation(s); summarizing peak distributions over exon, intron, enhancer, proximal promoter, 5’UTR and 3’UTR regions;
  • retrieving the sequences around the peaks; and enrichment analysis of GO and biological pathway.
  • plot the heatmap of given peak ranges
  • perform permutation test to determine if there is a significant overlap between two sets of peaks.

## 3. 计算不同peak数据的overlap,并使用韦恩图展示

  • 通过寻找重复实验peak的overlap来评估实验之间的peak的一致性。也可以通过这种不同实验之间peak一致性来判断两个实验是不是存在一定的联系。
peaks1 <- GRanges(seqnames=c("1", "2", "3", "4", "5", "6", 
                              "2", "6", "6", "6", "6", "5"),
                   ranges=IRanges(start=c(967654, 2010897, 2496704, 3075869, 
                                          3123260, 3857501, 201089, 1543200, 
                                          1557200, 1563000, 1569800, 167889600),
                                  end= c(967754, 2010997, 2496804, 3075969, 
                                         3123360, 3857601, 201089, 1555199,
                                         1560599, 1565199, 1573799, 167893599),
                                  names=paste("Site", 1:12, sep="")),
                  strand="+")

peaks2 <- GRanges(seqnames=c("1", "2", "3", "4", "5", "6", "1", "2", "3", 
                                     "4", "5", "6", "6", "6", "6", "6", "5"),
                          ranges=IRanges(start=c(967659, 2010898, 2496700, 
                                                 3075866, 3123260, 3857500, 
                                                 96765, 201089, 249670, 307586, 
                                                 312326, 385750, 1549800, 
                                                 1554400, 1565000, 1569400,
                                                 167888600), 
                                         end=c(967869, 2011108, 2496920, 
                                               3076166,3123470, 3857780, 
                                               96985, 201299, 249890, 307796, 
                                               312586, 385960, 1550599, 1560799,
                                               1565399, 1571199, 167888999), 
                                         names=paste("t", 1:17, sep="")),
                          strand=c("+", "+", "+", "+", "+", "+", "-", "-", "-", 
                                   "-", "-", "-", "+", "+", "+", "+", "+"))

ol <- findOverlapsOfPeaks(peaks1, peaks2, maxgap=1000)
peaklist <- ol$peaklist
attributes(ol )
$names
[1] "venn_cnt"           "peaklist"           "uniquePeaks"       
[4] "mergedPeaks"        "peaksInMergedPeaks" "overlappingPeaks"  
[7] "all.peaks"         

$class
[1] "overlappingPeaks"

olpeaks1///peaks2中存放peak1和peak2的overlap peak

overlappingPeaks <- ol$overlappingPeaks
overlappingPeaks[["peaks1///peaks2"]][1:2, ]

##          peaks1 seqnames   start     end width strand peaks2 seqnames   start
## Site1_t1  Site1        1  967654  967754   101      +     t1        1  967659
## Site2_t2  Site2        2 2010897 2010997   101      +     t2        2 2010898
##              end width strand overlapFeature shortestDistance
## Site1_t1  967869   211      +   overlapStart                5
## Site2_t2 2011108   211      +   overlapStart                1
pie1(table(overlappingPeaks[["peaks1///peaks2"]]$overlapFeature))
image.png
  • 获取merged overlapping peaks
peaklist[["peaks1///peaks2"]]

## GRanges object with 11 ranges and 1 metadata column:
##        seqnames              ranges strand |
##                         |
##    [1]        1       967654-967869      + |
##    [2]        2       201089-201299      * |
##    [3]        2     2010897-2011108      + |
##    [4]        3     2496700-2496920      + |
##    [5]        4     3075866-3076166      + |
##    [6]        5     3123260-3123470      + |
##    [7]        5 167888600-167893599      + |
##    [8]        6     1543200-1560799      + |
##    [9]        6     1563000-1565399      + |
##   [10]        6     1569400-1573799      + |
##   [11]        6     3857500-3857780      + |
##                                        peakNames
##                                  
##    [1]                  peaks1__Site1,peaks2__t1
##    [2]                  peaks1__Site7,peaks2__t8
##    [3]                  peaks1__Site2,peaks2__t2
##    [4]                  peaks2__t3,peaks1__Site3
##    [5]                  peaks2__t4,peaks1__Site4
##    [6]                  peaks1__Site5,peaks2__t5
##    [7]                peaks2__t17,peaks1__Site12
##    [8] peaks1__Site8,peaks2__t13,peaks2__t14,...
##    [9]                peaks1__Site10,peaks2__t15
##   [10]                peaks2__t16,peaks1__Site11
##   [11]                  peaks2__t6,peaks1__Site6
##   -------
##   seqinfo: 6 sequences from an unspecified genome; no seqlengths
  • peaks1 中宇peaks2没有overlap的peak
peaklist[["peaks1"]]

## NULL
  • peaks2 中宇peaks1没有overlap的peak
peaklist[["peaks2"]]

## GRanges object with 5 ranges and 1 metadata column:
##       seqnames        ranges strand |       peakNames
##                  | 
##   [1]        1   96765-96985      - |      peaks2__t7
##   [2]        3 249670-249890      - |      peaks2__t9
##   [3]        4 307586-307796      - |     peaks2__t10
##   [4]        5 312326-312586      - |     peaks2__t11
##   [5]        6 385750-385960      - |     peaks2__t12
##   -------
##   seqinfo: 6 sequences from an unspecified genome; no seqlengths
  • 韦恩图
    并且输出overlapping 是否显著的p值
makeVennDiagram(ol, totalTest=1e+2,
                fill=c("#009E73", "#F0E442"), # circle fill color
                col=c("#D55E00", "#0072B2"), #circle border color
                cat.col=c("#D55E00", "#0072B2"))

$p.value
     peaks1 peaks2         pval
[1,]      1      1 5.890971e-12

$vennCounts
     peaks1 peaks2 Counts count.peaks1 count.peaks2
[1,]      0      0     83            0            0
[2,]      0      1      5            0            5
[3,]      1      0      0            0            0
[4,]      1      1     12           12           12
attr(,"class")
[1] "VennCounts"
image.png
  • 使用R 包Vernerable绘制韦恩图
#     install.packages("Vennerable", repos="http://R-Forge.R-project.org", 
#                     type="source")
#     library(Vennerable)
#     venn_cnt2venn <- function(venn_cnt){
#         n <- which(colnames(venn_cnt)=="Counts") - 1
#         SetNames=colnames(venn_cnt)[1:n]
#         Weight=venn_cnt[,"Counts"]
#         names(Weight) <- apply(venn_cnt[,1:n], 1, base::paste, collapse="")
#         Venn(SetNames=SetNames, Weight=Weight)
#     }
# 
#     v <- venn_cnt2venn(ol$venn_cnt)
#     plot(v)

  • findOverlapsOfPeaks 最多可以计算5 peak lists 的overlapping peak
peaks3 <- GRanges(seqnames=c("1", "2", "3", "4", "5", 
                             "6", "1", "2", "3", "4"),
                   ranges=IRanges(start=c(967859, 2010868, 2496500, 3075966,
                                          3123460, 3851500, 96865, 201189, 
                                          249600, 307386),
                                  end= c(967969, 2011908, 2496720, 3076166,
                                         3123470, 3857680, 96985, 201299, 
                                         249890, 307796),
                                  names=paste("p", 1:10, sep="")),
                  strand=c("+", "+", "+", "+", "+", 
                           "+", "-", "-", "-", "-"))

ol <- findOverlapsOfPeaks(peaks1, peaks2, peaks3, maxgap=1000, 
                          connectedPeaks="min")
makeVennDiagram(ol, totalTest=1e+2,
                fill=c("#CC79A7", "#56B4E9", "#F0E442"), # circle fill color
                col=c("#D55E00", "#0072B2", "#E69F00"), #circle border color
                cat.col=c("#D55E00", "#0072B2", "#E69F00"))
image.png

函数makeVennDiagram中的参数totalTest表示超几何检验中使用的潜在峰值总数。它应该大于最大peak数。设置得越小,测试越严格。用于计算p值的时间并不取决于totalTest的值。超几何检验要求用户输入给定TF的总潜在结合位点(峰值)的估计值。为了规避这一要求,ChIPpeakAnno 实现了一个名为permTest的置换检验。

## 生成注释数据

  • peak一半需要注释到基因组元件,ChIPpeakAnno 内置有TSS.human.NCBI36, TSS.human.GRCh37, TSS.human.GRCh38, TSS.mouse.NCBIM37, TSS.mouse.GRCm38, TSS.rat.RGSC3.4, TSS.rat.Rnor_5.0, TSS.zebrafish.Zv8, and TSS.zebrafish.Zv9。使用data()即可调用,data(TSS.human.GRCh38)。对于其他的注释信息,可以使用getAnnotation()函数设置参数featureType,“Exon” to obtain the nearest exon, “miRNA” to find the nearest miRNA, and “5utr” or “3utr” to locate the overlapping “5’UTR” or “3’UTR”. 更多的参数与biomaRt类似。
    类似使用TranscriptDb TxDb.Hsapiens.UCSC.hg19.knownGene 和toGRanges可以构建注释文件。
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
annoData <- toGRanges(TxDb.Hsapiens.UCSC.hg19.knownGene, feature="gene")
annoData

## GRanges object with 23056 ranges and 0 metadata columns:
##         seqnames              ranges strand
##                         
##       1    chr19   58858172-58874214      -
##      10     chr8   18248755-18258723      +
##     100    chr20   43248163-43280376      -
##    1000    chr18   25530930-25757445      -
##   10000     chr1 243651535-244006886      -
##     ...      ...                 ...    ...
##    9991     chr9 114979995-115095944      -
##    9992    chr21   35736323-35743440      +
##    9993    chr22   19023795-19109967      -
##    9994     chr6   90539619-90584155      +
##    9997    chr22   50961997-50964905      -
##   -------
##   seqinfo: 93 sequences (1 circular) from hg19 genome

## 注释

data(myPeakList)
annotatedPeak <- annotatePeakInBatch(myPeakList[1:6],
                                     AnnotationData = annoData)
annotatedPeak[1:3]

## GRanges object with 3 ranges and 9 metadata columns:
##                          seqnames        ranges strand |         peak
##                                     |  
##   X1_93_556427.100288069     chr1 556660-556760      * | X1_93_556427
##   X1_41_559455.100288069     chr1 559774-559874      * | X1_41_559455
##   X1_12_703729.100288069     chr1 703885-703985      * | X1_12_703729
##                              feature start_position end_position feature_strand
##                                        
##   X1_93_556427.100288069   100288069         700245       714068              -
##   X1_41_559455.100288069   100288069         700245       714068              -
##   X1_12_703729.100288069   100288069         700245       714068              -
##                          insideFeature distancetoFeature shortestDistance
##                                             
##   X1_93_556427.100288069    downstream            157408           143485
##   X1_41_559455.100288069    downstream            154294           140371
##   X1_12_703729.100288069        inside             10183             3640
##                          fromOverlappingOrNearest
##                                       
##   X1_93_556427.100288069          NearestLocation
##   X1_41_559455.100288069          NearestLocation
##   X1_12_703729.100288069          NearestLocation
##   -------
##   seqinfo: 24 sequences from an unspecified genome; no seqlengths

  • 使用自定义的注释文件
myPeak1 <- GRanges(seqnames=c("1", "2", "3", "4", "5", "6", 
                              "2", "6", "6", "6", "6", "5"),
                   ranges=IRanges(start=c(967654, 2010897, 2496704, 3075869, 
                                          3123260, 3857501, 201089, 1543200, 
                                          1557200, 1563000, 1569800, 167889600),
                                  end= c(967754, 2010997, 2496804, 3075969, 
                                         3123360, 3857601, 201089, 1555199,
                                         1560599, 1565199, 1573799, 167893599),
                                  names=paste("Site", 1:12, sep="")))

TFbindingSites <- GRanges(seqnames=c("1", "2", "3", "4", "5", "6", "1", "2", 
                                     "3", "4", "5", "6", "6", "6", "6", "6",
                                     "5"),
                          ranges=IRanges(start=c(967659, 2010898, 2496700, 
                                                 3075866, 3123260, 3857500, 
                                                 96765, 201089, 249670, 307586, 
                                                 312326, 385750, 1549800, 
                                                 1554400, 1565000, 1569400,
                                                 167888600), 
                                         end=c(967869, 2011108, 2496920, 
                                               3076166,3123470, 3857780, 
                                               96985, 201299, 249890, 307796, 
                                               312586, 385960, 1550599, 1560799,
                                               1565399, 1571199, 167888999), 
                                         names=paste("t", 1:17, sep="")),
                          strand=c("+", "+", "+", "+", "+", "+", "-", "-", "-", 
                                   "-", "-", "-", "+", "+", "+", "+", "+"))

annotatedPeak2 <- annotatePeakInBatch(myPeak1, AnnotationData=TFbindingSites)
annotatedPeak2[1:3]

## GRanges object with 3 ranges and 9 metadata columns:
##            seqnames          ranges strand |        peak     feature
##                         |  
##   Site1.t1        1   967654-967754      * |       Site1          t1
##   Site2.t2        2 2010897-2010997      * |       Site2          t2
##   Site3.t3        3 2496704-2496804      * |       Site3          t3
##            start_position end_position feature_strand insideFeature
##                            
##   Site1.t1         967659       967869              +  overlapStart
##   Site2.t2        2010898      2011108              +  overlapStart
##   Site3.t3        2496700      2496920              +        inside
##            distancetoFeature shortestDistance fromOverlappingOrNearest
##                                          
##   Site1.t1                -5                5          NearestLocation
##   Site2.t2                -1                1          NearestLocation
##   Site3.t3                 4                4          NearestLocation
##   -------
##   seqinfo: 6 sequences from an unspecified genome; no seqlengths
  • 饼图展示
pie1(table(as.data.frame(annotatedPeak2)$insideFeature))
image.png
  • GenomicFeatures::promoters可以获取启动子注释
library(ChIPpeakAnno)
data(myPeakList)
data(TSS.human.NCBI36)
annotationData <- promoters(TSS.human.NCBI36, upstream=5000, downstream=500)
annotatedPeak <- annotatePeakInBatch(myPeakList[1:6,], 
                                     AnnotationData=annotationData,
                                     output="overlapping")
annotatedPeak[1:3]

## GRanges object with 3 ranges and 9 metadata columns:
##                                seqnames        ranges strand |         peak
##                                           |  
##   X1_93_556427.ENSG00000209341     chr1 556660-556760      * | X1_93_556427
##   X1_93_556427.ENSG00000209344     chr1 556660-556760      * | X1_93_556427
##   X1_93_556427.ENSG00000209346     chr1 556660-556760      * | X1_93_556427
##                                        feature start_position end_position
##                                              
##   X1_93_556427.ENSG00000209341 ENSG00000209341         554314       559813
##   X1_93_556427.ENSG00000209344 ENSG00000209344         555569       561068
##   X1_93_556427.ENSG00000209346 ENSG00000209346         555643       561142
##                                feature_strand insideFeature distancetoFeature
##                                               
##   X1_93_556427.ENSG00000209341              -        inside              3153
##   X1_93_556427.ENSG00000209344              -        inside              4408
##   X1_93_556427.ENSG00000209346              -        inside              4482
##                                shortestDistance fromOverlappingOrNearest
##                                                     
##   X1_93_556427.ENSG00000209341             2346              Overlapping
##   X1_93_556427.ENSG00000209344             1091              Overlapping
##   X1_93_556427.ENSG00000209346             1017              Overlapping
##   -------
##   seqinfo: 24 sequences from an unspecified genome; no seqlengths
  • 除了将peak注释到最近的基因之外,ChIPpeakAnno还可以通过在注释PeakInBatch中设置output=“both”和maxgap来报告所有overlap和侧翼基因。例如,如果设置maxgap=5000和output=“both”,它输出5kb内所有重叠和侧翼基因,再加上最近的基因。
annotatedPeak <- annotatePeakInBatch(myPeakList[1:6],
                                     AnnotationData = annoData,
                                     output="both", maxgap=5000)
head(annotatedPeak)

## GRanges object with 6 ranges and 9 metadata columns:
##                          seqnames          ranges strand |          peak
##                                       |   
##   X1_93_556427.100288069     chr1   556660-556760      * |  X1_93_556427
##   X1_41_559455.100288069     chr1   559774-559874      * |  X1_41_559455
##   X1_12_703729.100288069     chr1   703885-703985      * |  X1_12_703729
##       X1_20_925025.84808     chr1   926058-926158      * |  X1_20_925025
##      X1_11_1041174.54991     chr1 1041646-1041746      * | X1_11_1041174
##       X1_14_1269014.1855     chr1 1270239-1270339      * | X1_14_1269014
##                              feature start_position end_position feature_strand
##                                        
##   X1_93_556427.100288069   100288069         700245       714068              -
##   X1_41_559455.100288069   100288069         700245       714068              -
##   X1_12_703729.100288069   100288069         700245       714068              -
##       X1_20_925025.84808       84808         910579       917473              -
##      X1_11_1041174.54991       54991        1017198      1051736              -
##       X1_14_1269014.1855        1855        1270658      1284492              -
##                          insideFeature distancetoFeature shortestDistance
##                                             
##   X1_93_556427.100288069    downstream            157408           143485
##   X1_41_559455.100288069    downstream            154294           140371
##   X1_12_703729.100288069        inside             10183             3640
##       X1_20_925025.84808      upstream             -8585             8585
##      X1_11_1041174.54991        inside             10090             9990
##       X1_14_1269014.1855    downstream             14253              319
##                          fromOverlappingOrNearest
##                                       
##   X1_93_556427.100288069          NearestLocation
##   X1_41_559455.100288069          NearestLocation
##   X1_12_703729.100288069          NearestLocation
##       X1_20_925025.84808          NearestLocation
##      X1_11_1041174.54991          NearestLocation
##       X1_14_1269014.1855              Overlapping
##   -------
##   seqinfo: 24 sequences from an unspecified genome; no seqlengths

## Add other feature IDs to the annotated peaks

data(annotatedPeak)
library(org.Hs.eg.db)
addGeneIDs(annotatedPeak[1:6], orgAnn="org.Hs.eg.db", IDs2Add=c("symbol"))

## GRanges object with 6 ranges and 9 metadata columns:
##                                   seqnames              ranges strand |
##                                                    |
##   X1_11_100272487.ENSG00000202254        1 100272801-100272900      + |
##   X1_11_108905539.ENSG00000186086        1 108906026-108906125      + |
##   X1_11_110106925.ENSG00000065135        1 110107267-110107366      + |
##   X1_11_110679983.ENSG00000197106        1 110680469-110680568      + |
##   X1_11_110681677.ENSG00000197106        1 110682125-110682224      + |
##   X1_11_110756560.ENSG00000116396        1 110756823-110756922      + |
##                                             peak         feature start_position
##                                                 
##   X1_11_100272487.ENSG00000202254 1_11_100272487 ENSG00000202254      100257218
##   X1_11_108905539.ENSG00000186086 1_11_108905539 ENSG00000186086      108918435
##   X1_11_110106925.ENSG00000065135 1_11_110106925 ENSG00000065135      110091233
##   X1_11_110679983.ENSG00000197106 1_11_110679983 ENSG00000197106      110693108
##   X1_11_110681677.ENSG00000197106 1_11_110681677 ENSG00000197106      110693108
##   X1_11_110756560.ENSG00000116396 1_11_110756560 ENSG00000116396      110753965
##                                   end_position insideFeature distancetoFeature
##                                                  
##   X1_11_100272487.ENSG00000202254    100257309    downstream             15582
##   X1_11_108905539.ENSG00000186086    109013624      upstream            -12410
##   X1_11_110106925.ENSG00000065135    110136975        inside             16033
##   X1_11_110679983.ENSG00000197106    110744824      upstream            -12640
##   X1_11_110681677.ENSG00000197106    110744824      upstream            -10984
##   X1_11_110756560.ENSG00000116396    110776666        inside              2857
##                                   shortestDistance fromOverlappingOrNearest
##                                                        
##   X1_11_100272487.ENSG00000202254            15491             NearestStart
##   X1_11_108905539.ENSG00000186086            12310             NearestStart
##   X1_11_110106925.ENSG00000065135            16033             NearestStart
##   X1_11_110679983.ENSG00000197106            12540             NearestStart
##   X1_11_110681677.ENSG00000197106            10884             NearestStart
##   X1_11_110756560.ENSG00000116396             2857             NearestStart
##                                        symbol
##                                   
##   X1_11_100272487.ENSG00000202254        
##   X1_11_108905539.ENSG00000186086       NBPF6
##   X1_11_110106925.ENSG00000065135       GNAI3
##   X1_11_110679983.ENSG00000197106     SLC6A17
##   X1_11_110681677.ENSG00000197106     SLC6A17
##   X1_11_110756560.ENSG00000116396       KCNC4
##   -------
##   seqinfo: 24 sequences from an unspecified genome; no seqlengths
addGeneIDs(annotatedPeak$feature[1:6], orgAnn="org.Hs.eg.db", 
           IDs2Add=c("symbol"))

##   ensembl_gene_id  symbol
## 1 ENSG00000065135   GNAI3
## 2 ENSG00000116396   KCNC4
## 3 ENSG00000197106 SLC6A17
## 4 ENSG00000186086   NBPF6
## 5 ENSG00000202254    

## 获取peak附近的序列

peaks <- GRanges(seqnames=c("NC_008253", "NC_010468"),
                 ranges=IRanges(start=c(100, 500), 
                                end=c(300, 600), 
                                names=c("peak1", "peak2")))
library(BSgenome.Ecoli.NCBI.20080805)
peaksWithSequences <- getAllPeakSequence(peaks, upstream=20, 
                                         downstream=20, genome=Ecoli)

保存为fa格式

write2FASTA(peaksWithSequences,"test.fa")

## Create heatmap for given feature/peak ranges

path <- system.file("extdata", package="ChIPpeakAnno")
files <- dir(path, "broadPeak")
data <- sapply(file.path(path, files), toGRanges, format="broadPeak")
names(data) <- gsub(".broadPeak", "", files)
ol <- findOverlapsOfPeaks(data)
#makeVennDiagram(ol)
features <- ol$peaklist[[length(ol$peaklist)]]
wid <- width(features)
feature.recentered <- feature.center <- features
start(feature.center) <- start(features) + floor(wid/2)
width(feature.center) <- 1
start(feature.recentered) <- start(feature.center) - 2000
end(feature.recentered) <- end(feature.center) + 2000
## here we also suggest importData function in bioconductor trackViewer package 
## to import the coverage.
## compare rtracklayer, it will save you time when handle huge dataset.
library(rtracklayer)
files <- dir(path, "bigWig")
if(.Platform$OS.type != "windows"){
    cvglists <- sapply(file.path(path, files), import, 
                       format="BigWig", 
                       which=feature.recentered, 
                       as="RleList")
}else{## rtracklayer can not import bigWig files on Windows
    load(file.path(path, "cvglist.rds"))
}
names(cvglists) <- gsub(".bigWig", "", files)
sig <- featureAlignedSignal(cvglists, feature.center, 
                            upstream=2000, downstream=2000) 
heatmap <- featureAlignedHeatmap(sig, feature.center, 
                                 upstream=2000, downstream=2000,
                                 upper.extreme=c(3,.5,4))
image.png
featureAlignedDistribution(sig, feature.center, 
                           upstream=2000, downstream=2000,
                           type="l")
image.png

## peak中motif 研究

peaks <- GRanges(seqnames=c("NC_008253", "NC_010468"),
                 ranges=IRanges(start=c(100, 500), 
                                end=c(300, 600), 
                                names=c("peak1", "peak2")))
filepath <- system.file("extdata", "examplePattern.fa", package="ChIPpeakAnno")
readLines(filepath)

## [1] ">ExamplePattern" "GGNCCK"          ">ExamplePattern" "AACCNM"
library(BSgenome.Ecoli.NCBI.20080805)
summarizePatternInPeaks(patternFilePath=filepath, format="fasta", skip=0L, 
                        BSgenomeName=Ecoli, peaks=peaks)

## peak注释的基因进行GO富集分析

library(org.Hs.eg.db)
over <- getEnrichedGO(annotatedPeak[1:500], orgAnn="org.Hs.eg.db", 
                    maxP=0.01, minGOterm=10, 
                    multiAdjMethod="BH",
                    condense=FALSE)
head(over[["bp"]][, -3])
##  [1] go.id               go.term             Ontology           
##  [4] pvalue              count.InDataset     count.InGenome     
##  [7] totaltermInDataset  totaltermInGenome   BH.adjusted.p.value
## [10] EntrezID           
## <0 rows> (or 0-length row.names)
head(over[["cc"]][, -3])

##  [1] go.id               go.term             Ontology           
##  [4] pvalue              count.InDataset     count.InGenome     
##  [7] totaltermInDataset  totaltermInGenome   BH.adjusted.p.value
## [10] EntrezID           
## <0 rows> (or 0-length row.names)
head(over[["mf"]][, -3])
egOrgMap("Mus musculus")
## [1] "org.Mm.eg.db"
egOrgMap("Homo sapiens")
## [1] "org.Hs.eg.db"

## Find peaks with bi-directional promoters

  • 双向启动子是位于两个相邻基因的转录起始位点(TSS)之间的DNA区域,它们在相反方向上转录,通常由该共享启动子区域共同调控。下面是一个示例,用于查找双向启动子附近的peak,并输出双向启动子周围peak的百分比。
data(myPeakList)
data(TSS.human.NCBI36)
seqlevelsStyle(TSS.human.NCBI36) <- seqlevelsStyle(myPeakList)
annotatedBDP <- peaksNearBDP(myPeakList[1:10,], 
                             AnnotationData=TSS.human.NCBI36, 
                             MaxDistance=5000)
annotatedBDP$peaksWithBDP

## GRangesList object of length 2:
## $`1`
## GRanges object with 2 ranges and 9 metadata columns:
##                seqnames        ranges strand |   bdp_idx         peak
##                           |   
##   X1_93_556427     chr1 556660-556760      * |         1 X1_93_556427
##   X1_93_556427     chr1 556660-556760      * |         1 X1_93_556427
##                        feature feature.ranges feature.strand  distance
##                                     
##   X1_93_556427 ENSG00000209349  556240-556304              -       355
##   X1_93_556427 ENSG00000209351  557933-557999              +      1172
##                insideFeature distanceToSite description
##                         
##   X1_93_556427      upstream            355            
##   X1_93_556427      upstream           1172            
##   -------
##   seqinfo: 24 sequences from an unspecified genome; no seqlengths
## 
## $`8`
## GRanges object with 2 ranges and 9 metadata columns:
##                 seqnames          ranges strand |   bdp_idx          peak
##                              |    
##   X1_14_1300250     chr1 1300503-1300603      * |         8 X1_14_1300250
##   X1_14_1300250     chr1 1300503-1300603      * |         8 X1_14_1300250
##                         feature  feature.ranges feature.strand  distance
##                                       
##   X1_14_1300250 ENSG00000175756 1298974-1300443              -        59
##   X1_14_1300250 ENSG00000218550 1303908-1304275              +      3304
##                 insideFeature distanceToSite            description
##                                     
##   X1_14_1300250      upstream             59 Aurora kinase A-inte..
##   X1_14_1300250      upstream           3304                       
##   -------
##   seqinfo: 24 sequences from an unspecified genome; no seqlengths
c(annotatedBDP$percentPeaksWithBDP, 
  annotatedBDP$n.peaks, 
  annotatedBDP$n.peaksWithBDP)

## [1]  0.2 10.0  2.0

## 置换检验置换测试计算两组peak之间的overlap的显著性

library(TxDb.Hsapiens.UCSC.hg19.knownGene)
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
cds <- unique(unlist(cdsBy(txdb)))
utr5 <- unique(unlist(fiveUTRsByTranscript(txdb)))
utr3 <- unique(unlist(threeUTRsByTranscript(txdb)))
set.seed(123)
utr3 <- utr3[sample.int(length(utr3), 1000)]
pt <- peakPermTest(utr3, 
             utr5[sample.int(length(utr5), 1000)], 
             maxgap=500,
             TxDb=txdb, seed=1,
             force.parallel=FALSE)
plot(pt)
## highly relevant peaks
ol <- findOverlaps(cds, utr3, maxgap=1)
pt1 <- peakPermTest(utr3,
             c(cds[sample.int(length(cds), 500)], 
                cds[queryHits(ol)][sample.int(length(ol), 500)]), 
             maxgap=500, 
             TxDb=txdb, seed=1,
             force.parallel=FALSE)
plot(pt1)
}

if(interactive()){
    data(HOT.spots)
    data(wgEncodeTfbsV3)
    hotGR <- reduce(unlist(HOT.spots))
    removeOl <- function(.ele){
        ol <- findOverlaps(.ele, hotGR)
        if(length(ol)>0) .ele <- .ele[-unique(queryHits(ol))]
        .ele
    }
    temp <- tempfile()
    download.file(file.path("http://hgdownload.cse.ucsc.edu", 
                            "goldenPath", "hg19", "encodeDCC", 
                            "wgEncodeRegTfbsClustered", 
                            "wgEncodeRegTfbsClusteredV3.bed.gz"), temp)
    data <- toGRanges(gzfile(temp, "r"), header=FALSE, format="others", 
                      colNames = c("seqnames", "start", "end", "TF"))
    unlink(temp)
    data <- split(data, data$TF)
    TAF1 <- removeOl(data[["TAF1"]])
    TEAD4 <- removeOl(data[["TEAD4"]])
    pool <- new("permPool", grs=GRangesList(wgEncodeTfbsV3), N=length(TAF1))
    pt <- peakPermTest(TAF1, TEAD4, pool=pool, ntimes=1000)
    plot(pt)
}

#原文

The ChIPpeakAnno user’s guide

你可能感兴趣的:(ChIPpeakAnno 注释peak)