计算ChIP-seq的CPM值

首先,准备callpeak后的narrowpeak文件,和bam文件

利用bedtools multicov先计算每一个peak的count数。得到


然后R开始


# load packages ------

library(ChIPseeker)

library(org.Mm.eg.db)

library(gggenes)

library(ggplot2)

library(GenomeInfoDb)

library(GenomicFeatures)

library(diffloop)

library(GenomicRanges)

library(rio)

library(patchwork)

library(tidyverse)

library(venn)

library(ggsci)

library(scales)

library(edgeR)

library(stringi)

cols.use <- pal_d3("category10")(10)

show_col(cols.use)

options(scipen = 20)

options(ChIPseeker.ignore_1st_exon = T)

options(ChIPseeker.ignore_1st_intron = T)

options(ChIPseeker.ignore_downstream = T)

options(ChIPseeker.ignore_promoter_subcategory = F)

# prepera annotation files-------

mus102.anno <- makeTxDbFromGFF("C:/High-throughput sequencing/ensmble基因组/Mus_musculus.GRCm38.102.chr.gff3.gz",organism='Mus musculus')

seqlevels(mus102.anno)

promoter_ens <- getPromoters(TxDb=mus102.anno, upstream=3000, downstream=3000)

# load data-------

## chip reads counts

h3k4_0h_count  <- read.table('C:/High-throughput sequencing/!!!!GSE167940/ChIP-seq/计算reads/bedtools_multicov/H3K4me3_hCG_0h.peaks.reads.xls',header = F,sep = '\t')

h3k4_4h_count  <- read.table('C:/High-throughput sequencing/!!!!GSE167940/ChIP-seq/计算reads/bedtools_multicov//H3K4me3_hCG_4h.peaks.reads.xls',header = F,sep = '\t')

h3k27_0h_count <- read.table('C:/High-throughput sequencing/!!!!GSE167940/ChIP-seq/计算reads/bedtools_multicov/H3K27ac_hCG_0h.peaks.reads.xls',header = F,sep = '\t')

h3k27_4h_count <- read.table('C:/High-throughput sequencing/!!!!GSE167940/ChIP-seq/计算reads/bedtools_multicov/H3K27ac_hCG_4h.peaks.reads.xls',header = F,sep = '\t')

colnames(h3k4_0h_count) <- c('chr','start', 'end', 'peak_id', '-log10p', 'counts')

colnames(h3k4_4h_count) <- colnames(h3k4_0h_count)

colnames(h3k27_0h_count) <- colnames(h3k4_0h_count)

colnames(h3k27_4h_count) <- colnames(h3k4_0h_count)

## chip peaks

h3k4_0h_peaks  <- readPeakFile('C:/High-throughput sequencing/!!!!GSE167940/ChIP-seq/H3K4ME3/H3K4me3_hCG_0h_peaks.narrowPeak')

h3k4_4h_peaks  <- readPeakFile('C:/High-throughput sequencing/!!!!GSE167940/ChIP-seq/H3K4ME3/H3K4me3_hCG_4h_peaks.narrowPeak')

h3k27_0h_peaks <- readPeakFile('C:/High-throughput sequencing/!!!!GSE167940/ChIP-seq/H3K27ac/H3K27ac_hCG_0h_peaks.narrowPeak')

h3k27_4h_peaks <- readPeakFile('C:/High-throughput sequencing/!!!!GSE167940/ChIP-seq/H3K27ac/H3K27ac_hCG_4h_peaks.narrowPeak')


# peaks anno-------

# k4

k4_peaks <- list(h3k4_0h_peaks = h3k4_0h_peaks, h3k4_4h_peaks = h3k4_4h_peaks)

k4_peaks <- lapply(k4_peaks, rmchr)

seqlevels(k4_peaks[[1]])

k4_peakAnnoList <- lapply(k4_peaks, annotatePeak, TxDb=mus102.anno,tssRegion=c(-3000, 3000), verbose=FALSE,addFlankGeneInfo=TRUE, flankDistance=5000,annoDb="org.Mm.eg.db")

k4.anno.0h <- data.frame(k4_peakAnnoList$h3k4_0h_peaks)

k4.anno.4h <- data.frame(k4_peakAnnoList$h3k4_4h_peaks)

# k27

k27_peaks <- list(h3k27_0h_peaks = h3k27_0h_peaks, h3k27_4h_peaks = h3k27_4h_peaks)

k27_peaks <- lapply(k27_peaks, rmchr)

seqlevels(k27_peaks[[2]])

k27_peakAnnoList <- lapply(k27_peaks, annotatePeak, TxDb=mus102.anno,overlap='all',annoDb="org.Mm.eg.db")

k27.anno.0h <- data.frame(k27_peakAnnoList$h3k27_0h_peaks)

k27.anno.4h <- data.frame(k27_peakAnnoList$h3k27_4h_peaks)

# get cpm for chip------

h3k4_0h_count$cpm <- cpm(h3k4_0h_count$counts)

h3k4_4h_count$cpm <- cpm(h3k4_4h_count$counts)

h3k27_0h_count$cpm <- cpm(h3k27_0h_count$counts)

h3k27_4h_count$cpm <- cpm(h3k27_4h_count$counts)

# combine cpm with peaks's gene

colnames(k4.anno.0h)

h3k4_0h <- h3k4_0h_count %>% left_join( y = k4.anno.0h[,c(4,6,13,17,21,23)], by = c('peak_id' = 'V4'))

h3k4_4h <- h3k4_4h_count %>% left_join( y = k4.anno.4h[,c(4,6,13,17,21,23)], by = c('peak_id' = 'V4'))

h3k4_0h_gene.unique <- h3k4_0h %>% group_by(SYMBOL) %>% summarise(gene.cpm = sum(cpm))

h3k4_4h_gene.unique <- h3k4_4h %>% group_by(SYMBOL) %>% summarise(gene.cpm = sum(cpm))

h3k4.gene.cpm <- list(h3k4_0h_gene.unique = h3k4_0h_gene.unique, h3k4_4h_gene.unique = h3k4_4h_gene.unique)

export(h3k4.gene.cpm, file = 'h3k4.gene.cpm.corrected.xlsx')

h3k27_0h <- h3k27_0h_count %>% left_join( y = k27.anno.0h[,c(4,6,13,17,21,23)], by = c('peak_id' = 'V4'))

h3k27_4h <- h3k27_4h_count %>% left_join( y = k27.anno.4h[,c(4,6,13,17,21,23)], by = c('peak_id' = 'V4'))

h3k27_0h_gene.unique <- h3k27_0h %>% group_by(SYMBOL) %>% summarise(gene.cpm = sum(cpm))

h3k27_4h_gene.unique <- h3k27_4h %>% group_by(SYMBOL) %>% summarise(gene.cpm = sum(cpm))

h3k27.gene.cpm <- list(h3k27_0h_gene.unique = h3k27_0h_gene.unique, h3k27_4h_gene.unique = h3k27_4h_gene.unique)

export(h3k27.gene.cpm, file = 'h3k27.gene.cpm_2.xlsx')

你可能感兴趣的:(计算ChIP-seq的CPM值)