③ valr实例计算ChIP-seq中Upstream + GeneBody + Downstream上下游reads的分布

这里只考虑正向的,如果有反向的可以先转为正向。

# Summarizing interval coverage across genomic features
bedfile <- valr_example("genes.hg19.chr22.bed.gz")
genomefile <- valr_example("hg19.chrom.sizes.gz")
bgfile  <- valr_example("hela.h3k4.chip.bg.gz")

genes <- read_bed(bedfile, n_fields = 6)
genome <- read_genome(genomefile)
y <- read_bedgraph(bgfile)
# generate 1 bp TSS intervals, "+" strand only
tss <- genes %>%
  filter(strand == "+") %>%
  mutate(end = start + 1)

# 1000 bp up and downstream
region_size <- 2000
# 50 bp windows
win_size <- 100
bin_number <- region_size *2 /win_size
# add slop to the TSS, break into windows and add a group
# bed_slop函数扩展区域空间,延长区间上下游,这里延长上下游1KB
# bed_makewindows 扩展后的bed区间以50bp为一个bin进行划分
x <- tss %>%
  bed_slop(genome, both = region_size) %>%
  bed_makewindows(genome, win_size)
# map signals to TSS regions and calculate summary statistics.
# 使用bed_map函数统计每一个bin中的总reads数目
res <- bed_map(x, y, win_sum = sum(value, na.rm = TRUE)) %>%
  group_by(.win_id) %>%
  summarize(win_mean = mean(win_sum, na.rm = TRUE),
            win_sd = sd(win_sum, na.rm = TRUE))

library(ggplot2)

x_labels <- seq(-region_size, region_size, by = win_size * 5)
x_breaks <- seq(1, bin_number + 1, by = 5)

sd_limits <- aes(ymax = win_mean + win_sd, ymin = win_mean - win_sd)

p <- ggplot(res, aes(x = .win_id, y = win_mean)) +
  geom_point(size = 0.25) + geom_pointrange(sd_limits, size = 0.1) +
  scale_x_continuous(labels = x_labels, breaks = x_breaks) +
  xlab("Position (bp from TSS)") + ylab("Signal") +
  theme_classic()

TSS_Plot <- ggplot(res, aes(x = .win_id, y = win_mean)) +
  geom_point(size = 0.25) + geom_line(size = 1) + 
  scale_x_continuous(labels = c("-2Kb","TSS", "2Kb"), breaks = seq(0,40, by = bin_number/2))  +
  xlab("Position (bp from TSS)") + ylab("Signal") +
  theme_bw() + 
  theme(legend.key=element_rect(linetype='dashed',color="white"),
        axis.text.y = element_text(size=20),axis.text.x = element_text(size=20),
        legend.title = element_text(size=15),legend.text = element_text(size=15),
        legend.key.height=unit(1.2,'cm')) 


## 绘制TSS + GeneBody + TES 
## 生成genebody的counts数
gene_body <- genes %>%
  filter(strand == "+")

gene_body_x <- gene_body %>%
  bed_makewindows(genome, num_win = bin_number/2)

gene_body_res <- bed_map(gene_body_x, y, win_sum = sum(value, na.rm = TRUE)) %>%
  group_by(.win_id) %>%
  summarize(win_mean = mean(win_sum, na.rm = TRUE),
            win_sd = sd(win_sum, na.rm = TRUE))
gene_body_res[,1] <- c((bin_number/2 + 1):bin_number)
up_stream <- res[1:bin_number/2,]
# down_stream <- matrix(data = NA, ncol = 3, nrow = 20)
down_stream <- res[(bin_number/2 + 1):bin_number,]
down_stream[, 1] <- c((bin_number + 1):(bin_number/2 + bin_number))
new_data <- rbind(up_stream , gene_body_res, down_stream)
## ggplot2进行可视化
# 1000 bp up and downstream
region_size <- 2000
# 50 bp windows
win_size <- 100


ggplot(new_data, aes(x = .win_id, y = win_mean)) +
  geom_point(size = 0.25) + geom_line(size = 1)  +
  scale_x_continuous(labels = c("-2Kb","TSS", "TES", "2Kb"), breaks = seq(0,bin_number*3/2, by = bin_number/2)) +
  theme_bw() + xlab("") + ylab("Signal") +
  theme(legend.key=element_rect(linetype='dashed',color="white"),
        axis.text.y = element_text(size=20),axis.text.x = element_text(size=20),
        legend.title = element_text(size=15),legend.text = element_text(size=15),
        legend.key.height=unit(1.2,'cm')) 


image.png

从上图明显可以看出此信号明显的富集在TSS区右侧

你可能感兴趣的:(③ valr实例计算ChIP-seq中Upstream + GeneBody + Downstream上下游reads的分布)