2019-10-14-周末班-表观调控(续)

昨天上完课没有整理完,今天继续

9.之前用的参考基因组不是文章中所用的需要进一步优化

进行基因组的坐标转换,这一步真的有点难,转换的方式比较多可以参考生信菜鸟团的博文https://cloud.tencent.com/developer/article/1054520
暂时先用曾老师代码,尝试理解,后面再深入学习

library(rtracklayer)
path='dm3ToDm6.over.chain'
ch = import.chain(path)
ch
 
require(ChIPseeker) 
require(TxDb.Dmelanogaster.UCSC.dm3.ensGene  )
txdb <- TxDb.Dmelanogaster.UCSC.dm3.ensGene
require(clusterProfiler) 
library(ChIPpeakAnno)
peak_dm3 <- readPeakFile('Pho_WT.narrowPeak.bed')  

seqlevelsStyle(peak_dm3) = "UCSC"  # necessary
peak_dm6 = liftOver(peak_dm3, ch)
class(peak_dm6)
peak_dm3
peak_dm6

他在演示的时候这个R脚本运行较慢,最后用shell脚本完成相关内容,还需要在消化一下

10. 再次重画韦恩图根据原文查看分别在H3K27me3 domain内外的peaks

rm(list=ls())
require(ChIPseeker) 
require(TxDb.Dmelanogaster.UCSC.dm3.ensGene  )
txdb <- TxDb.Dmelanogaster.UCSC.dm3.ensGene
require(clusterProfiler) 
(bedFiles=list.files(pattern = '*dm6'))
bedFiles=bedFiles[-2]
library(ChIPpeakAnno)
fs=lapply(bedFiles,function(x){
  peak <- readPeakFile( x )  
  keepChr= !grepl('Het',seqlevels(peak)) 
  seqlevels(peak, pruning.mode="coarse") <- seqlevels(peak)[keepChr]
  peak
})
library(stringr)
str_split(bedFiles,'_',simplify = T)[,1]
H3K27 <- readPeakFile('H3K27_peaks.narrowPeak')  
keepChr= seqlevels(H3K27) %in% c('2L','2R','3L','3R','4','X','Y','M')
seqlevels(H3K27, pruning.mode="coarse") <- seqlevels(H3K27)[keepChr]
seqlevels(H3K27, pruning.mode="coarse") <- paste0('chr',seqlevels(H3K27))
seqlevels(H3K27)
tmp1=findOverlapsOfPeaks(fs[[1]], H3K27)
tmp2=findOverlapsOfPeaks(fs[[2]], H3K27)
tmp3=findOverlapsOfPeaks(fs[[3]], H3K27)

ol <- findOverlapsOfPeaks(tmp1$peaklist$`fs..1..///H3K27`,
                          tmp2$peaklist$`fs..2..///H3K27`,
                          tmp3$peaklist$`fs..3..///H3K27`)
png('3_factors_overlapVenn_in_domain.png')
makeVennDiagram(ol,
                NameOfPeaks=str_split(bedFiles,'_',simplify = T)[,1],
                TxDb=txdb)
dev.off()

 
lapply(1:7,function(i){
  gr=ol$peaklist[[i]]
  dat=as.data.frame(gr)[,1:3]
  dat[,1]=gsub('chr','',dat[,1])
  write.table(dat,sep = '\t',
              col.names = F,file = paste0('G',i,'_in.bed')
               ,quote = F,row.names = F)
})

ol <- findOverlapsOfPeaks(tmp1$peaklist$fs..1..,
                          tmp2$peaklist$fs..2..,
                          tmp3$peaklist$fs..3..)
png('3_factors_overlapVenn_not_domain.png')
makeVennDiagram(ol,
                NameOfPeaks=str_split(bedFiles,'_',simplify = T)[,1],
                TxDb=txdb)
dev.off()

lapply(1:7,function(i){
  gr=ol$peaklist[[i]]
  dat=as.data.frame(gr)[,1:3]
  dat[,1]=gsub('chr','',dat[,1])
  write.table(dat,sep = '\t',
              col.names = F,file = paste0('G',i,'_out.bed')
              ,quote = F,row.names = F)
})

生成2个新的韦恩图,并且得到overlap后的peaks数据集的bed文件


3_factors_overlapVenn_in_domain

3_factors_overlapVenn_not_domain

这一步的完成需要上一步得到的转换参考基因组版本后的bed文件,这个内容需要再练习一下

12. 对之前的差异分析提取感兴趣的基因进行注释

load(file = 'deg_output.Rdata')
library(ggpubr)
colnames(DEG_PhoKO)
DEG_PhoKO$log=log(DEG_PhoKO$baseMean+1)
DEG_PhoKO$change=ifelse(DEG_PhoKO$padj>0.05,'stable',
                        ifelse(DEG_PhoKO$log2FoldChange > 0,'up','down'))
table(DEG_PhoKO$change)
ggscatter(DEG_PhoKO,x="log" ,y="log2FoldChange",color = 'change')

DEG_SppsKO$log=log(DEG_SppsKO$baseMean+1)
DEG_SppsKO$change=ifelse(DEG_SppsKO$padj>0.05,'stable',
                         ifelse(DEG_SppsKO$log2FoldChange > 0,'up','down'))
table(DEG_SppsKO$change)
ggscatter(DEG_SppsKO,x="log" ,y="log2FoldChange",color = 'change')

library(UpSetR)

SppsKO_up=rownames(DEG_SppsKO[DEG_SppsKO$change=='up',])
SppsKO_down=rownames(DEG_SppsKO[DEG_SppsKO$change=='down',])
PhoKO_up=rownames(DEG_PhoKO[DEG_PhoKO$change=='up',])
PhoKO_down=rownames(DEG_PhoKO[DEG_PhoKO$change=='down',])

allG=unique(c(SppsKO_up,SppsKO_down,PhoKO_up,PhoKO_down))

df=data.frame(allG=allG,
              SppsKO_up=as.numeric(allG %in% SppsKO_up),
              SppsKO_down=as.numeric(allG %in% SppsKO_down),
              PhoKO_up=as.numeric(allG %in% PhoKO_up),
              PhoKO_down=as.numeric(allG %in% PhoKO_down))

upset(df)

colnames(df)
# "SppsKO_up"   "SppsKO_down" "PhoKO_up"    "PhoKO_down" 
SppsKO_up_uniq=df[df[,2]==1 & df[,3]==0 & df[,4]==0 & df[,5]==0 ,1] 
SppsKO_down_uniq=df[df[,2]==0 & df[,3]==1 & df[,4]==0 & df[,5]==0 ,1]
PhoKO_up_uniq=df[df[,2]==0 & df[,3]==0 & df[,4]==1 & df[,5]==0 ,1]
PhoKO_down_uniq=df[df[,2]==0 & df[,3]==0 & df[,4]==0 & df[,5]==1 ,1]
SppsKO_up_PhoKO_up=df[df[,2]==1 & df[,3]==0 & df[,4]==1 & df[,5]==0 ,1]
SppsKO_down_PhoKO_down=df[df[,2]==0 & df[,3]==1 & df[,4]==0 & df[,5]==1 ,1]
  
library(org.Dm.eg.db)
t1=toTable(org.Dm.egCHRLOC)
t2=toTable(org.Dm.egCHRLOCEND)
t3=toTable(org.Dm.egSYMBOL)
 
# 作业 

require(TxDb.Dmelanogaster.UCSC.dm6.ensGene  )
txdb <- TxDb.Dmelanogaster.UCSC.dm6.ensGene
gs=as.data.frame(genes(txdb))
library(org.Dm.eg.db)
t100=toTable(org.Dm.egFLYBASE)
t101=toTable(org.Dm.egSYMBOL)
t200=merge(t100,t101,by='gene_id')
colnames(gs)[6]="flybase_id"
t300=merge(t200,gs,by="flybase_id")

SppsKO_up_uniq
SppsKO_up_bed=t300[match(SppsKO_up_uniq[SppsKO_up_uniq %in% t300$symbol ],
                      t300$symbol),4:6]
SppsKO_down_uniq
SppsKO_down_bed=t300[match(SppsKO_down_uniq[SppsKO_down_uniq %in% t300$symbol ],
                      t300$symbol),4:6]

这个作业是参考上面的代码完成peaks查找对应基因进行的注释并做下有分析

13. 在上一步得到的基因集加载进来进行kegg注释

rm(list = ls())
load(file = '6-genesets.Rdata')

library(ggplot2)
library(clusterProfiler)
library(org.Dm.eg.db)

kegg_anno <- function(gs,pro){
  #pro='SppsKO_up_uniq'
  #gs=SppsKO_up_uniq
  gene_up= bitr(gs, fromType = "SYMBOL",
                toType = c( "ENTREZID"),
                OrgDb = org.Dm.eg.db)[,2] 
  kk.up <- enrichKEGG(gene         = gene_up,
                      organism     = 'dme', 
                      keyType =  'ncbi-geneid',
                      pvalueCutoff = 0.9,
                      qvalueCutoff =0.9)
  head(kk.up)[,1:6]
  kk=kk.up
  dotplot(kk)
  ggsave(paste0(pro,'_kk.png'))
  kk=DOSE::setReadable(kk, OrgDb='org.Dm.eg.db',keyType='ENTREZID')
  write.csv(kk@result,paste0(pro,'_kk.up.csv'))
  
}
l=list(SppsKO_up_uniq,SppsKO_down_uniq,
       PhoKO_up_uniq, PhoKO_down_uniq,
       SppsKO_up_PhoKO_up,SppsKO_down_PhoKO_down)
names(l)=c('SppsKO_up_uniq','SppsKO_down_uniq',
           'PhoKO_up_uniq', 'PhoKO_down_uniq',
           'SppsKO_up_PhoKO_up','SppsKO_down_PhoKO_down')
lapply(1:length(l), function(i){
  kegg_anno(l[[i]],names(l)[i])
})

library(ggplot2)
library(clusterProfiler)
library(org.Dm.eg.db)
gcSample=lapply(l, function(x){
  return( bitr(x, fromType = "SYMBOL",
               toType = c( "ENTREZID"),
               OrgDb = org.Dm.eg.db)[,2] )
})
xx <- compareCluster(gcSample, fun="enrichKEGG",
                     keyType =  'ncbi-geneid',
                     organism="dme", pvalueCutoff=0.05)
dotplot(xx)
感兴趣的基因集挑选后kegg注释

注:这篇文章可以挖掘的地方进行peaks的通路的注释,也可以根据peaks对应的promoter区对应的调控基因进行注释和下游分析。
本次课程内容很丰富,关于相应分析软件的使用不熟练,需要进一步练习。今天下载了文章原文,后面就开始读文献,好好理解一下周末课程的内容争取消化掉。

你可能感兴趣的:(2019-10-14-周末班-表观调控(续))