接复盘总结(一)
methylkit R包上游处理
对甲基化位点进行注释、导出SPM分析上游文件
setwd("D:/DMP_mk_")
rm(list=ls())
library(methylKit)
load(file = "united_data.Rdata")
#得到甲基化比率
perc.meth=percMethylation(meth)
#注释
library(genomation)
meth_GRanges <- as(meth,"GRanges")
#注释基因区域信息
#gene.obj=readTranscriptFeatures("hg19",up.flank = 1000, down.flank = 0)
gene.obj=readTranscriptFeatures("hg19",up.flank = 250, down.flank = 250)
#gene.obj=readTranscriptFeatures("hg19",up.flank = 250, down.flank = 0)
Anno_gene = annotateWithGeneParts(meth_GRanges,gene.obj)
#注释启动子区域信息
cpg.obj=readFeatureFlank("cpgi",feature.flank.name=c("CpGi","shores"))
Anno_cpgi=annotateWithFeatureFlank(meth_GRanges,cpg.obj$CpGi,cpg.obj$shores,
feature.name="CpGi",
flank.name="shores")
#提取基因区域以及CpG岛的信息
generg <- Anno_gene@members# 基因区域
cpgi <- Anno_cpgi@members# CpG岛
ucsc_name <- [email protected][["feature.name"]]
vs <- read.csv(file="vs.csv")
table(ucsc_name %in% vs[,1])
gene_symbol <- vs[match(ucsc_name,vs[,1]),2]
#将注释信息保存到anno数据框中
anno <- as.data.frame(cbind(generg,cpgi,ucsc_name,gene_symbol))
#计算两个replicate的甲基化比率均值
sample_avrg <- data.frame(apply(perc.meth[,1:2],1,mean))
for (i in c(seq(3,15,2),seq(21,27,2)))
{
sample_avrg <- cbind(sample_avrg,apply(perc.meth[,i:(i+1)],1,mean))
}
sample_avrg <- cbind(sample_avrg,apply(perc.meth[,17:20],1,mean))
#将列名命名为组织
colnames(sample_avrg) <- c(
"Adrenal_Gland",
"Brain",
"Breast",
"Kidney",
"Left_Ventricle",
"Liver",
"Lung",
"Pancreas",
"Skin",
"Stomach",
"Testis",
"Uterus",
"Skeletal_Muscle"
)
#输出SPM上游文件
write.table(sample_avrg,file = "meth.txt",sep = "\t")
save(anno,file = "anno.Rdata")
几点说明:
1.启动子区域的选取:上下游250bp,参考文献中做法。当然也有选用上游1000bp以及上游250bp的,在这里我选用长度相对在他们中间的。启动子区域的选取与后面的分析一致。
2.如果同组两个样本均要包括,一共得到375931(376k)个位点,同一组的两个样本取了均值,相当于methylKit中的pool函数的功能。
3.如果同组最少包含一个样本,一共得到715765(716k)个位点。
3.含有GENE SYMBOL和UCSC NAME的文件在UCSC的官网下载。
得到启动子区域的平均甲基化
代码:
setwd("D:/DMP_mk_")
library(methylKit)
rm(list=ls())
#创建file list
#多个对象
if(T)
{
file.list <- list(
"adrenal_1.txt",
"adrenal_2.txt",
"brain_1.txt",
"brain_2.txt",
"Breast_1.txt",
"Breast_2.txt",
"Kidney_1.txt",
"Kidney_2.txt",
"Left_Ventricle_1.txt",
"Left_Ventricle_2.txt",
"Liver_1.txt",
"Liver_2.txt",
"Lung_1.txt",
"Lung_2.txt",
"Pancreas_1.txt",
"Pancreas_2.txt",
"SkeletalMuscle_1.txt",
"SkeletalMuscle_2.txt",
"SkeletalMuscle_3.txt",
"SkeletalMuscle_4.txt",
"Skin_1.txt",
"Skin_2.txt",
"Stomach_1.txt",
"Stomach_2.txt",
"Testis_1.txt",
"Testis_2.txt",
"Uterus_1.txt",
"Uterus_2.txt"
)
}
# read the files to a methylRawList object: myobj
#多个分组对象读取
if(T)
{
myobj <- methRead(
file.list,
sample.id=list(
"adrenal_1.txt",
"adrenal_2.txt",
"brain_1.txt",
"brain_2.txt",
"Breast_1.txt",
"Breast_2.txt",
"Kidney_1.txt",
"Kidney_2.txt",
"Left_Ventricle_1",
"Left_Ventricle_2",
"Liver_1.txt",
"Liver_2.txt",
"Lung_1.txt",
"Lung_2.txt",
"Pancreas_1.txt",
"Pancreas_2.txt",
"SkeletalMuscle_1.txt",
"SkeletalMuscle_2.txt",
"SkeletalMuscle_3.txt",
"SkeletalMuscle_4.txt",
"Skin_1.txt",
"Skin_2.txt",
"Stomach_1.txt",
"Stomach_2.txt",
"Testis_1.txt",
"Testis_2.txt",
"Uterus_1.txt",
"Uterus_2.txt"),
assembly="hg19",
sep = " ",
treatment=c(0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,8,8,9,9,10,10,11,11,12,12),
context="CpG",
mincov = 10
)
}
#去除覆盖率较低的reads
myobj=filterByCoverage(myobj,lo.count=10,lo.perc=NULL,hi.count=NULL,hi.perc=99.9)
#归一化
myobj <- normalizeCoverage(myobj)
#注释
library(genomation)
#注释基因区域信息
#gene.obj=readTranscriptFeatures("hg19",up.flank = 1000, down.flank = 0)
gene.obj=readTranscriptFeatures("hg19",up.flank = 250, down.flank = 250)
#gene.obj=readTranscriptFeatures("hg19",up.flank = 250, down.flank = 0)
#汇总启动子区域的DNA甲基化信息
#得到各个样本启动子的平均甲基化
promoters=regionCounts(myobj,gene.obj$promoters)
head(promoters[[1]])
for (i in 1:28){
prom_GRanges <- as(promoters[[i]],"GRanges")
Anno_gene_prom = annotateWithGeneParts(prom_GRanges,gene.obj)
tmp <- as.data.frame(cbind([email protected][["feature.name"]],
promoters[[i]][["numCs"]]/promoters[[i]][["coverage"]]))
tmp[,2] <- as.numeric(tmp[,2])
#循环时给变量命名,有重复基因名的情况,因此需要合并,后详述
assign(paste0("sample",i),aggregate(tmp,by=list(tmp[,1]), FUN=mean))
}
rm(tmp,Anno_gene_prom,prom_GRanges,i)
#合并样本共有的启动子(基因)
tmp <- merge(get(paste0("sample",1)),get(paste0("sample",2)),by="Group.1")
for (i in 3:28){
tmp <- merge(tmp,get(paste0("sample",i)),by="Group.1")
}
tmp_1 <- t(na.omit(t(tmp)))
ucsc_name <- tmp_1[,1]
vs <- read.csv(file="vs.csv")
table(tmp_1[,1] %in% vs[,1])
rownames(tmp_1) <- vs[match(tmp[,1],vs[,1]),2]
tmp_1 <- tmp_1[,-1]
samplerm <- paste0("sample",1:28)
rm(list = samplerm)
rm(samplerm,tmp,i)
#计算两个replicate的甲基化比率均值
tmp_2 <- tmp_1
class(tmp_2)
tmp_2 <- data.frame(tmp_2,stringsAsFactors=FALSE)#先把tmp_2转为数据框
tmp_2 <- as.data.frame(lapply(tmp_2,as.numeric))
class(tmp_2)
sample_prom_avrg <- data.frame(apply(tmp_2[,1:2],1,mean))
for (i in c(seq(3,15,2),seq(21,27,2)))
{
sample_prom_avrg <- cbind(sample_prom_avrg,apply(tmp_2[,i:(i+1)],1,mean))
}
sample_prom_avrg <- cbind(sample_prom_avrg,apply(tmp_2[,17:20],1,mean))
colnames(sample_prom_avrg) <- c(
"Adrenal_Gland",
"Brain",
"Breast",
"Kidney",
"Left_Ventricle",
"Liver",
"Lung",
"Pancreas",
"Skin",
"Stomach",
"Testis",
"Uterus",
"Skeletal_Muscle"
)
rm(tmp_1,tmp_2)
rownames(sample_prom_avrg) <- ucsc_name
write.table(sample_prom_avrg,"meth_prom_250updown.txt",sep = "\t")#改动的地方
#save(vs,sample_prom_avrg,file = "meth_prom.Rdata")
为什么会有重复的基因名:
> promoters[[1]]
methylRaw object with 17143 rows
--------------
chr start end strand coverage numCs numTs
1 chr1 860279 860779 + 856 8 848
2 chr1 860870 861370 + 608 11 597
3 chr1 861051 861551 + 608 11 597
4 chr1 874404 874904 + 765 300 465
5 chr1 894429 894929 - 937 28 909
6 chr1 895716 896216 + 3409 46 3363
--------------
sample.id: adrenal_1.txt
assembly: hg19
context: CpG
resolution: region
> tmp[8237:8239,]
V1 V2
8237 uc001jaa.4 0.00832342449464923
8238 uc001jaa.4 0.00832342449464923
8239 uc001jaa.4 0.00832342449464923
> promoters[[1]][8237:8239]
methylRaw object with 3 rows
--------------
chr start end strand coverage numCs numTs
8237 chr10 43048006 43048506 - 841 7 834
8238 chr10 43048015 43048515 - 841 7 834
8239 chr10 43048030 43048530 - 841 7 834
--------------
sample.id: adrenal_1.txt
assembly: hg19
context: CpG
resolution: region
在对启动子区域进行注释之后,发现有多个overlap的启动子区域对应到了一个基因上,这有可能是转录起始位点的差别导致的。
得到了12118个共同的启动子区域甲基化:
> length(unique(ucsc_name))
[1] 12118
> length(unique(rownames(sample_prom_avrg)))
[1] 12118
另外的说明:
1.Homo.sapiens和BSgenome.Hsapiens.UCSC.hg19包含有所有基因的注释信息以及人类整个基因组的序列,也可以用来注释分析。
2.data.frame()调用as.data.frame(),as.data.frame()是一种将其他对象强制转换为类data.frame的方法。data.frame既能建立数据框,又能将已存在的数据转为数据框格式。
as.data.frame只能将已存在的数据转为数据框格式。anyway,data.frame比as.data.frame强大就是了。
3.当选取了不同的启动子区域之后,得到的启动子区域的数目也会有细微的差别,这可能是由于在进行启动子区域计数的时候存在一些选取的标准比如说需要一定的位点包含在启动子区域。