这是我学习jimmyB站的GEO数据挖掘-记下的笔记
- 视频直达:B站GEO数据挖掘
- 代码直达:github - jmzeng1314
了解GEO数据库
- GSE,GSM,GPL,GDS
- 芯片基础知识打卡
- illumina的bead 系列表达芯片扫盲
- 了解GSE结构
- 了解一个GSE里面有多种芯片平台数据
- 生信技能树论坛-研究热点版块介绍(三)-芯片处理
下载表达矩阵的方法:
第一种:下载表达矩阵,利用R提取GSE42872
a=read.table('GSE42872_series_matrix.txt.gz',sep='\t',quote = "",fill = T,
comment.char = "!",header = T) # 提取表达矩阵
rownames(a)=a[,1] # 将第一列的ID设置为rownames
a=a[,-1] # 去除第一个列
第二种:getGEO下载表达矩阵GSE42872
if(F){
downGSE <- function(studyID = "GSE1009", destdir = ".") {
library(GEOquery)
eSet <- getGEO(studyID, destdir = destdir, getGPL = F)
exprSet = exprs(eSet[[1]])
pdata = pData(eSet[[1]])
write.csv(exprSet, paste0(studyID, "_exprSet.csv"))
write.csv(pdata, paste0(studyID, "_metadata.csv"))
return(eSet)
}
downGSE('GSE42872')
}
# 这是写的一个function,后续只需要把GSE42872改为你的目标GSE即可
ID转换
- 好文推荐:用R获取芯片探针与基因的对应关系三部曲-bioconductor
还是以上面的GSE42872为例进行学习
- 对应的GPL6244找到其注释包为:
67 GPL6244——Homo sapiens hugene10sttranscriptcluster
load(file='GSE42872_raw_exprSet.Rdata')
exprSet=raw_exprSet
# 加载上面找到的对应注释包
library(hugene10sttranscriptcluster.db)
ids=toTable(hugene10sttranscriptclusterSYMBOL)
length(unique(ids$symbol)) #查看基因多少
tail(sort(table(ids$symbol))) # 每个基因对应的探针个数进行
table(sort(table(ids$symbol))) #table以下上面的,查看大致情况
plot(table(sort(table(ids$symbol)))) #画个图看看,基因对应探针的情况。
> tail(sort(table(ids$symbol)))
RPL41 UBTFL1 CDK11B UBE2D3 IGKC LRRFIP1
6 6 8 8 10 10
> table(sort(table(ids$symbol)))
1 2 3 4 5 6 8 10
18074 600 132 16 5 6 2 2
> plot(table(sort(table(ids$symbol))))
表达矩阵对应注释包中对应情况
dim(exprSet)
table(rownames(exprSet) %in% ids$probe_id) # 查看对应情况
dim(exprSet)
exprSet=exprSet[rownames(exprSet) %in% ids$probe_id,] # 过滤其中能够对应的Gene-ID
dim(exprSet)
ids=ids[match(rownames(exprSet),ids$probe_id),]Gene-ID
> dim(exprSet)
[1] 33297 6
> table(rownames(exprSet) %in% ids$probe_id)
FALSE TRUE
13466 19831
> dim(exprSet)
[1] 33297 6
> exprSet=exprSet[rownames(exprSet) %in% ids$probe_id,]
> dim(exprSet)
[1] 19831 6
查看两者的情况
head(ids)
exprSet[1:5,1:5]
> head(ids)
probe_id symbol
1 7896759 LINC01128
2 7896761 SAMD11
3 7896779 KLHL17
4 7896798 PLEKHN1
5 7896817 ISG15
6 7896822 AGRN
> exprSet[1:5,1:5]
GSM1052615 GSM1052616 GSM1052617 GSM1052618 GSM1052619
7896759 8.75126 8.61650 8.81149 8.32067 8.41445
7896761 8.39069 8.52617 8.43338 9.17284 9.10216
7896779 8.20228 8.30886 8.18518 8.13322 8.06453
7896798 8.41004 8.37679 8.27521 8.34524 8.35557
7896817 7.72204 7.74572 7.78022 7.72308 7.53797
将exprset中的ID与symbol一一对应起来
jimmy <- function(exprSet,ids){
tmp = by(exprSet,
ids$symbol,
function(x) rownames(x)[which.max(rowMeans(x))] )
#根据id的symbol对表达矩阵进行分类,如果一个symbol对应多个探针,那么将会得到一个新的探针列表
probes = as.character(tmp) #将上面的新的探针列表,得到一个probe
print(dim(exprSet))
exprSet=exprSet[rownames(exprSet) %in% probes ,] #过滤有多个探针的基因
print(dim(exprSet))
rownames(exprSet)=ids[match(rownames(exprSet),ids$probe_id),2]
return(exprSet)
}
new_exprSet <- jimmy(exprSet,ids)
exprSet=exprSet[rownames(exprSet) %in% probes ,] #过滤有多个探针的基因
[1] 19831 6
[1] 18837 6
处理一个symbol对应多个探针的情况。
- 上面我们已经展示:
> tail(sort(table(ids$symbol)))
RPL41 UBTFL1 CDK11B UBE2D3 IGKC LRRFIP1
6 6 8 8 10 10
-
去ids中查找IGKC得到如下:
- 那么后面用R语言找出上面的10个IGKC对应的gene-id
### IGKC,童谣需要对exprSet及ids进行过滤
load(file='GSE42872_raw_exprSet.Rdata')
exprSet=raw_exprSet
library(hugene10sttranscriptcluster.db)
ids=toTable(hugene10sttranscriptclusterSYMBOL)
length(unique(ids$symbol))
tail(sort(table(ids$symbol)))
table(sort(table(ids$symbol)))
plot(table(sort(table(ids$symbol))))
dim(exprSet)
table(rownames(exprSet) %in% ids$probe_id)
dim(exprSet)
exprSet=exprSet[rownames(exprSet) %in% ids$probe_id,]
dim(exprSet)
ids=ids[match(rownames(exprSet),ids$probe_id),]
head(ids)
exprSet[1:5,1:5]
# 查找IGKC
table(ids[,2]=='IGKC') #ids[,2]=='IGKC' 返回的为逻辑值TRUE、FLASE
exprSet[ids[,2]=='IGKC',] # 返回其中exprSet中对应gene-id为TRUE的值
- 上面这个即为
x=exprSet[ids[,2]=='IGKC',]
,即IGKC对应的10个探针,在每个sample中的情况,需要筛选出其中6个sample中表达平均自最大的探针rowMeans(x)
x=exprSet[ids[,2]=='IGKC',]
rownames(x)
rowMeans(x)
which.max(rowMeans(x))
rownames(x)
jimmy <- function(exprSet,ids){
tmp = by(exprSet,
ids$symbol,
function(x) rownames(x)[which.max(rowMeans(x))] ) #根据id的symbol对表达矩阵进行分类,如果一个symbol对应多个探针,那么将会得到一个新的探针列表
probes = as.character(tmp) #将上面的新的探针列表,得到一个probe
print(dim(exprSet))
exprSet=exprSet[rownames(exprSet) %in% probes ,]
print(dim(exprSet))
rownames(exprSet)=ids[match(rownames(exprSet),ids$probe_id),2]
return(exprSet)
}
new_exprSet <- jimmy(exprSet,ids)
# jimmy这个函数要求较严格,需要表达矩阵为matrix,ids与其一一对应,过滤
某些GPL无法扎到对应的注释包
getwd()
library(GEOquery)
gpl=getGEO('GPL6480',destdir = getwd())
colnames(Table(gpl)) ##you need to check this,which column do you need
head(Table(gpl)[,c(1,6,7)])
write.csv(Table(gpl)[,c(1,6,7)],"GPL6400.csv")
- 上面代码运行如下:
> getwd()
[1] "F:/R studying/GEO-master/GEO-master/GSE42872"
> library(GEOquery)
Setting options('download.file.method.GEOquery'='auto')
Setting options('GEOquery.inmemory.gpl'=FALSE)
> gpl=getGEO('GPL6480',destdir = getwd())
File stored at:
F:/R studying/GEO-master/GEO-master/GSE42872/GPL6480.soft
Warning: 94 parsing failures.
row col expected actual file
41001 CONTROL_TYPE 1/0/T/F/TRUE/FALSE pos literal data
41002 CONTROL_TYPE 1/0/T/F/TRUE/FALSE pos literal data
41003 CONTROL_TYPE 1/0/T/F/TRUE/FALSE pos literal data
41004 CONTROL_TYPE 1/0/T/F/TRUE/FALSE pos literal data
41005 CONTROL_TYPE 1/0/T/F/TRUE/FALSE pos literal data
..... ............ .................. ...... ............
See problems(...) for more details.
> colnames(Table(gpl))
[1] "ID" "SPOT_ID" "CONTROL_TYPE" "REFSEQ"
[5] "GB_ACC" "GENE" "GENE_SYMBOL" "GENE_NAME"
[9] "UNIGENE_ID" "ENSEMBL_ID" "TIGR_ID" "ACCESSION_STRING"
[13] "CHROMOSOMAL_LOCATION" "CYTOBAND" "DESCRIPTION" "GO_ID"
[17] "SEQUENCE"
> head(Table(gpl)[,c(1,6,7)])
ID GENE GENE_SYMBOL
1 A_23_P100001 400451 FAM174B
2 A_23_P100011 10239 AP3S2
3 A_23_P100022 9899 SV2B
4 A_23_P100056 348093 RBPMS2
5 A_23_P100074 57099 AVEN
6 A_23_P100092 146050 ZSCAN29
了解你的表达矩阵
load(file='GSE42872_new_exprSet.Rdata')
exprSet=new_exprSet
# if(T){
# 数据转换
library(reshape2)
exprSet_L=melt(exprSet)
colnames(exprSet_L)=c('probe','sample','value')
exprSet_L$group=rep(group_list,each=nrow(exprSet))
head(exprSet_L)
查看每个数据的基本情况
### ggplot2
library(ggplot2)
p=ggplot(exprSet_L,aes(x=sample,y=value,fill=group))+geom_boxplot()
print(p) # boxplot,每个样本基本要在一条线上次才能进行比较
- 后面这些代码画出大致意义同上
p=ggplot(exprSet_L,aes(x=sample,y=value,fill=group))+geom_violin()
print(p)
p=ggplot(exprSet_L,aes(value,fill=group))+geom_histogram(bins = 200)+facet_wrap(~sample, nrow = 4)
print(p)
p=ggplot(exprSet_L,aes(value,col=group))+geom_density()+facet_wrap(~sample, nrow = 4)
print(p)
p=ggplot(exprSet_L,aes(value,col=group))+geom_density()
print(p)
p=ggplot(exprSet_L,aes(x=sample,y=value,fill=group))+geom_boxplot()
p=p+stat_summary(fun.y="mean",geom="point",shape=23,size=3,fill="red")
p=p+theme_set(theme_set(theme_bw(base_size=20)))
p=p+theme(text=element_text(face='bold'),axis.text.x=element_text(angle=30,hjust=1),axis.title=element_blank())
print(p)
hclust
## hclust
colnames(exprSet)=paste(group_list,1:ncol(exprSet),sep='_')
# Define nodePar
nodePar <- list(lab.cex = 0.6, pch = c(NA, 19),
cex = 0.7, col = "blue")
hc=hclust(dist(t(exprSet)))
plot(hc) # 查看每组之间样本是否存在outline的
par(mar=c(5,5,5,10))
png('hclust.png',res=120)
dev.off()
plot(as.dendrogram(hc), nodePar = nodePar, horiz = TRUE)
dev.off()
PCA
## PCA
library(ggfortify)
df=as.data.frame(t(exprSet))
df$group=group_list
png('pca.png',res=120)
dev.off()
autoplot(prcomp( df[,1:(ncol(df)-1)] ), data=df,colour = 'group')+theme_bw()
dev.off()
# }
差异分析
用limma对芯片数据做差异分析
- 根据分组信息做差异分析- 这个一文不够的
- 用limma包对芯片数据做差异分析 (这个教程一定要看看)
limma包的使用
使用这个包需要三个数据:
- 表达矩阵
- 分组矩阵
- 差异比较矩阵
而且总共也只有三个步骤:
- lmFit
- eBayes
- topTable
三个数据
1+2拿到分组矩阵design(表达矩阵已经存在)
load(file='GSE42872_new_exprSet.Rdata')
exprSet=new_exprSet
dim(exprSet)
group_list
library(limma)
# tmp=data.frame(case=c(0,0,0,1,1,1),control=c(1,1,1,0,0,0))
#design.factor = factor(group_list, levels=c("control","case"))
#design.matrix = model.matrix(~0+design.factor)
#colnames(design.matrix) = levels(design.factor)
#tmp=design.matrix
design <- model.matrix(~0+factor(group_list))
colnames(design)=levels(factor(group_list))
rownames(design)=colnames(exprSet)
design # 分组矩阵
-
分组矩阵需要对应好表达矩阵
3.比较矩阵contrast.matrix
contrast.matrix<-makeContrasts(paste0(unique(group_list),collapse = "-"),
levels = design)
contrast.matrix<-makeContrasts("case-control",
levels = design)
contrast.matrix #这个矩阵声明,我们要把progres.组跟stable进行差异分析比较
- 需要确保case1,control -1.
三个步骤
lmFit+eBayes+topTable
deg = function(exprSet,design,contrast.matrix){
##step1
fit <- lmFit(exprSet,design)
##step2
fit2 <- contrasts.fit(fit, contrast.matrix)
##这一步很重要,大家可以自行看看效果
fit2 <- eBayes(fit2) ## default no trend !!!
##eBayes() with trend=TRUE
##step3
tempOutput = topTable(fit2, coef=1, n=Inf)
nrDEG = na.omit(tempOutput)
#write.csv(nrDEG2,"limma_notrend.results.csv",quote = F)
head(nrDEG)
return(nrDEG)
}
re = deg(exprSet,design,contrast.matrix)
作图
- 差异分析得到的结果注释一文就够
热图-pheatmap
nrDEG=re
## heatmap
library(pheatmap)
choose_gene=head(rownames(nrDEG),25) ## 50 maybe better
choose_matrix=exprSet[choose_gene,]
choose_matrix=t(scale(t(choose_matrix)))
pheatmap(choose_matrix,filename = 'DEG_top100_heatmap.png')
火山图
library(ggplot2)
## volcano plot
colnames(nrDEG)
plot(nrDEG$logFC,-log10(nrDEG$P.Value))
DEG=nrDEG
logFC_cutoff <- with(DEG,mean(abs( logFC)) + 2*sd(abs( logFC)) )
# logFC_cutoff=1
DEG$change = as.factor(ifelse(DEG$P.Value < 0.05 & abs(DEG$logFC) > logFC_cutoff,
ifelse(DEG$logFC > logFC_cutoff ,'UP','DOWN'),'NOT')
)
this_tile <- paste0('Cutoff for logFC is ',round(logFC_cutoff,3),
'\nThe number of up gene is ',nrow(DEG[DEG$change =='UP',]) ,
'\nThe number of down gene is ',nrow(DEG[DEG$change =='DOWN',])
)
g = ggplot(data=DEG,
aes(x=logFC, y=-log10(P.Value),
color=change)) +
geom_point(alpha=0.4, size=1.75) +
theme_set(theme_set(theme_bw(base_size=20)))+
xlab("log2 fold change") + ylab("-log10 p-value") +
ggtitle( this_tile ) + theme(plot.title = element_text(size=15,hjust = 0.5))+
scale_colour_manual(values = c('blue','black','red')) ## corresponding to the levels(res$change)
print(g)
#ggsave(g,filename = 'volcano.png')
#save(new_exprSet,group_list,nrDEG,DEG, file='GSE42872_DEG.Rdata')
下游分析
学习包clusteprofile的过程,你不必一一运行
rm(list=ls())
load(file='GSE42872_DEG.Rdata')
gene=head(rownames(nrDEG),1000) #此处我们应该取到其上调的486个基因
gene
## KEGG pathway analysis,来自Y叔的clustrprofile
library(clusterProfiler)
## 包接受的gene为entrezid,那么需要将gene的symbol改为ENTREZID
### get the universal genes and sDEG
gene.df <- bitr(gene, fromType = "SYMBOL",
toType = c("ENSEMBL", "ENTREZID"),
OrgDb = org.Hs.eg.db)
head(gene.df)
# 此处包中接受的gene为ENTREZID,那我们将gene.df中的ENTREZID赋值给包中的gene
kk <- enrichKEGG(gene = gene.df$ENTREZID,
organism = 'hsa',
pvalueCutoff = 0.05)
head(kk)[,1:6]
- 运行如下:
> load(file='GSE42872_DEG.Rdata')
> gene=head(rownames(nrDEG),1000) #此处我们应该取到其上调的486个基因
> gene [1] "CD36" "DUSP6" "DCT" "SPRY2" "MOXD1"
[6] "ETV4" "ETV5" "ST6GALNAC2" "DTL" "NUPR1"
[11] "LDLR" "SPRY4" "FST" "TXNIP" "CCND1"
[16] "IER3" "SERPINF1" "FAM111B" "CD24" "RNF150"
...
[986] "SASS6" "NUP58" "TSPAN13" "MDGA2" "FJX1"
[991] "MMD" "CKS1B" "PSAP" "RMI1" "TRIM51"
[996] "EMG1" "CENPP" "LINC00346" "MAK16" "TMEM232"
> ## KEGG pathway analysis,来自Y叔的clustrprofile
> library(clusterProfiler)
> gene.df <- bitr(gene, fromType = "SYMBOL",
+ toType = c("ENSEMBL", "ENTREZID"),
+ OrgDb = org.Hs.eg.db)
'select()' returned 1:many mapping between keys and columns
Warning message:
In bitr(gene, fromType = "SYMBOL", toType = c("ENSEMBL", "ENTREZID"), :
1.3% of input gene IDs are fail to map...
> head(gene.df)
SYMBOL ENSEMBL ENTREZID
1 CD36 ENSG00000135218 948
2 DUSP6 ENSG00000139318 1848
3 DCT ENSG00000080166 1638
4 SPRY2 ENSG00000136158 10253
5 MOXD1 ENSG00000079931 26002
6 ETV4 ENSG00000175832 2118
> # 此处包中接受的gene为ENTREZID,那我们将gene.df中的ENTREZID赋值给包中的gene
> kk <- enrichKEGG(gene = gene.df$ENTREZID,
+ organism = 'hsa',
+ pvalueCutoff = 0.05)
> head(kk)[,1:6]
ID Description GeneRatio BgRatio pvalue
hsa03030 hsa03030 DNA replication 18/427 36/7493 9.927795e-14
hsa04110 hsa04110 Cell cycle 30/427 124/7493 6.292081e-12
hsa05322 hsa05322 Systemic lupus erythematosus 27/427 133/7493 4.890465e-09
hsa05034 hsa05034 Alcoholism 28/427 180/7493 9.561579e-07
hsa05203 hsa05203 Viral carcinogenesis 29/427 201/7493 2.958811e-06
hsa03460 hsa03460 Fanconi anemia pathway 13/427 54/7493 7.211884e-06
p.adjust
hsa03030 2.779783e-11
hsa04110 8.808914e-10
hsa05322 4.564434e-07
hsa05034 6.693106e-05
hsa05203 1.656934e-04
hsa03460 3.365546e-04
数据处理
rm(list=ls())
### ---------------
###
### Create: Jianming Zeng
### Date: 2018-07-09 20:11:07
### Email: [email protected]
### Blog: http://www.bio-info-trainee.com/
### Forum: http://www.biotrainee.com/thread-1376-1-1.html
### CAFS/SUSTC/Eli Lilly/University of Macau
### Update Log: 2018-07-09 First version
###
### ---------------
load(file='GSE42872_DEG.Rdata')
source('functions.R')
library(ggplot2)
library(clusterProfiler)
library(org.Hs.eg.db)
df <- bitr(rownames(DEG), fromType = "SYMBOL",
toType = c( "ENTREZID"),
OrgDb = org.Hs.eg.db) #SYMBOL转为ENTREZID
head(df)
head(DEG)
DEG$SYMBOL = rownames(DEG)
DEG=merge(DEG,df,by='SYMBOL')
head(DEG)
gene_up= DEG[DEG$change == 'UP','ENTREZID']
gene_down=DEG[DEG$change == 'DOWN','ENTREZID']
gene_diff=c(gene_up,gene_down)
gene_all=as.character(DEG[ ,'ENTREZID'] )
data(geneList, package="DOSE") #加载genelist这个包
head(geneList)
boxplot(geneList) # 是一个0往上下走的数值
boxplot(DEG$logFC) #那我此处DEG$logFC同genelist,就是我们所需要的数值
geneList=DEG$logFC #所以将DEG$logFC赋值给包中的genelist
names(geneList)=DEG$ENTREZID #命名?
geneList=sort(geneList,decreasing = T)
KEGG 分析
## KEGG pathway analysis
if(T){
### over-representation test,
kk.up <- enrichKEGG(gene = gene_up,
organism = 'hsa',
universe = gene_all,
pvalueCutoff = 0.9,
qvalueCutoff =0.9) #当中的pvalueCutoff、qvalueCutoff值是否需要更改?
head(kk.up)[,1:6]
kk.down <- enrichKEGG(gene = gene_down,
organism = 'hsa',
universe = gene_all,
pvalueCutoff = 0.05)
head(kk.down)[,1:6]
kk.diff <- enrichKEGG(gene = gene_diff,
organism = 'hsa',
pvalueCutoff = 0.05)
head(kk.diff)[,1:6]
kegg_diff_dt <- as.data.frame(kk.diff)
kegg_down_dt <- as.data.frame(kk.down)
kegg_up_dt <- as.data.frame(kk.up)
down_kegg<-kegg_down_dt[kegg_down_dt$pvalue<0.05,];down_kegg$group=-1
up_kegg<-kegg_up_dt[kegg_up_dt$pvalue<0.05,];up_kegg$group=1
g_kegg=kegg_plot(up_kegg,down_kegg)
print(g_kegg)
#ggsave(g_kegg,filename = 'kegg_up_down.png')
GSEA
### GSEA
kk_gse <- gseKEGG(geneList = geneList,
organism = 'hsa',
nPerm = 1000,
minGSSize = 120,
pvalueCutoff = 0.9, #后期使用根据自己的实际情况修改这个值
verbose = FALSE)
head(kk_gse)[,1:6]
dev.off()
gseaplot(kk_gse, geneSetID = rownames(kk_gse[1,]))
down_kegg<-kk_gse[kk_gse$pvalue<0.05 & kk_gse$enrichmentScore < 0,];down_kegg$group=-1
up_kegg<-kk_gse[kk_gse$pvalue<0.05 & kk_gse$enrichmentScore > 0,];up_kegg$group=1
g_kegg=kegg_plot(up_kegg,down_kegg)
print(g_kegg)
#ggsave(g_kegg,filename = 'kegg_up_down_gsea.png')
}
GO
### GO database analysis
g_list=list(gene_up=gene_up,
gene_down=gene_down,
gene_diff=gene_diff)
if(F){
go_enrich_results <- lapply( g_list , function(gene) {
lapply( c('BP','MF','CC') , function(ont) {
cat(paste('Now process ',ont ))
ego <- enrichGO(gene = gene,
universe = gene_all,
OrgDb = org.Hs.eg.db,
ont = ont ,
pAdjustMethod = "BH",
pvalueCutoff = 0.99,
qvalueCutoff = 0.99,
readable = TRUE)
print( head(ego) )
return(ego)
})
})
#save(go_enrich_results,file = 'go_enrich_results.Rdata')
}
load(file = 'go_enrich_results.Rdata')
n1= c('gene_up','gene_down','gene_diff')
n2= c('BP','MF','CC')
for (i in 1:3){
for (j in 1:3){
fn=paste0('dotplot_',n1[i],'_',n2[j],'.png')
cat(paste0(fn,'\n'))
png(fn,res=150,width = 1080)
print( dotplot(go_enrich_results[[i]][[j]] ))
dev.off()
}
}
# TODO:
# ~~~~~~
- 最后会出现如下这些图:
dotplot_gene_up_BP.png
dotplot_gene_up_MF.png
dotplot_gene_up_CC.png
dotplot_gene_down_BP.png
dotplot_gene_down_MF.png
dotplot_gene_down_CC.png
dotplot_gene_diff_BP.png
dotplot_gene_diff_MF.png
dotplot_gene_diff_CC.png
生存分析
- 对有临床信息的表达矩阵批量做生存分析
我就直接放代码了,自己运行一遍吧。
rm(list=ls())
### ---------------
###
### Create: Jianming Zeng
### Date: 2018-07-09 20:11:07
### Email: [email protected]
### Blog: http://www.bio-info-trainee.com/
### Forum: http://www.biotrainee.com/thread-1376-1-1.html
### CAFS/SUSTC/Eli Lilly/University of Macau
### Update Log: 2018-07-09 First version
###
### ---------------
# R里面实现生存分析非常简单!
# 用my.surv <- surv(OS_MONTHS,OS_STATUS=='DECEASED')构建生存曲线。
# 用kmfit2 <- survfit(my.surv~TUMOR_STAGE_2009)来做某一个因子的KM生存曲线。
# 用 survdiff(my.surv~type, data=dat)来看看这个因子的不同水平是否有显著差异,其中默认用是的logrank test 方法。
# 用coxph(Surv(time, status) ~ ph.ecog + tt(age), data=lung) 来检测自己感兴趣的因子是否受其它因子(age,gender等等)的影响。
load(file='GSE11121_new_exprSet.Rdata')
exprSet=new_exprSet
dim(exprSet)
colnames(phe)
colnames(phe)=c('event','grade','node','size','time')
colnames(phe)
phe = as.data.frame(apply(phe,2,as.numeric)) #2是随便写的一个
dev.off()
boxplot(phe$size) #用boxplot看看大size的大小
phe$size=ifelse(phe$size>median(phe$size),'big','small')#根据median(size)
library(survival)
library(survminer)
# 利用ggsurvplot快速绘制漂亮的生存曲线图
sfit <- survfit(Surv(time, event)~size, data=phe) #time、event与size的关系
sfit
summary(sfit)
ggsurvplot(sfit, conf.int=F, pval=TRUE)
ggsurvplot(sfit,palette = c("#E7B800", "#2E9FDF"),
risk.table =TRUE,pval =TRUE,
conf.int =TRUE,xlab ="Time in months",
ggtheme =theme_light(),
ncensor.plot = TRUE)
## 多个 ggsurvplots作图生存曲线代码合并代码公布
sfit1=survfit(Surv(time, event)~size, data=phe)
sfit2=survfit(Surv(time, event)~grade, data=phe)
splots <- list()
splots[[1]] <- ggsurvplot(sfit1,pval =TRUE, data = phe, risk.table = TRUE)
splots[[2]] <- ggsurvplot(sfit2,pval =TRUE, data = phe, risk.table = TRUE)
# Arrange multiple ggsurvplots and print the output
arrange_ggsurvplots(splots, print = TRUE, ncol = 2, nrow = 1, risk.table.height = 0.4)
## 挑选感兴趣的基因做差异分析
phe$CBX4=ifelse(exprSet['CBX4',]>median(exprSet['CBX4',]),'high','low')
table(phe$CBX4)
ggsurvplot(survfit(Surv(time, event)~CBX4, data=phe), conf.int=F, pval=TRUE)
phe$CBX6=ifelse(exprSet['CBX6',]>median(exprSet['CBX6',]),'high','low')
table(phe$CBX6)
ggsurvplot(survfit(Surv(time, event)~CBX6, data=phe), conf.int=F, pval=TRUE)
## 批量生存分析 使用 logrank test 方法
mySurv=with(phe,Surv(time, event))
log_rank_p <- apply(exprSet , 1 , function(gene){
#gene=exprSet[1,]
phe$group=ifelse(gene>median(gene),'high','low')
data.survdiff=survdiff(mySurv~group,data=phe)
p.val = 1 - pchisq(data.survdiff$chisq, length(data.survdiff$n) - 1)
return(p.val)
})
log_rank_p=sort(log_rank_p)
head(log_rank_p)
boxplot(log_rank_p)
phe$H6PD=ifelse(exprSet['H6PD',]>median(exprSet['H6PD',]),'high','low')
table(phe$H6PD)
ggsurvplot(survfit(Surv(time, event)~H6PD, data=phe), conf.int=F, pval=TRUE)
## 批量生存分析 使用 coxph 回归方法
colnames(phe)
mySurv=with(phe,Surv(time, event))
cox_results <-apply(exprSet , 1 , function(gene){
group=ifelse(gene>median(gene),'high','low')
survival_dat <- data.frame(group=group,grade=phe$grade,size=phe$size,stringsAsFactors = F)
m=coxph(mySurv ~ grade + size + group, data = survival_dat)
beta <- coef(m)
se <- sqrt(diag(vcov(m)))
HR <- exp(beta)
HRse <- HR * se
#summary(m)
tmp <- round(cbind(coef = beta, se = se, z = beta/se, p = 1 - pchisq((beta/se)^2, 1),
HR = HR, HRse = HRse,
HRz = (HR - 1) / HRse, HRp = 1 - pchisq(((HR - 1)/HRse)^2, 1),
HRCILL = exp(beta - qnorm(.975, 0, 1) * se),
HRCIUL = exp(beta + qnorm(.975, 0, 1) * se)), 3)
return(tmp['grouplow',])
})
cox_results=t(cox_results)
table(cox_results[,4]<0.05)
cox_results[cox_results[,4]<0.05,]
length(setdiff(rownames(cox_results[cox_results[,4]<0.05,]),
names(log_rank_p[log_rank_p<0.05])
))
length(setdiff( names(log_rank_p[log_rank_p<0.05]),
rownames(cox_results[cox_results[,4]<0.05,])
))
length(unique( names(log_rank_p[log_rank_p<0.05]),
rownames(cox_results[cox_results[,4]<0.05,])
))
save(log_rank_p,cox_results,exprSet,phe,file = 'surviva.Rdata')
写在最后吧:
- 上面所有代码都来自jimmy大大的;
- 我这个只是跟着视频把GEO数据在现过程,这个教程只是帮组你参考;
- 不需要跟着我的教程走,跟着jimmy大大的视频走一遍就更完美了。