TCGA数据挖掘--神经胶质瘤(GBM)差异mRNA分析

文章目录

    • 1.TCGA-GBM数据下载
    • 2.提取mRNA数据
    • 3.ID转换,ensemblID转换为symbolID
    • 4.数据整理
    • 5.PCA
    • 6.差异表达
    • 7.富集分析

1.TCGA-GBM数据下载

{
  library(TCGAbiolinks)
  library(SummarizedExperiment)
  query <- GDCquery(project = 'TCGA-GBM', 
                    data.category = "Transcriptome Profiling", 
                    data.type = "Gene Expression Quantification", 
                    workflow.type = "HTSeq - Counts")
  
  GDCdownload(query, method = "api", files.per.chunk = 100)
  expdat <- GDCprepare(query = query)
  expr = assay(expdat)
  expr = as.data.frame(expr)
  save(expr,file = 'expr.Rdata')
}

2.提取mRNA数据


{
  library(rtracklayer)
  library(dplyr)
  gtf <- import('Homo_sapiens.GRCh38.97.chr.gtf') 
  gtf_df <- as.data.frame(gtf)  
  gene_df <- select(gtf_df,
                    c(gene_id,gene_name,gene_biotype))  
  index <- duplicated(gene_df$gene_id) 
  gene_df = gene_df[!index,]
  dim(gene_df)
  mRNA_df = gene_df[gene_df$gene_biotype == 'protein_coding',]  
  dim(mRNA_df)
  save(mRNA_df,file = 'mRNA_df.Rdata')

  load("D:/个人空间/文件保存/R-project/GBM/expr.Rdata")
  exprSet = expr[match(mRNA_df$gene_id,rownames(expr)),]
  dim(exprSet)
  exprSet = na.omit(exprSet)
  dim(exprSet)
  save(exprSet, file = 'exprSet.Rdata')
}

3.ID转换,ensemblID转换为symbolID

 {
  library(limma)
  load("D:/个人空间/文件保存/R-project/GBM/exprSet.Rdata")
  exprSet$names = rownames(exprSet)
  exprSet$names = mRNA_df[match(exprSet$names,mRNA_df$gene_id),2]
  dim(exprSet)
  table(duplicated(exprSet$names)) # 有3个重复的基因名
  # 对重复基因名取平均表达量,然后将基因名作为行名
  exprSet = avereps(exprSet[,-ncol(exprSet)],ID = exprSet$names) 
  dim(exprSet)
  save(exprSet, file = 'exprSet_names_by_symbol.Rdata')
}

4.数据整理

{
  # 4.1 去除低表达量的基因
  load("D:/个人空间/文件保存/R-project/GBM/exprSet_names_by_symbol.Rdata")
  pick_row <- apply(exprSet, 1, function(x){
    sum(x == 0) < 40
  })
  exprSet1 <- exprSet[pick_row,]

  # 4.2 分组(癌症组织和癌旁组织)
  library(stringr)
  tumor <- colnames(exprSet1)[as.integer(substr(colnames(exprSet1),14,15)) < 10]
  normal <- colnames(exprSet1)[as.integer(substr(colnames(exprSet1),14,15)) >= 10]

  tumor_sample <- exprSet1[,tumor]
  normal_sample <- exprSet1[,normal]

  exprSet_by_group <- cbind(tumor_sample,normal_sample)
  group_list <- c(rep('tumor',ncol(tumor_sample)),rep('normal',ncol(normal_sample)))

  save(exprSet_by_group, group_list, file = 'exprSet_by_group_list.Rdata')
}


5.PCA

{
  library(FactoMineR)
  library(factoextra)
  load("D:/个人空间/文件保存/R-project/GBM/exprSet_by_group_list.Rdata")
  data = as.data.frame(t(exprSet_by_group))
  data <- cbind(data,group = as.factor(group_list))

  pca <- PCA(data[,-ncol(data)], graph = FALSE)
  eig.val <- get_eigenvalue(pca)# 一列:特征值,二列:特征值的方差贡献度,三列:累计方差贡献度
  fviz_eig(pca, addlabels = TRUE, ylim = c(0, 100))
  fviz_pca_ind(pca,
               geom.ind = "point", 
               col.ind = data$group, 
               palette = "jco", 
               addEllipses = TRUE, 
               legend.title = "Groups",
               title = 'PCA')
    }

TCGA数据挖掘--神经胶质瘤(GBM)差异mRNA分析_第1张图片
TCGA数据挖掘--神经胶质瘤(GBM)差异mRNA分析_第2张图片

6.差异表达

  • 差异表达
 {
  library(limma)
  library(edgeR)
  DGElist <- DGEList( counts = exprSet_by_group, group = factor(group_list))

  # 挑选感兴趣基因
  keep_gene <- rowSums( cpm(DGElist) > 1 ) >= 2 
  table(keep_gene)
  DGElist <- DGElist[ keep_gene,keep.lib.sizes = FALSE]

  DGElist <- calcNormFactors(DGElist) # 计算归一化因子以对齐计数矩阵的列
  DGElist$samples
  design <- model.matrix( ~0 + factor(group_list))
  rownames(design) <- colnames(DGElist)
  colnames(design) <- levels(factor(group_list))

  # 转换RNA-Seq数据为线性建模做好准备
  v <- voom(DGElist, design, plot = TRUE, normalize = "quantile")
  # 给出一系列阵列,为每个基因拟合线性模型
  fit <- lmFit(v, design)
  # 构造对应于一组参数的指定对比的对比矩阵。
  cont.matrix <- makeContrasts(contrasts = c('tumor-normal'), levels = design)
  # 给定适合微阵列数据的线性模型,计算给定对比组的估计系数和标准误差
  fit2 <- contrasts.fit(fit, cont.matrix)
  # 差分表达的经验贝叶斯统计
  fit2 <- eBayes(fit2)
  nrDEG_limma_voom = topTable(fit2, coef = 'tumor-normal', n = Inf)
  nrDEG_limma_voom = na.omit(nrDEG_limma_voom)
  head(nrDEG_limma_voom)
  save(nrDEG_limma_voom,file = 'nrDEG.Rdata')
 }

  • 火山图
library(ggplot2)
  library(ggrepel)
  nrDEG <- nrDEG_limma_voom
  nrDEG$change <- ifelse(nrDEG$adj.P.Val < 0.01 & abs(nrDEG$logFC) > 2.5,
                         ifelse(nrDEG$logFC > 2.5,'UP','DOWN'),
                                  'NOT')
  table(nrDEG$change)
  save(nrDEG,file = 'nrDEG_by_group.Rdata')

  # 重点关注基因
  nrDEG$sign <- ifelse(nrDEG$adj.P.Val < 0.001 & abs(nrDEG$logFC) > 6.5,
                       rownames(nrDEG),
                       NA)
  table(nrDEG$sign)
  ggplot(data= nrDEG, aes(x = logFC, y = -log10(adj.P.Val), color = change)) +
    geom_point(alpha=0.8, size = 1) +
    theme_bw(base_size = 15) +
    theme(plot.title=element_text(hjust=0.5),   #  标题居中
          panel.grid.minor = element_blank(),
          panel.grid.major = element_blank()) + # 网格线设置为空白
    geom_hline(yintercept=2 ,linetype=4) +
    geom_vline(xintercept=c(-2.5,2.5) ,linetype=4 ) +
    scale_color_manual(name = "", 
                       values = c("red", "green", "black"),
                       limits = c("UP", "DOWN", "NOT")) +
    geom_label_repel(aes(label=sign), # 防止标签过多重叠
                     fontface="bold",
                     color="grey50",
                     box.padding=unit(0.35, "lines"),  # 文本框周边填充
                     point.padding=unit(0.5, "lines"), # 点周边填充
                     segment.colour = "grey50", # 连接点与标签的线段的颜色
                     force = T) + 
    labs(title = 'GBM DEG volcano')

TCGA数据挖掘--神经胶质瘤(GBM)差异mRNA分析_第3张图片

  • 热图
library( "pheatmap" )
  nrDEG_Z = nrDEG[ order( nrDEG$logFC ), ]
  nrDEG_F = nrDEG[ order( -nrDEG$logFC ), ]
  choose_gene = c( rownames( nrDEG_Z )[1:50], rownames( nrDEG_F )[1:50] )
  choose_matrix = exprSet_by_group[ choose_gene, ]
  choose_matrix = t( scale( t( choose_matrix ) ) )

  choose_matrix[choose_matrix > 2] = 2
  choose_matrix[choose_matrix < -2] = -2

  annotation_col = data.frame( group = factor( group_list ) )
  rownames( annotation_col ) = colnames( exprSet_by_group )

  pheatmap( fontsize_row = 4,
            choose_matrix, 
            annotation_col = annotation_col, 
            show_rownames = T,
            show_colnames = F,
            annotation_legend = T, 
            cluster_cols = T,
            filename = 'heatmap.png')

TCGA数据挖掘--神经胶质瘤(GBM)差异mRNA分析_第4张图片

7.富集分析

  • kegg_enrichment_analysis
  library( "clusterProfiler" )
  library( "org.Hs.eg.db" )
  load("nrDEG_by_group.Rdata")
  nrDEG$SYMBOL <- rownames(nrDEG)
  df <- bitr( rownames( nrDEG ), fromType = "SYMBOL", toType = c( "ENTREZID" ), 
              OrgDb = org.Hs.eg.db )
  head( df )
  nrDEG = merge( nrDEG, df, by = 'SYMBOL' )
  head( nrDEG )

  gene_up = nrDEG[ nrDEG$change == 'UP', 'ENTREZID' ] 
  gene_down = nrDEG[ nrDEG$change == 'DOWN', 'ENTREZID' ]
  gene_diff = c( gene_up, gene_down )
  gene_all = as.character(nrDEG[ ,'ENTREZID'] )
  g_list = list( gene_up = gene_up, gene_down = gene_down, gene_diff = gene_diff)

  kk.up <- enrichKEGG(gene = gene_up,
                      organism = 'hsa',
                      universe = gene_all,
                      pvalueCutoff = 0.01,
                      qvalueCutoff = 0.01)
  kk.dowm <- enrichKEGG(gene = gene_down,
                      organism = 'hsa',
                      universe = gene_all,
                      pvalueCutoff = 0.01,
                      qvalueCutoff = 0.01)
  
  kegg_down_dt <- as.data.frame(kk.dowm)
  kegg_up_dt <- as.data.frame( kk.up )
  down_kegg <- kegg_down_dt[ kegg_down_dt$pvalue < 0.05, ]
  down_kegg$group <- 'down_pathway'
  up_kegg <- kegg_up_dt[ kegg_up_dt$pvalue < 0.05, ]
  up_kegg$group <- 'up_pathway'
  dat = rbind(up_kegg,down_kegg)
  dat$pvalue = -log10(dat$pvalue)
  dat$group =  factor(dat$group)

  library(ggpubr)
  ggbarplot(dat,x = 'Description',y = 'pvalue',
            fill = 'group',
            color = 'white',
            palette = 'jco',
            sort.val = 'asc',
            xlab = 'Pathway names',
            ylab = '-log10 P-value',
            title = 'Pathway enrichment') +
    rotate() +
    theme_minimal()

TCGA数据挖掘--神经胶质瘤(GBM)差异mRNA分析_第5张图片

  • GO_enrichment_analysis
BP <- enrichGO( gene          =  gene_diff,
                  universe      =  gene_all,
                  OrgDb         =  org.Hs.eg.db,
                  keyType       = 'ENTREZID',
                  ont           =  'BP',
                  pAdjustMethod = "BH",
                  pvalueCutoff  =  0.01,
                  qvalueCutoff  =  0.01,
                  readable      =  TRUE)
  barplot(BP,showCategory=20)
  dotplot(BP,showCategory=20)

TCGA数据挖掘--神经胶质瘤(GBM)差异mRNA分析_第6张图片
TCGA数据挖掘--神经胶质瘤(GBM)差异mRNA分析_第7张图片

CC <- enrichGO( gene          =  gene_diff,
                  universe      =  gene_all,
                  OrgDb         =  org.Hs.eg.db,
                  keyType       = 'ENTREZID',
                  ont           =  'CC',
                  pAdjustMethod = "BH",
                  pvalueCutoff  =  0.01,
                  qvalueCutoff  =  0.01,
                  readable      =  TRUE)
 barplot(CC,showCategory=20)
 dotplot(CC,showCategory=20)

TCGA数据挖掘--神经胶质瘤(GBM)差异mRNA分析_第8张图片
TCGA数据挖掘--神经胶质瘤(GBM)差异mRNA分析_第9张图片

 MF <- enrichGO( gene          =  gene_diff,
                  universe      =  gene_all,
                  OrgDb         =  org.Hs.eg.db,
                  keyType       = 'ENTREZID',
                  ont           =  'MF',
                  pAdjustMethod = "BH",
                  pvalueCutoff  =  0.01,
                  qvalueCutoff  =  0.01,
                  readable      =  TRUE)
  barplot(MF,showCategory=20) 
  dotplot(MF,showCategory=20) +
    scale_x_continuous(limits = c(0,0.08), breaks = c(0.00,0.04,0.08))

TCGA数据挖掘--神经胶质瘤(GBM)差异mRNA分析_第10张图片
TCGA数据挖掘--神经胶质瘤(GBM)差异mRNA分析_第11张图片

  • DO_enrichment_analysis
library(DOSE)
  enrich.do <- enrichDO(gene = gene_diff,
                        universe = gene_all,
                        ont = 'DO',
                        pvalueCutoff = 0.05,
                        pAdjustMethod = 'BH',
                        minGSSize = 5,
                        maxGSSize = 500,
                        qvalueCutoff = 0.05,
                        readable = F)
  barplot(enrich.do)
  dotplot(enrich.do)

TCGA数据挖掘--神经胶质瘤(GBM)差异mRNA分析_第12张图片
TCGA数据挖掘--神经胶质瘤(GBM)差异mRNA分析_第13张图片

本博客内容将同步更新到个人微信公众号生信玩家。欢迎大家关注~~~
在这里插入图片描述

你可能感兴趣的:(转录组学,TCGA,data-mining)