2020-05-08 复现GSE137675 RNA-Seq结果(2):下游分析

  1. 将上游分析最后得到的counts.txt读入到R中
rm(list = ls())  # 魔幻操作,一键清空~
options(stringsAsFactors = F)  # 关闭字符串转因子选项
# 读入featureCounts的表达矩阵
data <- read.table("counts.txt",sep = "\t",header = T,row.names = 1)
data[1:3,]

# 查看低/零表达基因情况
dim(data)
## [1] 60662    12
table(rowSums(data==0)==ncol(data))
## FALSE  TRUE 
## 40892 19770 
# 去除低表达和不表达的基因
exprSet <- data[apply(data,1,sum)!=0,]
dim(exprSet)
## [1] 40892    12

#对exprSet行名进行重命名(去除点号和后面的数字,去除重复的基因名)
tmp0 <- exprSet
tmp0$mean <- apply(tmp0,1,mean)
tmp0 <- tmp0[order(tmp0$mean,decreasing = T),]

str <- stringr::str_split(rownames(tmp0),"\\.",simplify=T)
tmp0$id <- str[,1]
tmp0 <- tmp0[!duplicated(tmp0$id),]
rownames(tmp0) <- tmp0$id

exprSet <- tmp0[,-(ncol(tmp0)-1)]

# 保存预处理后的表达矩阵
save(exprSet,file = "data/Step01-featureCounts2exprSet.Rdata")
  1. 读入作者上传的表达矩阵GSE137675_Reads_count_featureCounts.xlsx
    并跟自己处理的结果做样本相关性分析。
    代码参考:https://mp.weixin.qq.com/s/b7UHpcicVu6yjj4aM_BGvw
rm(list = ls())  # 魔幻操作,一键清空~
options(stringsAsFactors = F)  # 关闭字符串转因子选项
# 读入作者上传的表达矩阵
tmp <- read.csv("GSE137675_Reads_count_featureCounts.csv",row.names = 1)
dim(tmp)
## [1] 40892    12
tmp2 <- tmp[apply(tmp,1,sum)!=0,]
## [1] 35904    12
tmp2$id <- rownames(tmp2)

# 合并作者上传的表达矩阵和自己走上游分析流程得到的表达矩阵
m <- merge(exprSet,tmp2,by='id')
rownames(m) <- m$id
m <- m[,-1]
dim(m)
## [1] 31523    24
pheatmap::pheatmap(cor(m))
pheatmap::pheatmap(cor(log2(m+1)))

cpmM <- log(edgeR::cpm(m+1))
pheatmap::pheatmap(cor(cpmM))

hg1 <- names(tail(sort(apply(cpmM[,1:12],1,mad)),500))
hg2 <- names(tail(sort(apply(cpmM[,13:24],1,mad)),500))
hg <- intersect(hg1,hg2)
hgM <- cpmM[hg,]
pheatmap::pheatmap(cor(hgM))
corrplot.png

对照表

CM2005.1.RNA-seq.rep1 SRR10139778
CM2005.1.RNA-seq.rep2 SRR10139779
CRMM1.RNA-seq.rep1 SRR10139780
CRMM1.RNA-seq.rep2 SRR10139781
OCM1.RNA-seq.rep1 SRR10139782
OCM1.RNA-seq.rep2 SRR10139783
OCM1a.RNA-seq.rep1 SRR10139784
OCM1a.RNA-seq.rep2 SRR10139785
OM431.RNA-seq.rep1 SRR10139786
OM431.RNA-seq.rep2 SRR10139787
PIG1.RNA-seq.rep1 SRR10139788
PIG1.RNA-seq.rep2 SRR10139789
  1. 使用R包DESeq2分析基因差异表达
rm(list = ls())  # 魔幻操作,一键清空~
options(stringsAsFactors = F)  # 关闭字符串转因子选项

# 读取基因表达矩阵信息
load(file = "data/Step01-featureCounts2exprSet.Rdata")

# 查看分组信息和表达矩阵数据
dim(exprSet)
## [1] 40884    13
colnames(exprSet)
##  [1] "SRR10139778" "SRR10139779" "SRR10139780" "SRR10139781" "SRR10139782"
##  [6] "SRR10139783" "SRR10139784" "SRR10139785" "SRR10139786" "SRR10139787"
## [11] "SRR10139788" "SRR10139789" "id" 
exprSet <- exprSet[,-ncol(exprSet)]
group_list <- c(rep("tumor",10),rep("normal",2))
names(group_list)<-colnames(exprSet)
group_list <- as.factor(group_list)

suppressMessages(library(DESeq2))
#### 第一步,构建DESeq2的DESeq对象
colData <- data.frame(row.names=colnames(exprSet),group_list=group_list)
dds <- DESeqDataSetFromMatrix(countData = exprSet,colData = colData,
                              design = ~ group_list)
#### 第二步,进行差异表达分析
f  <- 'data/Step03-DESeq2_dds2.Rdata'
if(!file.exists(f)){
  dds2 <- DESeq(dds)
  # 保存差异表达分析结果
  save(dds2, file = "data/Step03-DESeq2_dds2.Rdata")
}
#### 第三步,提取差异分析结果
load(file = "data/Step03-DESeq2_dds2.Rdata")
# 提取差异分析结果,tumor组对normal组的差异分析结果
res <- results(dds2,contrast=c("group_list","tumor","normal"))
resOrdered <- res[order(res$padj),]
head(resOrdered)

DEG <- as.data.frame(resOrdered)
# 去除差异分析结果中包含NA值的行
DESeq2_DEG = na.omit(DEG)

# 取出包含logFC和P值的两列
nrDEG=DESeq2_DEG[,c(2,6)]
colnames(nrDEG)=c('log2FoldChange','pvalue')  
save(nrDEG, DESeq2_DEG, file = "data/Step03-DESeq2_nrDEG.Rdata")

DEG <- nrDEG
logFC_cutoff <- 1.5
DEG$change = as.factor(ifelse(DEG$pvalue < 0.05 & abs(DEG$log2FoldChange) > logFC_cutoff,
                              ifelse(DEG$log2FoldChange > logFC_cutoff ,'UP','DOWN'),'STABLE'))
table(DEG$change)
##  DOWN STABLE     UP 
##  1555  20684   1964 
DOWN STABLE UP
1555 20684 1964

作者的差异基因分析结果:


DEG_res.png

你可能感兴趣的:(2020-05-08 复现GSE137675 RNA-Seq结果(2):下游分析)