Q1: 一行行的运行:https://github.com/jmzeng1314/5years/blob/master/learn-R/tasks/top50ggplot.Rmd 代码
Q2: 对RNAseq_expr挑选MAD值最大的100个基因的表达矩阵绘制热图
load('../view_new.Rdata')
tmp<- RNAseq_expr[rowSums(RNAseq_expr>0)>1,]
tmp <-tmp[names(tail(sort(apply(tmp,1,mad)),100)),]
tmp <- t(scale(t(tmp)))
pheatmap::pheatmap(tmp,filename = 'MAD_top100_heatmap.png')
M <- cor(log2(tmp))
####加上annotation col
annotation_col <- data.frame(group_ist = RNAseq_gl)
annotation_col$sample <- colnames(tmp)
rownames(annotation_col) <- colnames(tmp)
annotation_col
pheatmap::pheatmap(M,annotation_col = annotation_col,filename = 'correaltion_heatmap.png')
Q3: 对RNAseq_expr进行主成分分析并且绘图
q3 <- as.data.frame(t(e1))
q3[1:4,1:4]
q3$group <- RNAseq_gl
q3[,c(1,ncol(q3))]
qc_pca <- prcomp(q3)
#Error in colMeans(x, na.rm = TRUE) : 'x'必需为数值
q3_pca <- prcomp(q3[,1:(ncol(q3)-1)])
par(mar=c(2,2,2,2))
png(filename = 'pca.png',res = 100)
autoplot(q3_pca,data = q3,colour = 'group')+theme_classic()
dev.off()
# Q4: 对RNAseq_expr进行差异分析并且绘制火山图
setwd('limma /')
rm(list = ls())
suppressPackageStartupMessages(library(airway))
data(airway)
RNAseq_expr=assay(airway)
group_list=colData(airway)[,3]
dim(RNAseq_expr)
save(RNAseq_expr,group_list,file = 'rnaseq.Rdata')
rm(list = ls())
options(stringsAsFactors = F)
load('rnaseq.Rdata')
#### 三数据之一 ####
##### 表达矩阵 #####
colnames(RNAseq_expr)
pheatmap::pheatmap(cor(RNAseq_expr))
group_list
tmp=data.frame(group=group_list)
rownames(tmp)=colnames(RNAseq_expr)
# 组内的样本的相似性应该是要高于组间的!
pheatmap::pheatmap(cor(RNAseq_expr),annotation_col = tmp)
dim(RNAseq_expr)
RNAseq_expr=RNAseq_expr[apply(RNAseq_expr,1, function(x) sum(x>1) > 5),]
#RNAseq_expr=RNAseq_expr[rowSums(RNAseq_expr>1)>5,],等价于上一句
dim(RNAseq_expr)
RNAseq_expr=log(edgeR::cpm(RNAseq_expr)+1)
suppressPackageStartupMessages(library(limma))
########## QC ############
par(cex = 0.7)
n.sample=ncol(RNAseq_expr)
if(n.sample>40) par(cex = 0.5)
cols <- rainbow(n.sample*1.2)
boxplot(RNAseq_expr, col = cols,main="expression value",las=2)
########## QC ############
#### 三数据之二 ####
###### 分组矩阵 #####
?factor
?model.matrix
design <- model.matrix(~0+factor(group_list))
colnames(design)=levels(factor(group_list))
rownames(design)=colnames(RNAseq_expr)
design
#### 三数据之三 ####
##### 差异比较矩阵 #####
contrast.matrix<-makeContrasts(paste0(unique(group_list),collapse = "-"),levels = design)
contrast.matrix
### 三步骤 ###
##step1_________lmFit
fit <- lmFit(RNAseq_expr,design)
##step2_________eBayes
fit2 <- contrasts.fit(fit, contrast.matrix) ##这一步很重要,大家可以自行看看效果
fit2 <- eBayes(fit2) ## default no trend !!!
##eBayes() with trend=TRUE
##step3_________topTable
tempOutput = topTable(fit2, coef=1, n=Inf)
nrDEG = na.omit(tempOutput)
#
write.csv(nrDEG,"limma_notrend.results.csv",quote = F)
head(nrDEG)
source('../functions.R')###JM github 上下载
DEseq_DEG=nrDEG
nrDEG=DEseq_DEG[,c(1,4)]
colnames(nrDEG)=c('log2FoldChange','pvalue')
draw_h_v(RNAseq_expr,nrDEG,'limma')
limma_need_DEG_top50_heatmap.png
#####################DEseq2包进行差异分析 #######################
#差异分析套路与limma差不多,DEseq2针对count类型的数据
## 参考:http://www.bio-info-trainee.com/bioconductor_China/software/DESeq2.html
### 三个数据 ——表达矩阵,分组矩阵,差异比较矩阵
#### 表达矩阵,前面已经下载了,就直接载入了
rm(list = ls())
suppressPackageStartupMessages(library(DESeq2))
suppressPackageStartupMessages(library(limma))
load('./limma /rnaseq.Rdata')
suppressPackageStartupMessages(library(airway))
#### 三个步骤
#### step1 构造DDS对象、####steo2 DESeq函数进行normalization处理、####step3 results函数来提取差异比较结果
#### step1 构造DDS对象
colData <- data.frame(row.names=colnames(RNAseq_expr),
group_list=group_list
)
dds <- DESeqDataSetFromMatrix(countData = RNAseq_expr,
colData = colData,
design = ~ group_list)
dds
# class: DESeqDataSet
# dim: 64102 8
# metadata(1): version
# assays(1): counts
# rownames(64102): ENSG00000000003 ENSG00000000005 ... LRG_98 LRG_99
# rowData names(0):
# colnames(8): SRR1039508 SRR1039509 ... SRR1039520 SRR1039521
# colData names(1): group_list
#### 这里是jimmy提到的另一种构造dds的方法
# library(airway)
# data(airway)
# suppressMessages(library(DESeq2))
# dds <- DESeqDataSet(airway, design = ~ cell+ dex)
# design(dds) <- ~ dex
###
# ## 构造好的对象还可以更改分组信息
####steo2 DESeq函数进行normalization处理
suppressMessages(dds2 <- DESeq(dds))
##直接用DESeq函数即可
## 下面的代码如果你不感兴趣不需要运行,免得误导你
## 就是看看normalization前面的数据分布差异
# rld <- rlogTransformation(dds2) ## 得到经过DESeq2软件normlization的表达矩阵!
# RNAseq_expr_new=assay(rld)
# par(cex = 0.7)
# n.sample=ncol(RNAseq_expr)
# if(n.sample>40) par(cex = 0.5)
# cols <- rainbow(n.sample*1.2)
# par(mfrow=c(2,2))
# boxplot(RNAseq_expr, col = cols,main="expression value",las=2)
# boxplot(RNAseq_expr_new, col = cols,main="expression value",las=2)
# hist(RNAseq_expr)
# hist(RNAseq_expr_new)
####step3 results函数来提取差异比较结果
resultsNames(dds2)
#[1] "Intercept" "group_list_untrt_vs_trt"
## 其实如果只分成了两组,并没有必要指定这个比较矩阵!
#res <- results(dds2, contrast=c("group_list","treated","untreated"))
### 如果是第二种方法构建的dds,指定比较矩阵时,应注意对象
res <- results(dds2)
resOrdered <- res[order(res$padj),]
resOrdered=as.data.frame(resOrdered)
head(resOrdered)
DEG <- as.data.frame(resOrdered)
table(is.na(DEG))
DEG <- na.omit(DEG)
nrDEG <- DEG
DEseq_DEG <- nrDEG
write.csv(DEseq_DEG,file = 'DESeq_DEG.csv',quote = F)
###直接下载jimmyGitHub上的functions.R,调用后,直接出图
source('functions.R')
draw_h_v(RNAseq_expr,DEseq_DEG,'DESeq2')
### 对比后,可以发现,limma和DESeq2得到的差异基因还是有一些区别的,据此画出的火山图也有较大的区别
DESeq2_data_pre_nor vs data_post_nor.png
DEseq2_need_DEG_top50_heatmap.png
#nrDEG=DEseq_DEG[,c(2,6)]
#colnames(nrDEG)=c('log2FoldChange','pvalue')
## 土味火山图
###plot(DEseq_DEG$log2FoldChange, -log10(DEseq_DEG$pvalue))
Juicy_natural_volcano.png
#### 注入灵魂
logFC_cutoff <- with(nrDEG,mean(abs( log2FoldChange)) + 2*sd(abs( log2FoldChange)) )
nrDEG$change = as.factor(ifelse(nrDEG$pvalue < 0.05 & abs(nrDEG$log2FoldChange) > logFC_cutoff,
ifelse(nrDEG$log2FoldChange > logFC_cutoff ,'UP','DOWN'),'NOT'))
this_tile <- paste0('Cutoff for logFC is ',round(logFC_cutoff,3),
'\nThe number of up gene is ',nrow(nrDEG[nrDEG$change =='UP',]) ,
'\nThe number of down gene is ',nrow(nrDEG[nrDEG$change =='DOWN',]))
volcano = ggplot(data=nrDEG,
aes(x=log2FoldChange, y=-log10(pvalue),
color=change)) +
geom_point(alpha=0.4, size=1.75) +
xlab("log2 fold change") + ylab("-log10 p-value") +
ggtitle(this_tile) + theme(plot.title = element_text(size=15,hjust = 0.5)) +
scale_colour_manual(values = c('blue','black','red'))+
theme_classic()
Q5: 对RNAseq_expr进行差异分析并且绘制(平均值VS变化倍数)图
#DESeq2包里有这样一个函数 plotMA
#A simple helper function that makes a so-called "MA-plot", i.e. a scatter plot of log2 fold changes (on the y-axis)
#versus the mean of normalized counts (on the x-axis).
plotMA(res)
average_vs_foldchange.png
Q6: 绘制其中一个差异基因在两个分组的表达量boxplot并且添加统计学显著性指标
choose_gene <- rownames(nrDEG)[1]
suppressPackageStartupMessages(library(ggpubr))
suppressPackageStartupMessages(library(reshape2))
RNAseq_expr_m <- melt(RNAseq_expr)
RNAseq_expr_m$Group <- rep(levels(group_list),each = nrow(RNAseq_expr))
colnames(RNAseq_expr_m) <- c('Gene','Sample','Value', 'Group')
choose_gene_d <- RNAseq_expr_m[RNAseq_expr_m$Gene==choose_gene,]
?stat_compare_means
ggplot(choose_gene_d,aes(x=Group, y=Value,color=Group))+
geom_boxplot()+
theme_classic()+
labs(title = 'Expression boxplot',subtitle = choose_gene)
stat_compare_means(method = 't.test')
Q7: 通过org.Hs.eg.db包拿到RNAseq_expr所有基因的染色体信息,绘制染色体的基因数量条形图
ls('package:org.Hs.eg.db')
CHR <- toTable(org.Hs.egCHR)
colnames(CHR)
#[1] "gene_id" "chromosome"
ENSEMBL <- toTable(org.Hs.egENSEMBL)
colnames(ENSEMBL)
#[1] "gene_id" "ensembl_id"
ENSEMBL_ID <- data.frame(rownames(RNAseq_expr))
colnames(ENSEMBL_ID) <- "ensembl_id"
EID <- merge(ENSEMBL_ID,ENSEMBL,by="ensembl_id")
EIDC <- merge(EID,CHR,by="gene_id")
colnames(EIDC)
#[1] "gene_id" "ensembl_id" "chromosome"
suppressPackageStartupMessages(library(ggplot2))
ggplot(EIDC,aes(x=chromosome))+
geom_bar()
Q8: 在上面染色体的基因数量条形图并列叠加差异基因数量条形图
#### id的转换
DEG_ensembl <- data.frame(rownames(nrDEG))
colnames(DEG_ensembl) <- c("ensembl_id")
DEG_E <- merge(DEG_ensembl,ENSEMBL,by="ensembl_id")
DEG_EC <- merge(DEG_E,CHR,by="gene_id")
colnames(DEG_EC) <- c("DEG_gene_id","DEG_ensembl_id","Chromosome")
str(DEG_EC)
head(EIDC)
EIDC$DEG <- as.factor(ifelse(EIDC$ensembl_id %in% DEG_EC$DEG_ensembl_id,'DEG','NOT'))
ggplot(EIDC,aes(x=chromosome,fill=DEG)) +
geom_bar()+
theme_classic2()
Q9: 在oncolnc网页工具拿到GUL5基因在BRCA数据集的表达量及病人生存资料自行本地绘制生存分析图
##### 提示没有GUL5这个基因。。。。。以GULP1代替
rm(list = ls())
GULP1 <- read.csv('BRCA_51454_50_50.csv')
suppressPackageStartupMessages(library(survminer))
suppressPackageStartupMessages(library(survival))
suppressPackageStartupMessages(library(ggstatsplot))
colnames(GULP1)
ggbetweenstats(data = GULP1, x='Status', y='Expression')
vignette('survival')
vignette('survminer')
str(GULP1)
head(GULP1)
table(GULP1$Status)
# Alive Dead
# 871 135
GULP1$Status <- ifelse(GULP1$Status=='Dead',1,0)
head(GULP1)
table(GULP1$Status)
fit <- surv_fit(Surv(Days, Status) ~ Group,
data = GULP1)
b <- surv_pvalue(fit)
b$pval
ggsurvplot(fit,data = GULP1,pval =round(b$pval,3))
Q10: 在xena网页工具拿到GUL5基因在BRCA数据集的表达量及病人的PAM50分类并且绘制分类的boxplot
### 同样以GULP1代替
GULP1 <- read.table('limma /denseDataOnlyDownload.tsv',sep = '\t',header = T,na.strings = '')
colnames(GULP1)
table(GULP1$PAM50_mRNA_nature2012)
# Basal-like HER2-enriched Luminal A Luminal B Normal-like
# 98 58 231 127 8
tail(GULP1)
# sample samples PAM50_mRNA_nature2012 GULP1
# 1242 TCGA-E2-A1IO-11 TCGA-E2-A1IO-11 NA
# 1243 TCGA-E2-A1LI-11 TCGA-E2-A1LI-11 NA
# 1244 TCGA-E9-A1N8-11 TCGA-E9-A1N8-11 NA
# 1245 TCGA-E9-A1NE-11 TCGA-E9-A1NE-11 NA
# 1246 TCGA-E9-A1NH-11 TCGA-E9-A1NH-11 NA
# 1247 TCGA-XX-A899-11 TCGA-XX-A899-11 NA
table(complete.cases(GULP1))
GULP1<- GULP1[complete.cases(GULP1),]
ggplot(GULP1,aes(y=GULP1,x= PAM50_mRNA_nature2012,fill = PAM50_mRNA_nature2012))+
geom_boxplot()+
stat_boxplot(geom = 'errorbar',width = 0.1)+
theme_classic2()