题目:http://www.bio-info-trainee.com/4387.html
参考: https://www.jianshu.com/p/fab27c63af94
options(stringsAsFactors = F)
library(airway)
data(airway)
RNAseq_expr=assay(airway)
#RNAseq_expr 是一个数值型矩阵,属于连续性变量,可以探索众数、分位数和平均数 ,极差,方差和标准差等统计学指标
dex=colData(airway)[,3]
#coldata是样本信息,提取coldata中的第三列:药物dex treat&untreat
#Q1: 对RNAseq_expr的每一列绘制boxplot图
boxplot(RNAseq_expr,main = 'Boxplot of RNAseq-expr',
xlab = 'samples',ylab = 'expression')
#Q4: 对RNAseq_expr的每一列log2后绘制boxplot图,density图和条形图
#Q5: 对Q4的3个图里面添加 trt 和 untrt 组颜色区分开来
RNAseq_filter = RNAseq_expr[apply(RNAseq_expr,1,function(x) sum(x)>1),]
##过滤都是0的行
RNA_log2 <- log2(RNAseq_filter+1)
boxplot(RNA_log2,main = 'Boxplot of RNAseq_log2',
xlab = 'samples',ylab = 'expression',col= c("blue","red"),fill = as.integer(dex))
opar <- par(no.readonly=T)
par(mfrow=c(2,4))
for (i in c(1:8)) {
plot(density(RNA_log2[,i]),col=as.integer(dex)[i],main = "Density")}
par(opar)
#?如何给条形图不同组设置不同颜色
col = c("lightblue","lightgreen")
par(mfrow=c(1,1))
barplot(RNA_log2, main = 'Barplot of RNAseq-expr',
xlab = 'samples',ylab = 'expression',col = as.integer(dex))
#Q6:对RNAseq_expr的前两列画散点图并且计算线性回归方程
pairs(RNAseq_expr[,1:2])
RNAseq_df <- as.data.frame(RNAseq_expr)
fit <- lm(RNAseq_df[,1] ~ RNAseq_df[,2],data = RNAseq_df)
#lm()是最基本的拟合线性模型函数
plot(RNAseq_df[,1],RNAseq_df[,2],xlab="SRR1039508",ylab="SRR1039509")
abline(fit)
#Q7:对RNAseq_expr的所有列两两之间计算相关系数,并且热图可视化
cor = cor(RNAseq_filter)
heatmap(cor,symm = TRUE )
#Q8: 取RNAseq_expr第一行表达量绘制折线图
plot(RNAseq_filter[1,],type="l",xlab = "gene",ylab="expression",col="red")
#Q9: 取RNAseq_expr表达量最高的10个基因的行绘制多行折线图
max10 <- rownames(as.data.frame(sort(rowSums(RNAseq_expr),decreasing=T)[1:10]))
max10 <- RNAseq_filter[max10,]
plot(max10[1,],type="b",xlab = "samples",ylab="expression",pch = 1,col =1)
for (i in c(2:10)){
lines(max10[i,],type="b",xlab = "samples",ylab="expression",pch = i, col=i)
}
#ggplot
suppressMessages(library(reshape2))
suppressMessages(library(ggplot2))
suppressMessages(library("Rmisc"))
suppressMessages(library("plyr"))
#准备数据
RNAseq_L <- melt(RNA_log2)
colnames(RNAseq_L) <- c("gene","sample","expression")
RNAseq_L$group=rep(dex,each=nrow(RNA_log2))
#箱型图
box=ggplot(RNAseq_L,aes(x=sample,y=expression,fill=group))+geom_boxplot() +
theme(axis.text.x = element_text(angle=30, hjust=1, vjust=1,size=6 ))
print(box)
# 密度图
d <- ggplot(RNAseq_L,aes(x=expression,col=sample)) +
geom_density() +
theme(legend.text = element_text(size = 8, face = "bold"),legend.key.size=unit(0.3,'cm'))
print(d)
d2 <- ggplot(RNAseq_L,aes(expression,col=group)) +
geom_density() +
theme(legend.text = element_text(size = 8, face = "bold"),legend.key.size=unit(0.3,'cm'))
print(d2)
# 条形图
bar <- ggplot(RNAseq_L,aes(x=sample,y=expression,fill= group)) +
geom_bar(stat="identity") +
theme(axis.text.x = element_text(angle=30, hjust=1, vjust=1,size=6 ))
print(bar)
# 散点图
s <- ggplot(as.data.frame(RNAseq_expr[,1:2]),aes(x=SRR1039508,y=SRR1039509)) +
geom_point() +
geom_smooth(method = "lm")
print(s)
# 热图
melt_cor <- melt(cor)
h <- ggplot(melt_cor, aes(x=Var1, y=Var2, fill=value)) +
geom_tile()+
theme(axis.text.x = element_text(angle=45, hjust=1, vjust=1,size= 4),
axis.text.y = element_text(size= 4)) + labs(x="",y="")
print(h)
#Q8:RNAseq_expr第一行表达量折线图
df1 = data.frame(expression = RNAseq_filter[1,])
df1$sample <- rownames(df1)
l1 <- ggplot(df1,aes(x=sample, y=expression,group =1)) + geom_line() +
geom_point() +
theme(axis.text.x = element_text(angle=45, hjust=1, vjust=1,size=8 ))
print(l1)
#Q9: 取RNAseq_expr表达量最高的10个基因的行绘制多行折线图
df10 = melt(max10)
colnames(df10) <- c("gene","sample","expression")
l10 <- ggplot(df10,aes(x=sample,y=expression,colour=gene,group=gene)) + geom_line() + geom_point() +
theme(axis.text.x = element_text(angle=45, hjust=1, vjust=1,size=8 ),
legend.text = element_text(size = 8, face = "bold"),legend.key.size=unit(0.3,'cm'))
print(l10)
#多图展示
multiplot(box,d,d2,bar,s,h, layout = matrix(c(1,2,3,4,5,6), nrow=3, byrow=TRUE))
#c(1, 1, 2, 3):“1, 1”代表一张图占了两格,“2”和“3”各代表一张图
multiplot(l1,l10)
生物信息学绘图
#Q2: 对RNAseq_expr挑选MAD值最大的100个基因的表达矩阵绘制热图
top100_mad <- RNAseq_expr[names(sort(apply(RNAseq_expr,1,mad),decreasing=T)[1:100]),]
top100_mad_standard = t(scale(t(top100_mad)))
pheatmap::pheatmap(top100_mad_standard)
#Q3: 对RNAseq_expr进行主成分分析并且绘图
library(ggfortify)
library(dplyr)
df = as.data.frame(t(RNAseq_expr))
df$group=dex
autoplot(prcomp( df[,1:(ncol(df)-1)] ), data=df,colour = 'group')+theme_bw()
#theme_bw()--ggtheme
#Q4: 对RNAseq_expr进行差异分析并且绘制火山图
## DEG by DESeq2
suppressMessages(library(DESeq2))
#step1: 导入表达矩阵并设置分组
dex_df=data.frame(dex=dex)
dds <- DESeqDataSetFromMatrix(countData = RNAseq_expr,
colData = dex_df,
design = ~ dex)
##表达矩阵RNAseq_expr,样本信息coldata,
##design 表示怎样设计样本的模型(这里考虑药物dex,这项是coldata的因子型变量)
dds$dex <- factor(dds$dex, levels = c("untrt","trt"))
##在design中设置factor level,如果不告诉DESeq2哪组作为对照组的话,它会自动根据字母表顺序设置!!!
##设置untrt为对照组!
#step2:Pre-filtering
##这不是必须,只是为了减少后面计算量,主要就是去除表达量非常少的行,比如设置阈值为每行表达量为10
dds_filter <- rowSums(counts(dds)) >= 10
dds <- dds[dds_filter,]
#step3
dds <- DESeq(dds)
res <- results(dds, contrast=c("dex","trt","untrt"))
#step4 shrink
##用lfcShrink收缩log2_fold_change
##log2FC estimates do not account for the large dispersion we observe with low read counts.
##As with the shrinkage of dispersion estimates, LFC shrinkage uses information from all genes to generate more accurate estimates.
library(apeglm)
resLFC <- lfcShrink(dds,coef=resultsNames(dds)[2],res=res)
#step5 logFC_cutoff
resOrdered <- res[order(res$padj),]
DEG =as.data.frame(resOrdered)
DEG = na.omit(DEG) ##返回删除NA后的向量a
nrDEG=DEG[,c(2,6)]
colnames(nrDEG)=c('log2FoldChange','pvalue')
logFC_cutoff <- with(nrDEG,mean(abs( log2FoldChange)) + 2*sd(abs( log2FoldChange)) )
nrDEG$change = as.factor(ifelse(nrDEG$pvalue < 0.05 & abs(nrDEG$log2FoldChange) > logFC_cutoff,
ifelse(nrDEG$log2FoldChange > logFC_cutoff ,'UP','DOWN'),'NOT'))
##step6 volcano map
this_tile <- paste0('Cutoff for logFC is ',round(logFC_cutoff,3),
'\nThe number of up gene is ',nrow(nrDEG[nrDEG$change =='UP',]) ,
'\nThe number of down gene is ',nrow(nrDEG[nrDEG$change =='DOWN',]))
volcano = ggplot(data=nrDEG,
aes(x=log2FoldChange, y=-log10(pvalue), color=change)) +
geom_point(alpha=0.4, size=1.75) +
xlab("log2 fold change") + ylab("-log10 p-value") +
ggtitle(this_tile) + theme(plot.title = element_text(size=15,hjust = 0.5)) +
scale_colour_manual(values = c('blue','black','red'))
volcano
ggsave(volcano,filename = paste0('DEseq2_volcano.png'))
##step7 平均值VS变化倍数图
##M表示log fold change,衡量基因表达量变化,上调还是下调。A表示每个基因的count的均值。
png("MA.png")
plotMA(res,ylim=c(-5,5))
#标记出表达差异最大的基因
topGene <- rownames(res)[which.min(res$padj)]
with(res[topGene, ], {
points(baseMean, log2FoldChange, col="dodgerblue", cex=2, lwd=2)
text(baseMean, log2FoldChange, topGene, pos=2, col="dodgerblue")
})
dev.off()
png("MA2.png")
MA2 <- plotMA(resLFC, ylim=c(-5,5))
topGene <- rownames(resLFC)[which.min(res$padj)]
with(resLFC[topGene, ], {
points(baseMean, log2FoldChange, col="dodgerblue", cex=2, lwd=2)
text(baseMean, log2FoldChange, topGene, pos=2, col="dodgerblue")})
#identify(),交互函数,在plot中点击后显示该点索引(行数)
idx <- identify(res$baseMean, res$log2FoldChange)
dev.off()
#Q6: 绘制其中一个差异基因在两个分组的表达量boxplot并且添加统计学显著性指标
suppressMessages(library(ggpubr))
choose_gene = rownames(nrDEG)[2]
choose_gene_d <- cbind(as.data.frame(RNAseq_expr[choose_gene,]),as.data.frame(dex))
choose_gene_d$sample = rownames(choose_gene_d)
colnames(choose_gene_d) = c("expression","dex","sample")
opar<-par(no.readonly=T)
ggplot(data = choose_gene_d,aes(y=expression,x=dex)) +
geom_boxplot() +
stat_compare_means(method = "t.test")
#Q7: 通过org.Hs.eg.db包拿到RNAseq_expr所有基因的染色体信息,绘制染色体的基因数量条形图
suppressMessages(library(org.Hs.eg.db))
CHR <- toTable(org.Hs.egCHR)
ensembl <- toTable(org.Hs.egENSEMBL)
ensembl_id <- data.frame(rownames(RNAseq_expr))
colnames(ensembl_id) <- c("ensembl_id")
eg <- merge(ensembl_id,ensembl,by="ensembl_id")
egc <- merge(eg,CHR,by="gene_id")
ggplot(data = egc,aes(x=chromosome)) + geom_bar(fill="lightblue")
#Q8: 在上面染色体的基因数量条形图并列叠加差异基因数量条形图
DEG_ensembl <- data.frame(rownames(nrDEG))
colnames(DEG_ensembl) <- c("ensembl_id")
DEG_eg <- merge(DEG_ensembl,ensembl,by="ensembl_id")
DEG_egc <- merge(DEG_eg,CHR,by="gene_id")
colnames(DEG_egc) <- c("DEG_gene_id","DEG_ensembl_id","chromosome")
egc$DEG <- as.factor(ifelse(egc$ensembl_id %in% DEG_egc$DEG_ensembl_id,'DEG','NOT'))
ggplot(egc,aes(x=chromosome,fill=DEG)) + geom_bar(stat ='count')
#Q9:在oncolnc网页工具拿到CUL5基因在BRCA数据集的表达量及病人生存资料自行本地绘制生存分析图
suppressMessages(library(ggstatsplot))
CUL5 <- read.csv(file = 'BRCA_8065_50_50.csv',header = T)
ggbetweenstats(data = CUL5, x='Status', y='Expression')
suppressMessages(library(survival))
suppressMessages(library(survminer))
CUL5$Status = ifelse(CUL5$Status=='Dead',1,0)
survf <- survfit(Surv(Days,Status)~Group,data = CUL5)
ggsurvplot(survf, conf.int = F, pval=T)
# complex survplot
ggsurvplot(survf,palette = c("orange", "blue"),
risk.table = T, pval = T,
conf.int = T, xlab = "Time of days",
ggtheme = theme_light(),
ncensor.plot = T)
#Q10. 在xena网页工具拿到CUL5基因在BRCA数据集的表达量及病人的PAM50分类并且绘制分类的boxplot
cul5 <- read.csv(file = 'denseDataOnlyDownload.tsv',sep="\t",header = T,na.strings="")
# 去除NA以后还有...读取的时候加上参数 na.strings=""
#剔除第3列为空的行
cul5<- cul5[complete.cases(cul5),]
library(RColorBrewer)
ggplot(data = cul5,aes(y=CUL5,x=PAM50_mRNA_nature2012,color=PAM50_mRNA_nature2012)) +
geom_boxplot() +
theme_light() + stat_boxplot(geom = "errorbar",width=0.2)