2019-06-29 R语言作业(中级)

R语言作业(中级)题目链接:http://www.bio-info-trainee.com/3750.html

作业1 根据R包org.Hs.eg.db找到下面ensembl 基因ID 对应的基因名(symbol)

#1.加载这个注释包org.Hs.eg.db
library(org.Hs.eg.db) 
#了解一下这个包 ?org.Hs.eg.db 发现里面含有一个函数可以查看org.Hs.eg.db包的信息 columns()
columns(org.Hs.eg.db)
#了解一个函数toTable(),
#toTable(x) turns Bimap object x into a data frame
g2s=toTable(org.Hs.egSYMBOL)
g2e=toTable(org.Hs.egENSEMBL)
head(g2s)
head(g2e)

作业2 根据R包hgu133a.db找到下面探针对应的基因名(symbol)

options(BioC_mirror='https://mirrors.ustc.edu.cn/bioc')
if(!require('hgu133a.db')) BiocManager::install('hgu133a.db',ask = F,update = F)
library(hgu133a.db)
columns(hgu133a.db)
ids=toTable(hgu133aSYMBOL)
head(ids)
#找到下面探针对应的基因名(symbol)
prb <- read.table('names.txt')
head(prb)
colnames(prb)='probe_id'
head(prb)
work2 <- merge(prb,ids,by='probe_id')

作业3 找到R包CLL内置的数据集的表达矩阵里面的TP53基因的表达量,并且绘制在 progres.-stable分组的boxplot图

library(CLL)
data("sCLLex")
#使用CLL包时,需要了解这个包 ?CLL  发现里面有一个data(sCLLex)
dim(exprSet)#表达矩阵的维度
pData(sCLLex)#sCLLex的分组情况
sampleNames(sCLLex)#sCLLex的样本名称
varMetadata(sCLLex)#sCLLex的标签描述
varLabels(sCLLex)#sCLLex的观测变量
#了解一下?sCLLex 发现里面有一个exprs()函数可以提取sCLLex的表达矩阵
library(CLL)
data("sCLLex")
exprSet=exprs(sCLLex)
#寻找TP53基因的表达量,并且绘制在 progres.-stable分组的boxplot图
head(exprSet)
#head一下发现exprSet里面是probe_id
library(hgu95av2.db)
columns(hgu95av2.db)
g3s=toTable(hgu95av2SYMBOL)
head(g3s)
which(g3s[,2]=='TP53')
g3s[c(966,997,1420),c(1,2)]
head(exprSet)
tmp=exprSet
rownames(exprSet)
class(rownames(exprSet))
tmp1 <- data.frame(rownames(exprSet),exprSet)
tmp1[1:4,1:4]
colnames(tmp1)[1] <-  'probe_id'
TP53_prob <- data.frame(g3s[c(966,997,1420),c(1,2)][,1])
names(TP53_prob) <- 'probe_id'
tmp2 <- merge(TP53_prob,tmp1,by='probe_id')
pData(sCLLex)
pData1 <- pData(sCLLex)
pData2 <- data.frame(rownames(pData1),pData1)
colnames(pData2)[1] <- 'sample'
rownames(tmp2)=tmp2[,1]
tmp2=tmp2[,-1]
tmp2_t=t(tmp2)
tmp2_t=data.frame(rownames(tmp2_t),tmp2_t)
colnames(tmp2_t)[1] <- 'sample'
#hebing
TP53_exprSet <- merge(tmp2_t,pData2,by='sample')
#使用apply求TP53三个探针的平均值
apply(TP53_exprSet[,c(2,3,4)], 1, mean)
TP53_exprSet$mean <- apply(TP53_exprSet[,c(2,3,4)], 1, mean)
#绘制在 progres.-stable分组的boxplot图
boxplot(TP53_exprSet$mean~TP53_exprSet$Disease)
t.test(TP53_exprSet$mean~TP53_exprSet$Disease)
#ggpubr美化图形
library(ggpubr)
p <- ggboxplot(TP53_exprSet, x = "Disease", y = "mean",
               color = "Disease", palette =c("#00AFBB", "#E7B800"),
               add = "jitter", shape = "Disease")
my_comparisons <- list( c("progres.", "stable"))
p <- p + stat_compare_means(comparisons = my_comparisons)+stat_compare_means(label.y = 50)+ylim(2.4,4)
p

作业4 作业4 找到BRCA1基因在TCGA数据库的乳腺癌数据集(Breast Invasive Carcinoma (TCGA, PanCancer Atlas))的表达情况

作业5 找到TP53基因在TCGA数据库的乳腺癌数据集的表达量分组看其是否影响生存

作业6 下载数据集GSE17215的表达矩阵并且提取下面的基因画热图

exprSet <- read.table('GSE17215_series_matrix.txt',header = T,comment.char = '!',stringsAsFactors = F)

g6id <- c('ACTR3B','ANLN','BAG1','BCL2' ,'BIRC5' ,'BLVRA', 'CCNB1', 'CCNE1' ,'CDC20' ,'CDC6', 'CDCA1', 'CDH3', 'CENPF', 'CEP55', 'CXXC5', 'EGFR', 'ERBB2', 'ESR1', 'EXO1', 'FGFR4', 'FOXA1', 'FOXC1', 'GPR160', 'GRB7', 'KIF2C', 'KNTC2', 'KRT14', 'KRT17', 'KRT5', 'MAPT', 'MDM2', 'MELK', 'MIA', 'MKI67', 'MLPH', 'MMP11', 'MYBL2', 'MYC', 'NAT1','ORC6L' ,'PGR', 'PHGDH', 'PTTG1', 'RRM2', 'SFRP1', 'SLC39A6', 'TMEM45B', 'TYMS', 'UBE2C', 'UBE2T')
g6id2 <- data.frame(g6id)
colnames(g6id2) <- c('symbol')
GSE17215_annotation <- data.table::fread('GPL3921.annot',header = T)
g6s <- GSE17215_annotation[,c(1,3)]
colnames(g6s) <- c('ID','symbol')
g6id_merge <- merge(g6id2,g6s,by='symbol')
colnames(exprSet)[1] <- 'REF_ID'
colnames(g6id_merge)[2] <- 'REF_ID'
g6_expression <- merge(g6id_merge,exprSet,by='REF_ID')
g6_expression <- g6_expression[,-1]
#绘制热图
library(pheatmap)
rownames(g6_expression) <- g6_expression[,1]
table(g6_expression[,1])
length(unique(g6_expression[,1]))
nrow(g6_expression)
#发现探针ID对应了多个基因ID,因此要对基因ID进行删除重复
library(dplyr)
tmp <- g6_expression %>% select(symbol,1:(length(g6_expression))) %>% mutate(rowMean=rowMeans(.[grep('GSM',names(.))])) %>% arrange(desc(rowMean)) %>% distinct(symbol,.keep_all = T) %>% select(-rowMean)
g6_exp <- g6_expression %>% select(symbol,1:(length(g6_expression))) %>% mutate(rowMean=rowMeans(.[grep('GSM',names(.))])) %>% arrange(desc(rowMean)) %>% distinct(symbol,.keep_all = T) %>% select(-rowMean)
rownames(g6_exp) <- g6_exp[,1]
g6_exp <- g6_exp[,-1]
pheatmap(g6_exp,scale = 'row')

作业7 下载数据集GSE24673的表达矩阵计算样本的相关性并且绘制热图,需要标记上样本分组信息

exprSet <- read.table('GSE24673_series_matrix.txt',header = T,comment.char = '!',row.names = 1)
cor(exprSet)
library(pheatmap)
pheatmap(cor(exprSet))
colnames(exprSet)
col <- colnames(exprSet)
paste0('RB_139_09_TR',c(1,2,3))
trea <- paste0(RB_139_09_TR,c(1,2,3))
#添加注释信息
group_list=c(rep('RB_139_09_TR',3),rep('RB_535_09_TR',3),rep('RB_984_09_TR',3),rep('Human_Healthy_Retina',2))
group <- data.frame(col,group_list)
pheatmap(cor(exprSet),annotation_col = group,annotation_legend = T)
rownames(group) <- group[,1]
group <- group[,-1]
group <- as.character(group)
group <- as.data.frame(group)
rownames(group) <- colnames(exprSet)
pheatmap(cor(exprSet),annotation_col = group,annotation_legend = T)

作业8 找到 GPL6244 platform of Affymetrix Human Gene 1.0 ST Array 对应的R的bioconductor注释包,并且安装它

options()$repos
options()$BioC_mirror 
if(length(getOption('BioC_mirror'))==0) options(BioC_mirror='https://mirror.ustc.edu.cn/bioc/')
if(!require('hugene10sttranscriptcluster.db')) BiocManager::install('hugene10sttranscriptcluster.db',ask = F,update = F)

作业9 下载数据集GSE42872的表达矩阵,并且分别挑选出 所有样本的(平均表达量/sd/mad/)最大的探针,并且找到它们对应的基因。

rm(list = ls())
options(stringsAsFactors = F)
exprSet <- read.table('GSE42872_series_matrix.txt',header = T,comment.char = '!',row.names = 1)
tmp_mean <- apply(exprSet, 1, mean)
tmp_mean_1 <- sort(tmp_mean,decreasing = T)[1]
tmp_mean_1
tmp_sd <- apply(exprSet, 1, sd)
tmp_sd_1 <- sort(tmp_sd,decreasing = T)[1]
tmp_sd_1
tmp_mad <- apply(exprSet, 1, mad)
tmp_mad_1 <- sort(tmp_mad,decreasing = T)[1]
tmp_mad_1
#寻找到探针对应的基因SYMBOL
library(hugene10sttranscriptcluster.db)
g9s <- toTable(hugene10sttranscriptclusterSYMBOL)
which(g9s$probe_id=='7978905')
which(g9s$probe_id=='8133876')
g9s[16473,]

作业10 下载数据集GSE42872的表达矩阵,并且根据分组使用limma做差异分析,得到差异结果矩阵

rm(list=ls())
exprSet <- read.table('GSE42872_series_matrix.txt',comment.char = '!',header = T)
library(hugene10sttranscriptcluster.db)
g10s <- toTable(hugene10sttranscriptclusterSYMBOL)
colnames(exprSet)[1] <- 'probe_id'
exprSet_merge <- merge(exprSet,g10s,by='probe_id')
nrow(exprSet_merge)
length(unique(exprSet_merge$symbol))
table(exprSet_merge$symbol)
##探针转化以及去重,得到最终的表达矩阵
library(dplyr)
exprSet_expression <- exprSet_merge %>% select(-probe_id) %>%
  select(symbol,1:(length(exprSet)-1)) %>% mutate(rowMean=rowMeans(.[grep('GSM',names(.))]))%>% arrange(desc(rowMean)) %>% distinct(symbol,.keep_all = T) %>% select(-rowMean)
rownames(exprSet_expression) <- exprSet_expression[,1]
exprSet_expression <- exprSet_expression[,-1]
#加载limma包进行差异表达分析
library(limma)
group <- c (rep('con',3),rep('treatment',3))
design <- model.matrix(~factor(group))
colnames(design) <- c('con','treatment')
design
fit <- lmFit(exprSet_expression,design)
fit2 <- eBayes(fit)
allDiff=topTable(fit2,adjust='fdr',coef=2,number = Inf)
diffLab <- subset(allDiff,abs(logFC)>1 & adj.P.Val<0.05)
save(diffLab,group,allDiff,file = 'diff.Rdata')

你可能感兴趣的:(2019-06-29 R语言作业(中级))