RNA-seq分析_DEseq2代码整理总结

#首先整理TCGA基因表达矩阵,在肿瘤样本,有一个样本既有原位癌也有转移数据,要删去肿瘤转移表达数据

rm(list=ls())
library(DESeq2)
library(limma)

读入表达矩阵与样本信息,并查看表达矩阵与样本信息

genesymbol<-read.delim("/Users/Desktop/ESCC_RNA_count_data/ESCC_count_cancer_expression.txt",header = TRUE,check.names = F)
condition<-read.delim("/Users/Desktop/ESCC_RNA_count_data/clinical merge.txt",header=TRUE)
dim(genesymbol)
dim(condition)

#相同基因名ENSG求平均

genesymbol<-avereps(genesymbol[,-1],ID=genesymbol$id)

#从样本信息中提取id列与饮酒列

ncol(genesymbol)
nrow(condition)
condition<-condition[,c("submitter_id","alcohol_history")]

#使表达矩阵与样本表中的样本id一致,即去掉TCGA-JY-A6FA-01A后的01A

for(i in 1:ncol(genesymbol)){
  colnames(genesymbol)[i]<-substr(colnames(genesymbol)[i],1,12)
}

#删去样本信息中饮酒史缺失的样本

condition<-condition[!condition$alcohol_history=="not_reported",]
dim(condition)

#从表达矩阵中只提取有饮酒信息的

genesymbol_alcohol<-genesymbol[,colnames(genesymbol)%in%condition$submitter_id]

#将样本分类数据转化为因子,注意这里relevel不能对已经排序的数进行relevel

row.names(condition)<-condition$submitter_id
condition[,2]<-as.factor(condition[,2])
condition$alcohol_history<-relevel(condition$alcohol_history,ref="0")

#质控

genesymbol_alcohol<-genesymbol_alcohol[rowMeans(genesymbol_alcohol)>1,] 

#要想用DEseq2进行下面的差异分析,首先将表达矩阵与样本列表的样本名字顺序一致!!

genesymbol_alcohol<-t(genesymbol_alcohol)
genesymbol_alcohol=genesymbol_alcohol[order(rownames(genesymbol_alcohol)),]
condition<-condition[order(condition$submitter_id),]
genesymbol_alcohol<-t(genesymbol_alcohol)
head(genesymbol_alcohol)

#创建矩阵并对数值进行归一化

dds<-DESeqDataSetFromMatrix(countData=genesymbol_alcohol,colData=condition,design =~alcohol_history)
dds <- estimateSizeFactors(dds) 

#差异分析

dds_DE<-DESeq(dds)

#输出结果,alpha定义显著水平

res_DE<-results(dds_DE,alpha = 0.05,contrast = c("alcohol_history",1,0))
DEG1<-res_DE[order(res_DE$pvalue),]

#根据不同阈值输出结果,logFC取值为1,2,均值加减标准差
#ifelse(condition,statemen1,statemen2);若cond为true,则执行第一个语句;否则执行第二个语句

#logFC=1
logFC_cutoff_1<-c(1)
DEG1$change_1=as.factor(
  ifelse(DEG1$pvalue<0.05&abs(DEG1$log2FoldChange)>logFC_cutoff_1,
         ifelse(DEG1$log2FoldChange>logFC_cutoff_1,"up","down"),"NOT")
)

#logFC=2
logFC_cutoff_2<-c(2)
DEG1$change_2=as.factor(
  ifelse(DEG1$pvalue<0.05&abs(DEG1$log2FoldChange)>logFC_cutoff_2,
         ifelse(DEG1$log2FoldChange>logFC_cutoff_2,"up","down"),"NOT")
)

#logFC=均值加减标准差
logFC_cutoff_sd<-with(DEG1,mean(abs(log2FoldChange))+2*sd(abs(log2FoldChange)))
DEG1$change_sd=as.factor(
  ifelse(DEG1$pvalue<0.05&abs(DEG1$log2FoldChange)>logFC_cutoff_sd,
         ifelse(DEG1$log2FoldChange>logFC_cutoff_sd,"up","down"),"NOT")
)

#统计上调,下降,不变的基因的数目
table(DEG1$change_1)
table(DEG1$change_sd)


DEseq2归一化算法详解
http://www.360doc.com/content/19/1224/14/68068867_881789451.shtml

你可能感兴趣的:(RNA-seq分析_DEseq2代码整理总结)