想拿TCGA-HNSC的数据集做一个生存分析。TCGA-HNSC的肿瘤类型较多。
options(stringsAsFactors = F)
library(survival)
project <- 'HNSC'
remove(list = ls())
#====================================读取和临床资料整理=======================
#读入临床数据,将临床特征转置为列名
clinical <- read.table('HNSC.merged_only_clinical_clin_format.txt',
header=T, row.names=1, sep='\t', fill = T)
clinical <- as.data.frame(t(clinical))
#构造一个新函数,从总表中提取生存时间和事件以及病理分级等信息
clinSub <- function(clin){
clin$IDs <- toupper(clin$patient.bcr_patient_barcode)
rownames(clin) <- clin$IDs
#一下一段代码为了获取new_tumor相关时间
#获得列, new tumor event相关
ind_keep <- grep('days_to_new_tumor_event_after_initial_treatment',colnames(clin))
#这个是为了获取最大的那个时间值,对应的是无病生存期,总生存时间等,有点无聊
new_tumor = as.matrix(clinical[,ind_keep])
#返回值为NA值,或者new_tumor行的最小值。
min_days = function(min_day){
minimal_day = ifelse(sum(is.na(min_day)) < length(min_day),
min(min_day, na.rm = T), NA)
return(minimal_day)
}
new_tumor_collapsed = apply(new_tumor, 1, min_days)
#获取生存时间,操作同new_tumor
ind_keep = grep('days_to_death',colnames(clinical))
death = as.matrix(clinical[,ind_keep])
#构造一个获取生存时间函数,不同于new tumor取最小值,这里取最大值
max_days = function(max_day){
maximal_day = ifelse(sum(is.na(max_day)) < length(max_day),
max(max_day, na.rm = T), NA)
return(maximal_day)
}
death_collapsed = apply(death, 1, max_days)
#获取随访截止时间
ind_keep = grep('days_to_last_followup',colnames(clinical))
follow = as.matrix(clinical[,ind_keep])
follow_collapsed = apply(follow, 1, max_days)
#表格整合
all_clin <- data.frame(new_tumor_collapsed,death_collapsed,follow_collapsed)
colnames(all_clin) = c('new_tumor_days', 'death_days', 'follow_days')
rownames(all_clin) <- clin$IDs
#创建新的列,包含肿瘤事件的时间和结局,1表示发生,0表示截尾
all_clin$new_tumor_time = ifelse (!is.na(all_clin$new_tumor_days),
as.numeric(all_clin$new_tumor_days),
ifelse(!is.na(all_clin$follow_days),
as.numeric(all_clin$follow_days),
as.numeric(all_clin$death_days)))
all_clin$new_tumor_event = ifelse(is.na(all_clin$new_tumor_days), 0, 1)
#同样创建新的列来储存生存事件的时间和结局,1表示死亡,0表示截尾
all_clin$death_time = ifelse(is.na(all_clin$death_days),
as.numeric(all_clin$follow_days),
as.numeric(all_clin$death_days))
all_clin$death_event = ifelse(clin$patient.vital_status == 'alive', 0,1)
#把放化疗吸烟饮酒史数据去除,形成一个新的临床数据,
#再提取年龄性别解剖位置和临床病理资料
clin_sub = clin[, -grep('radiation|drug|smoke|alcohol', colnames(clin))]
clin_sub = clin_sub[, grep('//.age|subdivision|gender|pathologic|clinical',
colnames(clin_sub))]
#把一般临床资料和生存资料整合到一起
all_clin <- cbind(all_clin, clin_sub)
#输出结果
write.csv(all_clin, 'clinical_data_with_tnm.csv', row.names = T)
#设置函数返回值
return(all_clin)
}
#用构造的函数对临床资料进行处理
clin_fil <- clinSub(clinical)
#从HNSC的样本中提取tongue cancer
tongue_clin <- clin_fil[grepl('tongue', clin_fil$patient.anatomic_neoplasm_subdivision),
c(4:9, 11, 13:20)]
#下面的代码是把长的列名截短,其中一个用到高级函数lapply
colnames(tongue_clin)[5:7] <- c('age', 'subdivision', 'gender')
colsub <- function(colName){
colSub = unlist(strsplit(colName, split = '\\.'))
colSub = colSub[length(colSub)]
return(colSub)
}
colnames(tongue_clin)[8:15] <- lapply(colnames(tongue_clin[8:15]), colsub)
remove(clinSub, colsub)
#经过分析,有一例ID为TCGA-CQ-A4CA的病例生存时间和DFS时间均是NA,需要剔除,剩159例
tongue_clin <- tongue_clin[-is.na(tongue_clin$new_tumor_time),]
#============================读取基因表达数据=================================
RNAseq.Rsem <- paste(project,
paste('rnaseqv2','illuminahiseq_rnaseqv2','unc_edu','Level_3',
'RSEM_genes_normalized','data.data.txt', sep = '__'),
sep = '.')
#读入RNASeq RSEM数据,数据来自FireHose官网
rna <- read.table(RNAseq.Rsem,
header=T,row.names=1,sep='\t',
stringsAsFactors = F)
remove(RNAseq.Rsem)
#去除第一行
rna1 <- rna[-1,]
#构造一个能够把基因名提取出来,并剔除无基因名的函数,以形成新的列表
geneSub <- function(geneRna){
#构造一个把基因名和Entrez ID分开的函数,便于后面lapply使用。
SymbExtract <- function(x){
sym = strsplit(x, split = '\\|')
sym = sym[[1]][1]
return(sym)
}
rnaTarget = geneRna[unlist(lapply(rownames(geneRna), SymbExtract)) != '?', ]
#这一行代码是为了找出rnaTarget的基因ID有重复的基因名,准备删除
geneDel <- rownames(which(table(unlist(lapply(rownames(rnaTarget), SymbExtract))) > 1,
arr.ind = T))
rnaTarget <- rnaTarget[!(unlist(lapply(rownames(rnaTarget), SymbExtract)) %in% geneDel),]
#把基因名提取出来,配合使用lapply和前面构造的一个小函数,向量化运算
rownames(rnaTarget) <- unlist(lapply(rownames(rnaTarget), SymbExtract))
#把样本名截短, 只要前15个字符
colnames(rnaTarget) <- gsub('\\.', '-', substr(colnames(rnaTarget), 1, 15))
return(rnaTarget)
}
#利用构造的函数对rna进行处理
rna1 <- geneSub(rna1)
#只保留第14到15位字符是01和11的样本,他们分别代表原发肿瘤样本和正常样本
rna1 <- rna1[, substr(colnames(rna1), 14, 15) %in% c('01', '11')]
tongue_rna <- rna1[, substr(colnames(rna1), 1, 12) %in% rownames(tongue_clin)]
#获得肿瘤或者正常样本的索引
n_index <- which(substr(colnames(tongue_rna),14,15) == '11')
t_index <- which(substr(colnames(tongue_rna),14,15) == '01')
# 先删除在50%以上的样本中表达值为0的基因
rem <- function(x){
x = as.matrix(x)
x = t(apply(x,1,as.numeric))
r = as.numeric(apply(x,1,function(i) sum(i == 0)))
remove = which(r > dim(x)[2]*0.5)
return(remove)
}
remove <- rem(tongue_rna)
#对tongue cancer的表达矩阵进行基因筛除
tongue_rna <- tongue_rna[-remove,]
library(limma)
# 用voom函数来标准化数据,以便用limma包进行分析
vm <- function(x){
#对样本进行标记,列名第14,15位为01的是肿瘤样本,标记为1
cond = factor(ifelse(substr(colnames(x), 14, 15) == '01', 1, 0))
d = model.matrix(~1+cond)
x = t(apply(x,1,as.numeric))
ex = voom(x,d,plot = F)
return(ex$E)
}
tongue_rna_vm <- vm(tongue_rna)
#colnames(tongue_rna_vm) <- gsub('\\.','-',substr(colnames(tongue_rna),1,12))
dim(tongue_rna_vm)
# 查看数据如何分布,应该呈正态
hist(tongue_rna_vm)
#z = (value gene X in tumor Y)-(mean gene X in normal)]/(standard deviation X in normal)
# 计算z分数
scal <- function(x,y){
mean_n = rowMeans(y) # mean of normal
sd_n = apply(y,1,sd) # SD of normal
# z score as (value - mean normal)/SD normal
res = matrix(nrow=nrow(x), ncol=ncol(x))
colnames(res) = colnames(x)
rownames(res) = rownames(x)
for(i in 1:dim(x)[1]){
for(j in 1:dim(x)[2]){
res[i,j] <- (x[i,j]-mean_n[i])/sd_n[i]
}
}
return(res)
}
z_rna <- scal(tongue_rna_vm[,t_index],tongue_rna_vm[,n_index])
rm(tongue_rna_vm) #不再需要
未完待续