TCGA子集做一个生存分析

想拿TCGA-HNSC的数据集做一个生存分析。TCGA-HNSC的肿瘤类型较多。

options(stringsAsFactors = F)
library(survival)
project <- 'HNSC'
remove(list = ls())

#====================================读取和临床资料整理=======================
#读入临床数据,将临床特征转置为列名
clinical <- read.table('HNSC.merged_only_clinical_clin_format.txt',
                                     header=T, row.names=1, sep='\t', fill = T)
clinical <- as.data.frame(t(clinical))

#构造一个新函数,从总表中提取生存时间和事件以及病理分级等信息
clinSub <- function(clin){
  clin$IDs <- toupper(clin$patient.bcr_patient_barcode)
  rownames(clin) <- clin$IDs
  
  #一下一段代码为了获取new_tumor相关时间
  #获得列, new tumor event相关
  ind_keep <- grep('days_to_new_tumor_event_after_initial_treatment',colnames(clin))
  #这个是为了获取最大的那个时间值,对应的是无病生存期,总生存时间等,有点无聊
  new_tumor = as.matrix(clinical[,ind_keep])
  #返回值为NA值,或者new_tumor行的最小值。
  min_days = function(min_day){
    minimal_day = ifelse(sum(is.na(min_day)) < length(min_day), 
                         min(min_day, na.rm = T), NA)
    return(minimal_day)
  }
  new_tumor_collapsed = apply(new_tumor, 1, min_days)
  
  #获取生存时间,操作同new_tumor
  ind_keep = grep('days_to_death',colnames(clinical))
  death = as.matrix(clinical[,ind_keep])
  #构造一个获取生存时间函数,不同于new tumor取最小值,这里取最大值
  max_days = function(max_day){
    maximal_day = ifelse(sum(is.na(max_day)) < length(max_day), 
                         max(max_day, na.rm = T), NA)
    return(maximal_day)
  }
  death_collapsed = apply(death, 1, max_days)
  
  #获取随访截止时间
  ind_keep = grep('days_to_last_followup',colnames(clinical))
  follow = as.matrix(clinical[,ind_keep])
  follow_collapsed = apply(follow, 1, max_days)
  
  #表格整合
  all_clin <- data.frame(new_tumor_collapsed,death_collapsed,follow_collapsed)
  colnames(all_clin) = c('new_tumor_days', 'death_days', 'follow_days')
  rownames(all_clin) <- clin$IDs
  #创建新的列,包含肿瘤事件的时间和结局,1表示发生,0表示截尾
  all_clin$new_tumor_time = ifelse (!is.na(all_clin$new_tumor_days),
                                    as.numeric(all_clin$new_tumor_days),
                                    ifelse(!is.na(all_clin$follow_days),
                                           as.numeric(all_clin$follow_days),
                                           as.numeric(all_clin$death_days)))
  all_clin$new_tumor_event = ifelse(is.na(all_clin$new_tumor_days), 0, 1)
  #同样创建新的列来储存生存事件的时间和结局,1表示死亡,0表示截尾
  all_clin$death_time = ifelse(is.na(all_clin$death_days),
                               as.numeric(all_clin$follow_days),
                               as.numeric(all_clin$death_days))
  all_clin$death_event = ifelse(clin$patient.vital_status == 'alive', 0,1)
  
  #把放化疗吸烟饮酒史数据去除,形成一个新的临床数据,
  #再提取年龄性别解剖位置和临床病理资料
  clin_sub = clin[, -grep('radiation|drug|smoke|alcohol', colnames(clin))]
  clin_sub = clin_sub[, grep('//.age|subdivision|gender|pathologic|clinical', 
                              colnames(clin_sub))]
  
  #把一般临床资料和生存资料整合到一起
  all_clin <- cbind(all_clin, clin_sub)
  #输出结果
  write.csv(all_clin, 'clinical_data_with_tnm.csv', row.names = T)
  #设置函数返回值
  return(all_clin)
}

#用构造的函数对临床资料进行处理
clin_fil <- clinSub(clinical)
#从HNSC的样本中提取tongue cancer
tongue_clin <- clin_fil[grepl('tongue', clin_fil$patient.anatomic_neoplasm_subdivision),
                       c(4:9, 11, 13:20)]

#下面的代码是把长的列名截短,其中一个用到高级函数lapply
colnames(tongue_clin)[5:7] <- c('age', 'subdivision', 'gender')
colsub <- function(colName){
  colSub = unlist(strsplit(colName, split = '\\.'))
  colSub = colSub[length(colSub)]
  return(colSub)
}
colnames(tongue_clin)[8:15] <- lapply(colnames(tongue_clin[8:15]), colsub)

remove(clinSub, colsub)

#经过分析,有一例ID为TCGA-CQ-A4CA的病例生存时间和DFS时间均是NA,需要剔除,剩159例 
tongue_clin <- tongue_clin[-is.na(tongue_clin$new_tumor_time),]
#============================读取基因表达数据=================================
RNAseq.Rsem <- paste(project,
                     paste('rnaseqv2','illuminahiseq_rnaseqv2','unc_edu','Level_3',
                           'RSEM_genes_normalized','data.data.txt', sep = '__'),
                     sep = '.')

#读入RNASeq RSEM数据,数据来自FireHose官网
rna <- read.table(RNAseq.Rsem,
                  header=T,row.names=1,sep='\t',
                  stringsAsFactors = F)
remove(RNAseq.Rsem)
#去除第一行
rna1 <- rna[-1,]
#构造一个能够把基因名提取出来,并剔除无基因名的函数,以形成新的列表
geneSub <- function(geneRna){
  #构造一个把基因名和Entrez ID分开的函数,便于后面lapply使用。
  SymbExtract <- function(x){
    sym = strsplit(x, split = '\\|')
    sym = sym[[1]][1]
    return(sym)
  }
  rnaTarget = geneRna[unlist(lapply(rownames(geneRna), SymbExtract)) != '?', ]
  #这一行代码是为了找出rnaTarget的基因ID有重复的基因名,准备删除
  geneDel <- rownames(which(table(unlist(lapply(rownames(rnaTarget), SymbExtract))) > 1, 
                            arr.ind = T))
  rnaTarget <- rnaTarget[!(unlist(lapply(rownames(rnaTarget), SymbExtract)) %in% geneDel),]
  #把基因名提取出来,配合使用lapply和前面构造的一个小函数,向量化运算
  rownames(rnaTarget) <- unlist(lapply(rownames(rnaTarget), SymbExtract))
  #把样本名截短, 只要前15个字符
  colnames(rnaTarget) <- gsub('\\.', '-', substr(colnames(rnaTarget), 1, 15))
  return(rnaTarget)
  
}
#利用构造的函数对rna进行处理
rna1 <- geneSub(rna1)
#只保留第14到15位字符是01和11的样本,他们分别代表原发肿瘤样本和正常样本
rna1 <- rna1[, substr(colnames(rna1), 14, 15) %in% c('01', '11')]

tongue_rna <- rna1[, substr(colnames(rna1), 1, 12) %in% rownames(tongue_clin)]
#获得肿瘤或者正常样本的索引
n_index <- which(substr(colnames(tongue_rna),14,15) == '11')
t_index <- which(substr(colnames(tongue_rna),14,15) == '01')

# 先删除在50%以上的样本中表达值为0的基因
rem <- function(x){
  x = as.matrix(x)
  x = t(apply(x,1,as.numeric))
  r = as.numeric(apply(x,1,function(i) sum(i == 0)))
  remove = which(r > dim(x)[2]*0.5)
  return(remove)
}
remove <- rem(tongue_rna)
#对tongue cancer的表达矩阵进行基因筛除
tongue_rna <- tongue_rna[-remove,]


library(limma)
# 用voom函数来标准化数据,以便用limma包进行分析
vm <- function(x){
  #对样本进行标记,列名第14,15位为01的是肿瘤样本,标记为1
  cond = factor(ifelse(substr(colnames(x), 14, 15) == '01', 1, 0))
  d = model.matrix(~1+cond)
  x = t(apply(x,1,as.numeric))
  ex = voom(x,d,plot = F)
  return(ex$E)
}

tongue_rna_vm  <- vm(tongue_rna)
#colnames(tongue_rna_vm) <- gsub('\\.','-',substr(colnames(tongue_rna),1,12))
dim(tongue_rna_vm)

# 查看数据如何分布,应该呈正态
hist(tongue_rna_vm)


#z = (value gene X in tumor Y)-(mean gene X in normal)]/(standard deviation X in normal)


# 计算z分数
scal <- function(x,y){
  mean_n = rowMeans(y)  # mean of normal
  sd_n = apply(y,1,sd)  # SD of normal
  # z score as (value - mean normal)/SD normal
  res = matrix(nrow=nrow(x), ncol=ncol(x))
  colnames(res) = colnames(x)
  rownames(res) = rownames(x)
  for(i in 1:dim(x)[1]){
    for(j in 1:dim(x)[2]){
      res[i,j] <- (x[i,j]-mean_n[i])/sd_n[i]
    }
  }
  return(res)
}
z_rna <- scal(tongue_rna_vm[,t_index],tongue_rna_vm[,n_index])

rm(tongue_rna_vm) #不再需要

未完待续

你可能感兴趣的:(TCGA子集做一个生存分析)