第二课题原始参考代码

机器学习部分
#Description
#The core R codes for statistical analyses of manuscript gut jnl-2019-318860
#If you have any question of our source code, please contact me at [email protected]
#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
#The tsne method for Dimension reduction and visualization 
library(Rtsne)
tsne <- Rtsne(matrix, dims=3, pca = F, initial_dims = 10, perplexity = 100, max_iter = 500)

#the lasso and random forest algorithm for exLR-seq marker selection 
library(glmnet)
lasso.results <- c()
for (j in 1:1000) {
  n1 <- n[sample(1:nrow(n),ceiling(dim(n)[1]*0.5),replace=F),]
  h1 <- h[sample(1:nrow(h),ceiling(dim(h)[1]*0.5),replace=F),]
  s <- rbind.data.frame(n1,h1)
  ts <- apply(s,2,as.numeric)
  y <- as.matrix(ts[,1])
  x <- as.matrix(ts[,2:dim(ts)[1]])
  cv.fit <- cv.glmnet(x,y,family="binomial", type.measure = "auc", nfolds=5)
  co<-coef.cv.glmnet(cv.fit,s="lambda.1se")
  name <- rownames(co)[co[,1]!=0]
  lasso.results <- c(lasso.results, name)
}
freq.lasso.results <- table(lasso.results)

library(varSelRF)
facy <- factor(training.data$status)
x <- training.data[,-1]
step=varSelRF(x, facy, c.sd = 1, mtryFactor = 1, ntree = 500,
              ntreeIterat = 500, vars.drop.num = NULL, vars.drop.frac = 0.1, #
              whole.range = TRUE, recompute.var.imp = FALSE, verbose = FALSE,
              returnFirstForest = TRUE, fitted.rf = NULL, keep.forest = FALSE)
select.history <- step$selec.history


#The training diganostic model construction and SVM algorithm operation
set.seed(1)
library(pROC)
library(caret)
library(e1071)
library(kernlab)
fitControl <- trainControl(method = "repeatedcv",
                           number = 5,
                           repeats = 10,
                           ## Estimate class probabilities
                           classProbs = TRUE,
                           ## Evaluate performance using 
                           ## the following function
                           summaryFunction = twoClassSummary)

svmFit <- train(status ~ ., data = training.data, 
                method = "svmRadial", 
                trControl = fitControl, 
                preProc = c("center", "scale"),
                tuneLength = 8,
                metric = "ROC")

#The establishment of d-signature from three cohort
train.pred <- predict(svmFit,training.data)
validation.pred <- predict(svmFit,validation.data)
testing.pred <- predict(svmFit,testing.data)

#The caculation of diagnostic efficacy of d-signature
train.con <- confusionMatrix(train.pred, training.data$status)
validation.con <- confusionMatrix(validation.pred, validation.data$status)
testing.con <- confusionMatrix(testing.pred, testing.data$status)
tr.acc <- train.con$overall[1]
tr.se <- train.con$byClass[1]
tr.sp <- train.con$byClass[2]
va.acc <- validation.con$overall[1]
va.se <- validation.con$byClass[1]
va.sp <- validation.con$byClass[2]
te.acc <- testing.con$overall[1]
te.se <- testing.con$byClass[1]
te.sp <- testing.con$byClass[2]

#The Normfinder method for RT-qPCR
source("r.NormOldStab5.txt")
Result=Normfinder("Datafile.txt",ctVal=FALSE)
selected.exLRs.normfinder.results <- Result$Ordered

性别和年龄调整部分

####### Differential gene expression analysis using a linear mixed-effects model

load("./datExpr.rda") # Load expression data (normalized for library size and technical variables, log2 transformed)
load("./datTraits.rda") # Load metadata

library(nlme)

# Fixed effects:
Diagnosis = as.factor(datTraits$Diagnosis)
Diagnosis = relevel(Diagnosis,ref = "CTL") # Using control as reference level
Age = as.numeric(datTraits$Age)
Sex = as.factor(datTraits$Sex)
Region = as.factor(datTraits$Region)

# Random effect:
BrainID = as.factor(datTraits$Brain_ID)

DE_Diagnosis = data.frame(miRID = NA,Beta = NA,StdErr = NA,Pval = NA)
DE_Age = data.frame(miRID = NA,Beta = NA,StdErr = NA,Pval = NA)
DE_Sex = data.frame(miRID = NA,Beta = NA,StdErr = NA,Pval = NA)
DE_Region = data.frame(miRID = NA,Beta = NA,StdErr = NA,Pval = NA)

# Define a function that runs the linear mixed-effects model and extract Beta values, standard errors, and P values for the fixed effects
runlme = function(miR) {
  lme = lme(miR~Diagnosis + Age + Sex + Region, rand = ~1|BrainID,na.action = na.exclude)
  slme = summary(lme);
  
  Beta_Diagnosis = slme$tTable[2,1]
  Beta_Age = slme$tTable[3,1]
  Beta_Sex = slme$tTable[4,1]
  Beta_Region = slme$tTable[5,1]
  
  StdErr_Diagnosis = slme$tTable[2,2]
  StdErr_Age = slme$tTable[3,2]
  StdErr_Sex = slme$tTable[4,2]
  StdErr_Region = slme$tTable[5,2]
  
  Pval_Diagnosis = slme$tTable[2,5]
  Pval_Age = slme$tTable[3,5]
  Pval_Sex = slme$tTable[4,5]
  Pval_Region = slme$tTable[5,5]
  
  LME_Diagnosis = list(Beta = Beta_Diagnosis,StdErr = StdErr_Diagnosis,Pval = Pval_Diagnosis)
  LME_Age = list(Beta = Beta_Age,StdErr = StdErr_Age,Pval = Pval_Age)
  LME_Sex = list(Beta = Beta_Sex,StdErr = StdErr_Sex,Pval = Pval_Sex)
  LME_Region = list(Beta = Beta_Region,StdErr = StdErr_Region,Pval = Pval_Region)
  
  return(list(LME_Diagnosis = LME_Diagnosis,LME_Age = LME_Age,LME_Sex = LME_Sex,LME_Region = LME_Region));
}

# Run the linear mixed-effects model for all miRNAs in the data set
for (i in 1:nrow(datExpr)){
  miR = as.numeric(datExpr[i,])
  result = try(runlme(miR),silent = F)
  if (length(result) == 4) {
    Beta_Diagnosis = result[[1]]$Beta
    StdErr_Diagnosis = result[[1]]$StdErr
    Pval_Diagnosis = result[[1]]$Pval
    
    Beta_Age = result[[2]]$Beta
    StdErr_Age = result[[2]]$StdErr
    Pval_Age = result[[2]]$Pval
    
    Beta_Sex = result[[3]]$Beta
    StdErr_Sex = result[[3]]$StdErr
    Pval_Sex = result[[3]]$Pval
  
    Beta_Region = result[[4]]$Beta
    StdErr_Region = result[[4]]$StdErr
    Pval_Region = result[[4]]$Pval
  } else {
    cat('Error in LME for microRNA',rownames(datExpr)[i],'\n')
    cat(' Setting Beta value=0, SE=Inf, and P-value=1\n')
    Beta_Diagnosis = Beta_Age = Beta_Sex = Beta_Region = 0
    StdErr_Diagnosis = StdErr_Age = StdErr_Sex = StdErr_Region = Inf
    Pval_Diagnosis = Pval_Age = Pval_Sex = Pval_Region = 1
  }
  DE_Diagnosis[i,"miRID"] = rownames(datExpr)[i]; DE_Diagnosis[i,"Beta"] = Beta_Diagnosis; DE_Diagnosis[i,"StdErr"] = StdErr_Diagnosis; DE_Diagnosis[i,"Pval"] = Pval_Diagnosis
  DE_Age[i,"miRID"] = rownames(datExpr)[i]; DE_Age[i,"Beta"] = Beta_Age; DE_Age[i,"StdErr"] = StdErr_Age; DE_Age[i,"Pval"] = Pval_Age
  DE_Sex[i,"miRID"] = rownames(datExpr)[i]; DE_Sex[i,"Beta"] = Beta_Sex; DE_Sex[i,"StdErr"] = StdErr_Sex; DE_Sex[i,"Pval"] = Pval_Sex
  DE_Region[i,"miRID"] = rownames(datExpr)[i]; DE_Region[i,"Beta"] = Beta_Region; DE_Region[i,"StdErr"] = StdErr_Region; DE_Region[i,"Pval"] = Pval_Region
}

# Perform FDR adjustment using Benjamini-Hochberg correction
DE_Diagnosis$adjPval = p.adjust(DE_Diagnosis$Pval,method = "BH",n = nrow(DE_Diagnosis))
DE_Age$adjPval = p.adjust(DE_Age$Pval,method = "BH",n = nrow(DE_Age))
DE_Sex$adjPval = p.adjust(DE_Sex$Pval,method = "BH",n = nrow(DE_Sex))
DE_Region$adjPval = p.adjust(DE_Region$Pval,method = "BH",n = nrow(DE_Region))

write.csv(DE_Diagnosis,file = "./DGE_Diagnosis.csv")
write.csv(DE_Age,file = "./DGE_Age.csv")
write.csv(DE_Sex,file = "./DGE_Sex.csv")
write.csv(DE_Region,file = "./DGE_BrainRegion.csv")

WGCNA部分(未使用)

####### WGCNA analysis using bootstrapping

library(WGCNA)
library(flashClust)
options(stringsAsFactors  =  FALSE)

dir.create("./Network_resampled")
dir.create("./TOM_resampled")


### 1. Construct network using the original sample set
load("./datExpr.rda") # Load expression data (normalized for library size and technical variables, log2 transformed)
load("./datTraits.rda") # Load metadata

# Pick soft threshold
powers  =  c(c(1:10), seq(from = 12, to = 20, by = 2))
sft = pickSoftThreshold(datExpr,networkType = "signed",corFnc = "bicor",verbose = 5,powerVector = powers)

softPower = 8 # Choose based on fit to scale-free topology
adjacency = adjacency(datExpr, corFnc = "bicor", type = "signed", power = softPower)
TOM = TOMsimilarity(adjacency,TOMType = "signed", verbose = 0)
dissTOM = 1-TOM
geneTree = flashClust(as.dist(dissTOM), method = "average")

minModuleSize = 10
ds = 3
cutHeight = 0.99999
dynamicMods = cutreeDynamic(dendro = geneTree, distM = dissTOM, method = "hybrid",
                            deepSplit = ds, pamRespectsDendro = T,pamStage = T,
                            minClusterSize = minModuleSize, cutHeight = cutHeight)
dynamicColors = labels2colors(dynamicMods)

# Calculate eigengenes
MEList = moduleEigengenes(datExpr, colors = dynamicColors,softPower = softPower)
MEs = MEList$eigengenes
# Merge modules whose module eigengenes are highly correlated
MEDissThres = 0.15
merge = mergeCloseModules(datExpr, dynamicColors, cutHeight = MEDissThres, verbose = 3)
moduleColors = merge$colors
MEs = merge$newMEs

save(MEs, TOM, moduleColors, geneTree, file = "./Network.RData")

### 2. Perform bootstrapping 
datTraits_ASD = datTraits[datTraits$Diagnosis == "ASD",]
datTraits_Ctrl = datTraits[datTraits$Diagnosis == "CTL",]
datExpr_ASD = datExpr[match(rownames(datTraits_ASD),rownames(datExpr)),]
datExpr_Ctrl = datExpr[match(rownames(datTraits_Ctrl),rownames(datExpr)),]

softPower = 8 
ds = 3
cutHeight = 0.99999
minModuleSize = 10

nSets = 200

for (i in 1:nSets){
  if (i%%10 == 0) { print(paste(i,"th resampling",sep = "")) }
  # Resample within ASD and CTL groups
  ASD_resample = sample(1:nrow(datExpr_ASD),size = nrow(datExpr_ASD),replace = T) 
  datExpr_ASD_resample = datExpr_ASD[ASD_resample,]
  Ctrl_resample = sample(1:nrow(datExpr_Ctrl),size = nrow(datExpr_Ctrl),replace = T)
  datExpr_Ctrl_resample = datExpr_Ctrl[Ctrl_resample,]
  datExpr_resample = rbind(datExpr_ASD_resample,datExpr_Ctrl_resample)
  
  adjacency = adjacency(datExpr_resample, corFnc = "bicor", type = "signed", power = softPower)
  TOM = TOMsimilarity(adjacency,TOMType = "signed")
  dissTOM = 1-TOM
  geneTree = flashClust(as.dist(dissTOM), method = "average")
  dynamicMods = cutreeDynamic(dendro = geneTree, distM = dissTOM, method="hybrid",
                              deepSplit = ds, pamRespectsDendro = T, pamStage=T,
                              minClusterSize = minModuleSize, cutHeight = cutHeight, verbose=0)
  dynamicColors = labels2colors(dynamicMods)
  
  MEList = moduleEigengenes(datExpr_resample, colors = dynamicColors,softPower = softPower)
  MEs = MEList$eigengenes
  
  # Merge modules whose module eigengenes are highly correlated
  MEDissThres = 0.15
  merge = mergeCloseModules(datExpr_resample, dynamicColors, cutHeight = MEDissThres, verbose = 0)
  moduleColors = merge$colors
  MEs = merge$newMEs
  
  save(MEs, TOM, moduleColors, geneTree, file = paste("./Network_resampled/Network_resampled",i,".RData",sep = ""))
}

### 3. Calculate consensus TO matrix 
nSets = 200
prob = 0.95 
nSubset = 20000

load("./Network.RData")
TOM.mat = as.matrix(TOM)
nGenes = nrow(TOM.mat)

subsetEntries = sample(nGenes*(nGenes-1)/2, size = nSubset)
TOMsubset = list()

TOMsubset.main = vectorizeMatrix(TOM.mat)[subsetEntries] 
quantile.TOM.main = quantile(TOMsubset.main,probs = prob,type = 8) 

quantile.TOM = rep(1, nSets) ## vector of quantiles of the individual TO matrices
beta.prob = rep(1, nSets) ## Scaling powers to calibrate TOM values                                        

for (set in 1:nSets) { 
  if (!file.exists(paste("./TOM_resampled/TOM_resampled",set,".RData",sep = ""))) {
    print(paste("On resampled TOM ",set,"...",sep = ""))
    load(paste("./Network_resampled/Network_resampled",set,".RData",sep = ""))
    tmpTOM = as.matrix(TOM)
    TOMsubset[[set]] = vectorizeMatrix(tmpTOM)[subsetEntries]  
    quantile.TOM[set] = quantile(TOMsubset[[set]],probs = prob,type = 8) 
    beta.prob[set] = log(quantile.TOM.main)/log(quantile.TOM[set]) ## calculate the power of the adjacency function
    print(paste("Scaling power for this TOM is ",signif(beta.prob[set],3),sep = ""))
    TOM = tmpTOM^(beta.prob[set]) ## use the power adjacency function for calibrating the TOM
    save(TOM,file = paste("./TOM_resampled/TOM_resampled",set,".RData",sep = ""))
  } else {
    print(paste("./Network_resampled/Network_resampled",set,".RData",sep = ""))
    print("Already processed!")
  }
}

rm(TOM.mat,tmpTOM)

consTOM = vector("list",nSets)
for (set in 1:nSets) {
  print(paste("loading TOM",set,sep = " "))
  load(paste("./TOM_resampled/TOM_resampled",set,".RData",sep = ""))
  consTOM[[set]]$TOM = TOM
}
print("Calculting the median quantile across the TOMs")
consensusTOM = matrix(NA,nrow = nrow(TOM),ncol = ncol(TOM))
for (i in 1:nrow(TOM) ) {
  if (i%%100 == 0) { print(paste("On gene",i,sep = " ")) }          
  for (j in 1:ncol(TOM)) {
    thisvec = vector(mode = "numeric",length = nSets)
    for (k in 1:nSets){
      thisvec[k]=consTOM[[k]]$TOM[i,j]
    }
    consensusTOM[i,j] = median(thisvec)
  }
}
save(consensusTOM,file = "./TOM_resampled/ConsensusTOM.RData")

# Define modules based on consensus TO matrix
cons_dissTOM = 1-consensusTOM
cons_geneTree = flashClust(as.dist(cons_dissTOM), method = "average")

softPower = 8
ds = 3
cutHeight = 0.99999
minModuleSize = 10
cons_dynamicMods = cutreeDynamic(dendro = cons_geneTree, distM = cons_dissTOM, method = "hybrid",
                                 deepSplit = ds, pamRespectsDendro = T, pamStage = T,
                                 minClusterSize = minModuleSize, cutHeight = cutHeight, verbose = 0)
cons_dynamicColors = labels2colors(cons_dynamicMods)

MEDissThres = 0.15
cons_merge = mergeCloseModules(datExpr, cons_dynamicColors, cutHeight = MEDissThres, verbose = 0)
cons_moduleColors = cons_merge$colors
cons_moduleColors = matchLabels(cons_moduleColors, moduleColors, pThreshold = 5e-2) # Match consensus module colors with original module colors

cons_MEList = moduleEigengenes(datExpr, colors = cons_moduleColors, softPower = softPower)
cons_MEs = cons_MEList$eigengenes

save(cons_MEs, consensusTOM, cons_moduleColors, cons_geneTree, file = "ConsensusNetwork.RData")

# Calculate module membership
modNames=substring(names(cons_MEs),3)
geneModuleBicor=bicorAndPvalue(datExpr, cons_MEs, use = "p")
geneModuleMembership = as.data.frame(geneModuleBicor$bicor)
MMPvalue = as.data.frame(geneModuleBicor$p)
names(geneModuleMembership) = paste("MM", modNames, sep = "")
names(MMPvalue) = paste("p.MM", modNames, sep = "")

geneInfo = data.frame(miRID = colnames(datExpr), moduleColor = cons_moduleColors, geneModuleMembership, MMPvalue)
write.csv(geneInfo, file = "./ConsensusNetwork_ModuleMembership.csv")

参考文献:

  • Plasma extracellular vesicle long RNA profiling identifies a diagnostic signature for the detection of pancreatic ductal adenocarcinoma.
  • Extracellular Vesicles Long RNA Sequencing Reveals Abundant mRNA, circRNA, and lncRNA in Human Blood as Potential Biomarkers for Cancer Diagnosis.
  • Genome-wide, integrative analysis implicates microRNA dysregulation in autism spectrum disorder

你可能感兴趣的:(第二课题原始参考代码)