一些变量筛选方法——6、代码

之前有小伙伴说希望公开之前变量筛选文章(一些变量筛选方法——5、真实数据与总结)的代码,这里时隔好多个月,将之前的代码整理出来了。

当时由于时间有限,只有几天的时间将论文和代码赶出来,所以写的不是很好,全程for循环,还请见谅!


Code

Simulation

# --------------------- simulation  --------------------- #
library(glmnet)
library(ISLR)
library(leaps)
library(BeSS)
library(energy)
library(ncvreg)

gen_y <- function(x1, x2, x3, x4) {
  c_vec <- c(2, 0.5, 3, 2)
  a <- 4 * log(n) / sqrt(n)
  U <- rbinom(n = 4, size = 1, prob = 0.4)
  Z <- rnorm(4)
  beta <- (-1) ^ U * (a + abs(Z))
  b <- runif(3, -1, 1)  
  eps <- rnorm(n) * 0.01
  y1 <- c_vec[1] * beta[1] * x1 + c_vec[2] * beta[2] * x2 + 
    c_vec[3] * beta[3] * ifelse(x3 < 0, 1, 0) + 
    c_vec[4] * beta[4] * x4 + eps
  y2 <- c_vec[1] * beta[1] * x1 * x2 + 
    c_vec[2] * beta[2] * ifelse(x3 < 0, 1, 0) + 
    c_vec[4] * beta[4] * x4 + eps
  y3 <- c_vec[1] * beta[1] * x1 + c_vec[2] * beta[2] * x2 + 
    c_vec[3] * beta[3] * ifelse(x3 < 0, 1, 0) + 
    exp(c_vec[4] * abs(x4)) + eps
  y4 <- b[1] * abs(x1) + b[2] * x2 / (2 - x2) + b[3] * exp(x3) / 3 + eps
  y5 <- b[1] * sin(x1) + b[2] * tanh(x2) + b[3] * x1 / x2 + eps
  list(y1, y2, y3, y4, y5)
}

gen_x <- function(p) {
  X <- matrix(rnorm(n * p), nrow = n, ncol = p)
  X
}

# k = 1, 2, 3, 4, 5, 为所选择的公式
gen_dat <- function(p, k) {
  X <- gen_x(p)
  dat <- cbind(X, gen_y(X[, 1], X[, 2], X[, 3], X[, 4])[[k]])
  colnames(dat) <- c(paste0('x', 1:p), 'y')
  dat
}

# ------------------ 1

# 后面将选用最优子集选择,向前逐步回归,向后逐步回归,LASSO,SCAD,PDAS
n <- 200
# p_vec <- c(20, 30, 50, 100, 200, 500)
p_vec <- c(10, 20, 30)
p_ture_vec <- c(4, 4, 4, 3, 2)

m.t.leaps = sd.t.leaps = m.TP.leaps = sd.TP.leaps = m.FP.leaps = sd.FP.leaps = matrix(nrow = 5, ncol = length(p_vec))
m.t.forward = sd.t.forward = m.TP.forward = sd.TP.forward = m.FP.forward = sd.FP.forward = matrix(nrow = 5, ncol = length(p_vec))
m.t.backward = sd.t.backward = m.TP.backward = sd.TP.backward = m.FP.backward = sd.FP.backward = matrix(nrow = 5, ncol = length(p_vec))
m.t.glmnet = sd.t.glmnet = m.TP.glmnet = sd.TP.glmnet = m.FP.glmnet = sd.FP.glmnet = matrix(nrow = 5, ncol = length(p_vec))
m.t.ncvreg = sd.t.ncvreg = m.TP.ncvreg = sd.TP.ncvreg = m.FP.ncvreg = sd.FP.ncvreg = matrix(nrow = 5, ncol = length(p_vec))
m.t.spdas = sd.t.spdas = m.TP.spdas = sd.TP.spdas = m.FP.spdas = sd.FP.spdas = matrix(nrow = 5, ncol = length(p_vec))

for(m in 1:length(p_vec)) {
  p <- p_vec[m]
  for(k in 1:5) {
    t.leaps = TP.leaps = FP.leaps = vector()
    t.forward = TP.forward = FP.forward = vector()
    t.backward = TP.backward = FP.backward = vector()
    t.glmnet = TP.glmnet = FP.glmnet = vector()
    t.ncvreg = TP.ncvreg = FP.ncvreg = vector()
    t.spdas = TP.spdas = FP.spdas = vector()
    
    for(i in 1:100) {
      set.seed(2018 + i)
      dat_temp <- gen_dat(p, k)
      dat <- list()
      dat$x <- dat_temp[, 1:p]
      dat$y <- dat_temp[, p + 1]
      
      p_ture <- p_ture_vec[k]
      # Best subset selection with "leaps" package
      t.leaps[i] <- system.time(fit.leaps <- regsubsets(dat$x ,dat$y,  really.big = TRUE, nvmax = p, all.best = TRUE))[3]
      
      mark <- which.min(log(fit.leaps$bound[-1] / n) * n + 2 * (1:p)) # location of minimum of AIC
      beta.leaps <- rep(0, p)
      names(beta.leaps) <- colnames(dat$x)
      beta.leaps[names(coef(fit.leaps, mark))[-1]] <- coef(fit.leaps, mark)[-1]
      
      df.leaps = sum(beta.leaps != 0)
      TP.leaps[i] = length(intersect(which(beta.leaps != 0), 1:p_ture))
      FP.leaps[i] = df.leaps - TP.leaps[i]
      
      # Forward stepwise with "leaps" package
      t.forward[i] <- system.time(fit.forward <- regsubsets(dat$x ,dat$y,  really.big = TRUE, nvmax = p, all.best = TRUE, method = 'forward'))[3]
      
      mark <- which.min(log(fit.forward$bound[-1] / n) * n + 2 * (1:p)) # location of minimum of AIC
      beta.forward <- rep(0, p)
      names(beta.forward) <- colnames(dat$x)
      beta.forward[names(coef(fit.forward, mark))[-1]] <- coef(fit.forward, mark)[-1]
      
      df.forward = sum(beta.forward != 0)
      TP.forward[i] = length(intersect(which(beta.forward != 0), 1:p_ture))
      FP.forward[i] = df.forward - TP.forward[i]
      
      
      # Backward stepwise with "leaps" package
      t.backward[i] <- system.time(fit.backward <- regsubsets(dat$x ,dat$y,  really.big = TRUE, nvmax = p, all.best = TRUE, method = 'backward'))[3]
      
      mark <- which.min(log(fit.backward$bound[-1] / n) * n + 2 * (1:p)) # location of minimum of AIC
      beta.backward <- rep(0, p)
      names(beta.backward) <- colnames(dat$x)
      beta.backward[names(coef(fit.backward, mark))[-1]] <- coef(fit.backward, mark)[-1]
      
      df.backward = sum(beta.backward != 0)
      TP.backward[i] = length(intersect(which(beta.backward != 0), 1:p_ture))
      FP.backward[i] = df.backward - TP.backward[i]
      
      
      
      
      # fit a glmnet with method CV
      t.glmnet[i] <- system.time(fit.cv.glmnet <- cv.glmnet(dat$x, dat$y))[3]
      fit.glmnet <- glmnet(dat$x, dat$y, lambda = fit.cv.glmnet$lambda.1se)
      beta.glmnet <- fit.glmnet$beta
      beta0.glmnet <- fit.glmnet$a0
      df.glmnet <- sum(beta.glmnet != 0)
      TP.glmnet[i] <- length(intersect(which(beta.glmnet != 0), 1:p_ture))
      FP.glmnet[i] <- df.glmnet - TP.glmnet[i]
      
      
      # SCAD with "ncvreg" package
      t.ncvreg[i] <- system.time(fit.cv.ncvreg <- cv.ncvreg(dat$x, dat$y, penalty = 'SCAD'))[3]
      fit.ncvreg <- ncvreg(dat$x, dat$y, lambda = fit.cv.ncvreg$lambda.min, penalty = 'SCAD')
      beta.ncvreg <- fit.ncvreg$beta
      beta0.ncvreg <- fit.ncvreg$a0
      df.ncvreg <- sum(beta.ncvreg != 0)
      TP.ncvreg[i] <- length(intersect(which(beta.ncvreg != 0), 1:p_ture))
      FP.ncvreg[i] <- df.ncvreg - TP.ncvreg[i]
      
      
      # fit a BeSS model with method = "sequential"
      t.spdas[i] <- system.time(fit.spdas <- bess(dat$x, dat$y,
                                               method = "sequential", epsilon = 0))[3]
      
      beta.spdas <- coef(fit.spdas, sparse = F, type = "AIC")
      df.spdas <- sum(beta.spdas[-1] != 0)
      TP.spdas[i] <- length(intersect(which(beta.spdas[-1] != 0), 1:p_ture))
      FP.spdas[i] <- df.spdas - TP.spdas[i]
      
      print(paste(i, k ,m))
    }
    m.t.leaps[k, m] <- mean(t.leaps)
    sd.t.leaps[k, m] <- sd(t.leaps)
    m.TP.leaps[k, m] <- mean(TP.leaps)
    sd.TP.leaps[k, m] <- sd(TP.leaps)
    m.FP.leaps[k, m] <- mean(FP.leaps)
    sd.FP.leaps[k, m] <- sd(FP.leaps)
    
    m.t.forward[k, m] <- mean(t.forward)
    sd.t.forward[k, m] <- sd(t.forward)
    m.TP.forward[k, m] <- mean(TP.forward)
    sd.TP.forward[k, m] <- sd(TP.forward)
    m.FP.forward[k, m] <- mean(FP.forward)
    sd.FP.forward[k, m] <- sd(FP.forward)
    
    m.t.backward[k, m] <- mean(t.backward)
    sd.t.backward[k, m] <- sd(t.backward)
    m.TP.backward[k, m] <- mean(TP.backward)
    sd.TP.backward[k, m] <- sd(TP.backward)
    m.FP.backward[k, m] <- mean(FP.backward)
    sd.FP.backward[k, m] <- sd(FP.backward)
    
    m.t.glmnet[k, m] <- mean(t.glmnet)
    sd.t.glmnet[k, m] <- sd(t.glmnet)
    m.TP.glmnet[k, m] <- mean(TP.glmnet)
    sd.TP.glmnet[k, m] <- sd(TP.glmnet)
    m.FP.glmnet[k, m] <- mean(FP.glmnet)
    sd.FP.glmnet[k, m] <- sd(FP.glmnet)
    
    m.t.ncvreg[k, m] <- mean(t.ncvreg)
    sd.t.ncvreg[k, m] <- sd(t.ncvreg)
    m.TP.ncvreg[k, m] <- mean(TP.ncvreg)
    sd.TP.ncvreg[k, m] <- sd(TP.ncvreg)
    m.FP.ncvreg[k, m] <- mean(FP.ncvreg)
    sd.FP.ncvreg[k, m] <- sd(FP.ncvreg)
    
    
    m.t.spdas[k, m] <- mean(t.spdas)
    sd.t.spdas[k, m] <- sd(t.spdas)
    m.TP.spdas[k, m] <- mean(TP.spdas)
    sd.TP.spdas[k, m] <- sd(TP.spdas)
    m.FP.spdas[k, m] <- mean(FP.spdas)
    sd.FP.spdas[k, m] <- sd(FP.spdas)
  }
}

# save(m.t.leaps, sd.t.leaps, m.TP.leaps, sd.TP.leaps, m.FP.leaps, sd.FP.leaps,
#      m.t.forward, sd.t.forward, m.TP.forward, sd.TP.forward, m.FP.forward, sd.FP.forward,
#      m.t.backward, sd.t.backward, m.TP.backward, sd.TP.backward, m.FP.backward, sd.FP.backward,
#      m.t.glmnet, sd.t.glmnet, m.TP.glmnet, sd.TP.glmnet, m.FP.glmnet, sd.FP.glmnet,
#      m.t.ncvreg, sd.t.ncvreg, m.TP.ncvreg, sd.TP.ncvreg, m.FP.ncvreg, sd.FP.ncvreg,
#      m.t.spdas, sd.t.spdas, m.TP.spdas, sd.TP.spdas, m.FP.spdas, sd.FP.spdas,
#      file = 'D:\\Kanny\\simulation1.rda')

t.leaps = TP.leaps = FP.leaps = matrix(nrow = 5, ncol = 3)
t.forward = TP.forward = FP.forward = matrix(nrow = 5, ncol = 3)
t.backward = TP.backward = FP.backward = matrix(nrow = 5, ncol = 3)
t.glmnet = TP.glmnet = FP.glmnet = matrix(nrow = 5, ncol = 3)
t.ncvreg = TP.ncvreg = FP.ncvreg = matrix(nrow = 5, ncol = 3)
t.spdas = TP.spdas = FP.spdas = matrix(nrow = 5, ncol = 3)

options(scipen = 200)
for(i in 1:5) {
  for(j in 1:3) {
    t.leaps[i, j] <- paste(round(m.t.leaps[i, j], 4))
    t.forward[i, j] <- paste(round(m.t.forward[i, j], 4))
    t.backward[i, j] <- paste(round(m.t.backward[i, j], 4))
    t.glmnet[i, j] <- paste(round(m.t.glmnet[i, j], 4))
    t.ncvreg[i, j] <- paste(round(m.t.ncvreg[i, j], 4))
    t.spdas[i, j] <- paste(round(m.t.spdas[i, j], 4))
    
    TP.leaps[i, j] <- paste(round(m.TP.leaps[i, j], 2), '±', round(sd.TP.leaps[i, j], 2))
    TP.forward[i, j] <- paste(round(m.TP.forward[i, j], 2), '±', round(sd.TP.forward[i, j], 2))
    TP.backward[i, j] <- paste(round(m.TP.backward[i, j], 2), '±', round(sd.TP.backward[i, j], 2))
    TP.glmnet[i, j] <- paste(round(m.TP.glmnet[i, j], 2), '±', round(sd.TP.glmnet[i, j], 2))
    TP.ncvreg[i, j] <- paste(round(m.TP.ncvreg[i, j], 2), '±', round(sd.TP.ncvreg[i, j], 2))
    TP.spdas[i, j] <- paste(round(m.TP.spdas[i, j], 2), '±', round(sd.TP.spdas[i, j], 2))
    
    FP.leaps[i, j] <- paste(round(m.FP.leaps[i, j], 2), '±', round(sd.FP.leaps[i, j], 2))
    FP.forward[i, j] <- paste(round(m.FP.forward[i, j], 2), '±', round(sd.FP.forward[i, j], 2))
    FP.backward[i, j] <- paste(round(m.FP.backward[i, j], 2), '±', round(sd.FP.backward[i, j], 2))
    FP.glmnet[i, j] <- paste(round(m.FP.glmnet[i, j], 2), '±', round(sd.FP.glmnet[i, j], 2))
    FP.ncvreg[i, j] <- paste(round(m.FP.ncvreg[i, j], 2), '±', round(sd.FP.ncvreg[i, j], 2))
    FP.spdas[i, j] <- paste(round(m.FP.spdas[i, j], 2), '±', round(sd.FP.spdas[i, j], 2))
  }
}

library(xtable)
#### Time
# p = 10
l <- 1
table_out <- cbind(t.leaps[, l], t.forward[, l], t.backward[, l], t.glmnet[, l], t.ncvreg[, l], t.spdas[, l])
table_out <- t(table_out)
colnames(table_out) <- paste0('公式', 1:5)
row.names(table_out) <- c('Best SS', 'Forward', 'Backward', 'LASSO', 'SCAD', 'PDAS')
xtable(table_out, caption = '方法耗时,p = 10')

# p = 20
l <- 2
table_out <- cbind(t.leaps[, l], t.forward[, l], t.backward[, l], t.glmnet[, l], t.ncvreg[, l], t.spdas[, l])
table_out <- t(table_out)
colnames(table_out) <- paste0('公式', 1:5)
row.names(table_out) <- c('Best SS', 'Forward', 'Backward', 'LASSO', 'SCAD', 'PDAS')
xtable(table_out, caption = '方法耗时,p = 20')

# p = 30
l <- 3
table_out <- cbind(t.leaps[, l], t.forward[, l], t.backward[, l], t.glmnet[, l], t.ncvreg[, l], t.spdas[, l])
table_out <- t(table_out)
colnames(table_out) <- paste0('公式', 1:5)
row.names(table_out) <- c('Best SS', 'Forward', 'Backward', 'LASSO', 'SCAD', 'PDAS')
xtable(table_out, caption = '方法耗时,p = 30')



#### True Positive
# p = 10
l <- 1
table_out <- cbind(TP.leaps[, l], TP.forward[, l], TP.backward[, l], TP.glmnet[, l], TP.ncvreg[, l], TP.spdas[, l])
table_out <- t(table_out)
colnames(table_out) <- paste0('公式', 1:5)
row.names(table_out) <- c('Best SS', 'Forward', 'Backward', 'LASSO', 'SCAD', 'PDAS')
xtable(table_out, caption = 'True Positive,p = 10')

# p = 20
l <- 2
table_out <- cbind(TP.leaps[, l], TP.forward[, l], TP.backward[, l], TP.glmnet[, l], TP.ncvreg[, l], TP.spdas[, l])
table_out <- t(table_out)
colnames(table_out) <- paste0('公式', 1:5)
row.names(table_out) <- c('Best SS', 'Forward', 'Backward', 'LASSO', 'SCAD', 'PDAS')
xtable(table_out, caption = 'True Positive,p = 20')

# p = 30
l <- 3
table_out <- cbind(TP.leaps[, l], TP.forward[, l], TP.backward[, l], TP.glmnet[, l], TP.ncvreg[, l], TP.spdas[, l])
table_out <- t(table_out)
colnames(table_out) <- paste0('公式', 1:5)
row.names(table_out) <- c('Best SS', 'Forward', 'Backward', 'LASSO', 'SCAD', 'PDAS')
xtable(table_out, caption = 'True Positive,p = 30')



#### False Positive
# p = 10
l <- 1
table_out <- cbind(FP.leaps[, l], FP.forward[, l], FP.backward[, l], FP.glmnet[, l], FP.ncvreg[, l], FP.spdas[, l])
table_out <- t(table_out)
colnames(table_out) <- paste0('公式', 1:5)
row.names(table_out) <- c('Best SS', 'Forward', 'Backward', 'LASSO', 'SCAD', 'PDAS')
xtable(table_out, caption = 'False Positive,p = 10')

# p = 20
l <- 2
table_out <- cbind(FP.leaps[, l], FP.forward[, l], FP.backward[, l], FP.glmnet[, l], FP.ncvreg[, l], FP.spdas[, l])
table_out <- t(table_out)
colnames(table_out) <- paste0('公式', 1:5)
row.names(table_out) <- c('Best SS', 'Forward', 'Backward', 'LASSO', 'SCAD', 'PDAS')
xtable(table_out, caption = 'False Positive,p = 20')

# p = 30
l <- 3
table_out <- cbind(FP.leaps[, l], FP.forward[, l], FP.backward[, l], FP.glmnet[, l], FP.ncvreg[, l], FP.spdas[, l])
table_out <- t(table_out)
colnames(table_out) <- paste0('公式', 1:5)
row.names(table_out) <- c('Best SS', 'Forward', 'Backward', 'LASSO', 'SCAD', 'PDAS')
xtable(table_out, caption = 'False Positive,p = 30')


# ------------------ 2

n <- 200
# p_vec <- c(20, 30, 50, 100, 200, 500)
p_vec <- c(100, 200)
p_ture_vec <- c(4, 4, 4, 3, 2)

m.t.cor = sd.t.cor = m.cor_out = sd.cor_out = matrix(nrow = 5, ncol = length(p_vec))
m.t.dcor = sd.t.dcor = m.dcor_out = sd.dcor_out = matrix(nrow = 5, ncol = length(p_vec))
m.t.rf = sd.t.rf = m.rf_out = sd.rf_out = matrix(nrow = 5, ncol = length(p_vec))

for(m in 1:length(p_vec)) {
  p <- p_vec[m]
  for(k in 1:5) {
    t.cor = cor_out = vector()
    t.dcor = dcor_out = vector()
    t.rf = rf_out = vector()
    
    for(i in 1:100) {
      set.seed(2018 + i)
      dat_temp <- gen_dat(p, k)
      dat <- list()
      dat$x <- dat_temp[, 1:p]
      dat$y <- dat_temp[, p + 1]
      
      p_ture <- p_ture_vec[k]
      
      t.cor[i] <- system.time(cor_result <- sapply(1:p, function(j) cor(dat$x[, j], dat$y)))
      cor_out[i] <- max(order(cor_result, decreasing = T)[1:p_ture])
      
      t.dcor[i] <- system.time(dcor_result <- sapply(1:p, function(j) dcor(dat$x[, j], dat$y)))
      dcor_out[i] <- max(order(dcor_result, decreasing = T)[1:p_ture])
      
      t.rf[i] <- system.time(rf_result <- ranger(y ~ ., data = as.data.frame(dat_temp), importance = "impurity"))
      rf_out[i] <- max(order(rf_result$variable.importance, decreasing = T)[1:p_ture])
      print(paste(i, k ,m))
    }
    
    m.t.cor[k, m] <- mean(t.cor)
    sd.t.cor[k, m] <- sd(t.cor)
    m.cor_out[k, m] <- mean(cor_out)
    sd.cor_out[k, m] <- sd(cor_out)
    
    m.t.dcor[k, m] <- mean(t.dcor)
    sd.t.dcor[k, m] <- sd(t.dcor)
    m.dcor_out[k, m] <- mean(dcor_out)
    sd.dcor_out[k, m] <- sd(dcor_out)
    
    m.t.rf[k, m] <- mean(t.rf)
    sd.t.rf[k, m] <- sd(t.rf)
    m.rf_out[k, m] <- mean(rf_out)
    sd.rf_out[k, m] <- sd(rf_out)
  }
}

t.cor = cor_out = matrix(nrow = 5, ncol = 2)
t.dcor = dcor_out = matrix(nrow = 5, ncol = 2)
t.rf = rf_out = matrix(nrow = 5, ncol = 2)

options(scipen = 200)
for(i in 1:5) {
  for(j in 1:2) {
    t.cor[i, j] <- paste(round(m.t.cor[i, j], 4))
    t.dcor[i, j] <- paste(round(m.t.dcor[i, j], 4))
    t.rf[i, j] <- paste(round(m.t.rf[i, j], 4))
    
    cor_out[i, j] <- paste(round(m.cor_out[i, j], 2), '±', round(sd.cor_out[i, j], 2))
    dcor_out[i, j] <- paste(round(m.dcor_out[i, j], 2), '±', round(sd.dcor_out[i, j], 2))
    rf_out[i, j] <- paste(round(m.rf_out[i, j], 2), '±', round(sd.rf_out[i, j], 2))
  }
}

# save(t.cor, t.dcor, t.rf, cor_out, dcor_out, rf_out,
#      file = 'D:\\Kanny\\simulation2.rda')

## 为了转化到latex里面可输出的表格,所以使用了下面的代码
library(xtable)
#### Time
# p = 100
l <- 1
table_out <- cbind(t.cor[, l], t.dcor[, l], t.rf[, l])
table_out <- t(table_out)
colnames(table_out) <- paste0('公式', 1:5)
row.names(table_out) <- c('Cor-SIS', 'DCor-SIS', 'RF')
xtable(table_out, caption = 'Time,p = 100')

# p = 200
l <- 2
table_out <- cbind(t.cor[, l], t.dcor[, l], t.rf[, l])
table_out <- t(table_out)
colnames(table_out) <- paste0('公式', 1:5)
row.names(table_out) <- c('Cor-SIS', 'DCor-SIS', 'RF')
xtable(table_out, caption = 'Time,p = 200')

#### 个数
# p = 100
l <- 1
table_out <- cbind(cor_out[, l], dcor_out[, l], rf_out[, l])
table_out <- t(table_out)
colnames(table_out) <- paste0('公式', 1:5)
row.names(table_out) <- c('Cor-SIS', 'DCor-SIS', 'RF')
xtable(t(table_out), caption = '个数,p = 100')

# p = 200
l <- 2
table_out <- cbind(cor_out[, l], dcor_out[, l], rf_out[, l])
table_out <- t(table_out)
colnames(table_out) <- paste0('公式', 1:5)
row.names(table_out) <- c('Cor-SIS', 'DCor-SIS', 'RF')
xtable(t(table_out), caption = '个数,p = 200')

Real Data

# --------------------- example ---------------------
### 1

library(ISLR)
library(leaps) 
library(glmnet)

names(Hitters) 
regfit.full <- regsubsets(Salary ~ ., Hitters)
summary(regfit.full) 

regfit.fwd <- regsubsets(Salary ~ ., data = Hitters, method = "forward")
summary(regfit.fwd)

regfit.bwd <- regsubsets(Salary~ ., data = Hitters, method = "backward")
summary(regfit.bwd)

dat <- na.omit(Hitters)
x <- model.matrix(Salary ~ ., dat)[, -1]
y <- dat$Salary


fit.cv.glmnet <- cv.glmnet(x, y)
plot(fit.cv.glmnet)
fit.glmnet <- glmnet(x, y, lambda = fit.cv.glmnet$lambda.1se)
fit.glmnet <- glmnet(x, y)
plot(fit.glmnet, label = T)

beta.glmnet <- fit.glmnet$beta




### 2


##### example2

library(data.table)
dat_ex2 <- fread('D:\\Kanny\\FPS-5.csv', data.table = F)
y <- dat_ex2$feature3208

# 转为哑变量
y.matrix <- model.matrix(~ y, data.frame(1:3600, y))

library(snowfall)

sfInit(parallel = TRUE, cpus = 4)
sfExport('dat_ex2', 'y.matrix')
sfLibrary(energy)
result <- sfSapply(1:(ncol(dat_ex2) - 1), function(i) {
  dcor(y.matrix, dat_ex2[, i])
  print(i)
})

sfStop()

# save(result, file = 'D:\\Kanny\\ex2_result1.rda')
load('D:\\Kanny\\ex2_result1.rda')

n <- nrow(dat_ex2)
rest_num <- round(n / log(n))
or_result <- order(result, decreasing = T)
ind_rest <- which(or_result <= rest_num)

dat_new_x <- dat_ex2[, ind_rest]

fit.glmnet1 <- glmnet(x = as.matrix(dat_new_x), y = y, family = 'multinomial', type.multinomial = 'grouped')
plot(fit.glmnet1)

# require(doMC)
# registerDoMC(cores = 4)
fit.cv.glmnet <- cv.glmnet(x = as.matrix(dat_new_x), y = y, family = 'multinomial', grouped = T, nfolds = 5)
plot(fit.cv.glmnet)

fit.glmnet <- glmnet(x = as.matrix(dat_new_x), y = y, family = 'multinomial', type.multinomial = 'grouped', lambda = fit.cv.glmnet$lambda.1se)
beta.glmnet <- fit.glmnet$beta
beta0.glmnet <- fit.glmnet$a0

sum(beta.glmnet$saglik != 0)


你可能感兴趣的:(R,统计学习)