R包glmnet是通过惩罚最大似然法拟合广义线性和类似模型的软件包。
#install.packages("glmnet", repos = "https://cran.us.r-project.org")
library(glmnet)
ls("package:glmnet")
# x = matrix(rnorm(100 * 20), 100, 20)*10
# y = rnorm(100)
### 1. 加载包和数据
data(QuickStartExample)
x <- QuickStartExample$x
y <- QuickStartExample$y
### 2. 拟合模型
fit = glmnet(x,y,alpha = 1,family = "gaussian",standardize=TRUE) # lasso回归
##glmnet主要参数说明:
# alpha: elasticnet mixing parameter,0≤α≤ 1,
# alpha=1(默认值) is the lasso penalty, and alpha=0 the ridge penalty.
# nlambda: 默认100
# lambda: 用户提供的lambda序列。默认是让程序根据nlambda和lambda计算的lambda序列
# standardize:数据是否要标准化处理,默认为TRUE
# intercept,线性模型的截距,默认为TRUE
# family:模型中使用的误差分布和链接函数,取值范围:"gaussian", "binomial",
# "poisson", "multinomial","cox", "mgaussian",默认为gaussian。
#fit = glmnet(x,y,alpha = 0,family = "gaussian",standardize=TRUE) # Ridge回归
#fit = glmnet(x,y,alpha = 0.3,family = "gaussian",standardize=TRUE) # elasticnet
# # 自定义lambdas
# lambdas <- 10^seq(2, -2, by = -.1) # 100和0.01之间
# fit = glmnet(x,y,alpha = 1,family = "gaussian",
# lambda=lambdas,standardize=TRUE)
class(fit) # [1] "elnet" "glmnet"
print(fit)
# glmnet返回一系列模型供用户选择(不同lambda下训练好的模型)
## 输出结果解读:
# Df:the number of nonzero coefficients
# %dev:the percent deviance explained
# Lambda: the corresponding value of λ,可能小于指定的lambda数,由于算法停止规则
# help(glmnet.control) # internal glmnet parameters
### 3.查看模型参数
any(fit$lambda == 0.5)# 是否有此lambda值的模型(训练好的)
# 所有lambda值下的模型参数
coef(fit)
# 指定lambda值下的模型参数
coef(fit,s = 0.01)
coef(fit,s = c(10,1,0.1))
### 4. 预测
# 不同lambda下模型的预测结果。注:一个lambda对应一个训练好的模型
predict(fit, newx = x[1:5,],type = "response")
# type:“link”, “response”, “coefficients”, “nonzero”其中的一个
# 指定参数lambda,模型的预测结果
predict(fit, newx = x[1:5,], type = "response", s = 0.00351)
predict(fit, newx = x[1:5,], type = "response", s = 0.5)
### 5. 模型拟合过程作图
plot(fit,label = TRUE)
plot(fit, xvar = "lambda", label = TRUE)
## 参数 xvar: “norm” for the ℓ1-norm of the coefficients (default),
# “lambda” for the log-lambda value and “dev” for %deviance explained
## 参数label: 是否显示变量名
### 6. 交叉验证确定模型的最佳lambda(超参数)
# glmnet返回一系列模型供用户选择
cvfit <- cv.glmnet(x, y) # 默认10倍交叉验证(nfolds = 10)。
# 返回cv.glmnet对象,包含交叉验证的所有fits要素统计。
# lambda.min:is the value of λ that gives minimum mean cross-validated error
# lambda.1se: is the value of λ that gives the most regularized model such that
# the cross-validated error is within one standard error of the minimum
plot(cvfit)
cvfit$lambda.min # 交叉验证平均误差最小时模型的lambda
cvfit$lambda.1se
# 得到交叉验证平均误差最小的lambda时模型的系数(coefficients)
coef(cvfit, s = "lambda.min")
coef(cvfit) # 默认s = "lambda.1se"
coef(cvfit, s = "lambda.1se")
predict(cvfit, newx = x[1:5,], s = "lambda.min")
predict(cvfit, newx = x[1:5,])
### 7. Cox 回归
# Cox回归
data(CoxExample)
fit = glmnet(CoxExample$x,CoxExample$y,
alpha = 1,family = "cox")
## alpha = 1,用于变量选择,去除系数为零的变量
cvfit <- cv.glmnet(CoxExample$x,CoxExample$y,
alpha = 1,family = "cox")
# type.measure :deviance
# cvfit <- cv.glmnet(CoxExample$x,CoxExample$y,
# alpha = 1,family = "cox",
# type.measure= "C")
plot(cvfit)
# type.measure="C" is Harrel's concordance measure
cvfit$lambda.min
coef(cvfit,s = cvfit$lambda.min)
coef(cvfit,s = cvfit$lambda.1se)
# 预测生存时间
predict(cvfit, newx = CoxExample$x[1:10, ],
s = cvfit$lambda.min,
type = "response")
# type:“link”, “response”, “coefficients”, “nonzero”其中的一个
## 和survival包coxph比较
library(survival)
set.seed(1)
nobs <- 100; nvars <- 15
x <- matrix(rnorm(nobs * nvars), nrow = nobs)
# create response
ty <- rep(rexp(nobs / 5), each = 5)
tcens <- rbinom(n = nobs, prob = 0.3, size = 1)
y <- Surv(ty, tcens)
glmnet_fit <- glmnet(x, y, family = "cox", lambda = 0)
# lambda =0, 没有正则项,等于coxph_fit
coxph_fit <- coxph(y ~ x)
# glmnet函数,当lambda =0, 没有正则项,等于coxph_fit
plot(coef(glmnet_fit), coef(coxph_fit))
abline(0, 1)
###8. 其他回归实例
## 当y仅包含两种分类时可以使用二分类模型,
data(BinomialExample)
x <- BinomialExample$x
y <- BinomialExample$y
fit = glmnet(x,y,alpha = 1,family = "binomial")
plot(fit,label = TRUE)
#指定lambda在0.05和0.01时预测新样本的类别,type = "class"指定输出值为类别
predict(fit, newx = x[1:5,], type = "class", s = c(0.05, 0.01))
#交叉验证
#cvfit = cv.glmnet(x, y, family = "binomial", type.measure = "class")
cvfit = cv.glmnet(x, y, family = "binomial", type.measure = "auc")
plot(cvfit)
## 当y为多分类时
data(MultinomialExample)
x <- MultinomialExample$x
y <- MultinomialExample$y
#拟合模型
fit = glmnet(x, y, family = "multinomial", type.multinomial = "grouped")
predict(fit, newx = x[1:10,], type = "class")
#交叉验证
cvfit=cv.glmnet(x, y, family="multinomial", type.multinomial = "grouped",
parallel = TRUE)
#cvfit = cv.glmnet(x, y, family = "multinomial", type.measure = "class")
# Only deviance, class, mse, mae available as type.measure for Multinomial models;
#绘图展示
plot(fit, xvar = "lambda", label = TRUE, type.coef = "2norm")
plot(cvfit)
#预测模型
predict(cvfit, newx = x[1:10,], s = "lambda.min", type = "class")