角度一:解空间形状
通常认为L1 正则化可以产生稀疏权值矩阵,即产生一个参数稀疏的模型。
而L2 正则化可以让模型的参数取绝对值较小的数。
考虑两种正则化函数的等值面与原始Loss函数的等值面的关系。
以二维情况为例,L1正则化函数的等值面是个菱形,L2正则化函数的等值面是个圆形。
为什么加入正则项就是定义了一个解空间约束?
简单来说就是可以理解成一个带约束条件的最优化问题,根据KKT条件可以转换成正则
角度二:函数叠加
简单说就是L(w)+Cw^2和L(w)+C|w|这俩目标函数求导结果分别是+2Cw和+C,+C不会随着w的变化而变化,所以解空间就稀疏了
角度三:L1正则化引入了拉普拉斯先验,L2正则化引入了高斯先验。拉普拉斯分布曲线有尖,高斯分布曲线在极值点附近是平滑的
如果仅仅需要使用L2正则化,那么也可以利用优化器的weight_decay参数来实现。
weight_decay参数可以设置参数在训练过程中的衰减,这和L2正则化的作用效果等价。
before L2 regularization:
gradient descent: w = w - lr * dloss_dw
after L2 regularization:
gradient descent: w = w - lr * (dloss_dw+beta*w) = (1-lr*beta)*w - lr*dloss_dw
so (1-lr*beta)is the weight decay ratio.
Pytorch的优化器支持一种称之为Per-parameter options的操作,就是对每一个参数进行特定的学习率,权重衰减率指定,以满足更为细致的要求。
weight_params = [param for name, param in model.named_parameters() if "bias" not in name]
bias_params = [param for name, param in model.named_parameters() if "bias" in name]
optimizer = torch.optim.SGD([{'params': weight_params, 'weight_decay':1e-5},
{'params': bias_params, 'weight_decay':0}],
lr=1e-2, momentum=0.9)
下面代码要求:
pip install torchkeras==3.2.3
pytorch==1.12.1
import numpy as np
import pandas as pd
import torch
from torch import nn
import torch.nn.functional as F
class FocalLoss(nn.Module):
def __init__(self, gamma=2.0, alpha=0.75):
super().__init__()
self.gamma = gamma
self.alpha = alpha
def forward(self, y_pred, y_true):
bce = torch.nn.BCELoss(reduction="none")(y_pred, y_true)
p_t = (y_true * y_pred) + ((1 - y_true) * (1 - y_pred))
alpha_factor = y_true * self.alpha + (1 - y_true) * (1 - self.alpha)
modulating_factor = torch.pow(1.0 - p_t, self.gamma)
loss = torch.mean(alpha_factor * modulating_factor * bce)
return loss
import torch
# L2正则化
def L2Loss(model,alpha):
l2_loss = torch.tensor(0.0, requires_grad=True)
for name, param in model.named_parameters():
if 'bias' not in name: #一般不对偏置项使用正则
l2_loss = l2_loss + (0.5 * alpha * torch.sum(torch.pow(param, 2)))
return l2_loss
# L1正则化
def L1Loss(model,beta):
l1_loss = torch.tensor(0.0, requires_grad=True)
for name, param in model.named_parameters():
if 'bias' not in name:
l1_loss = l1_loss + beta * torch.sum(torch.abs(param))
return l1_loss
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset,DataLoader,TensorDataset
import torchkeras
# %matplotlib inline
# %config InlineBackend.figure_format = 'svg'
#正负样本数量
n_positive,n_negative = 1000,6000
#生成正样本, 小圆环分布
r_p = 5.0 + torch.normal(0.0,1.0,size = [n_positive,1])
theta_p = 2*np.pi*torch.rand([n_positive,1])
Xp = torch.cat([r_p*torch.cos(theta_p),r_p*torch.sin(theta_p)],axis = 1)
Yp = torch.ones_like(r_p)
#生成负样本, 大圆环分布
r_n = 8.0 + torch.normal(0.0,1.0,size = [n_negative,1])
theta_n = 2*np.pi*torch.rand([n_negative,1])
Xn = torch.cat([r_n*torch.cos(theta_n),r_n*torch.sin(theta_n)],axis = 1)
Yn = torch.zeros_like(r_n)
#汇总样本
X = torch.cat([Xp,Xn],axis = 0)
Y = torch.cat([Yp,Yn],axis = 0)
#可视化
plt.figure(figsize = (6,6))
plt.scatter(Xp[:,0],Xp[:,1],c = "r")
plt.scatter(Xn[:,0],Xn[:,1],c = "g")
plt.legend(["positive","negative"]);
ds = TensorDataset(X,Y)
ds_train,ds_val = torch.utils.data.random_split(ds,[int(len(ds)*0.7),len(ds)-int(len(ds)*0.7)])
dl_train = DataLoader(ds_train,batch_size = 100,shuffle=True,num_workers=2)
dl_val = DataLoader(ds_val,batch_size = 100,num_workers=2)
features,labels = next(iter(dl_train))
print('features={} labels={}'.format(features, labels))
class Net(nn.Module):
def __init__(self):
super().__init__()
self.fc1 = nn.Linear(2, 4)
self.fc2 = nn.Linear(4, 8)
self.fc3 = nn.Linear(8, 1)
def forward(self, x):
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
y = self.fc3(x)
return y
net = Net()
from torchkeras import summary
summary(net, features)
# L2正则化
def L2Loss(model,alpha):
l2_loss = torch.tensor(0.0, requires_grad=True)
for name, param in model.named_parameters():
if 'bias' not in name: #一般不对偏置项使用正则
l2_loss = l2_loss + (0.5 * alpha * torch.sum(torch.pow(param, 2)))
return l2_loss
# L1正则化
def L1Loss(model,beta):
l1_loss = torch.tensor(0.0, requires_grad=True)
for name, param in model.named_parameters():
if 'bias' not in name:
l1_loss = l1_loss + beta * torch.sum(torch.abs(param))
return l1_loss
from torchkeras import KerasModel
from torchkeras.metrics import AUCROC
net = Net()
# 将L2正则和L1正则添加到FocalLoss损失,一起作为目标函数
def focal_loss_with_regularization(y_pred,y_true):
y_probs = torch.sigmoid(y_pred)
focal = FocalLoss()(y_probs,y_true)
l2_loss = L2Loss(net,0.001) #注意设置正则化项系数
l1_loss = L1Loss(net,0.001)
total_loss = focal + l1_loss
return total_loss
optimizer = torch.optim.Adam(net.parameters(),lr = 0.002)
model = KerasModel(net=net,
loss_fn = focal_loss_with_regularization ,
metrics_dict = {"auc":AUCROC()},
optimizer= optimizer )
dfhistory = model.fit(train_data=dl_train,
val_data=dl_val,
epochs=20,
ckpt_path='checkpoint.pt',
patience=3,
monitor='val_auc',
mode='max')
# 结果可视化
fig, (ax1,ax2) = plt.subplots(nrows=1,ncols=2,figsize = (12,5))
ax1.scatter(Xp[:,0],Xp[:,1], c="r")
ax1.scatter(Xn[:,0],Xn[:,1],c = "g")
ax1.legend(["positive","negative"]);
ax1.set_title("y_true");
Xp_pred = X[torch.squeeze(torch.sigmoid(net.forward(X))>=0.5)]
Xn_pred = X[torch.squeeze(torch.sigmoid(net.forward(X))<0.5)]
ax2.scatter(Xp_pred[:,0],Xp_pred[:,1],c = "r")
ax2.scatter(Xn_pred[:,0],Xn_pred[:,1],c = "g")
ax2.legend(["positive","negative"]);
ax2.set_title("y_pred");
fig.show()