基于周志华老师的《机器学习》、上一篇学习笔记以及网络的其他资料,对线性模型的这一部分内容进行一个总结。上接:机器学习:线性模型学习总结(2)。
学习时间:2022.04.19~2022.04.20
和用Sk-Learn一样,也用来一个专门处理表格数据的函数,不过主要还是使用之前的流程:
# 数据预处理
df_x = mango_processing(df_x).astype(float)
df_y = df_y.astype(float)
# 划分训练集和测试集
tr_x, te_x, tr_y, te_y = train_test_split(df_x, df_y, test_size=0.2, random_state=42)
# 全部转换成张量
train_tensor_x, test_tensor_x, train_tensor_y, test_tensor_y = map(torch.tensor, (np.array(tr_x), np.array(te_x), np.array(tr_y), np.array(te_y)))
# 将标签转为long或float格式(根据损失函数定):
# train_tensor_y = train_tensor_y.squeeze(-1).long()
# test_tensor_y = test_tensor_y.squeeze(-1).long()
train_tensor_y = train_tensor_y.squeeze(-1).float()
test_tensor_y = test_tensor_y.squeeze(-1).float()
# 返回测试集和训练集
return train_tensor_x, test_tensor_x, train_tensor_y, test_tensor_y
因为全连接层不加激活函数,就只相当于加权求和,所以就可以当做线性回归:
class LinearModel(nn.Module):
def __init__(self):
super(LinearModel, self).__init__()
self.liner = nn.Linear(14, 1)
def forward(self, x):
x = x.to(torch.float32)
x = self.liner(x)
x = x.squeeze(-1) # 线性回归的损失函数MSELoss要求输入维数和目标维数一致,因此做了个降维
return x
在最后加一个Sigmoid函数实现逻辑回归分类:
class LinearModel(nn.Module):
def __init__(self):
super(LinearModel, self).__init__()
self.liner = torch.nn.Linear(24, 2)
def forward(self, x):
x = x.to(torch.float32)
x = self.liner(x)
x = torch.sigmoid(x)
return x
这里学习了TorchMetrics这个包,直接用来调用评价回归结果,老样子,还是先写了一个函数:
# 计算均方误差MSE
mean_squared_error = torchmetrics.MeanSquaredError()
mean_squared_error(y_pred, y_true)
mse = mean_squared_error.compute()
print('MSE:', mse, end='; ')
# 计算平均绝对误差MAE
mean_absolute_error = torchmetrics.MeanAbsoluteError()
mean_absolute_error(y_pred, y_true)
mae = mean_absolute_error.compute()
print('MAE:', mae, end='; ')
# 计算平均绝对百分比误差MAPE
mean_absolute_percentage_error = torchmetrics.MeanAbsolutePercentageError()
mean_absolute_percentage_error(y_pred, y_true)
mape = mean_absolute_percentage_error.compute()
print('MAPE:', mape, end='; ')
# 计算可解释方差EV
explained_variance = torchmetrics.ExplainedVariance()
explained_variance(y_pred, y_true)
ev = explained_variance.compute()
print('EV:', ev, end='; ')
# 计算可解释方差EV
r2_score = torchmetrics.R2Score()
r2_score(y_pred, y_true)
r2 = r2_score.compute()
print('R2-Score:', r2, end='.')
同上:
# 计算准确率Accuracy
accuracy = torchmetrics.Accuracy()
accuracy(y_pred, y_true)
acc = accuracy.compute()
print('Accuracy:', acc, end='; ')
# 计算精度precision
precision = torchmetrics.Precision(average='macro', num_classes=calss_num) # 需要根据预测的类别数量设定
precision(y_pred, y_true)
pre = precision.compute()
print('Precision:', pre, end='; ')
# 计算召回率recall
recall = torchmetrics.Recall(average='macro', num_classes=calss_num) # 需要根据预测的类别数量设定
recall(y_pred, y_true)
rec = recall.compute()
print('Recall:', rec, end='; ')
# 计算fl-score
f1_score = torchmetrics.F1Score(num_classes=calss_num)
f1_score(y_pred, y_true)
f1 = f1_score.compute()
print('F1-Score:', f1, end='; ')
# 计算AUROC
auroc = torchmetrics.AUROC(average='macro', num_classes=calss_num)
auroc(y_pred, y_true)
auc = auroc.compute()
print('AUROC:', auc, end='.')
auroc.reset()
数据来源:New York City Taxi Fare Prediction | Kaggle。
# 读取数据集,做好预处理
df = pd.read_csv('train.csv')
df.pickup_datetime = pd.to_datetime(df.pickup_datetime).dt.tz_localize(None)
df['hour'] = df['pickup_datetime'].apply(lambda x: x.strftime('%H')).astype(int)
df['minute'] = df['pickup_datetime'].apply(lambda x: x.strftime('%M')).astype(int)
df['second'] = df['pickup_datetime'].apply(lambda x: x.strftime('%S')).astype(int)
df['date'] = df['pickup_datetime'].apply(lambda x: x.strftime('%Y%m%d')).astype(int)
print(df.info())
target = df.fare_amount
data = df.drop(['fare_amount', 'key', 'pickup_datetime'], axis=1)
# 划分训练集,转换成张量
tr_tx, te_tx, tr_ty, te_ty = data_to_tensor(data, target)
# ---------------------------------------定义网络---------------------------------------
class LinearModel(nn.Module):
def __init__(self):
super(LinearModel, self).__init__()
self.liner = nn.Linear(14, 1)
def forward(self, x):
x = x.to(torch.float32)
x = self.liner(x)
x = x.squeeze(-1) # 线性回归的损失函数MSELoss要求输入维数和目标维数一致,因此做了个降维
return x
# --------------------------准备训练(除超参数外,可复用)--------------------------
# 设置随机数种子,保证结果可复现
seed = 42
torch.manual_seed(seed) # 设置CPU
# 实例化模型
model = LinearModel()
# 适应设备(CPU or GPU)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)
# 存入DataLoader
ds = TensorDataset(tr_tx, tr_ty)
dl = DataLoader(ds, batch_size=128, shuffle=True)
# 学习率
lr = 1e-5
# 设定迭代次数
epoch = 100
# 设定每隔多少次显示一次评价指标
show_step = 10
# 选用优化器
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=0.01)
# 设置损失函数Loss
criterion = nn.MSELoss()
# ----------------------------------模型训练(可复用)----------------------------------
for epoch in range(epoch+1):
for x, y in dl:
pred = model(x) # 正向传播
loss = criterion(pred, y) # 计算损失函数
optimizer.zero_grad() # 优化器的梯度清零
loss.backward() # 反向传播
optimizer.step() # 参数更新
if epoch % show_step == 0: # 控制输出间隔
with torch.no_grad():
print('epoch: ', epoch)
tr_pred = model(tr_tx) # 得到训练集的预测结果
te_pred = model(te_tx) # 得到测试集的预测结果
all_regress_evaluation(tr_pred, tr_ty, te_pred, te_ty)
# -------------------------------输出验证集结果(基本可复用)-------------------------------
df_v = pd.read_csv('test.csv') # 读取验证集数据
df_v.pickup_datetime = pd.to_datetime(df_v.pickup_datetime).dt.tz_localize(None)
df_v['hour'] = df_v['pickup_datetime'].apply(lambda x: x.strftime('%H')).astype(int)
df_v['minute'] = df_v['pickup_datetime'].apply(lambda x: x.strftime('%M')).astype(int)
df_v['second'] = df_v['pickup_datetime'].apply(lambda x: x.strftime('%S')).astype(int)
df_v['date'] = df_v['pickup_datetime'].apply(lambda x: x.strftime('%Y%m%d')).astype(int)
va_x = df_v.drop(['key', 'pickup_datetime'], axis=1) # 弃列(少一个预测列)
va_x = mango_processing(va_x).astype(float) # 数据预处理
va_tx = torch.tensor(np.array(va_x)) # 转换成张量
va_pred = model(va_tx) # 预测
va_id = df_v['key'] # 读取索引列
va_out = pd.DataFrame({'key': va_id, 'fare_amount': va_pred.detach().numpy()}) # 构建输出数据的DataFrame
va_out['fare_amount'] = va_out['fare_amount'].apply(lambda x: round(x, 2)) # 数字保留两位小数
va_out.to_csv('Valid Prediction.csv', index=False) # 输出到CSV,并取消索引列
数据来源:Spaceship Titanic | Kaggle。
# 读取数据集,做好预处理
df = pd.read_csv('train.csv')
target = df.Transported
data = df.drop(['PassengerId', 'Transported', 'Name', 'Cabin'], axis=1)
# 划分训练集,转换成张量
tr_tx, te_tx, tr_ty, te_ty = data_to_tensor(data, target)
# ------------------------------------------定义网络------------------------------------------
class LinearModel(nn.Module):
def __init__(self):
super(LinearModel, self).__init__()
self.liner = torch.nn.Linear(24, 2)
def forward(self, x):
x = x.to(torch.float32)
x = self.liner(x)
x = torch.sigmoid(x)
return x
# --------------------------准备训练(除超参数外,可完全复用)--------------------------
# 设置随机数种子,保证结果可复现
seed = 42
torch.manual_seed(seed) # 设置CPU
# 实例化模型
model = LinearModel()
# 适应设备(CPU or GPU)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)
# 存入DataLoader
ds = TensorDataset(tr_tx, tr_ty)
dl = DataLoader(ds, batch_size=256, shuffle=True)
# 学习率
lr = 1e-3
# 设定迭代次数
epoch = 70
# 设定每隔多少次显示一次评价指标
show_step = 10
# 选用优化器
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=0.01)
# 设置损失函数Loss
criterion = nn.CrossEntropyLoss()
# ----------------------------------模型训练(可完全复用)----------------------------------
for epoch in range(epoch+1):
for x, y in dl:
pred = model(x) # 正向传播
loss = criterion(pred, y) # 计算损失函数
optimizer.zero_grad() # 优化器的梯度清零
loss.backward() # 反向传播
optimizer.step() # 参数更新
if epoch % show_step == 0: # 控制输出间隔
with torch.no_grad():
print('epoch: ', epoch)
tr_pred = model(tr_tx) # 得到训练集的预测结果
te_pred = model(te_tx) # 得到测试集的预测结果
all_classify_evaluation(tr_pred, tr_ty, te_pred, te_ty, 2)
# 输出曲线
tr_pred = model(tr_tx)
torch_plot_curve(tr_pred, tr_ty)
# -------------------------------输出验证集结果(基本可复用)-------------------------------
df_v = pd.read_csv('test.csv') # 读取验证集数据
va_x = df_v.drop(['PassengerId', 'Name', 'Cabin'], axis=1) # 弃列(少一个预测列)
va_x = mango_processing(va_x).astype(float) # 数据预处理
va_tx = torch.tensor(np.array(va_x)) # 转换成张量
va_pred = model(va_tx) # 预测
_, va_y = torch.max(va_pred.data, 1) # 分类数据,需要选取概率最大项的索引填充到第1列
va_id = df_v['PassengerId'] # 读取索引列
va_out = pd.DataFrame({'PassengerId': va_id, 'Transported': va_y}) # 构建输出数据的DataFrame
va_out['Transported'] = va_out['Transported'].astype(bool) # 分类数据,标签转换
va_out.to_csv('Valid Prediction.csv', index=False) # 输出到CSV,并取消索引列