ML2020spring - hw2
该作业kaggle地址:ML2020spring - hw2 Classification - Binary Income Prediction
一个关于数据的二项分类问题,由一系列给出的特征数据,判断此人的收入是否大于5000元。
数据处理部分沿用了numpy的数据处理方式,神经网络的实现部分用了pytroch
数据预处理部分:
import os
import csv
import torch
from torch import nn
import numpy as np
import matplotlib.pyplot as plt
X_train_fpath = './lhy_DL_Hw/hw2_data/X_train'
Y_train_fpath = './lhy_DL_Hw/hw2_data/Y_train'
X_test_fpath = './lhy_DL_Hw/hw2_data/X_test'
output_fpath = './lhy_DL_Hw/output_{}.csv'
with open(X_train_fpath) as f:
next(f)
X_train = np.array([line.strip('\n').split(',')[1:] for line in f], dtype = float)
with open(Y_train_fpath) as f:
next(f)
Y_train = np.array([line.strip('\n').split(',')[1] for line in f], dtype = float)
with open(X_test_fpath) as f:
next(f)
X_test = np.array([line.strip('\n').split(',')[1:] for line in f], dtype = float)
def _train_dev_split(X, Y, dev_ratio = 0.25):
# This function spilts data into training set and development set.
train_size = int(len(X) * (1 - dev_ratio))
return X[:train_size], Y[:train_size], X[train_size:], Y[train_size:]
def _normalize(X, train=True, specified_column=None, X_mean=None, X_std=None):
if specified_column == None:
specified_column = np.arange(X.shape[1])
if train:
X_mean = np.mean(X[:, specified_column], 0).reshape(1, -1)
X_std = np.std(X[:, specified_column], 0).reshape(1, -1)
X[:, specified_column] = (X[:, specified_column] - X_mean) / (X_std + 1e-8)
return X, X_mean, X_std
dev_ratio = 0.1
X_train, Y_train, X_dev, Y_dev = _train_dev_split(X_train, Y_train, dev_ratio = dev_ratio)
X_train, X_mean, X_std =_normalize(X_train)
X_dev,_,_=_normalize(X_dev,X_mean=X_mean,X_std=X_std)
X_test,_,_=_normalize(X_test,X_mean=X_mean,X_std=X_std)
构造数据集,并搭建神经网络:
X_train=torch.from_numpy(X_train)
Y_train=torch.from_numpy(Y_train)
X_dev=torch.from_numpy(X_dev)
Y_dev=torch.from_numpy(Y_dev)
X_test=torch.from_numpy(X_test)
train_dataset=torch.utils.data.TensorDataset(X_train,Y_train)
dev_dataset=torch.utils.data.TensorDataset(X_dev,Y_dev)
test_dataset=torch.utils.data.TensorDataset(X_test)
train_loader=torch.utils.data.DataLoader(train_dataset,shuffle=True,batch_size=256)
dev_loader=torch.utils.data.DataLoader(dev_dataset,shuffle=True,batch_size=256)
test_loader=torch.utils.data.DataLoader(test_dataset,shuffle=True,batch_size=256)
print(X_train.shape,len(X_train[0]),Y_train.shape)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
input_size=510
output_size=2
learning_rate=0.0001
epoches=15
PATH=".\\hw2_classification_save\\classification_best_parameter.pkl"
model=torch.nn.Sequential(
torch.nn.Linear(input_size,512),
torch.nn.ReLU(),
torch.nn.Linear(512,72),
torch.nn.Dropout(0.5),
torch.nn.ReLU(),
torch.nn.Linear(72,output_size),
torch.nn.ReLU(),
)
model.to(device)
# model=torch.nn.Linear(input_size,output_size)
criterion=nn.CrossEntropyLoss()
optimizer=torch.optim.Adam(model.parameters(),lr=learning_rate)
开始训练:
training_loss_list=[]
dev_loss_list=[]
acc_list=[]
dev_acc_list=[]
max_acc=0
if os.path.exists(PATH):
m=model.load_state_dict(torch.load(PATH))
print(m)
else:
print("Training Start")
for epoch in range(epoches):
running_loss=0
dev_loss_total=0
correct_total=0
labels_total=0
for i,data in enumerate(train_loader):
inputs,labels=data
inputs = inputs.to(device)
labels=labels.to(device)
# print(i,inputs,labels)
optimizer.zero_grad()
inputs=torch.tensor(inputs,dtype=torch.float32)
inputs = inputs.to(device)
outputs=model(inputs)
loss=criterion(outputs,labels.long())
loss.backward()
optimizer.step()
running_loss+=loss.item()
_, predict = torch.max(outputs, 1)
correct_total+=(predict == labels).sum().item()
labels_total+=len(labels)
acc = correct_total/labels_total
acc_list.append(acc)
training_loss_list.append(running_loss/labels_total)
if epoch%1==0:
# print(i)
print("epoch",epoch,"loss={:.5}".format(running_loss/labels_total),"acc={:.5}".format(acc))
dev_loss = 0
dev_acc = 0
dev_correct_total = 0
dev_labels_total = 0
#一边训练一边验证
with torch.no_grad():
for data in dev_loader:
dev_inputs, dev_labels = data
dev_inputs = dev_inputs.to(device)
dev_labels = dev_labels.to(device)
dev_outputs = model(dev_inputs.float())
loss = criterion(dev_outputs, dev_labels.long())
dev_loss += loss.item()
_, dev_predict = torch.max(dev_outputs, 1)
dev_correct_total += (dev_predict == dev_labels).sum().item()
dev_labels_total += len(dev_labels)
dev_acc = dev_correct_total / dev_labels_total
dev_loss_list.append(dev_loss/ dev_labels_total)
dev_acc_list.append(dev_acc)
print("[dev_loss]={:.5}".format(dev_loss / dev_labels_total), "[dev_acc]={:.5}".format(dev_acc))
if dev_acc>max_acc:
max_acc=dev_acc
torch.save(model.state_dict(), PATH)
print("model saved,max_acc=",max_acc)
#选取在测试集中表现最好的模型,并保存
plt.plot(np.arange(epoches),training_loss_list)
plt.plot(np.arange(epoches),dev_loss_list)
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.show()
#损失值的图像
plt.plot(np.arange(epoches),acc_list)
plt.plot(np.arange(epoches),dev_acc_list)
plt.xlabel("Epoch")
plt.ylabel("Acc")
plt.show()
print("Finshed Training")
#精确度的图像
输入测试集并生成要提交csv文件
X_test = X_test.to(device)
test_predict=model(X_test.float())
_,test_predict=torch.max(test_predict,1)
print("len=",len(test_predict),test_predict)
with open('classification_submit.csv', mode='w', newline='') as submit_file:
csv_writer = csv.writer(submit_file)
header = ['id', 'label']
csv_writer.writerow(header)
for i in range(len(test_predict)):
row = [ str(i), test_predict[i].item()]
csv_writer.writerow(row)
# print(row)
自此,该作业全部完成。
在kaggle上与别人比较了最后的得分,并多次修改了网络模型结构,然而该模型的表现仍旧不理想,在参考了别人的作业后发现,在完成了训练集训练和验证集的验证后,还可以把验证集作为训练集,通过增加数据集的形式优化模型参数。
代码如下:
import os
import csv
import torch
from torch import nn
import numpy as np
X_train_fpath = './lhy_DL_Hw/hw2_data/X_train'
Y_train_fpath = './lhy_DL_Hw/hw2_data/Y_train'
X_test_fpath = './lhy_DL_Hw/hw2_data/X_test'
output_fpath = './lhy_DL_Hw/output_{}.csv'
with open(X_train_fpath) as f:
next(f)
X_train = np.array([line.strip('\n').split(',')[1:] for line in f], dtype = float)
with open(Y_train_fpath) as f:
next(f)
Y_train = np.array([line.strip('\n').split(',')[1] for line in f], dtype = float)
with open(X_test_fpath) as f:
next(f)
X_test = np.array([line.strip('\n').split(',')[1:] for line in f], dtype = float)
def _normalize(X, train=True, specified_column=None, X_mean=None, X_std=None):
if specified_column == None:
specified_column = np.arange(X.shape[1])
if train:
X_mean = np.mean(X[:, specified_column], 0).reshape(1, -1)
X_std = np.std(X[:, specified_column], 0).reshape(1, -1)
X[:, specified_column] = (X[:, specified_column] - X_mean) / (X_std + 1e-8)
return X, X_mean, X_std
X_train, X_mean, X_std =_normalize(X_train)
X_test,_,_=_normalize(X_test,X_mean=X_mean,X_std=X_std)
#在这里没有切分数据集为 训练集与验证集,而是全部作为训练集,根据上一次训练,调节好的超参数直接进行训练
X_train=torch.from_numpy(X_train)
Y_train=torch.from_numpy(Y_train)
print(X_train.shape)
X_test=torch.from_numpy(X_test)
train_dataset=torch.utils.data.TensorDataset(X_train,Y_train)
test_dataset=torch.utils.data.TensorDataset(X_test)
train_loader=torch.utils.data.DataLoader(train_dataset,shuffle=True,batch_size=512)
test_loader=torch.utils.data.DataLoader(test_dataset,shuffle=True,batch_size=512)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
input_size=510
output_size=2
learning_rate=0.0001
epoches=15
PATH=".\\hw2_classification_save\\classification_parameter_update.pkl"
model=torch.nn.Sequential(
torch.nn.Linear(input_size,512),
torch.nn.ReLU(),
torch.nn.Linear(512,72),
torch.nn.Dropout(0.5),
torch.nn.ReLU(),
torch.nn.Linear(72,output_size),
torch.nn.ReLU(),
)
model.to(device)
criterion=nn.CrossEntropyLoss()
optimizer=torch.optim.Adam(model.parameters(),lr=learning_rate)
training_loss_list=[]
acc_list=[]
if os.path.exists(PATH):
model.load_state_dict(torch.load(PATH))
else:
for epoch in range(epoches):
running_loss=0
dev_loss_total=0
correct_total=0
labels_total=0
for i,data in enumerate(train_loader):
inputs,labels=data
inputs = inputs.to(device)
labels=labels.to(device)
# print(i,inputs,labels)
optimizer.zero_grad()
inputs=torch.tensor(inputs,dtype=torch.float32)
inputs = inputs.to(device)
outputs=model(inputs)
loss=criterion(outputs,labels.long())
loss.backward()
optimizer.step()
running_loss+=loss.item()
_, predict = torch.max(outputs, 1)
correct_total+=(predict == labels).sum().item()
labels_total+=len(labels)
acc = correct_total/labels_total
acc_list.append(acc)
training_loss_list.append(running_loss/labels_total)
if epoch%1==0:
# print(i)
print("epoch",epoch,"loss={:.5}".format(running_loss/labels_total),"acc={:.5}".format(acc))
torch.save(model.state_dict(), PATH)
X_test = X_test.to(device)
test_predict=model(X_test.float())
_,test_predict=torch.max(test_predict,1)
print("len=",len(test_predict),test_predict)
with open('classification_submit_update.csv', mode='w', newline='') as submit_file:
csv_writer = csv.writer(submit_file)
header = ['id', 'label']
csv_writer.writerow(header)
for i in range(len(test_predict)):
row = [ str(i), test_predict[i].item()]
csv_writer.writerow(row)
# print(row)
二项分类模型实现!