2020-07-17 暑期学习日更计划 (李宏毅2020-hw2 pytorch实现)

ML2020spring - hw2

该作业kaggle地址:ML2020spring - hw2 Classification - Binary Income Prediction
一个关于数据的二项分类问题,由一系列给出的特征数据,判断此人的收入是否大于5000元。
数据处理部分沿用了numpy的数据处理方式,神经网络的实现部分用了pytroch

数据预处理部分:

import os
import csv
import torch
from torch import nn
import numpy as np
import matplotlib.pyplot as plt

X_train_fpath = './lhy_DL_Hw/hw2_data/X_train'
Y_train_fpath = './lhy_DL_Hw/hw2_data/Y_train'
X_test_fpath = './lhy_DL_Hw/hw2_data/X_test'
output_fpath = './lhy_DL_Hw/output_{}.csv'

with open(X_train_fpath) as f:
    next(f)
    X_train = np.array([line.strip('\n').split(',')[1:] for line in f], dtype = float)
with open(Y_train_fpath) as f:
    next(f)
    Y_train = np.array([line.strip('\n').split(',')[1] for line in f], dtype = float)
with open(X_test_fpath) as f:
    next(f)
    X_test = np.array([line.strip('\n').split(',')[1:] for line in f], dtype = float)

def _train_dev_split(X, Y, dev_ratio = 0.25):
    # This function spilts data into training set and development set.
    train_size = int(len(X) * (1 - dev_ratio))
    return X[:train_size], Y[:train_size], X[train_size:], Y[train_size:]


def _normalize(X, train=True, specified_column=None, X_mean=None, X_std=None):

    if specified_column == None:
        specified_column = np.arange(X.shape[1])
    if train:
        X_mean = np.mean(X[:, specified_column], 0).reshape(1, -1)
        X_std = np.std(X[:, specified_column], 0).reshape(1, -1)

    X[:, specified_column] = (X[:, specified_column] - X_mean) / (X_std + 1e-8)

    return X, X_mean, X_std

dev_ratio = 0.1
X_train, Y_train, X_dev, Y_dev = _train_dev_split(X_train, Y_train, dev_ratio = dev_ratio)
X_train, X_mean, X_std =_normalize(X_train)

X_dev,_,_=_normalize(X_dev,X_mean=X_mean,X_std=X_std)
X_test,_,_=_normalize(X_test,X_mean=X_mean,X_std=X_std)

构造数据集,并搭建神经网络:

X_train=torch.from_numpy(X_train)
Y_train=torch.from_numpy(Y_train)
X_dev=torch.from_numpy(X_dev)
Y_dev=torch.from_numpy(Y_dev)

X_test=torch.from_numpy(X_test)

train_dataset=torch.utils.data.TensorDataset(X_train,Y_train)
dev_dataset=torch.utils.data.TensorDataset(X_dev,Y_dev)
test_dataset=torch.utils.data.TensorDataset(X_test)

train_loader=torch.utils.data.DataLoader(train_dataset,shuffle=True,batch_size=256)
dev_loader=torch.utils.data.DataLoader(dev_dataset,shuffle=True,batch_size=256)
test_loader=torch.utils.data.DataLoader(test_dataset,shuffle=True,batch_size=256)

print(X_train.shape,len(X_train[0]),Y_train.shape)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
input_size=510
output_size=2
learning_rate=0.0001
epoches=15
PATH=".\\hw2_classification_save\\classification_best_parameter.pkl"

model=torch.nn.Sequential(
    torch.nn.Linear(input_size,512),
    torch.nn.ReLU(),
    torch.nn.Linear(512,72),
    torch.nn.Dropout(0.5),
    torch.nn.ReLU(),
    torch.nn.Linear(72,output_size),
    torch.nn.ReLU(),

)
model.to(device)
# model=torch.nn.Linear(input_size,output_size)
criterion=nn.CrossEntropyLoss()
optimizer=torch.optim.Adam(model.parameters(),lr=learning_rate)

开始训练:

training_loss_list=[]
dev_loss_list=[]
acc_list=[]
dev_acc_list=[]
max_acc=0
if os.path.exists(PATH):
    m=model.load_state_dict(torch.load(PATH))
    print(m)
else:
    print("Training Start")
    for epoch in range(epoches):
        running_loss=0
        dev_loss_total=0
        correct_total=0
        labels_total=0
        for i,data in enumerate(train_loader):
            inputs,labels=data
            inputs = inputs.to(device)
            labels=labels.to(device)

            # print(i,inputs,labels)
            optimizer.zero_grad()

            inputs=torch.tensor(inputs,dtype=torch.float32)
            inputs = inputs.to(device)
            outputs=model(inputs)
            loss=criterion(outputs,labels.long())
            loss.backward()
            optimizer.step()
            running_loss+=loss.item()

            _, predict = torch.max(outputs, 1)
            correct_total+=(predict == labels).sum().item()
            labels_total+=len(labels)

        acc = correct_total/labels_total
        acc_list.append(acc)
        training_loss_list.append(running_loss/labels_total)
        if epoch%1==0:
            # print(i)
            print("epoch",epoch,"loss={:.5}".format(running_loss/labels_total),"acc={:.5}".format(acc))
            dev_loss = 0
            dev_acc = 0
            dev_correct_total = 0
            dev_labels_total = 0
#一边训练一边验证
            with torch.no_grad():
                for data in dev_loader:
                    dev_inputs, dev_labels = data
                    dev_inputs = dev_inputs.to(device)
                    dev_labels = dev_labels.to(device)

                    dev_outputs = model(dev_inputs.float())
                    loss = criterion(dev_outputs, dev_labels.long())
                    dev_loss += loss.item()
                    _, dev_predict = torch.max(dev_outputs, 1)
                    dev_correct_total += (dev_predict == dev_labels).sum().item()
                    dev_labels_total += len(dev_labels)

                    dev_acc = dev_correct_total / dev_labels_total
                dev_loss_list.append(dev_loss/ dev_labels_total)
                dev_acc_list.append(dev_acc)
                print("[dev_loss]={:.5}".format(dev_loss / dev_labels_total), "[dev_acc]={:.5}".format(dev_acc))

            if dev_acc>max_acc:
                max_acc=dev_acc
                torch.save(model.state_dict(), PATH)
                print("model saved,max_acc=",max_acc)
#选取在测试集中表现最好的模型,并保存

    plt.plot(np.arange(epoches),training_loss_list)
    plt.plot(np.arange(epoches),dev_loss_list)
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.show()
#损失值的图像
    plt.plot(np.arange(epoches),acc_list)
    plt.plot(np.arange(epoches),dev_acc_list)
    plt.xlabel("Epoch")
    plt.ylabel("Acc")
    plt.show()
    print("Finshed Training")
#精确度的图像

输入测试集并生成要提交csv文件

X_test = X_test.to(device)
test_predict=model(X_test.float())
_,test_predict=torch.max(test_predict,1)
print("len=",len(test_predict),test_predict)

with open('classification_submit.csv', mode='w', newline='') as submit_file:
    csv_writer = csv.writer(submit_file)
    header = ['id', 'label']
    csv_writer.writerow(header)
    for i in range(len(test_predict)):
        row = [ str(i), test_predict[i].item()]
        csv_writer.writerow(row)
        # print(row)


自此,该作业全部完成。
在kaggle上与别人比较了最后的得分,并多次修改了网络模型结构,然而该模型的表现仍旧不理想,在参考了别人的作业后发现,在完成了训练集训练和验证集的验证后,还可以把验证集作为训练集,通过增加数据集的形式优化模型参数。

代码如下:

import os
import csv
import torch
from torch import nn
import numpy as np

X_train_fpath = './lhy_DL_Hw/hw2_data/X_train'
Y_train_fpath = './lhy_DL_Hw/hw2_data/Y_train'
X_test_fpath = './lhy_DL_Hw/hw2_data/X_test'
output_fpath = './lhy_DL_Hw/output_{}.csv'

with open(X_train_fpath) as f:
    next(f)
    X_train = np.array([line.strip('\n').split(',')[1:] for line in f], dtype = float)
with open(Y_train_fpath) as f:
    next(f)
    Y_train = np.array([line.strip('\n').split(',')[1] for line in f], dtype = float)
with open(X_test_fpath) as f:
    next(f)
    X_test = np.array([line.strip('\n').split(',')[1:] for line in f], dtype = float)

def _normalize(X, train=True, specified_column=None, X_mean=None, X_std=None):

    if specified_column == None:
        specified_column = np.arange(X.shape[1])
    if train:
        X_mean = np.mean(X[:, specified_column], 0).reshape(1, -1)
        X_std = np.std(X[:, specified_column], 0).reshape(1, -1)

    X[:, specified_column] = (X[:, specified_column] - X_mean) / (X_std + 1e-8)

    return X, X_mean, X_std

X_train, X_mean, X_std =_normalize(X_train)

X_test,_,_=_normalize(X_test,X_mean=X_mean,X_std=X_std)

#在这里没有切分数据集为 训练集与验证集,而是全部作为训练集,根据上一次训练,调节好的超参数直接进行训练
X_train=torch.from_numpy(X_train)
Y_train=torch.from_numpy(Y_train)

print(X_train.shape)

X_test=torch.from_numpy(X_test)

train_dataset=torch.utils.data.TensorDataset(X_train,Y_train)
test_dataset=torch.utils.data.TensorDataset(X_test)

train_loader=torch.utils.data.DataLoader(train_dataset,shuffle=True,batch_size=512)
test_loader=torch.utils.data.DataLoader(test_dataset,shuffle=True,batch_size=512)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
input_size=510
output_size=2
learning_rate=0.0001
epoches=15
PATH=".\\hw2_classification_save\\classification_parameter_update.pkl"

model=torch.nn.Sequential(
    torch.nn.Linear(input_size,512),
    torch.nn.ReLU(),
    torch.nn.Linear(512,72),
    torch.nn.Dropout(0.5),
    torch.nn.ReLU(),
    torch.nn.Linear(72,output_size),
    torch.nn.ReLU(),

)

model.to(device)
criterion=nn.CrossEntropyLoss()
optimizer=torch.optim.Adam(model.parameters(),lr=learning_rate)

training_loss_list=[]
acc_list=[]

if os.path.exists(PATH):
    model.load_state_dict(torch.load(PATH))
else:
    for epoch in range(epoches):
        running_loss=0
        dev_loss_total=0
        correct_total=0
        labels_total=0
        for i,data in enumerate(train_loader):
            inputs,labels=data
            inputs = inputs.to(device)
            labels=labels.to(device)

            # print(i,inputs,labels)
            optimizer.zero_grad()

            inputs=torch.tensor(inputs,dtype=torch.float32)
            inputs = inputs.to(device)
            outputs=model(inputs)
            loss=criterion(outputs,labels.long())
            loss.backward()
            optimizer.step()
            running_loss+=loss.item()

            _, predict = torch.max(outputs, 1)
            correct_total+=(predict == labels).sum().item()
            labels_total+=len(labels)

        acc = correct_total/labels_total
        acc_list.append(acc)
        training_loss_list.append(running_loss/labels_total)
        if epoch%1==0:
            # print(i)
            print("epoch",epoch,"loss={:.5}".format(running_loss/labels_total),"acc={:.5}".format(acc))
    torch.save(model.state_dict(), PATH)


X_test = X_test.to(device)
test_predict=model(X_test.float())
_,test_predict=torch.max(test_predict,1)
print("len=",len(test_predict),test_predict)

with open('classification_submit_update.csv', mode='w', newline='') as submit_file:
    csv_writer = csv.writer(submit_file)
    header = ['id', 'label']
    csv_writer.writerow(header)
    for i in range(len(test_predict)):
        row = [ str(i), test_predict[i].item()]
        csv_writer.writerow(row)
        # print(row)

二项分类模型实现!

你可能感兴趣的:(2020-07-17 暑期学习日更计划 (李宏毅2020-hw2 pytorch实现))