P9课后作业《PyTorch深度学习实践》kaggle-otto多类别分类

难度不大,只需在手写字符的基础上做些改变。
重点在于pytorch学习,效果并不是特别好,在测试集上的loss有16
下面是pytorch实现代码:

import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np  
from tqdm import tqdm

#定义一个cpu
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# 定义函数将类别标签转为id表示,方便后面计算交叉熵
def label2id(labels):
    id = []
    for label in labels:
        id.append(int(label.split("_")[-1])-1)
    return id

# prepare dataset
# 定义训练集
class Train_OttoDataset(Dataset):
    def __init__(self, filepath):
        #导入数据
        data = pd.read_csv(filepath)
        labels = data['target']
        labels = label2id(labels) #将字符串标签转换成id
        self.len = data.shape[0]
        train_num = int(self.len * 1)
        #切分80%为训练集
        self.train_data = data[:train_num]
        self.train_label = labels[:train_num]
        self.x_data = torch.from_numpy(np.array(self.train_data)[:,1:-1].astype(np.float32))
        self.y_data = torch.from_numpy(np.array(self.train_label).astype(np.float32))
        self.train_len = self.train_data.shape[0]
    
    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]

    def __len__(self):
        return self.train_len

train_dataset = Train_OttoDataset('train.csv')
train_loader = DataLoader(dataset = train_dataset, batch_size = 32, shuffle = True, num_workers = 2)

# 定义验证集
class Dev_OttoDataset(Dataset):
    def __init__(self, filepath):
        #导入数据
        data = pd.read_csv(filepath)
        labels = data['target']
        labels = label2id(labels) #将字符串标签转换成id
        self.len = data.shape[0]
        train_num = int(self.len * 0.8)
        #切分80%为数据集
        self.dev_data = data[train_num:]
        self.dev_label = labels[train_num:]
        self.x_data = torch.from_numpy(np.array(self.dev_data)[:,1:-1].astype(np.float32))
        self.y_data = torch.from_numpy(np.array(self.dev_label).astype(np.float32))
        self.dev_len = self.dev_data.shape[0]
    
    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]

    def __len__(self):
        return self.dev_len

dev_dataset = Dev_OttoDataset('train.csv')
dev_loader = DataLoader(dataset = dev_dataset, batch_size = 8, shuffle = False, num_workers = 2)

# design model using class
class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.linear1 = torch.nn.Linear(93, 64)
        self.bn = torch.nn.BatchNorm1d(num_features=64)
        self.linear2 = torch.nn.Linear(64, 32)
        self.dropout = torch.nn.Dropout(p=0.1)
        self.linear3 = torch.nn.Linear(32, 16)
        self.linear4 = torch.nn.Linear(16, 9)
        self.relu = torch.nn.ReLU()

    def forward(self, x):
        # first layer
        x = self.relu(self.bn(self.linear1(x)))
        # second layer
        x = self.relu(self.dropout(self.linear2(x)))
        x = self.relu(self.linear3(x))
        return self.linear4(x) # 最后一层不做激活,直接接到softmax

model = Net()
model.to(device)

# construct loss and optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr = 0.01, momentum = 0.5)

def train(epoch):
    running_loss = 0.0
    for batch_idx, data in enumerate(tqdm(train_loader), 0):
        inputs, labels = data[0].to(device), data[1].to(device) # 使用gpu训练
        optimizer.zero_grad()

        #forward + backward + optimize
        outputs = model(inputs)
        loss = criterion(outputs, labels.long())
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        if batch_idx % 300 == 299: # 输出每次的平均loss
            print('\n [%d, %5d] loss: %.3f' % (epoch+1, batch_idx+1, running_loss/300))
            running_loss = 0.0

# 验证
def dev():
    with torch.no_grad():
        correct = 0.0
        total = 0.0
        dev_mean_loss = 0.0
        for batch_idx, data in enumerate(tqdm(train_loader), 0):
            inputs, labels = data[0].to(device), data[1].to(device)
            outputs = model(inputs)
            dev_loss = criterion(outputs, labels.long())
            dev_mean_loss += dev_loss.item()
            total += labels.size(0)
            _, predicted = torch.max(outputs.data, dim = 1)
            correct += (predicted == labels).sum().item()
            acc = correct / total
            count = batch_idx
        print('dev loss:', dev_mean_loss/count,'Accuracy on dev set:', acc)

# 定义预测保存函数,用于保存预测结果
def predict_save():
    test_data = pd.read_csv('test.csv')
    test_input = torch.from_numpy(np.array(test_data)[:,1:].astype(np.float32)).to(device)
    with torch.no_grad():
        test_out = model(test_input)
        _, predicted = torch.max(test_out, dim = 1) #dim=1表示输出所在行的最大值
        test_out = pd.get_dummies(predicted.cpu())
        print(test_out)

        lables=['Class_1', 'Class_2', 'Class_3', 'Class_4', 'Class_5', 'Class_6', 'Class_7', 'Class_8', 'Class_9']
        # 添加列标签
        test_out.columns = lables
        # 插入id行
        test_out.insert(0, 'id', test_data['id'])
        output = pd.DataFrame(test_out)
        output.to_csv('my_predict.csv', index = False)
    
if __name__ == '__main__':
    for epoch in range(100):
        train(epoch)
        dev()
    predict_save()

你可能感兴趣的:(pytorch,深度学习,kaggle)