pytorch深度学习实践_p8_pytorch实现泰坦尼克号生存预测

泰坦尼克号生存预测

题目原址:https://www.kaggle.com/c/titanic
有能力的可以去看看题目先,数据文件也在这个网址里

数据准备

因为kaggle给的test.csv文件没有预测值,所以只能自己从train.csv文件分出一部分,来判断预测值得好坏,这里我将train.csv划分为训练集(80%)验证集(20%),可以自行设定。

定义训练数据集

class Train_Titanic(Dataset):
    def __init__(self,filepath):
        data = pd.read_csv(filepath, encoding='big5')
        features = ["Pclass", "Sex", "SibSp", "Parch", "Fare"]
        self.data_len = data.shape[0]
        self.train_data = data[:int(self.data_len*0.8)] #训练集占80%
        self.x_data = torch.from_numpy(np.array(pd.get_dummies(self.train_data[features])).astype(np.float32))
        self.y_data = torch.from_numpy(np.array(self.train_data["Survived"].astype(np.float32)))
        self.train_len = self.train_data.shape[0]

    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]

    def __len__(self):
        return self.train_len

定义验证数据集

class Dev_Titanic(Dataset):
    def __init__(self, filepath):
        data = pd.read_csv(filepath, encoding='big5')
        features = ["Pclass", "Sex", "SibSp", "Parch", "Fare"]
        self.data_len = data.shape[0] 	#shape[0]表示train.csv总行数
        self.dev_data = data[int(self.data_len * 0.8):] #验证集占20%
        self.x_data = torch.from_numpy(np.array(pd.get_dummies(self.dev_data[features])).astype(np.float32))
        self.y_data = torch.from_numpy(np.array(self.dev_data["Survived"]).astype(np.float32))
        self.dev_len = self.dev_data.shape[0]

    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]

    def __len__(self):
        return self.dev_len

导入train.csv文件

train_dataset = Train_Titanic("p8_作业_数据/train.csv")
dev_dataset = Dev_Titanic("p8_作业_数据/train.csv")


train_loader = DataLoader(dataset=train_dataset,
                          batch_size=32,
                          shuffle=True,		#训练集保证随机性,打乱顺序
                          num_workers=8)	#线程的个数,加快平行运算,取决于你的cpu
dev_loader = DataLoader(dataset=dev_dataset,
                          batch_size=32,
                          shuffle=False,	#验证集保证结果直观性, 不打乱顺序
                          num_workers=8)

定义神经网络

class Model(torch.nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.linear1 = torch.nn.Linear(6, 6) #选取的五个特征,由于feature性别被拆分成男女两个featur,经过独热表示后变为6维。
        self.linear2 = torch.nn.Linear(6, 6)
        self.linear3 = torch.nn.Linear(6, 3)
        self.linear4 = torch.nn.Linear(3, 2)
        self.linear5 = torch.nn.Linear(2, 1)


    def forward(self,x):
        x = F.relu(self.linear1(x))
        x = F.relu(self.linear2(x))
        x = F.relu(self.linear3(x))
        x = F.relu(self.linear4(x))
        x = F.sigmoid(self.linear5(x)) #最后一层要sigmoid保证输出值在(0,1)之间

        return x

构建loss和optimizer

    criterion =torch.nn.BCELoss(size_average=True) #交叉熵
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

训练

 train_mean_loss = 0
        dev_mean_loss = 0
        for i, data in enumerate(train_loader, 0):
            input, label = data

            y_pred = model(input)
            y_pred = y_pred.squeeze(-1)     #降维
            train_loss = criterion(y_pred, label)
            train_mean_loss += train_loss.item()
            optimizer.zero_grad()	#记得梯度归零
            train_loss.backward()
            optimizer.step()

验证

with torch.no_grad():
            correct = 0
            total = 0
            for data in dev_loader:
                inputs, labels = data
                outputs = model(inputs).squeeze(-1)
                dev_loss = criterion(outputs, labels)
                dev_mean_loss += dev_loss.item()
                total += labels.size(0)
                correct += (np.round(outputs) == labels).sum().item()
        print('Accuracy on test set: %d %%' % (100 * correct / total))
        print(epoch,train_mean_loss/len(train_loader))
        print(epoch,dev_mean_loss/len(dev_loader))

测试

#测试并保存预测结果为csv文件
    test_data = pd.read_csv("p8_作业_数据/test.csv")
    features = ["Pclass", "Sex", "SibSp", "Parch", "Fare"]
    test = torch.from_numpy(np.array(pd.get_dummies(test_data[features])).astype(np.float32))
    
       with torch.no_grad():
        y_pred= model(test)
        y = []
        for i in y_pred:
            y.append(np.round(i)) #四舍五入,y>=0.5认为存活,否则视为死亡
        #预测结果保存为csv文件
        output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': y})
        output.to_csv('my_predict.csv', index=False)

完整代码

import torch
from torch.utils.data import Dataset,DataLoader
import numpy as np
import pandas as pd
import torch.nn.functional as F

class Train_Titanic(Dataset):
    def __init__(self,filepath):
        data = pd.read_csv(filepath, encoding='big5')
        features = ["Pclass", "Sex", "SibSp", "Parch", "Fare"]
        self.data_len = data.shape[0]
        self.train_data = data[:int(self.data_len*0.8)]		#训练集占80%
        self.x_data = torch.from_numpy(np.array(pd.get_dummies(self.train_data[features])).astype(np.float32))
        self.y_data = torch.from_numpy(np.array(self.train_data["Survived"].astype(np.float32)))
        self.train_len = self.train_data.shape[0]

    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]

    def __len__(self):
        return self.train_len


class Dev_Titanic(Dataset):
    def __init__(self, filepath):
        data = pd.read_csv(filepath, encoding='big5')
        features = ["Pclass", "Sex", "SibSp", "Parch", "Fare"]
        self.data_len = data.shape[0]
        self.dev_data = data[int(self.data_len * 0.8):]		#验证集占20%
        self.x_data = torch.from_numpy(np.array(pd.get_dummies(self.dev_data[features])).astype(np.float32))
        self.y_data = torch.from_numpy(np.array(self.dev_data["Survived"]).astype(np.float32))
        self.dev_len = self.dev_data.shape[0]

    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]

    def __len__(self):
        return self.dev_len


train_dataset = Train_Titanic("p8_作业_数据/train.csv")
dev_dataset = Dev_Titanic("p8_作业_数据/train.csv")


train_loader = DataLoader(dataset=train_dataset,
                          batch_size=32,
                          shuffle=True,
                          num_workers=8)
dev_loader = DataLoader(dataset=dev_dataset,
                          batch_size=32,
                          shuffle=False,
                          num_workers=8)

class Model(torch.nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.linear1 = torch.nn.Linear(6, 6) #选取的五个特征,由于feature性别被拆分成男女两个featur,经过独热表示后变为6维。
        self.linear2 = torch.nn.Linear(6, 6)
        self.linear3 = torch.nn.Linear(6, 3)
        self.linear4 = torch.nn.Linear(3, 2)
        self.linear5 = torch.nn.Linear(2, 1)


    def forward(self,x):
        x = F.relu(self.linear1(x))
        x = F.relu(self.linear2(x))
        x = F.relu(self.linear3(x))
        x = F.relu(self.linear4(x))
        x = F.sigmoid(self.linear5(x))

        return x


if __name__ == '__main__': #防止window因为多线程报错
    model = Model()
    criterion =torch.nn.BCELoss(size_average=True) #交叉熵
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

    for epoch in range(100): #其实20的时候准确率基本就已经稳定了,如果想要kaggle分数更高一点可以选择100但是挺费时间的
        # 训练
        train_mean_loss = 0
        dev_mean_loss = 0
        for i, data in enumerate(train_loader, 0):
            input, label = data

            y_pred = model(input)
            y_pred = y_pred.squeeze(-1)     #降维
            train_loss = criterion(y_pred, label)
            train_mean_loss += train_loss.item()
            optimizer.zero_grad()	#记得梯度归零
            train_loss.backward()
            optimizer.step()

        # 验证
        with torch.no_grad():
            correct = 0
            total = 0
            for data in dev_loader:
                inputs, labels = data
                outputs = model(inputs).squeeze(-1)
                dev_loss = criterion(outputs, labels)
                dev_mean_loss += dev_loss.item()
                total += labels.size(0)
                correct += (np.round(outputs) == labels).sum().item()
        print('Accuracy on test set: %d %%' % (100 * correct / total))
        print(epoch,train_mean_loss/len(train_loader))
        print(epoch,dev_mean_loss/len(dev_loader))

    #测试并保存预测结果为csv文件
    test_data = pd.read_csv("p8_作业_数据/test.csv")
    features = ["Pclass", "Sex", "SibSp", "Parch", "Fare"]
    test = torch.from_numpy(np.array(pd.get_dummies(test_data[features])).astype(np.float32))
    
       with torch.no_grad():
        y_pred= model(test)
        y = []
        for i in y_pred:
            y.append(np.round(i)) #四舍五入,y>=0.5认为存活,否则视为死亡
        #预测结果保存为csv文件
        output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': y})
        output.to_csv('my_predict.csv', index=False)


最终kaggle分数在0.78左右

你可能感兴趣的:(pytorch深度学习实践,pytorch,深度学习,神经网络)