题目原址:https://www.kaggle.com/c/titanic
有能力的可以去看看题目先,数据文件也在这个网址里
因为kaggle给的test.csv文件没有预测值,所以只能自己从train.csv文件分出一部分,来判断预测值得好坏,这里我将train.csv划分为训练集(80%)验证集(20%),可以自行设定。
class Train_Titanic(Dataset):
def __init__(self,filepath):
data = pd.read_csv(filepath, encoding='big5')
features = ["Pclass", "Sex", "SibSp", "Parch", "Fare"]
self.data_len = data.shape[0]
self.train_data = data[:int(self.data_len*0.8)] #训练集占80%
self.x_data = torch.from_numpy(np.array(pd.get_dummies(self.train_data[features])).astype(np.float32))
self.y_data = torch.from_numpy(np.array(self.train_data["Survived"].astype(np.float32)))
self.train_len = self.train_data.shape[0]
def __getitem__(self, index):
return self.x_data[index], self.y_data[index]
def __len__(self):
return self.train_len
class Dev_Titanic(Dataset):
def __init__(self, filepath):
data = pd.read_csv(filepath, encoding='big5')
features = ["Pclass", "Sex", "SibSp", "Parch", "Fare"]
self.data_len = data.shape[0] #shape[0]表示train.csv总行数
self.dev_data = data[int(self.data_len * 0.8):] #验证集占20%
self.x_data = torch.from_numpy(np.array(pd.get_dummies(self.dev_data[features])).astype(np.float32))
self.y_data = torch.from_numpy(np.array(self.dev_data["Survived"]).astype(np.float32))
self.dev_len = self.dev_data.shape[0]
def __getitem__(self, index):
return self.x_data[index], self.y_data[index]
def __len__(self):
return self.dev_len
train_dataset = Train_Titanic("p8_作业_数据/train.csv")
dev_dataset = Dev_Titanic("p8_作业_数据/train.csv")
train_loader = DataLoader(dataset=train_dataset,
batch_size=32,
shuffle=True, #训练集保证随机性,打乱顺序
num_workers=8) #线程的个数,加快平行运算,取决于你的cpu
dev_loader = DataLoader(dataset=dev_dataset,
batch_size=32,
shuffle=False, #验证集保证结果直观性, 不打乱顺序
num_workers=8)
class Model(torch.nn.Module):
def __init__(self):
super(Model, self).__init__()
self.linear1 = torch.nn.Linear(6, 6) #选取的五个特征,由于feature性别被拆分成男女两个featur,经过独热表示后变为6维。
self.linear2 = torch.nn.Linear(6, 6)
self.linear3 = torch.nn.Linear(6, 3)
self.linear4 = torch.nn.Linear(3, 2)
self.linear5 = torch.nn.Linear(2, 1)
def forward(self,x):
x = F.relu(self.linear1(x))
x = F.relu(self.linear2(x))
x = F.relu(self.linear3(x))
x = F.relu(self.linear4(x))
x = F.sigmoid(self.linear5(x)) #最后一层要sigmoid保证输出值在(0,1)之间
return x
criterion =torch.nn.BCELoss(size_average=True) #交叉熵
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
train_mean_loss = 0
dev_mean_loss = 0
for i, data in enumerate(train_loader, 0):
input, label = data
y_pred = model(input)
y_pred = y_pred.squeeze(-1) #降维
train_loss = criterion(y_pred, label)
train_mean_loss += train_loss.item()
optimizer.zero_grad() #记得梯度归零
train_loss.backward()
optimizer.step()
with torch.no_grad():
correct = 0
total = 0
for data in dev_loader:
inputs, labels = data
outputs = model(inputs).squeeze(-1)
dev_loss = criterion(outputs, labels)
dev_mean_loss += dev_loss.item()
total += labels.size(0)
correct += (np.round(outputs) == labels).sum().item()
print('Accuracy on test set: %d %%' % (100 * correct / total))
print(epoch,train_mean_loss/len(train_loader))
print(epoch,dev_mean_loss/len(dev_loader))
#测试并保存预测结果为csv文件
test_data = pd.read_csv("p8_作业_数据/test.csv")
features = ["Pclass", "Sex", "SibSp", "Parch", "Fare"]
test = torch.from_numpy(np.array(pd.get_dummies(test_data[features])).astype(np.float32))
with torch.no_grad():
y_pred= model(test)
y = []
for i in y_pred:
y.append(np.round(i)) #四舍五入,y>=0.5认为存活,否则视为死亡
#预测结果保存为csv文件
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': y})
output.to_csv('my_predict.csv', index=False)
import torch
from torch.utils.data import Dataset,DataLoader
import numpy as np
import pandas as pd
import torch.nn.functional as F
class Train_Titanic(Dataset):
def __init__(self,filepath):
data = pd.read_csv(filepath, encoding='big5')
features = ["Pclass", "Sex", "SibSp", "Parch", "Fare"]
self.data_len = data.shape[0]
self.train_data = data[:int(self.data_len*0.8)] #训练集占80%
self.x_data = torch.from_numpy(np.array(pd.get_dummies(self.train_data[features])).astype(np.float32))
self.y_data = torch.from_numpy(np.array(self.train_data["Survived"].astype(np.float32)))
self.train_len = self.train_data.shape[0]
def __getitem__(self, index):
return self.x_data[index], self.y_data[index]
def __len__(self):
return self.train_len
class Dev_Titanic(Dataset):
def __init__(self, filepath):
data = pd.read_csv(filepath, encoding='big5')
features = ["Pclass", "Sex", "SibSp", "Parch", "Fare"]
self.data_len = data.shape[0]
self.dev_data = data[int(self.data_len * 0.8):] #验证集占20%
self.x_data = torch.from_numpy(np.array(pd.get_dummies(self.dev_data[features])).astype(np.float32))
self.y_data = torch.from_numpy(np.array(self.dev_data["Survived"]).astype(np.float32))
self.dev_len = self.dev_data.shape[0]
def __getitem__(self, index):
return self.x_data[index], self.y_data[index]
def __len__(self):
return self.dev_len
train_dataset = Train_Titanic("p8_作业_数据/train.csv")
dev_dataset = Dev_Titanic("p8_作业_数据/train.csv")
train_loader = DataLoader(dataset=train_dataset,
batch_size=32,
shuffle=True,
num_workers=8)
dev_loader = DataLoader(dataset=dev_dataset,
batch_size=32,
shuffle=False,
num_workers=8)
class Model(torch.nn.Module):
def __init__(self):
super(Model, self).__init__()
self.linear1 = torch.nn.Linear(6, 6) #选取的五个特征,由于feature性别被拆分成男女两个featur,经过独热表示后变为6维。
self.linear2 = torch.nn.Linear(6, 6)
self.linear3 = torch.nn.Linear(6, 3)
self.linear4 = torch.nn.Linear(3, 2)
self.linear5 = torch.nn.Linear(2, 1)
def forward(self,x):
x = F.relu(self.linear1(x))
x = F.relu(self.linear2(x))
x = F.relu(self.linear3(x))
x = F.relu(self.linear4(x))
x = F.sigmoid(self.linear5(x))
return x
if __name__ == '__main__': #防止window因为多线程报错
model = Model()
criterion =torch.nn.BCELoss(size_average=True) #交叉熵
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
for epoch in range(100): #其实20的时候准确率基本就已经稳定了,如果想要kaggle分数更高一点可以选择100但是挺费时间的
# 训练
train_mean_loss = 0
dev_mean_loss = 0
for i, data in enumerate(train_loader, 0):
input, label = data
y_pred = model(input)
y_pred = y_pred.squeeze(-1) #降维
train_loss = criterion(y_pred, label)
train_mean_loss += train_loss.item()
optimizer.zero_grad() #记得梯度归零
train_loss.backward()
optimizer.step()
# 验证
with torch.no_grad():
correct = 0
total = 0
for data in dev_loader:
inputs, labels = data
outputs = model(inputs).squeeze(-1)
dev_loss = criterion(outputs, labels)
dev_mean_loss += dev_loss.item()
total += labels.size(0)
correct += (np.round(outputs) == labels).sum().item()
print('Accuracy on test set: %d %%' % (100 * correct / total))
print(epoch,train_mean_loss/len(train_loader))
print(epoch,dev_mean_loss/len(dev_loader))
#测试并保存预测结果为csv文件
test_data = pd.read_csv("p8_作业_数据/test.csv")
features = ["Pclass", "Sex", "SibSp", "Parch", "Fare"]
test = torch.from_numpy(np.array(pd.get_dummies(test_data[features])).astype(np.float32))
with torch.no_grad():
y_pred= model(test)
y = []
for i in y_pred:
y.append(np.round(i)) #四舍五入,y>=0.5认为存活,否则视为死亡
#预测结果保存为csv文件
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': y})
output.to_csv('my_predict.csv', index=False)