用最近流行的pytorch实现一遍
import torch
import pandas as pd
import numpy as np
import torch.nn.functional as F
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, Dataset
# Configurations
OLD_INDEX = ['Pclass', 'Sex', 'Age', 'UknAge', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Survived']
NEW_INDEX = ['Age', 'UknAge', 'Fare',
'Pclass_0', 'Pclass_1', 'Pclass_2',
'Sex_0', 'Sex_1',
'SibSp_0', 'SibSp_1', 'SibSp_2', 'SibSp_3', 'SibSp_4', 'SibSp_5', 'SibSp_8',
'Parch_0', 'Parch_1', 'Parch_2', 'Parch_3', 'Parch_4', 'Parch_5', 'Parch_6', 'Parch_9',
'Embarked_0', 'Embarked_1', 'Embarked_2',
'Survived'
]
MAP_Sex = {'male': 0, 'female': 1}
MAP_Embarked = {'C': 0, 'Q': 1, 'S': 2}
ONE_HOT = [[1, 0], [0, 1]]
FEATURES = 26
PATH = ""
# 数据预处理
def preprocess(data, flag):
# Data Cleaning
data = pd.DataFrame(data, columns=OLD_INDEX)
data['UknAge'] = data['UknAge'].fillna(0)
data['Survived'] = data['Survived'].fillna(0)
#### print(data[data['Age'].isnull()])
data.loc[data['Age'].isnull(), 'UknAge'] = 1
data['Age'] = data['Age'].fillna(0)
#### print(data[data['Fare'].isnull()])
data['Fare'] = data['Fare'].fillna(14.4)
#### print(data[data['Embarked'].isnull()])
data['Embarked'] = data['Embarked'].fillna('C')
#### One-hot Encoding
data['Pclass'] -= 1
data['Sex'] = data['Sex'].map(MAP_Sex)
data['Embarked'] = data['Embarked'].map(MAP_Embarked)
data = pd.get_dummies(data, columns=['Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked'])
data = pd.DataFrame(data, columns=NEW_INDEX)
data = data.fillna(0)
#### Normalization
for col in NEW_INDEX:
pass
maximum = data[col].max()
if maximum > 0:
data[col] /= maximum
#### To List
temp = np.array(data, dtype=np.float32)
if flag == "train":
x_data = torch.from_numpy(temp[:, :-1])
y_data = torch.from_numpy(temp[:, [-1]])
return x_data, y_data
elif flag == "test":
x_data = torch.from_numpy(temp[:, :-1])
return x_data
# prepare the data
class TitanicDataSets(Dataset):
def __init__(self, filepath,flag):
xy = preprocess(pd.read_csv(filepath + "train.csv"), flag="train")
if flag == "train":
self.x_data = xy[0][:800]
self.y_data = xy[1][:800]
self.len = self.x_data.shape[0]
if flag == "test":
self.x_data = xy[0][800:892]
self.y_data = xy[1][800:892]
self.len = self.x_data.shape[0]
def __getitem__(self, index):
return self.x_data[index], self.y_data[index]
def __len__(self):
return self.len
train_dataset = TitanicDataSets(filepath=PATH,flag="train")
train_loader = DataLoader(dataset=train_dataset, batch_size=32, shuffle=True, num_workers=0)
test_dataset = TitanicDataSets(filepath=PATH,flag="test")
test_loader = DataLoader(dataset=test_dataset, batch_size=32, shuffle=False, num_workers=0)
# 构建模型
class TitanicModel(torch.nn.Module):
def __init__(self):
super(TitanicModel, self).__init__()
self.linear1 = torch.nn.Linear(26, 13)
self.linear2 = torch.nn.Linear(13, 6)
self.linear3 = torch.nn.Linear(6, 4)
self.linear4 = torch.nn.Linear(4, 2)
def forward(self, x):
x = F.relu(self.linear1(x))
x = F.relu(self.linear2(x))
x = F.relu(self.linear3(x))
x = self.linear4(x)
return x
titanic_model = TitanicModel()
# 构建损失函数和优化器
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(params=titanic_model.parameters(), lr=0.01, momentum=0.56)
# 构建循环
def train(epochs):
for epoch in range(epochs):
for i, data in enumerate(train_loader, 0):
# prepare data
inputs, labels = data
# 前馈
y_predict = titanic_model(inputs)
target = torch.Tensor([x.tolist()[0] for x in labels])
target = target.type(torch.long)
loss = criterion(y_predict, target)
# 反馈
optimizer.zero_grad()
loss.backward()
# 更新
optimizer.step()
def test():
total = 0
correct = 0
with torch.no_grad():
for data in test_loader:
target = torch.Tensor([x.tolist()[0] for x in data[1]])
outputs = titanic_model(data[0])
_, predicted = torch.max(outputs, dim=1)
total += target.size(0)
correct += (predicted == target).sum().item()
acc = 100 * correct / total
print("acc:{}%".format(acc))
return acc
ACC = []
EPOCH = []
for i in range(20):
train(epochs=10)
acc = test()
EPOCH.append(i)
ACC.append(acc)
plt.plot(EPOCH, ACC, ls="-.", lw=2, c="c", label="plot figure")
plt.xlabel('num of train')
plt.ylabel('loss')
plt.grid() # 网格
plt.show()
# 输出数据
out_data = preprocess(pd.read_csv(PATH + "test.csv"), flag="test")
with torch.no_grad():
outputs = titanic_model(out_data)
_, predicted = torch.max(outputs, dim=1)
submission = pd.DataFrame({'PassengerId': list(range(892, 1310)), 'Survived':predicted.tolist()})
submission.to_csv(PATH+"submission.csv", index=0)