2022年6月1日
对kaggle中经典案例泰坦尼克进行简单实现,用于熟悉框架。
首先对数据进行处理,使其利于神经网络输入
# 首先对文件进行读取
train_data = pd.read_csv('Titanic/train.csv')
test_data = pd.read_csv('Titanic/test.csv')
# 确定输入输出
all_features = pd.concat((train_data.iloc[:, 2:-1], test_data.iloc[:, 1:-1]))
'''找出特征中类型为数字的特征并将其归一化'''
numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index
all_features[numeric_features] = all_features[numeric_features].apply(
lambda x: (x - x.mean()) / (x.std())
)
'''将缺失数据填补为0'''
all_features[numeric_features] = all_features[numeric_features].fillna(0)
'''自动转化为全数字构型'''
all_features = pd.get_dummies(all_features, dummy_na=True)
将数据放入tensor中。
# 将数据放入tensor中
n_train = train_data.shape[0]
train_features = torch.tensor(all_features[:n_train].values, dtype=torch.float32)
test_features = torch.tensor(all_features[n_train:].values, dtype=torch.float32)
train_labels = torch.tensor(train_data.Survived.values.reshape(-1, 1), dtype=torch.float32)
in_features = train_features.shape[1]
构建神经网络,随便写一个,效果不一定好
# 构建神经网络
def get_net():
net = torch.nn.Sequential(
torch.nn.Linear(in_features, 250),
torch.nn.ReLU(),
torch.nn.Linear(250, 1)
)
return net
构造一个将存活概率(预测所得)转化为结果的简单函数:
def get_result(y):
result = []
for i in y:
if i >= 0.5:
result.append(1)
if i < 0.5:
result.append(0)
return result
构造一个计算预测准确度的函数:
def get_acc(right, y):
num_right = 0
for i, j in zip(right, y):
if i == j:
num_right += 1
return num_right / len(right)
网络运行:
# 超参数
EPOCH = 100
lr = 0.05
weight_decay = 0.1
# 实现网络
net = get_net()
loss = torch.nn.MSELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=lr, weight_decay=weight_decay)
for epoch in range(EPOCH):
optimizer.zero_grad()
pred_lable = net(train_features)
l = loss(pred_lable, train_labels)
l.backward()
optimizer.step()
result = get_result(pred_lable)
acc = get_acc(train_labels, result)
print('损失函数值:', l)
print('预测结果:', result)
print('实际存活:', train_labels.reshape(-1).tolist())
print('预测准确率', acc)
运行结果:
全部程序如下
'kaggle经典案例泰坦尼克存活情况预测'
'2022年6月1日 于北京交通大学 韬会'
import pandas as pd
import torch
train_data = pd.read_csv('Titanic/train.csv')
test_data = pd.read_csv('Titanic/test.csv')
# 超参数
EPOCH = 100
lr = 0.05
weight_decay = 0.1
# 确定输入输出
all_features = pd.concat((train_data.iloc[:, 2:-1], test_data.iloc[:, 1:-1]))
numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index
all_features[numeric_features] = all_features[numeric_features].apply(
lambda x: (x - x.mean()) / (x.std())
)
all_features[numeric_features] = all_features[numeric_features].fillna(0)
all_features = pd.get_dummies(all_features, dummy_na=True)
# 将数据放入tensor中
n_train = train_data.shape[0]
train_features = torch.tensor(all_features[:n_train].values, dtype=torch.float32)
test_features = torch.tensor(all_features[n_train:].values, dtype=torch.float32)
train_labels = torch.tensor(train_data.Survived.values.reshape(-1, 1), dtype=torch.float32)
in_features = train_features.shape[1]
# 构建神经网络
def get_net():
net = torch.nn.Sequential(
torch.nn.Linear(in_features, 250),
torch.nn.ReLU(),
torch.nn.Linear(250, 1)
)
return net
def get_result(y):
result = []
for i in y:
if i >= 0.5:
result.append(1)
if i < 0.5:
result.append(0)
return result
def get_acc(right, y):
num_right = 0
for i, j in zip(right, y):
if i == j:
num_right += 1
return num_right / len(right)
net = get_net()
loss = torch.nn.MSELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=lr, weight_decay=weight_decay)
for epoch in range(EPOCH):
optimizer.zero_grad()
pred_lable = net(train_features)
l = loss(pred_lable, train_labels)
l.backward()
optimizer.step()
result = get_result(pred_lable)
acc = get_acc(train_labels, result)
print('损失函数值:', l)
print('预测结果:', result)
print('实际存活:', train_labels.reshape(-1).tolist())
print('预测准确率', acc)