因为自己当前在学习机器学习,而且是第一次练习kaggle上的习题,所以花费了不少的精力做了详细的笔记,每一步都有详细的结果和图像展示。但笔记是用Jupyter Notebook写的,所以没法写在博客上。目前已经上传到GitHub上,可以直接查看,欢迎给出意见。
import copy
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
TRAIN_PATH = "./titanic/train.csv"
TEST_PATH = "./titanic/test.csv"
SAMPLE_SUBMISSION_PATH = "./titanic/gender_submission.csv"
SUBMISSION_PATH = "submission.csv"
import os
from matplotlib import pyplot as plt
%matplotlib inline
ID = 'PassengerId'
TARGET = 'Survived'
# 读取训练集文件,并查看相关信息
train_data = pd.read_csv(TRAIN_PATH)
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
train_data.info() # 其中Age只有714个数据,缺少了891-714个数据;Cabin仅有204个,Embarked缺少2个数据
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 PassengerId 891 non-null int64
1 Survived 891 non-null int64
2 Pclass 891 non-null int64
3 Name 891 non-null object
4 Sex 891 non-null object
5 Age 714 non-null float64
6 SibSp 891 non-null int64
7 Parch 891 non-null int64
8 Ticket 891 non-null object
9 Fare 891 non-null float64
10 Cabin 204 non-null object
11 Embarked 889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
# 其中PassengerId、Name、Ticket与是否生存无关,所以删除掉
train_data = train_data.drop(['PassengerId', 'Name', 'Ticket'], axis=1)
Survived | Pclass | Sex | Age | SibSp | Parch | Fare | Cabin | Embarked | |
0 | 0 | 3 | male | 22.0 | 1 | 0 | 7.2500 | NaN | S |
1 | 1 | 1 | female | 38.0 | 1 | 0 | 71.2833 | C85 | C |
2 | 1 | 3 | female | 26.0 | 0 | 0 | 7.9250 | NaN | S |
train_data.info() # 其中Age只有714个数据,缺少了891-714个数据;Cabin仅有204个,Embarked缺少2个数据
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Survived 891 non-null int64
1 Pclass 891 non-null int64
2 Sex 891 non-null object
3 Age 714 non-null float64
4 SibSp 891 non-null int64
5 Parch 891 non-null int64
6 Fare 891 non-null float64
7 Cabin 204 non-null object
8 Embarked 889 non-null object
dtypes: float64(2), int64(4), object(3)
memory usage: 62.8+ KB
# 891人当中,共有多少人生还
total_survived_num = train_data['Survived'].sum()
total_no_survived_num = 891 - total_survived_num
plt.figure(figsize=(12, 6))
# 绘制柱状图
plt.subplot(1,2, 1)
plt.bar([1, 0], [total_survived_num, total_no_survived_num], width=0.5)
plt.xticks(ticks=[0, 1])
plt.title('Survival Count')
# 绘制饼状图
plt.subplot(1, 2, 2)
plt.pie([total_survived_num, total_no_survived_num],
labels=['Survived', 'No Survived'], autopct="%.1f%%")
plt.title('Survival Rate')
下面,分别分析 Pclass、Sex、Age、SibSp、Parch、Fare、Cabin 和 Embarked 等与“生还”的关系
x = train_data[['Pclass', 'Survived']].groupby(['Pclass']).count()
Survived | |
Pclass | |
1 | 216 |
2 | 184 |
3 | 491 |
plt.figure(figsize=(12, 5))
# 绘制柱状图
plt.subplot(1, 2, 1)
plt.bar([1, 2, 3], x['Survived'], width=0.5)
plt.title('Pclass Person Count')
# 绘制饼状图
plt.subplot(1, 2, 2)
plt.pie(x['Survived'], labels=[1, 2, 3], autopct='%.1f%%')
plt.title('Pclass Person Rate')
海难发生前,一等舱、二等舱、三等舱的乘客分别为216、184、491人,分别占总人数的 24.2%, 20.7%, 55.1%
x = train_data[train_data['Survived'] == 1]
x = x[['Pclass', 'Survived']].groupby('Pclass').count()
Survived | |
Pclass | |
1 | 136 |
2 | 87 |
3 | 119 |
plt.figure(figsize=(12, 5))
# 绘制柱状图
plt.subplot(1, 2, 1)
plt.bar([1, 2, 3], x['Survived'], width=0.5)
plt.title('Pclass Person Count')
# 绘制饼状图
plt.subplot(1, 2, 2)
plt.pie(x['Survived'], labels=[1, 2, 3], autopct='%.1f%%')
plt.title('Pclass Person Rate')
海难发生后,一等舱、二等舱、三等舱的乘客分别为136、87、119人,分别占总人数的 39.8%, 25.4%, 34.8%
一等舱生还率为 63%,二等舱为 47%,三等舱为 24%。可见客舱等级越高,生还率越高。
x = train_data[['Sex', 'Survived']].groupby('Sex').count()
Survived | |
Sex | |
female | 314 |
male | 577 |
x = train_data[train_data['Survived'] == 1]
x = x[['Sex', 'Survived']].groupby('Sex').count()
Survived | |
Sex | |
female | 233 |
male | 109 |
male_survived_rate = 109 / 577
female_survived_rate = 233 / 314
print("男生存活率:%.1f%%,女生存活率:%.1f%%" %(male_survived_rate*100, female_survived_rate*100))
# 查看缺少的年龄数
nan_age_count = train_data['Age'].isnull().sum()
# 求年龄平均值
avg_age = train_data['Age'].mean()
# 填充缺失的年龄
train_data['Age'] = train_data['Age'].fillna(avg_age)
count 891.000000
mean 29.699118
std 13.002015
min 0.420000
25% 22.000000
50% 29.699118
75% 35.000000
max 80.000000
Name: Age, dtype: float64
plt.figure(figsize=(12, 5))
# 绘制年龄分布图
plt.subplot(1, 2, 1)
train_data['Age'].hist(bins = 70)
# 绘制年龄分布图
plt.subplot(1, 2, 2)
train_data.boxplot(column='Age', showfliers=False)
children_df = train_data[train_data['Age'] <= 12]
juvenile_df = train_data[(train_data['Age'] > 12) & (train_data['Age'] < 18)]
adults_df = train_data[(train_data['Age'] >= 18) & (train_data['Age'] < 65)]
agedness_df = train_data[train_data['Age'] >= 65]
# 儿童数量
children_count = children_df['Survived'].count()
# 少年数量
juvenile_count = juvenile_df['Survived'].count()
# 成年人数量
adults_count = adults_df['Survived'].count()
# 老年人数量
agedness_count = agedness_df['Survived'].count()
children_count, juvenile_count, adults_count, agedness_count
(69, 44, 767, 11)
# 儿童中存活的数量
children_survived_count = children_df['Survived'].sum()
# 少年中存活的数量
juvenile_survived_count = juvenile_df['Survived'].sum()
# 成年人中存活的数量
adults_survived_count = adults_df['Survived'].sum()
# 老年人中存活的数量
agedness_survived_count = agedness_df['Survived'].sum()
children_survived_count, juvenile_survived_count, adults_survived_count, agedness_survived_count
(40, 21, 280, 1)
children_survived_rate = 40 / 69
juvenile_survived_rate = 21 / 44
adults_survived_rate = 280 / 767
agedness_survived_rate = 1 / 11
print("儿童存活率:%.1f%%,少年存活率:%.1f%%" %(children_survived_rate*100, juvenile_survived_rate*100))
print("成年人存活率:%.1f%%,老年人存活率:%.1f%%" %(adults_survived_rate*100, agedness_survived_rate*100))
sibsp_df = train_data[train_data['SibSp'] != 0]
no_sibsp_df = train_data[train_data['SibSp'] == 0]
# 有兄弟姐妹的乘客数
sibsp_count = sibsp_df['Survived'].count()
# 没有兄弟姐妹的乘客数
no_sibsp_count = no_sibsp_df['Survived'].count()
sibsp_count, no_sibsp_count
(283, 608)
# 有兄弟姐妹的乘客生还数
sibsp_survived_count = sibsp_df['Survived'].sum()
# 没有兄弟姐妹的乘客生还数
no_sibsp_survived_count = no_sibsp_df['Survived'].sum()
sibsp_survived_count, no_sibsp_survived_count
(132, 210)
sibsp_survived_rate = 132 / 283
no_sibsp_survived_rate = 210 / 608
print("有兄弟姐妹的存活率:%.1f%%,没有兄弟姐妹的存活率:%.1f%%" %(sibsp_survived_rate*100, no_sibsp_survived_rate*100))
parch_df = train_data[train_data['Parch'] != 0]
no_parch_df = train_data[train_data['Parch'] == 0]
# 有父母子女的乘客数
parch_count = parch_df['Survived'].count()
# 没有父母子女的乘客数
no_parch_count = no_parch_df['Survived'].count()
parch_count, no_parch_count
(213, 678)
# 有父母子女的乘客生还数
parch_survived_count = parch_df['Survived'].sum()
# 没有父母子女的乘客生还数
no_parch_survived_count = no_parch_df['Survived'].sum()
parch_survived_count, no_parch_survived_count
(109, 233)
parch_survived_rate = 109 / 213
no_parch_survived_rate = 233 / 678
print("有父母子女的存活率:%.1f%%,没有父母子女的存活率:%.1f%%" %(parch_survived_rate*100, no_parch_survived_rate*100))
count 891.000000
mean 32.204208
std 49.693429
min 0.000000
25% 7.910400
50% 14.454200
75% 31.000000
max 512.329200
Name: Fare, dtype: float64
plt.figure(figsize=(12, 5))
# 绘制票价分布图
plt.subplot(1, 2, 1)
train_data['Fare'].hist(bins = 20)
# 绘制盒图
plt.subplot(1, 2, 2)
train_data.boxplot(column='Fare', showfliers=False)
plt.figure(figsize=(12, 5))
# 绘制存活乘客的票价分布图
plt.subplot(1, 2, 1)
train_data[train_data['Survived'] == 1]['Fare'].hist(bins = 20)
# 绘制存活乘客的票价盒图
plt.subplot(1, 2, 2)
train_data[train_data['Survived'] == 1].boxplot(column='Fare', showfliers=False)
train_data = train_data.drop('Cabin', axis=1)
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Survived 891 non-null int64
1 Pclass 891 non-null int64
2 Sex 891 non-null object
3 Age 891 non-null float64
4 SibSp 891 non-null int64
5 Parch 891 non-null int64
6 Fare 891 non-null float64
7 Embarked 889 non-null object
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB
# 统计各个(港口)出现的次数
S 644
C 168
Q 77
Name: Embarked, dtype: int64
# S出现的次数最多,所以用S来填充两个缺失的数
train_data['Embarked'] = train_data['Embarked'].fillna('S')
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Survived 891 non-null int64
1 Pclass 891 non-null int64
2 Sex 891 non-null object
3 Age 891 non-null float64
4 SibSp 891 non-null int64
5 Parch 891 non-null int64
6 Fare 891 non-null float64
7 Embarked 891 non-null object
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB
S 646
C 168
Q 77
Name: Embarked, dtype: int64
train_data[train_data['Survived'] == 1]['Embarked'].value_counts()
S 219
C 93
Q 30
Name: Embarked, dtype: int64
S_survived_rate = 219 / 646
C_survived_rate = 93 / 168
Q_survived_rate = 30 / 77
print("S港口存活率:%.1f%%,C港口存活率:%.1f%%,Q港口存活率:%.1f%%" %(S_survived_rate*100, C_survived_rate*100,Q_survived_rate*100))
海难发生前,一等舱有 216 人,二等舱 184 人,三等舱 491 人,分别占总人数的 24%, 21%, 55%。
海难发生后,一等舱、二等舱、三等舱的乘客人数变为136、87、119人,分别占总人数的 40%, 25%, 35%。
一等舱生还率为 63%,二等舱为 47%,三等舱为 24%。可见客舱等级越高,生还率越高。
海难发生前,男性共577人,女性314人,男女比例为 65% 和 35%。
海难发生后,男性变为109人,女性变为233人,男女比例变为 32% 和 68%。
样本的891人中,平均年龄约为30岁, 标准差15岁,最小年龄为0.42,最大年龄80。按照儿童(0-12)、少年(12-18)、成人(18-65)、老年人(65及以上)划分为四类。
四类人的生还率分别为58%,48%,39% 和9%。可见年龄越大,生还率越低。“尊老爱幼”的原则在本次事故中没有很好体现。
train_data = pd.read_csv(TRAIN_PATH) # 891条
test_data = pd.read_csv(TEST_PATH) # 418条
# 将训练数据和测试数据先进行纵向堆叠,方便统一进行数据处理
full = pd.concat([train_data, test_data], axis=0, ignore_index=True)
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 PassengerId 1309 non-null int64
1 Survived 891 non-null float64
2 Pclass 1309 non-null int64
3 Name 1309 non-null object
4 Sex 1309 non-null object
5 Age 1046 non-null float64
6 SibSp 1309 non-null int64
7 Parch 1309 non-null int64
8 Ticket 1309 non-null object
9 Fare 1308 non-null float64
10 Cabin 295 non-null object
11 Embarked 1307 non-null object
dtypes: float64(3), int64(4), object(5)
memory usage: 122.8+ KB
# 删除不需要的数据
full = full.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Survived 891 non-null float64
1 Pclass 1309 non-null int64
2 Sex 1309 non-null object
3 Age 1046 non-null float64
4 SibSp 1309 non-null int64
5 Parch 1309 non-null int64
6 Fare 1308 non-null float64
7 Embarked 1307 non-null object
dtypes: float64(3), int64(3), object(2)
memory usage: 81.9+ KB
# 填充age中的缺失值
full['Age'] = full['Age'].fillna(full['Age'].mean())
# 填充fare中的缺失值
full['Fare'] = full['Fare'].fillna(full['Fare'].mean())
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Survived 891 non-null float64
1 Pclass 1309 non-null int64
2 Sex 1309 non-null object
3 Age 1309 non-null float64
4 SibSp 1309 non-null int64
5 Parch 1309 non-null int64
6 Fare 1309 non-null float64
7 Embarked 1307 non-null object
dtypes: float64(3), int64(3), object(2)
memory usage: 81.9+ KB
# 使用众数填充embarked中的数据
S 914
C 270
Q 123
Name: Embarked, dtype: int64
full['Embarked'] = full['Embarked'].fillna('S')
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Survived 891 non-null float64
1 Pclass 1309 non-null int64
2 Sex 1309 non-null object
3 Age 1309 non-null float64
4 SibSp 1309 non-null int64
5 Parch 1309 non-null int64
6 Fare 1309 non-null float64
7 Embarked 1309 non-null object
dtypes: float64(3), int64(3), object(2)
memory usage: 81.9+ KB
0 male
1 female
2 female
3 female
4 male
Name: Sex, dtype: object
# 将性别的值映射成数值
sex_2_dict = {"male": 0, "female":1}
full['Sex'] = full['Sex'].map(sex_2_dict)
0 0
1 1
2 1
3 1
4 0
Name: Sex, dtype: int64
0 3
1 1
2 3
3 1
4 3
Name: Pclass, dtype: int64
# 存放提取后的特征
pClassDf = pd.DataFrame()
# 将船舱类别转换为one-hot编码格式
pClassDf = pd.get_dummies(full['Pclass'], prefix='Pclass')
Pclass_1 | Pclass_2 | Pclass_3 | |
0 | 0 | 0 | 1 |
1 | 1 | 0 | 0 |
2 | 0 | 0 | 1 |
3 | 1 | 0 | 0 |
4 | 0 | 0 | 1 |
# 将one-hot编码产生的虚拟变量添加到泰坦尼克号数据集full中
full = pd.concat([full, pClassDf], axis=1)
# 因为已经将类别转换为one-hot编码形式,并且添加到了full数据集中,所以删除原有的Pclass列
full = full.drop('Pclass', axis=1)
Survived | Sex | Age | SibSp | Parch | Fare | Embarked | Pclass_1 | Pclass_2 | Pclass_3 | |
0 | 0.0 | 0 | 22.0 | 1 | 0 | 7.2500 | S | 0 | 0 | 1 |
1 | 1.0 | 1 | 38.0 | 1 | 0 | 71.2833 | C | 1 | 0 | 0 |
2 | 1.0 | 1 | 26.0 | 0 | 0 | 7.9250 | S | 0 | 0 | 1 |
3 | 1.0 | 1 | 35.0 | 1 | 0 | 53.1000 | S | 1 | 0 | 0 |
4 | 0.0 | 0 | 35.0 | 0 | 0 | 8.0500 | S | 0 | 0 | 1 |
0 S
1 C
2 S
3 S
4 S
Name: Embarked, dtype: object
# 存放提取后的特征
embarkedDf = pd.DataFrame()
# 将港口类别转换为one-hot编码格式
embarkedDf = pd.get_dummies(full['Embarked'], prefix='Embarked')
Embarked_C | Embarked_Q | Embarked_S | |
0 | 0 | 0 | 1 |
1 | 1 | 0 | 0 |
2 | 0 | 0 | 1 |
3 | 0 | 0 | 1 |
4 | 0 | 0 | 1 |
# 将one-hot编码产生的虚拟变量添加到泰坦尼克号数据集full中
full = pd.concat([full, embarkedDf], axis=1)
# 因为已经将类别转换为one-hot编码形式,并且添加到了full数据集中,所以删除原有的Embarked列
full = full.drop('Embarked', axis=1)
Survived | Sex | Age | SibSp | Parch | Fare | Pclass_1 | Pclass_2 | Pclass_3 | Embarked_C | Embarked_Q | Embarked_S | |
0 | 0.0 | 0 | 22.0 | 1 | 0 | 7.2500 | 0 | 0 | 1 | 0 | 0 | 1 |
1 | 1.0 | 1 | 38.0 | 1 | 0 | 71.2833 | 1 | 0 | 0 | 1 | 0 | 0 |
2 | 1.0 | 1 | 26.0 | 0 | 0 | 7.9250 | 0 | 0 | 1 | 0 | 0 | 1 |
3 | 1.0 | 1 | 35.0 | 1 | 0 | 53.1000 | 1 | 0 | 0 | 0 | 0 | 1 |
4 | 0.0 | 0 | 35.0 | 0 | 0 | 8.0500 | 0 | 0 | 1 | 0 | 0 | 1 |
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Survived 891 non-null float64
1 Sex 1309 non-null int64
2 Age 1309 non-null float64
3 SibSp 1309 non-null int64
4 Parch 1309 non-null int64
5 Fare 1309 non-null float64
6 Pclass_1 1309 non-null uint8
7 Pclass_2 1309 non-null uint8
8 Pclass_3 1309 non-null uint8
9 Embarked_C 1309 non-null uint8
10 Embarked_Q 1309 non-null uint8
11 Embarked_S 1309 non-null uint8
dtypes: float64(3), int64(3), uint8(6)
memory usage: 69.2 KB
# 取出训练集样本和标签,测试集样本,并转换为numpy类型
train_labels = full.iloc[:891,0].to_numpy()
train_inputs = full.iloc[:891,1:].to_numpy()
test_inputs = full.iloc[891:,1:].to_numpy()
(891, 11),类型:
(418, 11),类型:
import torch
from torch.utils.data import DataLoader, TensorDataset, random_split
import copy
# 将数据转换为tensor类型
train_labels = torch.from_numpy(train_labels).to(torch.long)
train_inputs = torch.from_numpy(train_inputs).to(torch.float32)
test_inputs = torch.from_numpy(test_inputs).to(torch.float32)
torch.Size([891, 11]),类型:
torch.Size([418, 11]),类型:
# 将样本和标签构建成数据集,以便使用DataLoader来自动生成batch训练
dataset = TensorDataset(train_inputs, train_labels)
dataset_len = len(dataset)
# 将数据集进一步划分为训练集和验证集(7:3,大概训练集624个,验证集267个)
train_dataset, valid_dataset = random_split(dataset, lengths=[624, 267])
# 构建模型
class Net(torch.nn.Module):
def __init__(self, input_size, output_size):
self.input_size = input_size
self.output_size = output_size
self.net = torch.nn.Sequential(
torch.nn.Linear(self.input_size, 22),
torch.nn.Linear(22, 11),
torch.nn.Linear(11, self.output_size)
def forward(self, x):
return self.net(x)
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter('logs')
# 定义batch大小
batch_size = 64
train_dataset_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_dataset_loader = DataLoader(valid_dataset, batch_size=batch_size*2)
# 定义模型
net = Net(11, 2)
# 定义优化器
optimizer = torch.optim.Adam(net.parameters(), lr=1e-2, weight_decay=1e-2)
# 定义损失函数,分类问题使用交叉熵损失函数
loss_fn = torch.nn.CrossEntropyLoss()
# 定义迭代次数
epochs = 60
for i in range(epochs):
loss = 0
train_correct = 0
# 在训练集上训练,并计算准确率
for inputs, labels in train_dataset_loader:
# 前向传播计算输出
outputs = net(inputs)
# 找出输出中最大值的索引下标
_, idx = outputs.max(dim=1)
train_correct += (idx == labels).sum()
# 计算损失
loss = loss_fn(outputs, labels)
# 反向传播计算梯度
# 优化参数
# 寻找出训练集准确率不低的模型
train_accuracy = train_correct / len(train_dataset)
if train_accuracy > 0.8:
state = {
'accuracy': train_accuracy * 100,
'state_dict': net.state_dict(),
'optimizer': optimizer.state_dict()
torch.save(state, f"model_epoch{i+1}_{train_accuracy}.pth")
writer.add_scalar("训练集准确率", train_accuracy, i+1)
print("Epoch [%d/%d] Loss=%.4f" %(i+1, epochs, loss.item()), end='')
print(",训练集准确率:%.4f%%" %(train_accuracy * 100), end='')
# 在验证集上计算准确率
valid_correct = 0
for inputs, labels in valid_dataset_loader:
outputs = net(inputs)
_, idx = outputs.max(dim=1)
valid_correct += (idx == labels).sum()
valid_accuracy = valid_correct / len(valid_dataset)
writer.add_scalar("验证集准确率", valid_accuracy, i+1)
print(",验证集准确率:%.4f%%" %(valid_accuracy*100))
# 创建模型
net = Net(11, 2)
# 加载训练好的模型
state = torch.load("model_epoch28_0.8092948794364929.pth")
# 前向计算输出
outputs = net(test_inputs)
输出数据的大小:torch.Size([418, 2])
# 从输出数据中,找出每一行的最大值索引,即对应的预测数据,Survived是否生还
_, predict = outputs.max(dim=1)
# 转换为numpy数据类型
predict = predict.numpy()
print(f"预测结果的大小:{predict.shape}, 预测数据:\n{predict}")
预测结果的大小:(418,), 预测数据:
# 查看读取的测试文件内容
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 PassengerId 418 non-null int64
1 Pclass 418 non-null int64
2 Name 418 non-null object
3 Sex 418 non-null object
4 Age 332 non-null float64
5 SibSp 418 non-null int64
6 Parch 418 non-null int64
7 Ticket 418 non-null object
8 Fare 417 non-null float64
9 Cabin 91 non-null object
10 Embarked 418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB
# 从测试csv文件中取出PassengerId,并转换为numpy数据类型
passenger_id = test_data['PassengerId'].to_numpy()
print(f"id数据的大小:{passenger_id.shape}, 数据:\n{passenger_id}")
id数据的大小:(418,), 数据:
# 将PassengerId与预测结果,两两组合,构建成DataFrame类型,再写入到csv文件中
data = zip(passenger_id, predict)
result = pd.DataFrame(data=data, columns=['PassengerId', 'Survived'])
result.to_csv(SUBMISSION_PATH, index=None)