1、泰坦尼克号上的旅客生存概率预测(使用TensorFlow的高级框架Keras)
# -*- coding: utf-8 -*-
"""
Created on Wed May 13 08:41:57 2020
@author: DELL
泰坦尼克号上的旅客生存概率预测
"""
# 1、下载旅客数据集
import os
# 下载数据:http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.xls
data_file_path="data/titanic3.xls"
if os.path.isfile(data_file_path):
print(data_file_path,'data file already eists.')
# 2、使用pnadas进行数据处理
import numpy
import pandas as pd
# 读取数据文件,结果为DataFrame格式
df_data = pd.read_excel(data_file_path)
#查看数据摘要
df_data.describe()
"""
pclass survived ... fare body
count 1309.000000 1309.000000 ... 1308.000000 121.000000
mean 2.294882 0.381971 ... 33.295479 160.809917
std 0.837836 0.486055 ... 51.758668 97.696922
min 1.000000 0.000000 ... 0.000000 1.000000
25% 2.000000 0.000000 ... 7.895800 72.000000
50% 3.000000 0.000000 ... 14.454200 155.000000
75% 3.000000 1.000000 ... 31.275000 256.000000
max 3.000000 1.000000 ... 512.329200 328.000000
[8 rows x 7 columns]
"""
# 3、筛选提取字段
selected_cols=['survived','name','pclass','sex','age','sibsp','parch','fare','embarked']
selected_df_data=df_data[selected_cols]
# 找出存在缺失值的"列"
selected_df_data.isnull().any()
"""
survived False
name False
pclass False
sex False
age True
sibsp False
parch False
fare True
embarked True
dtype: bool
"""
# 4、数据预处理
# # 填充null值
# # 为缺失age记录填充值 设置为平均值
# age_mean_value= selected_df_data['age'].mean()
# selected_df_data['age'] = selected_df_data['age'].fillna(age_mean_value)
# # 为缺失fare记录填充值
# fare_mean_value=selected_df_data['fare'].mean()
# selected_df_data['fare'] = selected_df_data['fare'].fillna(fare_mean_value)
# # 为缺失embarked记录填充值
# selected_df_data['embarked']=selected_df_data['embarked'].fillna('S')
# # 转换编码
# # 性别sex由字符串转换为数字编码
# selected_df_data['sex'] = selected_df_data['sex'].map({'female':0,'male':1}).astype(int)
# # 港口embarked由字母表示转换为数字编码
# selected_df_data['embarked'] = selected_df_data['embarked'].map({'C':0,'Q':1,'S':2}).astype(int)
# # 显示前3行数据
# selected_df_data[:3]
# """
# survived name pclass ... parch fare embarked
# 0 1 Allen, Miss. Elisabeth Walton 1 ... 0 211.3375 2
# 1 1 Allison, Master. Hudson Trevor 1 ... 2 151.5500 2
# 2 0 Allison, Miss. Helen Loraine 1 ... 2 151.5500 2
# [3 rows x 9 columns]
# """
# # 删除name字段
# selected_df_data = selected_df_data.drop(['name'],axis=1) # axis=1表示删除列
# selected_df_data[:3]
# """
# survived pclass sex age sibsp parch fare embarked
# 0 1 1 0 29.0000 0 0 211.3375 2
# 1 1 1 1 0.9167 1 2 151.5500 2
# 2 0 1 0 2.0000 1 2 151.5500 2
# """
# # 分离特征值和标签值
# # 转换为ndarray数组
# ndarray_data = selected_df_data.values
# # 后7列是特征值
# features = ndarray_data[:,1:]
# # 第0列是标签值
# label = ndarray_data[:,0]
# # 特征值标准化
# from sklearn import preprocessing
# minmax_scale = preprocessing.MinMaxScaler(feature_range=(0,1))
# norm_features = minmax_scale.fit_transform(features)
# norm_features[:3]
# """
# array([[0. , 0. , 0.36116884, 0. , 0. , 0.41250333, 1. ],
# [0. , 1. , 0.00939458, 0.125 , 0.22222222, 0.2958059 , 1. ],
# [0. , 0. , 0.0229641 , 0.125 , 0.22222222, 0.2958059 , 1. ]])
# """
# 定义数据预处理函数
from sklearn import preprocessing
def prepare_data(df_data):
df = df_data.drop(['name'],axis=1) # 删除姓名列
age_mean = df['age'].mean()
df['age'] = df['age'].fillna(age_mean) # 为缺失age记录填充值
fare_mean = df['fare'].mean()
df['fare'] = df['fare'].fillna(fare_mean) # 为缺失fare记录填充值
df['sex'] = df['sex'].map({'female':0,'male':1}).astype(int) # 把sex值由字符串转换为数值
df['embarked'] = df['embarked'].fillna('S') # 为缺失embarked记录填充值
df['embarked'] = df['embarked'].map({'C':0,'Q':1,'S':2}).astype(int) # 把embarked值由字符串转换为数值
ndarray_data = df.values # 转换为ndarray数组
features = ndarray_data[:,1:] # 后7列是特征值
label = ndarray_data[:,0] # 第0列是标签值
# 特征值标准化
minmax_scale = preprocessing.MinMaxScaler(feature_range=(0,1))
norm_features = minmax_scale.fit_transform(features)
return norm_features,label
# 5、数据准备(用shuffle打乱数据顺序,并划分训练集和测试集)
shuffled_df_data = selected_df_data.sample(frac=1)
x_data,y_data = prepare_data(shuffled_df_data) # 打乱后的数据集
# 划分训练集和测试集
train_size = int(len(x_data)*0.8)
x_train = x_data[:train_size]
y_train = y_data[:train_size]
x_test = x_data[train_size:]
y_test = y_data[train_size:]
# 6、构建模型
import tensorflow as tf
# 建立Keras序列模型
model = tf.keras.models.Sequential()
# 输入特征数据是7列,也可以用input_shape=(7,).
# 输入层7个神经元. 第1隐藏层64个神经元. 第2隐藏层32个神经元. 输出层1个神经元.
model.add(tf.keras.layers.Dense(units=64,
input_dim=7,
use_bias=True,
kernel_initializer='uniform',
bias_initializer='zeros',
activation='relu'))
model.add(tf.keras.layers.Dense(units=32,
activation='sigmoid'))
model.add(tf.keras.layers.Dense(units=1,
activation='sigmoid'))
model.summary()
"""
Model: "sequential"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
dense (Dense) (None, 64) 512
_________________________________________________________________
dense_1 (Dense) (None, 32) 2080
_________________________________________________________________
dense_2 (Dense) (None, 1) 33
=================================================================
Total params: 2,625
Trainable params: 2,625
Non-trainable params: 0
_________________________________________________________________
"""
# 7、模型设置与训练
# 优化器、损失函数、准确率
model.compile(optimizer=tf.keras.optimizers.Adam(0.003),
loss='binary_crossentropy',
metrics=['accuracy'])
# 模型训练
train_history = model.fit(x=x_train,
y=y_train,
validation_split=0.2, # 验证集所占比例
epochs=100,
batch_size=40,
verbose=2) # 训练过程显示模式(0:不显示,1:带进度条模式,2:每epoch显示一行
Output:
模型训练过程可视化:
train_history.history
train_history.history.keys()
# dict_keys(['loss', 'acc', 'val_loss', 'val_acc'])
# 字典模式存储
# 8、模型训练过程可视化
import matplotlib.pyplot as plt
def visu_train_history(train_history,train_metric,validation_metric):
plt.plot(train_history.history[train_metric])
plt.plot(train_history.history[validation_metric])
plt.title('Train History')
plt.ylabel(train_metric)
plt.xlabel('epoch')
plt.legend(['train','validation'],loc='upper left')
plt.show()
visu_train_history(train_history,'acc','val_acc')
visu_train_history(train_history, 'loss', 'val_loss')
Output:
模型评估和预测应用
# 9、模型评估
evaluate_result = model.evaluate(x=x_test,y=y_test)
# 262/262 [==============================] - 0s 46us/sample - loss: 0.4582 - acc: 0.7901
evaluate_result
# [0.45818260471329436, 0.7900763]
model.metrics_names # 评估结果返回值的标签
# ['loss', 'acc']
# 10、应用模型进行预测
# 添加旅客信息
selected_cols
"""
['survived',
'name',
'pclass',
'sex',
'age',
'sibsp',
'parch',
'fare',
'embarked']
"""
Jack_info = [0,'Jack',3,'male',23,1,0,5.0000,'S']
Rose_info = [1,'Rose',1,'female',20,1,0,100.0000,'S']
# 创建新的旅客DataFrame
new_passenger_pd=pd.DataFrame([Jack_info,Rose_info],columns=selected_cols)
# 在老的DataFrame中加入新的旅客信息
all_passenger_pd=selected_df_data.append(new_passenger_pd)
all_passenger_pd[-3:]
"""
survived name pclass ... parch fare embarked
1308 0 Zimmerman, Mr. Leo 3 ... 0 7.875 S
0 0 Jack 3 ... 0 5.000 S
1 1 Rose 1 ... 0 100.000 S
[3 rows x 9 columns]
"""
# 数据准备
x_features, y_label=prepare_data(all_passenger_pd)
# 利用模型计算旅客生存概率
surv_probability=model.predict(x_features)
# 在数据表最后一列插入生存概率
all_passenger_pd.insert(len(all_passenger_pd.columns),'surv_probability',surv_probability)
all_passenger_pd[-5:]
"""
survived name ... embarked surv_probability
1306 0 Zakarian, Mr. Mapriededer ... C 0.240120
1307 0 Zakarian, Mr. Ortin ... C 0.231930
1308 0 Zimmerman, Mr. Leo ... S 0.100012
0 0 Jack ... S 0.099771
1 1 Rose ... S 0.974901
[5 rows x 10 columns]
"""
2、泰坦尼克号上的旅客生存概率预测(使用TensorFlow的高级框架Keras)含模型的回调
# -*- coding: utf-8 -*-
"""
Created on Wed May 13 08:41:57 2020
@author: DELL
泰坦尼克号上的旅客生存概率预测
"""
# 1、下载旅客数据集
import os
# 下载数据:http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.xls
data_file_path="data/titanic3.xls"
if os.path.isfile(data_file_path):
print(data_file_path,'data file already eists.')
# 2、使用pnadas进行数据处理
import numpy
import pandas as pd
# 读取数据文件,结果为DataFrame格式
df_data = pd.read_excel(data_file_path)
#查看数据摘要
df_data.describe()
"""
pclass survived ... fare body
count 1309.000000 1309.000000 ... 1308.000000 121.000000
mean 2.294882 0.381971 ... 33.295479 160.809917
std 0.837836 0.486055 ... 51.758668 97.696922
min 1.000000 0.000000 ... 0.000000 1.000000
25% 2.000000 0.000000 ... 7.895800 72.000000
50% 3.000000 0.000000 ... 14.454200 155.000000
75% 3.000000 1.000000 ... 31.275000 256.000000
max 3.000000 1.000000 ... 512.329200 328.000000
[8 rows x 7 columns]
"""
# 3、筛选提取字段
selected_cols=['survived','name','pclass','sex','age','sibsp','parch','fare','embarked']
selected_df_data=df_data[selected_cols]
# 找出存在缺失值的"列"
selected_df_data.isnull().any()
"""
survived False
name False
pclass False
sex False
age True
sibsp False
parch False
fare True
embarked True
dtype: bool
"""
# 4、数据预处理
# # 填充null值
# # 为缺失age记录填充值 设置为平均值
# age_mean_value= selected_df_data['age'].mean()
# selected_df_data['age'] = selected_df_data['age'].fillna(age_mean_value)
# # 为缺失fare记录填充值
# fare_mean_value=selected_df_data['fare'].mean()
# selected_df_data['fare'] = selected_df_data['fare'].fillna(fare_mean_value)
# # 为缺失embarked记录填充值
# selected_df_data['embarked']=selected_df_data['embarked'].fillna('S')
# # 转换编码
# # 性别sex由字符串转换为数字编码
# selected_df_data['sex'] = selected_df_data['sex'].map({'female':0,'male':1}).astype(int)
# # 港口embarked由字母表示转换为数字编码
# selected_df_data['embarked'] = selected_df_data['embarked'].map({'C':0,'Q':1,'S':2}).astype(int)
# # 显示前3行数据
# selected_df_data[:3]
# """
# survived name pclass ... parch fare embarked
# 0 1 Allen, Miss. Elisabeth Walton 1 ... 0 211.3375 2
# 1 1 Allison, Master. Hudson Trevor 1 ... 2 151.5500 2
# 2 0 Allison, Miss. Helen Loraine 1 ... 2 151.5500 2
# [3 rows x 9 columns]
# """
# # 删除name字段
# selected_df_data = selected_df_data.drop(['name'],axis=1) # axis=1表示删除列
# selected_df_data[:3]
# """
# survived pclass sex age sibsp parch fare embarked
# 0 1 1 0 29.0000 0 0 211.3375 2
# 1 1 1 1 0.9167 1 2 151.5500 2
# 2 0 1 0 2.0000 1 2 151.5500 2
# """
# # 分离特征值和标签值
# # 转换为ndarray数组
# ndarray_data = selected_df_data.values
# # 后7列是特征值
# features = ndarray_data[:,1:]
# # 第0列是标签值
# label = ndarray_data[:,0]
# # 特征值标准化
# from sklearn import preprocessing
# minmax_scale = preprocessing.MinMaxScaler(feature_range=(0,1))
# norm_features = minmax_scale.fit_transform(features)
# norm_features[:3]
# """
# array([[0. , 0. , 0.36116884, 0. , 0. , 0.41250333, 1. ],
# [0. , 1. , 0.00939458, 0.125 , 0.22222222, 0.2958059 , 1. ],
# [0. , 0. , 0.0229641 , 0.125 , 0.22222222, 0.2958059 , 1. ]])
# """
# 定义数据预处理函数
from sklearn import preprocessing
def prepare_data(df_data):
df = df_data.drop(['name'],axis=1) # 删除姓名列
age_mean = df['age'].mean()
df['age'] = df['age'].fillna(age_mean) # 为缺失age记录填充值
fare_mean = df['fare'].mean()
df['fare'] = df['fare'].fillna(fare_mean) # 为缺失fare记录填充值
df['sex'] = df['sex'].map({'female':0,'male':1}).astype(int) # 把sex值由字符串转换为数值
df['embarked'] = df['embarked'].fillna('S') # 为缺失embarked记录填充值
df['embarked'] = df['embarked'].map({'C':0,'Q':1,'S':2}).astype(int) # 把embarked值由字符串转换为数值
ndarray_data = df.values # 转换为ndarray数组
features = ndarray_data[:,1:] # 后7列是特征值
label = ndarray_data[:,0] # 第0列是标签值
# 特征值标准化
minmax_scale = preprocessing.MinMaxScaler(feature_range=(0,1))
norm_features = minmax_scale.fit_transform(features)
return norm_features,label
# 5、数据准备(用shuffle打乱数据顺序,并划分训练集和测试集)
shuffled_df_data = selected_df_data.sample(frac=1)
x_data,y_data = prepare_data(shuffled_df_data) # 打乱后的数据集
# 划分训练集和测试集
train_size = int(len(x_data)*0.8)
x_train = x_data[:train_size]
y_train = y_data[:train_size]
x_test = x_data[train_size:]
y_test = y_data[train_size:]
# 6、构建模型
import tensorflow as tf
# 建立Keras序列模型
model = tf.keras.models.Sequential()
# 输入特征数据是7列,也可以用input_shape=(7,).
# 输入层7个神经元. 第1隐藏层64个神经元. 第2隐藏层32个神经元. 输出层1个神经元.
model.add(tf.keras.layers.Dense(units=64,
input_dim=7,
use_bias=True,
kernel_initializer='uniform',
bias_initializer='zeros',
activation='relu'))
model.add(tf.keras.layers.Dense(units=32,
activation='sigmoid'))
model.add(tf.keras.layers.Dense(units=1,
activation='sigmoid'))
model.summary()
"""
Model: "sequential"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
dense (Dense) (None, 64) 512
_________________________________________________________________
dense_1 (Dense) (None, 32) 2080
_________________________________________________________________
dense_2 (Dense) (None, 1) 33
=================================================================
Total params: 2,625
Trainable params: 2,625
Non-trainable params: 0
_________________________________________________________________
"""
# 7、模型设置与训练
# 优化器、损失函数、准确率
model.compile(optimizer=tf.keras.optimizers.Adam(0.003),
loss='binary_crossentropy',
metrics=['accuracy'])
# 模型训练过程中的回调
# 设置回调参数,内置的回调还包括:tf.keras.callbacks.LearningRateScheduler(), tf.keras.callbacks.EarlyStopping
logdir='.\model4\logs'
checkpoint_path='.\model4\checkpoint\Titanic.{epoch:02d}-{val_loss:.2f}.ckpt'
callbacks=[tf.keras.callbacks.TensorBoard(log_dir=logdir,histogram_freq=2),
tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
save_weights_only=True,
verbose=1,
period=5)]
# 模型训练
train_history = model.fit(x=x_train,
y=y_train,
validation_split=0.2, # 验证集所占比例
epochs=100,
batch_size=40,
callbacks=callbacks,
verbose=2) # 训练过程显示模式(0:不显示,1:带进度条模式,2:每epoch显示一行
train_history.history
train_history.history.keys()
# dict_keys(['loss', 'acc', 'val_loss', 'val_acc'])
# 字典模式存储
# 8、模型训练过程可视化
import matplotlib.pyplot as plt
def visu_train_history(train_history,train_metric,validation_metric):
plt.plot(train_history.history[train_metric])
plt.plot(train_history.history[validation_metric])
plt.title('Train History')
plt.ylabel(train_metric)
plt.xlabel('epoch')
plt.legend(['train','validation'],loc='upper left')
plt.show()
visu_train_history(train_history,'acc','val_acc')
visu_train_history(train_history, 'loss', 'val_loss')
# 9、模型评估
evaluate_result = model.evaluate(x=x_test,y=y_test)
# 262/262 [==============================] - 0s 46us/sample - loss: 0.4582 - acc: 0.7901
evaluate_result
# [0.45818260471329436, 0.7900763]
model.metrics_names # 评估结果返回值的标签
# ['loss', 'acc']
# 10、应用模型进行预测
# 添加旅客信息
selected_cols
"""
['survived',
'name',
'pclass',
'sex',
'age',
'sibsp',
'parch',
'fare',
'embarked']
"""
Jack_info = [0,'Jack',3,'male',23,1,0,5.0000,'S']
Rose_info = [1,'Rose',1,'female',20,1,0,100.0000,'S']
# 创建新的旅客DataFrame
new_passenger_pd=pd.DataFrame([Jack_info,Rose_info],columns=selected_cols)
# 在老的DataFrame中加入新的旅客信息
all_passenger_pd=selected_df_data.append(new_passenger_pd)
all_passenger_pd[-3:]
"""
survived name pclass ... parch fare embarked
1308 0 Zimmerman, Mr. Leo 3 ... 0 7.875 S
0 0 Jack 3 ... 0 5.000 S
1 1 Rose 1 ... 0 100.000 S
[3 rows x 9 columns]
"""
# 数据准备
x_features, y_label=prepare_data(all_passenger_pd)
# 利用模型计算旅客生存概率
surv_probability=model.predict(x_features)
# 在数据表最后一列插入生存概率
all_passenger_pd.insert(len(all_passenger_pd.columns),'surv_probability',surv_probability)
all_passenger_pd[-5:]
"""
survived name ... embarked surv_probability
1306 0 Zakarian, Mr. Mapriededer ... C 0.240120
1307 0 Zakarian, Mr. Ortin ... C 0.231930
1308 0 Zimmerman, Mr. Leo ... S 0.100012
0 0 Jack ... S 0.099771
1 1 Rose ... S 0.974901
[5 rows x 10 columns]
"""
3、模型恢复
# -*- coding: utf-8 -*-
"""
Created on Wed May 13 15:24:57 2020
@author: DELL
从CheckPoint文件中恢复模型
"""
# 1、下载旅客数据集
import os
# 下载数据:http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.xls
data_file_path="data/titanic3.xls"
if os.path.isfile(data_file_path):
print(data_file_path,'data file already eists.')
# 2、使用pnadas进行数据处理
import numpy
import pandas as pd
# 读取数据文件,结果为DataFrame格式
df_data = pd.read_excel(data_file_path)
# 3、筛选提取字段
selected_cols=['survived','name','pclass','sex','age','sibsp','parch','fare','embarked']
selected_df_data=df_data[selected_cols]
# 4、数据预处理
# 定义数据预处理函数
from sklearn import preprocessing
def prepare_data(df_data):
df = df_data.drop(['name'],axis=1) # 删除姓名列
age_mean = df['age'].mean()
df['age'] = df['age'].fillna(age_mean) # 为缺失age记录填充值
fare_mean = df['fare'].mean()
df['fare'] = df['fare'].fillna(fare_mean) # 为缺失fare记录填充值
df['sex'] = df['sex'].map({'female':0,'male':1}).astype(int) # 把sex值由字符串转换为数值
df['embarked'] = df['embarked'].fillna('S') # 为缺失embarked记录填充值
df['embarked'] = df['embarked'].map({'C':0,'Q':1,'S':2}).astype(int) # 把embarked值由字符串转换为数值
ndarray_data = df.values # 转换为ndarray数组
features = ndarray_data[:,1:] # 后7列是特征值
label = ndarray_data[:,0] # 第0列是标签值
# 特征值标准化
minmax_scale = preprocessing.MinMaxScaler(feature_range=(0,1))
norm_features = minmax_scale.fit_transform(features)
return norm_features,label
# 5、数据准备(用shuffle打乱数据顺序,并划分训练集和测试集)
shuffled_df_data = selected_df_data.sample(frac=1)
x_data,y_data = prepare_data(shuffled_df_data) # 打乱后的数据集
# 划分训练集和测试集
train_size = int(len(x_data)*0.8)
x_train = x_data[:train_size]
y_train = y_data[:train_size]
x_test = x_data[train_size:]
y_test = y_data[train_size:]
# 6、构建模型
import tensorflow as tf
# 建立Keras序列模型
model = tf.keras.models.Sequential()
# 输入特征数据是7列,也可以用input_shape=(7,).
# 输入层7个神经元. 第1隐藏层64个神经元. 第2隐藏层32个神经元. 输出层1个神经元.
model.add(tf.keras.layers.Dense(units=64,
input_dim=7,
use_bias=True,
kernel_initializer='uniform',
bias_initializer='zeros',
activation='relu'))
model.add(tf.keras.layers.Dense(units=32,
activation='sigmoid'))
model.add(tf.keras.layers.Dense(units=1,
activation='sigmoid'))
model.summary()
"""
Model: "sequential"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
dense (Dense) (None, 64) 512
_________________________________________________________________
dense_1 (Dense) (None, 32) 2080
_________________________________________________________________
dense_2 (Dense) (None, 1) 33
=================================================================
Total params: 2,625
Trainable params: 2,625
Non-trainable params: 0
_________________________________________________________________
"""
# 7、模型设置与训练
# 优化器、损失函数、准确率
model.compile(optimizer=tf.keras.optimizers.Adam(0.003),
loss='binary_crossentropy',
metrics=['accuracy'])
# 定义好模型结构后,再从CheckPiont文件中恢复模型各参数权值
logdir='E:\python_learning\model4\logs'
checkpoint_path='E:\python_learning\model4\checkpoint\Titanic.{epoch:02d}-{val_loss:.2f}.ckpt'
checkpoint_dir =os.path.dirname(checkpoint_path)
latest = tf.train.latest_checkpoint(checkpoint_dir)
latest
# '.\\model4\\checkpoint\\Titanic.100-0.45.ckpt'
model.load_weights(latest)
#
# 恢复模型评估
loss,acc=model.evaluate(x_test,y_test)
print("Restored model,accuracy:{:5.2f}%".format(100*acc))
"""
262/262 [==============================] - 0s 187us/sample - loss: 0.4141 - acc: 0.8282
Restored model,accuracy:82.82%
"""
以上。参考Mooc课程吴明晖老师的《深度学习应用开发-TensorFlow实践》。