Python项目实战-Tensorflow2.0实现泰坦尼克生存预测

目录

一、数据集下载地址

二、探索性因子分析(EDA)

三、特征工程

四、构建Dataset与Model

fit和自定义estimator使用

预定义estimator的使用


一、数据集下载地址

# https:storage.googleapis.com/tf-datasets/titanic/train.csv
# https:storage.googleapis.com/tf-datasets/titanic/eval.csv

二、探索性因子分析(EDA)

import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf

from tensorflow import keras

print(tf.__version__)
print(sys.version_info)
for module in mpl, np, pd, sklearn, tf, keras:
    print(module.__name__, module.__version__)
# 读取文件
train_file = './Titanic/train.csv'
eval_file = './Titanic/eval.csv'
train_df = pd.read_csv(train_file)
eval_df = pd.read_csv(eval_file)
print(train_df.head())
print(eval_df.head())
# 分离label和feature
y_train = train_df.pop('survived')
y_eval = eval_df.pop('survived')
print(y_train.head())
print(y_eval.head())
train_df.describe() # 描述性统计量
eval_df.describe()
print(train_df.shape,eval_df.shape)
train_df.age.hist(bins=20) #分为20个区间
train_df.sex.value_counts().plot(kind='barh') # 个数,横向柱形图,纵向为barv
train_df['class'].value_counts().plot(kind='barh')
pd.concat([train_df,y_train],axis=1).groupby('sex').survived.mean().plot(kind='barh') # 横向合并

三、特征工程

categorical_columns = ['sex','n_siblings_spouses','parch','class',
                      'deck','embark_town','alone']
numeric_columns =['age','fare']
feature_columns = []
for categorical_column in categorical_columns:
    vocab = train_df[categorical_column].unique() #获取所有可能的值
    print(categorical_column,vocab)
    feature_columns.append(
        tf.feature_column.indicator_column(tf.feature_column.categorical_column_with_vocabulary_list(
        categorical_column,vocab))
    ) # 添加feature_column,将离散特征变成onehot编码
for categorical_column in numeric_columns:
    feature_columns.append(
        tf.feature_column.numeric_column(categorical_column,dtype=tf.float32)
    )
# 添加cross_feature:age(1,2,3,4) gender(male,female)
# age_x_gender:[(1,male),(2,male),(3,male)....(4,female)]
# 需要封装才能使用,加进去不一定好,和模型本身有关系
# hash_bucket_size:hash(100000)%100,防止稀疏矩阵,内存溢出
feature_columns.append(tf.feature_column.indicator_column(tf.feature_column.crossed_column(['age','sex'],
                                 hash_bucket_size=100)))   
# feature_columns类似于一个字典,供查阅使用
print(feature_columns)    

四、构建Dataset与Model

fit和自定义estimator使用

# 构建dataset
def make_dataset(data_df,label_df,epochs=10,shuffle=True,
                batch_size=32):
    # 转换为dataset,即Tensor张量
    dataset = tf.data.Dataset.from_tensor_slices(
    (dict(data_df),label_df))
    if shuffle:
        dataset = dataset.shuffle(10000)
    dataset = dataset.repeat(epochs).batch(batch_size)
    return dataset

# keras.layers.DenseFeature链接dataset和feature_columns
# feature_columns其实本质是变换的规则,dataset是数据集
# 下面是实验代码
for x,y in train_dataset.take(1):
    age_column =feature_columns[7]
    gender_column =feature_columns[0]
    print(keras.layers.DenseFeatures(age_column)(x).numpy())
    print(keras.layers.DenseFeatures(gender_column)(x).numpy())

for x,y in train_dataset.take(1):
    print(keras.layers.DenseFeatures(feature_columns)(x).numpy())


# 构建keras模型
model =keras.models.Sequential(
    [
        # 首层将数据特征化
        keras.layers.DenseFeatures(feature_columns),
        # 正常构建隐藏层
        keras.layers.Dense(100,activation='relu'),
        keras.layers.Dense(100,activation='relu'),
        # 输出层
        keras.layers.Dense(2,activation='softmax'),
    ]
)
# 固化模型
model.compile(loss='sparse_categorical_crossentropy',
             optimizer =keras.optimizers.SGD(lr=0.01),
             metrics = ['accuracy'])
# 1. model.fit
# 2. model->estimator->train 自定义estimator
# 初始化dataset
train_dataset = make_dataset(train_df,y_train,epochs=100)
eval_dataset = make_dataset(eval_df,y_eval,epochs=1,shuffle=False)
# 是样本数/batch_size得来的,20偏大,数据不够
history = model.fit(train_dataset,
                   validation_data=eval_dataset,
                   steps_per_epoch = 20,
                   validation_steps=8,
                   epochs=100)


estimator = keras.estimator.model_to_estimator(keras_model=model) # 转成estimator
# function无参数
#.return a. (features,labels) b.dataset(feature,label)
# 报了稀奇古怪的错
# InternalError: GPU sync failed,重启kernel即可
estimator.train(input_fn =
                lambda:make_dataset(train_df,y_train,epochs=100)) 

预定义estimator的使用

# 预定义estimator使用
output_dir = 'baseline_model_cross'
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
# tensorflow版本有问题,需要改!!!
baseline_estimator = tf.compat.v1.estimator.BaselineClassifier(model_dir=output_dir,
                                                    n_classes=2)
baseline_estimator.train(input_fn=lambda : make_dataset(
train_df,y_train,epochs=100))
# baseline estimator其实就是随机猜测
baseline_estimator.evaluate(input_fn=lambda:make_dataset(
eval_df,y_eval,epochs=1,shuffle=False,batch_size=20))


linear_output_dir = 'linear_model_cross'
if os.path.exists(linear_output_dir):
    os.mkdir(linear_output_dir)
linear_estimator = tf.compat.v1.estimator.LinearClassifier(
model_dir =linear_output_dir,
n_classes=2,feature_columns=feature_columns)
linear_estimator.train(input_fn=lambda:make_dataset(
train_df,y_train,epochs=100))
# tensorboard --logdir linear_model
# 多模型tensorboard --logdir ./
# 可以在tensorboard中展现
linear_estimator.evaluate(input_fn=lambda:make_dataset(
eval_df,y_eval,batch_size=20,epochs=1,shuffle=False
))

dnn_output_dir ='dnn_model_cross'
if not os.path.exists(dnn_output_dir):
    os.mkdir(dnn_output_dir)
dnn_estimator = tf.compat.v1.estimator.DNNClassifier(
model_dir=dnn_output_dir,
    n_classes=2,
    feature_columns=feature_columns,
    hidden_units=[128,128],
    # 两层
    activation_fn=tf.nn.relu,
    optimizer='Adam'
)
dnn_estimator.train(input_fn=lambda: make_dataset(
train_df,y_train,epochs=100))
dnn_estimator.evaluate(input_fn=lambda:make_dataset(
eval_df,y_eval,batch_size=20,shuffle=False,epochs=1))

 

你可能感兴趣的:(项目实战)