一、数据集下载地址
二、探索性因子分析(EDA)
三、特征工程
四、构建Dataset与Model
fit和自定义estimator使用
预定义estimator的使用
# https:storage.googleapis.com/tf-datasets/titanic/train.csv
# https:storage.googleapis.com/tf-datasets/titanic/eval.csv
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf
from tensorflow import keras
print(tf.__version__)
print(sys.version_info)
for module in mpl, np, pd, sklearn, tf, keras:
print(module.__name__, module.__version__)
# 读取文件
train_file = './Titanic/train.csv'
eval_file = './Titanic/eval.csv'
train_df = pd.read_csv(train_file)
eval_df = pd.read_csv(eval_file)
print(train_df.head())
print(eval_df.head())
# 分离label和feature
y_train = train_df.pop('survived')
y_eval = eval_df.pop('survived')
print(y_train.head())
print(y_eval.head())
train_df.describe() # 描述性统计量
eval_df.describe()
print(train_df.shape,eval_df.shape)
train_df.age.hist(bins=20) #分为20个区间
train_df.sex.value_counts().plot(kind='barh') # 个数,横向柱形图,纵向为barv
train_df['class'].value_counts().plot(kind='barh')
pd.concat([train_df,y_train],axis=1).groupby('sex').survived.mean().plot(kind='barh') # 横向合并
categorical_columns = ['sex','n_siblings_spouses','parch','class',
'deck','embark_town','alone']
numeric_columns =['age','fare']
feature_columns = []
for categorical_column in categorical_columns:
vocab = train_df[categorical_column].unique() #获取所有可能的值
print(categorical_column,vocab)
feature_columns.append(
tf.feature_column.indicator_column(tf.feature_column.categorical_column_with_vocabulary_list(
categorical_column,vocab))
) # 添加feature_column,将离散特征变成onehot编码
for categorical_column in numeric_columns:
feature_columns.append(
tf.feature_column.numeric_column(categorical_column,dtype=tf.float32)
)
# 添加cross_feature:age(1,2,3,4) gender(male,female)
# age_x_gender:[(1,male),(2,male),(3,male)....(4,female)]
# 需要封装才能使用,加进去不一定好,和模型本身有关系
# hash_bucket_size:hash(100000)%100,防止稀疏矩阵,内存溢出
feature_columns.append(tf.feature_column.indicator_column(tf.feature_column.crossed_column(['age','sex'],
hash_bucket_size=100)))
# feature_columns类似于一个字典,供查阅使用
print(feature_columns)
# 构建dataset
def make_dataset(data_df,label_df,epochs=10,shuffle=True,
batch_size=32):
# 转换为dataset,即Tensor张量
dataset = tf.data.Dataset.from_tensor_slices(
(dict(data_df),label_df))
if shuffle:
dataset = dataset.shuffle(10000)
dataset = dataset.repeat(epochs).batch(batch_size)
return dataset
# keras.layers.DenseFeature链接dataset和feature_columns
# feature_columns其实本质是变换的规则,dataset是数据集
# 下面是实验代码
for x,y in train_dataset.take(1):
age_column =feature_columns[7]
gender_column =feature_columns[0]
print(keras.layers.DenseFeatures(age_column)(x).numpy())
print(keras.layers.DenseFeatures(gender_column)(x).numpy())
for x,y in train_dataset.take(1):
print(keras.layers.DenseFeatures(feature_columns)(x).numpy())
# 构建keras模型
model =keras.models.Sequential(
[
# 首层将数据特征化
keras.layers.DenseFeatures(feature_columns),
# 正常构建隐藏层
keras.layers.Dense(100,activation='relu'),
keras.layers.Dense(100,activation='relu'),
# 输出层
keras.layers.Dense(2,activation='softmax'),
]
)
# 固化模型
model.compile(loss='sparse_categorical_crossentropy',
optimizer =keras.optimizers.SGD(lr=0.01),
metrics = ['accuracy'])
# 1. model.fit
# 2. model->estimator->train 自定义estimator
# 初始化dataset
train_dataset = make_dataset(train_df,y_train,epochs=100)
eval_dataset = make_dataset(eval_df,y_eval,epochs=1,shuffle=False)
# 是样本数/batch_size得来的,20偏大,数据不够
history = model.fit(train_dataset,
validation_data=eval_dataset,
steps_per_epoch = 20,
validation_steps=8,
epochs=100)
estimator = keras.estimator.model_to_estimator(keras_model=model) # 转成estimator
# function无参数
#.return a. (features,labels) b.dataset(feature,label)
# 报了稀奇古怪的错
# InternalError: GPU sync failed,重启kernel即可
estimator.train(input_fn =
lambda:make_dataset(train_df,y_train,epochs=100))
# 预定义estimator使用
output_dir = 'baseline_model_cross'
if not os.path.exists(output_dir):
os.mkdir(output_dir)
# tensorflow版本有问题,需要改!!!
baseline_estimator = tf.compat.v1.estimator.BaselineClassifier(model_dir=output_dir,
n_classes=2)
baseline_estimator.train(input_fn=lambda : make_dataset(
train_df,y_train,epochs=100))
# baseline estimator其实就是随机猜测
baseline_estimator.evaluate(input_fn=lambda:make_dataset(
eval_df,y_eval,epochs=1,shuffle=False,batch_size=20))
linear_output_dir = 'linear_model_cross'
if os.path.exists(linear_output_dir):
os.mkdir(linear_output_dir)
linear_estimator = tf.compat.v1.estimator.LinearClassifier(
model_dir =linear_output_dir,
n_classes=2,feature_columns=feature_columns)
linear_estimator.train(input_fn=lambda:make_dataset(
train_df,y_train,epochs=100))
# tensorboard --logdir linear_model
# 多模型tensorboard --logdir ./
# 可以在tensorboard中展现
linear_estimator.evaluate(input_fn=lambda:make_dataset(
eval_df,y_eval,batch_size=20,epochs=1,shuffle=False
))
dnn_output_dir ='dnn_model_cross'
if not os.path.exists(dnn_output_dir):
os.mkdir(dnn_output_dir)
dnn_estimator = tf.compat.v1.estimator.DNNClassifier(
model_dir=dnn_output_dir,
n_classes=2,
feature_columns=feature_columns,
hidden_units=[128,128],
# 两层
activation_fn=tf.nn.relu,
optimizer='Adam'
)
dnn_estimator.train(input_fn=lambda: make_dataset(
train_df,y_train,epochs=100))
dnn_estimator.evaluate(input_fn=lambda:make_dataset(
eval_df,y_eval,batch_size=20,shuffle=False,epochs=1))