train_file = "./titanic/train.csv"
train_df = pd.read_csv(train_file) # 读取为DataFrame类型
y_train = train_df.pop("survived") # 取1列
print(train_df.head(5)) # 打印前5行
train_df.describe() # 展示每一列的统计信息
train_df.age.hist(bins=20) # 直方图 type(df.age)=='Series'
train_df.sex.value_counts() # 值统计 仍然是Series类型
train_df['sex'].unique() # 返回不同值
tf.feature_columns.xxx_column
特征列是建立estimator所需要的。特征列分两类: categorical_column和numeric_column,建立列时要提供名称和其它信息(比如dtype或vocab),建立类别列时必须用indicator_column包裹。
categorical_columns = ['sex', 'n_siblings_spouses', 'parch', 'class', 'deck', 'embark_town', 'alone']
numeric_columns = ['age', 'fare']
feature_columns = []
for categorical_column in categorical_columns:
vocab = train_df[categorical_column].unique() #list(str)
feature_columns.append(
tf.feature_column.indicator_column( # 类别列必须用 indicator 封装
tf.feature_column.categorical_column_with_vocabulary_list( # 类别列
categorical_column, # 列名(str)
vocab # 词典, list(str)
)
)
)
for numeric_column in numeric_columns: # 数值列
feature_columns.append(
tf.feature_column.numeric_column(numeric_column, dtype=tf.float32)
)
# x: {'sex':tf.Tensor([1 0 0 1 1]), 'age':tf.Tensor([40. 0.83 48. 49. 36.])...}
age_column = feature_columns[7]
gender_column = feature_columns[0]
for x, y in train_dataset.take(1):
print(keras.layers.DenseFeatures(age_column)(x).numpy())
print(keras.layers.DenseFeatures(gender_column)(x).numpy())
DenseFeatures层以feature_column(s)作为输入, 给样本可以调用__call__方法. DenseFeature还会将类别列处理成one-hot形式。DenseFeatures需要作为模型的输入。
estimator = keras.estimator.model_to_estimator(model)
estimator.train(input_fn=lambda: make_dataset(train_df, y_train, epochs=100))
dnn_estimator = tf.estimator.DNNClassifier(
model_dir=dnn_output_dir,
n_classes=2,
feature_columns=feature_columns,
hidden_units=[128, 128],
activation_fn=tf.nn.relu,
optimizer='Adam'
)
dnn_estimator.train(lambda : make_dataset(train_df, y_train, epochs=100))
dnn_estimator.evaluate(input_fn=lambda: make_dataset(eval_df, y_eval, epochs=1, shuffle=False))
This classifier ignores feature values and will learn to predict the average value of each label. For single-label problems, this will predict the probability distribution of the classes as seen in the labels. For multi-label problems, this will predict the fraction of examples that are positive for each class.
忽略特征列,学习每个标签的平均值或分布。
baseline_estimator = tf.compat.v1.estimator.BaselineClassifier(
model_dir=output_dir,
n_classes=2
)
baseline_estimator.train(input_fn=lambda: make_dataset(train_df, y_train, epochs=100))
baseline_estimator.evaluate(input_fn=lambda: make_dataset(eval_df, y_eval, epochs=1, shuffle=False, batch_size=20))
Train a linear model to classify instances into one of multiple possible classes. When number of possible classes is 2, this is binary classification.
linear_estimator = tf.estimator.LinearClassifier(
model_dir=linear_output_dir,
n_classes=2,
feature_columns=feature_columns
)
linear_estimator.train(input_fn=lambda: make_dataset(train_df, y_train, epochs=100))
linear_estimator.evaluate(input_fn=lambda: make_dataset(eval_df, y_eval, epochs=1, shuffle=False))
tf.compat.v1.disable_eager_execution() # 关闭动态图机制
new_layer = x_
for hidden_unit in hidden_units:
new_layer = tf.compat.v1.layers.dense(input_for_next_layer, hidden_unit, activation=tf.nn.relu)
# last hidden output * W(logits) --softmax--> prob
# 1. logit --softmax--> prob
# 2. labels --> one_hot
# 3. calculate cross entropy
logits = tf.compat.v1.layers.dense(new_layer, class_num)
loss = tf.compat.v1.losses.sparse_softmax_cross_entropy(labels=y, logits=logits)
correct_prediction = tf.equal(tf.argmax(logits, 1), y)
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float64))
with tf.compat.v1.Session() as sess:
sess.run(init)
for epoch in range(epochs):
for step in range(train_steps_per_epoch):
batch_data = x_train_scaled[step * batch_size : (step + 1) * batch_size]
batch_label = y_train[step * batch_size : (step + 1) * batch_size]
_, loss_eval, accuracy_eval = sess.run([train_op,loss,accuracy], feed_dict={
x: batch_data,
y: batch_label
})
print("\r[Train] epoch: %d, step: %d, loss: %3.5f, accuracy: %2.2f"
% (epoch, step, loss_eval, accuracy_eval), end="")
valid_accuracy = eval_with_sess(sess, x, y, accuracy,
x_valid_scaled, y_valid, batch_size)
print("\n[Valid] accuracy: %2.2f" % valid_accuracy)
logits=tf.compat.v1.layers.dense(layers, class_num)
loss=tf.compat.v1.losses.sparse_softmax_cross_entropy(labels=y, logits=logits)
Measures the probability error in discrete classification tasks in which the classes are mutually exclusive.
WARNING: This op expects unscaled logits, since it performs a softmax on logits internally for efficiency. Do not call this op with the output of softmax, as it will produce incorrect results.
dataset = tf.compat.v1.data.Dataset.from_tensor_slices((images, labels))
tf.compat.v1.disable_eager_execution()
x, y = dataset.make_one_shot_iterator().get_next()
with tf.compat.v1.Session() as sess:
x_val, y_val = sess.run([x, y])
images_placeholder = tf.compat.v1.placeholder(tf.float32, [None, 28*28])
labels_placeholder = tf.compat.v1.placeholder(tf.float32, [None, ])
dataset = tf.compat.v1.data.Dataset.from_tensor_slices((images_placeholder, labels_placeholder))
dataset_iter = dataset.make_initializable_iterator()
x, y = dataset_iter.get_next()