TF Learn入门 —— 稍复杂使用举例

使用TensorFlow的 TF.Learn API 解决二进制分类问题。 根据普查中的个人信息,包括年龄、性别、教育程度和职业(特征),来预测该人年收入是否超过5万美元(目标标签)。将训练logistic regression模型,输出值在0和1之间,表示该人收入超过5万美元的可能性。

读取普查数据

下载数据

import tempfile
import urllib
train_file = tempfile.NamedTemporaryFile()
test_file = tempfile.NamedTemporaryFile()
urllib.urlretrieve('http://mlr.cs.umass.edu/ml/machine-learning-databases/adult/adult.data', train_file.name)
urllib.urlretrieve('http://mlr.cs.umass.edu/ml/machine-learning-databases/adult/adult.test', test_file.name)

读入数据

import pandas as pd
COLUMNS = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation', 'relationship', 'race', 'gender', 'capital_gain', 'capital_loss', 'hour_per_week', 'native_country', 'income_bracket']
df_train = pd.read_csv(train_file, names=COLUMNS, skipinitialspace=True)
df_test = pd.read_csv(test_file, names=COLUMNS, skipinitialspace=True, skiprows=1)

构建标签栏

LABEL_COLUMN = 'label'
df_train[LABEL_COLUMN] = (df_train['income_bracket'].apply(lambda x: '>50k' in x)).astype(int)
df_test[LABEL_COLUMN] = (df_test['income_bracket'].apply(lambda x: '>50k' in x)).astype(int)

检查数据

CATEGORICAL_COLUMNS = ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'gender', 'native_country']
CONTINUOUS_COLUMNS = ['age', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']

将数据转化为Tensors

import tensorflow as tf
define input_fn(df):
continous_cols = {k: tf.constant(df[k].values) for k in CONTINOUS_COLUMNS}
categorical_cols = {k: tf.SparseTensor(
indices=[[i, 0] for i in range(df[k].size)],
values=df[k].values,
shape=[df[k].size, 1])
for k in CATEGORICAL_COLUMNS}
feature_cols = dict(continous_cols.items() + categorical_cols.items())
label = tf.constant(df[LABEL_COLUMN].values)
return feature_cols, label
def train_input_fn():
return input_fn(df_train)
def eval_input_fn():
return input_fn(df_test)

根据模型选择和处理特征

基本类别特征

gender = tf.contrib.layer.sparse_column_with_keys(columns_name='gender', keys=['Female', 'Male']) #知道可能的特征值
education = tf.contrib.layers_sparse_column_with_hash_bucket('education', hash_bucket_size=1000) #不知道可能的特征值

基本连续特征

age = tf.contrib.layers.real_valued_column('age')

将连续数据归类化

age_buckets = tf.contrib.layers.bucketized_column(age, boundaries=[18, 25, 30, 35, 4, 45, 50, 55, 60, 65])

构建组合特征

education_x_occupation = tf.contrib.layers.crossed_column([education, occupation], hash_bucket_size=int(1e4))
age_buckets_x_education_x_occupation = tf.contrib.layers.crossed_column([age_buckets, education, occupation], hash_bucket_size=int(1e6))

定义模型

model_dir = tempfile.mkdtemp()
m = tf.contrib.learn.LinearClassifier(feature_columns=[gender, native_country, education, occupation, workclass, marital_status, race, age_buckets, education_x_occupation, age_buckets_x_education_x_occupation], model_dir=model_dir)

训练和评估模型

m.fit(input_fn=train_input_fn, steps=200)
results = m.evaluate(input_fn=eval_input_fn, steps=1)
for key in sorted(results):
print '%s: %s' %(key, results[key])

预防过度拟合

m = tf.contrib.learn.LinearClassifier(
feature_columns=[gender, native_country, education, occupation, workclass, marital_status, race, age_buckets, education_x_occupation, age_buckets_x_education_x_occupation],
optimizer=tf.train.FtrlOptimizer(
learning_rate=0.1,
l1_regularization_strength=1.0,
l2_regularization_strength=1.0
),
model_dir=model_dir
)






你可能感兴趣的:(TensorFlow)