在使用很多模型的时候,都需要对输入的数据进行必要的特征工程处理。最典型的就是:one-hot处理,还有hash分桶等处理。为了方便处理这些特征,tensorflow提供了一些列的特征工程方法来方便使用.
import tensorflow as tf
from tensorflow.python.estimator.inputs import numpy_io
import numpy as np
import collections
from tensorflow.python.framework import errors
from tensorflow.python.platform import test
from tensorflow.python.training import coordinator
from tensorflow import feature_column
from tensorflow.python.feature_column.feature_column import _LazyBuilder
numeric_column(
key,
shape=(1,),
default_value=None,
dtype=tf.float32,
normalizer_fn=None
)
接下来对numeric_column测试的demo如下:
def test_numeric():
price = {'price': [[1.], [2.], [3.], [4.]]} # 4行样本
builder = _LazyBuilder(price)
def transform_fn(x):
return x + 2
price_column = feature_column.numeric_column('price', normalizer_fn=transform_fn)
price_transformed_tensor = price_column._get_dense_tensor(builder)
with tf.Session() as session:
print(session.run([price_transformed_tensor]))
# 使用input_layer
price_transformed_tensor = feature_column.input_layer(price, [price_column])
with tf.Session() as session:
print('use input_layer' + '_' * 40)
print(session.run([price_transformed_tensor]))
test_numeric()
[array([[ 3.],
[ 4.],
[ 5.],
[ 6.]], dtype=float32)]
use input_layer________________________________________
[array([[ 3.],
[ 4.],
[ 5.],
[ 6.]], dtype=float32)]
从上面的结果可以看出,transform_fn 将所有的数值+2来处理了。使用_LazyBuilder
和inpu_layer来分别进行了测试.效果是一样的.
bucketized_column(
source_column,
boundaries
)
def test_bucketized_column():
price = {'price': [[5.], [15.], [25.], [35.]]} # 4行样本
price_column = feature_column.numeric_column('price')
bucket_price = feature_column.bucketized_column(price_column, [0, 10, 20, 30, 40])
price_bucket_tensor = feature_column.input_layer(price, [bucket_price])
with tf.Session() as session:
print(session.run([price_bucket_tensor]))
test_bucketized_column()
[array([[ 0., 1., 0., 0., 0., 0.],
[ 0., 0., 1., 0., 0., 0.],
[ 0., 0., 0., 1., 0., 0.],
[ 0., 0., 0., 0., 1., 0.]], dtype=float32)]
我们看到分桶之后,会直接转换成one-hot形式的。
categorical_column_with_vocabulary_list(
key,
vocabulary_list,
dtype=None,
default_value=-1,
num_oov_buckets=0
)
与前面numeric 不同的是,这里返回的是稀疏tensor.
def test_categorical_column_with_vocabulary_list():
color_data = {'color': [['R', 'R'], ['G', 'R'], ['B', 'G'], ['A', 'A']]} # 4行样本
builder = _LazyBuilder(color_data)
color_column = feature_column.categorical_column_with_vocabulary_list(
'color', ['R', 'G', 'B'], dtype=tf.string, default_value=-1
)
color_column_tensor = color_column._get_sparse_tensors(builder)
with tf.Session() as session:
session.run(tf.global_variables_initializer())
session.run(tf.tables_initializer())
print(session.run([color_column_tensor.id_tensor]))
# 将稀疏的转换成dense,也就是one-hot形式,只是multi-hot
color_column_identy = feature_column.indicator_column(color_column)
color_dense_tensor = feature_column.input_layer(color_data, [color_column_identy])
with tf.Session() as session:
session.run(tf.global_variables_initializer())
session.run(tf.tables_initializer())
print('use input_layer' + '_' * 40)
print(session.run([color_dense_tensor]))
test_categorical_column_with_vocabulary_list()
[SparseTensorValue(indices=array([[0, 0],
[0, 1],
[1, 0],
[1, 1],
[2, 0],
[2, 1],
[3, 0],
[3, 1]]), values=array([ 0, 0, 1, 0, 2, 1, -1, -1]), dense_shape=array([4, 2]))]
use input_layer________________________________________
[array([[ 2., 0., 0.],
[ 1., 1., 0.],
[ 0., 1., 1.],
[ 0., 0., 0.]], dtype=float32)]
对于categorical_column_with_vocabulary_list来说返回的是sparser_tensor,注意 id_tensor 这个是有效的,另外一个是None. 对于线性模型来说是可以直接使用sparser_tensor的。然而,对于深度模型来说,需要将sparser转换成dense,所以也就有了indicator_column 这个函数的出现。indicator_column的作用就是将category产生的sparser tensor转换成dense tensor.
注意:
* input_layer: 只接受dense tensor
* tables_initializer: 在sparser的时候使用的,如果不进行初始化会出现 Table not initialized. [Node: hash_table_Lookup = LookupTableFindV2 这样的异常
categorical_column_with_hash_bucket(
key,
hash_bucket_size,
dtype=tf.string
)
当category的数量很多,也就无法使用指定category的方法来处理了,那么,可以使用这种哈希分桶的方式来进行处理。比如,切词之后的句子,每一个词可以使用这种方式来处理. 使用 categorical_column_with_vocabulary_file 也是一种不错的选择,比如将词频高的拿出来。毕竟对于hash_bucket来说,对于bucket_size的选取是个问题。
def test_categorical_column_with_hash_bucket():
color_data = {'color': [['R'], ['G'], ['B'], ['A']]} # 4行样本
builder = _LazyBuilder(color_data)
color_column = feature_column.categorical_column_with_hash_bucket('color', 7)
color_column_tensor = color_column._get_sparse_tensors(builder)
with tf.Session() as session:
session.run(tf.global_variables_initializer())
session.run(tf.tables_initializer())
print(session.run([color_column_tensor.id_tensor]))
# 将稀疏的转换成dense,也就是one-hot形式,只是multi-hot
color_column_identy = feature_column.indicator_column(color_column)
color_dense_tensor = feature_column.input_layer(color_data, [color_column_identy])
with tf.Session() as session:
session.run(tf.global_variables_initializer())
session.run(tf.tables_initializer())
print('use input_layer' + '_' * 40)
print(session.run([color_dense_tensor]))
test_categorical_column_with_hash_bucket()
[SparseTensorValue(indices=array([[0, 0],
[1, 0],
[2, 0],
[3, 0]]), values=array([5, 2, 6, 3]), dense_shape=array([4, 1]))]
use input_layer________________________________________
[array([[ 0., 0., 0., 0., 0., 1., 0.],
[ 0., 0., 1., 0., 0., 0., 0.],
[ 0., 0., 0., 0., 0., 0., 1.],
[ 0., 0., 0., 1., 0., 0., 0.]], dtype=float32)]
从上面看这种hash分桶的方法,在hash_size的选择上是很重要的。现在选择3,对于R 和 B 来说分桶到一个烈面了;对于 G和A 分桶到一个里面了。当将 hash_size=7来测试, R G B A就都分到了不同的桶中,所以值越大也容易精确的分桶.
[SparseTensorValue(indices=array([[0, 0],
[1, 0],
[2, 0],
[3, 0]]), values=array([5, 2, 6, 3]), dense_shape=array([4, 1]))]
use input_layer________________________________________
[array([[ 0., 0., 0., 0., 0., 1., 0.],
[ 0., 0., 1., 0., 0., 0., 0.],
[ 0., 0., 0., 0., 0., 0., 1.],
[ 0., 0., 0., 1., 0., 0., 0.]], dtype=float32)]
categorical_column_with_identity(
key,
num_buckets,
default_value=None
)
这是对连续的数字类的处理函数。比如 id 一共有10000个,那么可以使用这种方式。但是如果多数没有被使用,那么还不如使用 categorical_column_with_hash_bucket 进行重新处理。
embedding_column(
categorical_column,
dimension,
combiner='mean',
initializer=None,
ckpt_to_load_from=None,
tensor_name_in_ckpt=None,
max_norm=None,
trainable=True
)
将sparsor tensor转换成dense tensor. 在DNN的输入中需要使用dense tensor. embedding如果共用,需要的是name一样.
def test_embedding():
color_data = {'color': [['R'], ['G'], ['B'], ['A']]} # 4行样本
color_column = feature_column.categorical_column_with_vocabulary_list(
'color', ['R', 'G', 'B'], dtype=tf.string, default_value=-1
)
color_embeding = feature_column.embedding_column(color_column, 8)
color_embeding_dense_tensor = feature_column.input_layer(color_data, [color_embeding])
with tf.Session() as session:
session.run(tf.global_variables_initializer())
session.run(tf.tables_initializer())
print('embeding' + '_' * 40)
print(session.run([color_embeding_dense_tensor]))
test_embedding()
embeding________________________________________
[array([[-0.38754427, 0.00133941, 0.22987399, 0.10634357, 0.60504574,
0.09730898, -0.26186299, 0.37433708],
[-0.11320268, 0.0495495 , 0.45014769, 0.18113135, 0.07382802,
-0.18399857, -0.42906326, -0.4881283 ],
[ 0.45096871, -0.22977889, -0.28710714, -0.10303244, -0.34233567,
0.06112694, 0.11003948, 0.08152663],
[ 0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. ]], dtype=float32)]
每一个都会转换成8个维度的数据,并且使用高斯分布来进行初始化。因为A 没有在catergorical_column中出现,所以使用了0进行初始化.
weighted_categorical_column(
categorical_column,
weight_feature_key,
dtype=tf.float32
)
为categorical_column赋值权重。默认的categorical_column中,所有的权重都是一样的,但是有些时候,对于同样一组category_column不同的category的权重不同。例如,如果使用tag来表示文本,那么tag的权重就不同。
def test_weighted_categorical_column():
color_data = {'color': [['R'], ['G'], ['B'], ['A']],
'weight': [[1.0], [2.0], [4.0], [8.0]]} # 4行样本
color_column = feature_column.categorical_column_with_vocabulary_list(
'color', ['R', 'G', 'B'], dtype=tf.string, default_value=-1
)
color_weight_categorical_column = feature_column.weighted_categorical_column(color_column, 'weight')
builder = _LazyBuilder(color_data)
with tf.Session() as session:
id_tensor, weight = color_weight_categorical_column._get_sparse_tensors(builder)
session.run(tf.global_variables_initializer())
session.run(tf.tables_initializer())
print('weighted categorical' + '-' * 40)
print(session.run([id_tensor]))
print('-' * 40)
print(session.run([weight]))
test_weighted_categorical_column()
weighted categorical----------------------------------------
[SparseTensorValue(indices=array([[0, 0],
[1, 0],
[2, 0],
[3, 0]]), values=array([ 0, 1, 2, -1]), dense_shape=array([4, 1]))]
----------------------------------------
[SparseTensorValue(indices=array([[0, 0],
[1, 0],
[2, 0],
[3, 0]]), values=array([ 1., 2., 4., 8.], dtype=float32), dense_shape=array([4, 1]))]
可以看到,weight 这个tensor也是存在的。对于前面其他categorical_column来说是不存在weight的。
linear_model(
features,
feature_columns,
units=1,
sparse_combiner='sum',
weight_collections=None,
trainable=True
)
对所有特征进行线性加权操作.
def get_linear_model_bias():
with tf.variable_scope('linear_model', reuse=True):
return tf.get_variable('bias_weights')
def get_linear_model_column_var(column):
return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
'linear_model/' + column.name)[0]
def test_linear_model():
"""
测试线性模型
:return:
"""
featrues = {
'price': [[1.0], [5.0], [10.0]],
'color': [['R'], ['G'], ['B']]
}
price_column = feature_column.numeric_column('price')
color_column = feature_column.categorical_column_with_vocabulary_list('color',
['R', 'G', 'B'])
prediction = feature_column.linear_model(featrues, [price_column, color_column])
bias = get_linear_model_bias()
price_var = get_linear_model_column_var(price_column)
color_var = get_linear_model_column_var(color_column)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
sess.run(tf.local_variables_initializer())
sess.run(tf.tables_initializer())
sess.run(bias.assign([7.0]))
sess.run(price_var.assign([[10.0]]))
sess.run(color_var.assign([[2.0], [2.0], [2.0]]))
predication_result = sess.run([prediction])
print(predication_result)
test_linear_model()
[array([[ 19.],
[ 59.],
[ 109.]], dtype=float32)]
组合特征,这仅仅适用于sparser特征.产生的依然是sparsor特征.
def test_crossed_column():
"""
crossed column测试
:return:
"""
featrues = {
'price': [['A', 'A'], ['B', 'D'], ['C', 'A']],
'color': [['R', 'R'], ['G', 'G'], ['B', 'B']]
}
price = feature_column.categorical_column_with_vocabulary_list('price',
['A', 'B', 'C', 'D'])
color = feature_column.categorical_column_with_vocabulary_list('color',
['R', 'G', 'B'])
p_x_c = feature_column.crossed_column([price, color], 16)
p_x_c_identy = feature_column.indicator_column(p_x_c)
p_x_c_identy_dense_tensor = feature_column.input_layer(featrues, [p_x_c_identy])
with tf.Session() as session:
session.run(tf.global_variables_initializer())
session.run(tf.tables_initializer())
print('use input_layer' + '_' * 40)
print(session.run([p_x_c_identy_dense_tensor]))
test_crossed_column()
use input_layer________________________________________
[array([[ 0., 0., 0., 0., 0., 0., 0., 0., 4., 0., 0., 0., 0.,
0., 0., 0.],
[ 0., 0., 0., 0., 2., 0., 0., 0., 0., 2., 0., 0., 0.,
0., 0., 0.],
[ 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 2., 0.,
0., 2., 0.]], dtype=float32)]