TensorFlow系列——本地运行使用feature_column做特征工程

关键词:

  • tf.contrib.data.parallel_interleave
  • tf.data.TFRecordDataset
  • input_layer
  • make_initializable_iterator()
  • train.MonitoredTrainingSession
  • test_op.initializer
  • parse_example
  • data.Dataset.list_files
  • data.experimental.make_batched_features_dataset
  • data.Dataset.from_tensor_slices
  • compat.v1.data.make_one_shot_iterator
  • feature_column.shared_embeddings
  • shared_embedding_columns_v2

一、使用feature_column.inpout_layer做特征工程

---------TensorFlow1.x---------

1、使用parse_example方式

1.1、在自定义tfrecord数据解析函数中使用

# 注意:使用的是批处理tf.parse_example而不是tf.parse_single_example
def parse_exmp(serial_exmp):
        oriExample = tf.parse_example(serial_exmp,features={'user_data':tf.FixedLenFeature([43], tf.float32)})
        oriAllData = oriExample.get("user_data")
        feaDics=dict()
        retainLabel = oriAllData[:,0:1]
        feaDics["sta_fea1"]=oriAllData[:,1:18]
        feaDics["click_level"]=tf.cast(oriAllData[:,18:19],dtype=tf.int64)
        rs = tf.feature_column.input_layer(features=feaDics,feature_columns=get_feature_columns_new())
        return rs

1.2、读取tfrecord数据集并做特征处理

# train_files是tfrecord文件列表
# 注意:batch()和prefetch()是必须的
input_files = tf.data.Dataset.list_files(train_files)
dataset = input_files.apply(tf.contrib.data.parallel_interleave(tf.data.TFRecordDataset,cycle_length=reader_num_threads)).batch(10).prefetch(5)
dataset = dataset.map(parse_exmp,num_parallel_calls=8)

1.3、输出处理结果

test_op = dataset.make_initializable_iterator()
one_element = test_op.get_next()
with tf.train.MonitoredTrainingSession() as sess:
    sess.run(test_op.initializer)
    print(sess.run(one_element))

2、使用make_batched_features_dataset自动解析tfrecord数据方式

2.1、在自定义tfrecord数据解析函数中使用

def parse_exmp_batched(serial_exmp):
        oriAllData = serial_exmp.get("user_data")
        feaDics=dict()
        retainLabel = oriAllData[:,0:1]
        feaDics["click_level"]=tf.cast(oriAllData[:,18:19],dtype=tf.int64)
        feaDics["valid_flg"]=tf.cast(oriAllData[:,19:20],dtype=tf.int64)
        feaDics["actDay_fea1"]=oriAllData[:,20:33]
        rs1 = tf.feature_column.input_layer(features=feaDics,feature_columns=get_feature_columns_new()[0])
        rs2 = tf.feature_column.input_layer(features=feaDics,feature_columns=get_feature_columns_new()[1])
        rs3 = tf.concat([rs1,rs2],-1)
        print(rs1)
        print(rs2)
        #return feaDics,{"label":tf.to_float(retainLabel)}
        return rs1,rs2,rs3

2.2、从文件中读取tfrecord数据

def train_input_fn():
        return tf.data.experimental.make_batched_features_dataset(
            file_pattern=train_files,
            batch_size=10,
            features=feature_schema,
            label_key=None
            ...
dataTest = train_input_fn()
dataset = dataTest.map(parse_exmp_batched,num_parallel_calls=8)

2.3、输出处理结果

同上

---------TensorFlow2.x---------

3、使用parse_example方式

3.1、在自定义tfrecord数据解析函数中使用

def parse_exmp(serial_exmp):        
        oriExample = tf.io.parse_example(serial_exmp,features=feature_spec)
        oriAllData = oriExample.get("user_data")
        feaDics=dict()
        retainLabel = oriAllData[:,0:1]
        feaDics["sta_fea1"]=oriAllData[:,1:18]
        feaDics["click_level"]=tf.cast(oriAllData[:,18:19],dtype=tf.int64)
        rs = tf.compat.v1.feature_column.input_layer(features=feaDics,feature_columns=get_feature_columns_new())
        print(rs)
        return rs

3.2、读取tfrecord数据集并做特征处理

dataset = tf.data.Dataset.from_tensor_slices(train_files).interleave(map_func=lambda x:tf.data.TFRecordDataset(x),cycle_length=batch_size,block_length=1,num_parallel_calls=8).batch(10).prefetch(5)
dataset =dataset.map(parse_exmp,num_parallel_calls=8)

3.3、输出处理结果

test_op = tf.compat.v1.data.make_one_shot_iterator(dataset)
one_element = test_op.get_next()
print(one_element)

4、使用make_batched_features_dataset自动解析tfrecord数据方式

4.1、在自定义tfrecord数据解析函数中使用

同TensorFlow1.x

4.2、从文件中读取tfrecord数据

同TensorFlow1.x

4.3、输出处理结果

同1.3

二、使用自定义输入层方式做特征工程

---------TensorFlow1.x---------

1、使用make_batched_features_dataset自动解析tfrecord数据方式

1.1、创建自定义输入层

重写自定义的inputlayer输入层

1.2、使用自定义输入层

def parse_exmp_batched(serial_exmp):
        oriAllData = serial_exmp.get("user_data")
        feaDics=dict()
        retainLabel = oriAllData[:,0:1]
        feaDics["sta_fea1"]=oriAllData[:,1:18]       
        input_layers = myInputLayer(feature_columns_1=get_feature_columns_new()[0],feature_columns_2=get_feature_columns_new()[1],name="inputlayer")
        outputRs = input_layers(feaDics)    
        print(outputRs)
        return outputRs

1.3从文件中读取tfrecord数据

def train_input_fn():
        return tf.data.experimental.make_batched_features_dataset(
            file_pattern=train_files,
            batch_size=10,
            features=feature_schema,
            label_key=None
            ...
dataTest = train_input_fn()
dataset = dataTest.map(parse_exmp_batched,num_parallel_calls=8)

1.3、输出处理结果

test_op = dataset.make_initializable_iterator()
one_element = test_op.get_next()
with tf.train.MonitoredTrainingSession() as sess:
    sess.run(test_op.initializer)
    print(sess.run(one_element))

2、使用parse_example——不啰嗦了

---------TensorFlow2.x---------

3、使用make_batched_features_dataset自动解析tfrecord数据方式

from datetime import datetime,timedelta
import random
import tensorflow as tf
from tensorflow.python.feature_column import feature_column_v2 as fc_v2

class myInputLayer(tf.keras.layers.Layer):
    def __init__(self,feature_columns_1,
                 feature_columns_2,
                 trainable=True,
                 name=None,
                 **kwargs):
        ...

    def build(self, input_shape):
        ...
    def call(self, inputs, **kwargs):
        ...
        return ...

feature_schema = {
    "user_data": tf.io.FixedLenFeature(shape=(43,),dtype=tf.float32),
    # "label": tf.io.FixedLenFeature(shape=(1,),dtype=tf.float32)
}

def parse_exmp_batched(serial_exmp):
        oriAllData = serial_exmp.get("user_data")
        feaDics=dict()
        retainLabel = oriAllData[:,0:1]
        feaDics["sta_fea1"]=oriAllData[:,1:18]
        feaDics["click_level"]=tf.cast(oriAllData[:,18:19],dtype=tf.int64)
        input_layers = myInputLayer(feature_columns_1=get_feature_columns_new()[0],feature_columns_2=get_feature_columns_new()[1],name="inputlayer")
        outputRs = input_layers(feaDics)
        return outputRs

train_files = [...]

def train_input_fn():
        return tf.data.experimental.make_batched_features_dataset(
            file_pattern=train_files,
            batch_size=10,
            features=feature_schema,
            label_key=None,
            ...)
dataTest = train_input_fn()
dataset = dataTest.map(parse_exmp_batched,num_parallel_calls=8)
test_op = tf.compat.v1.data.make_one_shot_iterator(dataset)
one_element = test_op.get_next()
print(one_element)

4、使用parse_example方式

from datetime import datetime,timedelta
import random
from tensorflow.python.feature_column import feature_column_v2 as fc_v2
import tensorflow as tf
print(tf.executing_eagerly())

class myInputLayer(tf.keras.layers.Layer):
    def __init__(self,feature_columns_1,
                 feature_columns_2,
                 trainable=True,
                 name=None,
                 **kwargs):
        ...

    def build(self, input_shape):
        ...
        super(myInputLayer,self).build(None)
    def call(self, inputs, **kwargs):
        ...
        return ...

feature_schema = {
    "user_data": tf.io.FixedLenFeature(shape=(43,),dtype=tf.float32),
}

def get_feature_schema(exludeFea: List[str]) -> Dict:
    valid_fea_schema = dict()
    valid_fea_schema = {key: value for key, value in feature_schema.items() if key not in exludeFea}
    return valid_fea_schema

feature_schemas = get_feature_schema([])

def get_feature_columns(args):
    fea_1 = tf.feature_column.numeric_column(key="user_data",shape=(43),dtype=tf.float32)
    return [fea_1]

def get_feature_columns_new():
    ...
    act_first_fea1 = tf.feature_column.categorical_column_with_identity(key="act_first_fea1",num_buckets=2)
    act_last_fea1 = tf.feature_column.categorical_column_with_identity(key="act_last_fea1",num_buckets=2)
    click_level_emb = tf.feature_column.embedding_column(click_level,5)
    valid_flg_emb = tf.feature_column.embedding_column(valid_flg,3)
    act_first_fea1_emb,act_last_fea1_emb = tf.feature_column.shared_embeddings([act_first_fea1,act_last_fea1],2,'mean',initializer=None,trainable=True)
    
    return [sta_fea1,actDay_fea1,click_level_emb,valid_flg_emb],[
            act_first_fea1_emb,act_first_fea2_emb,act_first_fea3_emb,act_first_fea4_emb,act_first_fea5_emb,
            act_last_fea1_emb,act_last_fea2_emb,act_last_fea3_emb,act_last_fea4_emb,act_last_fea5_emb]

feature_column = get_feature_columns("")

def parse_exmp(serial_exmp):
        feature_spec = tf.feature_column.make_parse_example_spec(feature_column)
        oriExample = tf.io.parse_example(serial_exmp,features=feature_spec)
        print(oriExample)
        oriAllData = oriExample.get("user_data")
        feaDics=dict()
        retainLabel = oriAllData[:,0:1]
        feaDics["sta_fea1"]=oriAllData[:,1:18]
        feaDics["click_level"]=tf.cast(oriAllData[:,18:19],dtype=tf.int64)  
        inputnet = myInputLayer(get_feature_columns_new()[0],get_feature_columns_new()[1],name="inputlayer")
        rs = inputnet(feaDics)
        return rs

train_files = [...]
batch_size = 10
dataset = tf.data.Dataset.from_tensor_slices(train_files).interleave(map_func=lambda x:tf.data.TFRecordDataset(x),cycle_length=batch_size,block_length=1,num_parallel_calls=8).batch(10).prefetch(5)

dataset =dataset.map(parse_exmp,num_parallel_calls=8)
test_op = tf.compat.v1.data.make_one_shot_iterator(dataset)
one_element = test_op.get_next()
print(one_element)

 

你可能感兴趣的:(深度学习-实践,tensorflow,feature_column,自定义输入层)