人工智能AI:Keras PyTorch MXNet TensorFlow PaddlePaddle 深度学习实战(不定时更新)
3.2.1.1 定义正负样本
3.2.1.2 获取初始训练集
android,23,4,32,43,32,2,54,1502378738,2,4,6,33,421,22,43,12,0,0,0
ios,47,4,16,28,32,43,56,1502408488,22,33,12,2,1,23,4,47,0,0,1
android,22,4,16,7,0,0,7,1502362845,2,34,62,2,32,221,32,4,0,0,0
android,23,21,122,223,33,23,42,1502367552,77,11,2,3,87,1,20,6,2,0,0
android,76,432,876,23,2,23,56,1502430914,2,32,1,23,54,66,33,212,199,0,1
我们需要从neo4j当中获取训练样本,构造样本过程分为正样本和负样本两部分,正样本: 用户产生交互行为的双画像,负样本: 推荐曝光后没有产生交互行为的双画像,通过cypher语句取出特征。
1、读取用户、帖子特征进行组合cypher语句
from neo4j.v1 import GraphDatabase
import numpy as np
import pandas as pd
NEO4J_CONFIG = dict({
"uri": "bolt://192.168.19.137:7687",
"auth": ("neo4j", "itcast"),
"encrypted": False
})
_driver = GraphDatabase.driver(**NEO4J_CONFIG)
# 选择有过行为关系的用户和帖子,将相关特征合并,以及目标为1
def get_positive_sample():
cypher = "match(a:SuperfansUser)-[r:share|comment|like]-(b:SuperfansPost) return [a.like_posts_num, a.forward_posts_num, a.comment_posts_num,a.publish_posts_num,a.follow_stars_list,b.hot_score,b.commented_num,b.forwarded_num,b.liked_num,b.related_stars_list,b.publish_time] limit 200"
train_data_with_labels = get_train_data(cypher, '1')
return train_data_with_labels
# 选择没有关系的用户和帖子,将相关特征合并,以及目标为0
def get_negative_sample():
cypher = "match(a:SuperfansUser)-[r:report|unlike]-(b:SuperfansPost) return [a.like_posts_num, a.forward_posts_num, a.comment_posts_num,a.publish_posts_num,a.follow_stars_list,b.hot_score,b.commented_num,b.forwarded_num,b.liked_num,b.related_stars_list,b.publish_time] limit 200"
train_data_with_labels = get_train_data(cypher, '0')
return train_data_with_labels
2、获取结果之后,进行数据集的格式处理和构造:get_train_data(cypher, '1')
# [92, 47, 4, 1618, ['218960', '187579', '210958', '219148', '3116'], 549, 5, 2, 533, ['1'], 1516180431]
def _extended_length(b, index):
"""
:param b: 传入样本
:param index: 传入位置
:return:
"""
print(b, index)
for i in index:
if len(b[i])<5:
k = [0]*5
for i, value in enumerate(b[i]):
k[i] = value
b.extend(k)
print(b)
i = 0
while i < len(index):
b.pop(index[i] - i)
i += 1
return b
def get_train_data(cypher, label):
### 根据neo4j关系生成标注数据
# 正样本: 用户产生交互行为的双画像
# 负样本: 推荐曝光后没有产生交互行为的双画像
with _driver.session() as session:
record = session.run(cypher)
sample = list(map(lambda x: x[0], record))
index_list = [4,9]
# 第一步特征处理: 列表特征处理
train_data = list(map(lambda x: _extended_length(x, index_list) + [str(label)], sample))
print(train_data)
return train_data
最后保存到本地当前目录train_data.csv文件
if __name__ == "__main__":
p_train_data = get_positive_sample()
n_train_data = get_negative_sample()
print(len(p_train_data))
print(len(n_train_data))
train_data = p_train_data + n_train_data
pd.DataFrame(train_data).to_csv("./train_data.csv", header=False, index=False)
分析:
目的:构建泛娱乐WDL模型输入、特征的处理,从而进行后续模型训练
3.2.3.1 模型输入函数
1、指定读取CSV文件API,返回dataset
def input_fn(filenames,
num_epochs=None,
shuffle=True,
skip_header_lines=0,
batch_size=200):
dataset = tf.data.TextLineDataset(filenames).skip(skip_header_lines).map(
_decode_csv)
2、实现_decode_csv解析文件内容特征以及目标值函数
所有CSV列,以及解析式的默认格式。
CSV_COLUMNS = [
'like_posts_num', 'forward_posts_num', 'comment_posts_num', 'publish_posts_num', 'hot_score',
'commented_num', 'forwarded_num', 'liked_num', 'publish_time', 'follow_star_1',
'follow_star_2', 'follow_star_3', 'follow_star_4', 'follow_star_5', 'related_star_1',
'related_star_2', 'related_star_3', 'related_star_4', 'related_star_5', 'islike'
]
CSV_COLUMN_DEFAULTS = [[''], [0], [0], [0], [0],
[0], [0], [0], [0], [0],
[0], [0], [0], [0], [0],
[0], [0], [0], [0], ['']]
LABEL_COLUMN = 'islike'
LABELS = ['1', '0']
使用tf.decode_csv解析
def _decode_csv(line):
# ## ['123','321'] ---> [['123'], ['321']]
row_columns = tf.expand_dims(line, -1)
# ##修改各个特征的类型
columns = tf.decode_csv(row_columns, record_defaults=CSV_COLUMN_DEFAULTS)
features = dict(zip(CSV_COLUMNS, columns))
# Remove unused columns
for col in UNUSED_COLUMNS:
features.pop(col)
return features
其中会对特征进行过滤,指定无用的特征列 UNUSED_COLUMNS,输入的特征以及目标标签都会需要
UNUSED_COLUMNS = set(CSV_COLUMNS) - {col.name for col in INPUT_COLUMNS} - {LABEL_COLUMN}
这里UNUSED_COLUMNS有'device_system'以及'islike'这两列
# 指定StarID维度列表映射大小
STAR_ID_LIST = list(map(lambda x: x, range(0,500)))
INPUT_COLUMNS = [
tf.feature_column.numeric_column('like_posts_num'),
tf.feature_column.numeric_column('forward_posts_num'),
tf.feature_column.numeric_column('comment_posts_num'),
tf.feature_column.numeric_column('publish_posts_num'),
tf.feature_column.numeric_column('commented_num'),
tf.feature_column.numeric_column('forwarded_num'),
tf.feature_column.numeric_column('liked_num'),
tf.feature_column.numeric_column('publish_time'),
tf.feature_column.categorical_column_with_vocabulary_list(
'follow_star_1', STAR_ID_LIST),
tf.feature_column.categorical_column_with_vocabulary_list(
'follow_star_2', STAR_ID_LIST),
tf.feature_column.categorical_column_with_vocabulary_list(
'follow_star_3', STAR_ID_LIST),
tf.feature_column.categorical_column_with_vocabulary_list(
'follow_star_4', STAR_ID_LIST),
tf.feature_column.categorical_column_with_vocabulary_list(
'follow_star_5', STAR_ID_LIST),
tf.feature_column.categorical_column_with_vocabulary_list(
'related_star_1', STAR_ID_LIST),
tf.feature_column.categorical_column_with_vocabulary_list(
'related_star_2', STAR_ID_LIST),
tf.feature_column.categorical_column_with_vocabulary_list(
'related_star_3', STAR_ID_LIST),
tf.feature_column.categorical_column_with_vocabulary_list(
'related_star_4', STAR_ID_LIST),
tf.feature_column.categorical_column_with_vocabulary_list(
'related_star_5', STAR_ID_LIST)
]
3、dataset进行制定epoch以及Batch大小,打乱顺序,并指定目标值,将字符串编程目标0,1
使用dataset的batch,repeat相关方法进行处理,
if shuffle:
dataset = dataset.shuffle(buffer_size=batch_size * 10)
iterator = dataset.repeat(num_epochs).batch(
batch_size).make_one_shot_iterator()
features = iterator.get_next()
return features, parse_label_column(features.pop(LABEL_COLUMN))
最后将目标值进行处理
def parse_label_column(label_string_tensor):
table = tf.contrib.lookup.index_table_from_tensor(tf.constant(LABELS))
return table.lookup(label_string_tensor)
3.2.3.2 tf.feature_column特征处理
(like_posts_num, forward_posts_num, comment_posts_num, publish_posts_num, hot_score,
commented_num, forwarded_num, liked_num, publish_time, follow_star_1,
follow_star_2, follow_star_3, follow_star_4, follow_star_5, related_star_1,
related_star_2, related_star_3, related_star_4, related_star_5) = INPUT_COLUMNS
wide侧特征列指定
wide_columns = [
tf.feature_column.crossed_column([follow_star_1, related_star_1],
hash_bucket_size=int(1e3)),
tf.feature_column.crossed_column([follow_star_1, related_star_2],
hash_bucket_size=int(1e3)),
tf.feature_column.crossed_column([follow_star_1, related_star_3],
hash_bucket_size=int(1e3)),
tf.feature_column.crossed_column([follow_star_1, related_star_4],
hash_bucket_size=int(1e3)),
tf.feature_column.crossed_column([follow_star_1, related_star_5],
hash_bucket_size=int(1e3)),
tf.feature_column.crossed_column([follow_star_2, related_star_1],
hash_bucket_size=int(1e3)),
tf.feature_column.crossed_column([follow_star_2, related_star_2],
hash_bucket_size=int(1e3)),
tf.feature_column.crossed_column([follow_star_2, related_star_3],
hash_bucket_size=int(1e3)),
tf.feature_column.crossed_column([follow_star_2, related_star_4],
hash_bucket_size=int(1e3)),
tf.feature_column.crossed_column([follow_star_2, related_star_5],
hash_bucket_size=int(1e3)),
tf.feature_column.crossed_column([follow_star_1, related_star_1, follow_star_2],
hash_bucket_size=int(1e4)),
tf.feature_column.crossed_column([follow_star_2, related_star_1, related_star_2],
hash_bucket_size=int(1e4)),
tf.feature_column.crossed_column([follow_star_3, related_star_1, related_star_2],
hash_bucket_size=int(1e4)),
tf.feature_column.crossed_column([follow_star_1, related_star_2, related_star_1],
hash_bucket_size=int(1e4)),
device_system,
follow_star_1,
follow_star_2,
follow_star_3,
follow_star_4,
follow_star_5,
related_star_1,
related_star_2,
related_star_3,
related_star_4,
related_star_5
]
deep侧特征指定
# 深度特征比做挖掘特征,针对稀疏+稠密的所有特征, 但由于隐层作用时将考虑大小问题,因此类别特征必须onehot编码才能作为输入
deep_columns = [
tf.feature_column.indicator_column(follow_star_1),
tf.feature_column.indicator_column(follow_star_2),
tf.feature_column.indicator_column(follow_star_3),
tf.feature_column.indicator_column(follow_star_4),
tf.feature_column.indicator_column(follow_star_5),
tf.feature_column.indicator_column(related_star_1),
tf.feature_column.indicator_column(related_star_2),
tf.feature_column.indicator_column(related_star_3),
tf.feature_column.indicator_column(related_star_4),
tf.feature_column.indicator_column(related_star_5),
like_posts_num,
forward_posts_num,
comment_posts_num,
publish_posts_num,
commented_num,
forwarded_num,
liked_num,
publish_time
]
3.2.3.3 Wide&Deep模型构建
def build_estimator(config, embedding_size=8, hidden_units=None):
"""
"""
# 特征处理
# 模型构建
return tf.estimator.DNNLinearCombinedClassifier(config=config,
linear_feature_columns=wide_columns,
dnn_feature_columns=deep_columns,
dnn_hidden_units=[embedding_size] + [100, 70, 50, 25])