1 数据集下载
https://www.kaggle.com/sherinclaudia/movielens
2 加载工具包
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import tensorflow as tf
from sklearn.metrics import mean_squared_error, roc_auc_score
import pickle
from sklearn.base import BaseEstimator, TransformerMixin
from time import time
from tensorflow.contrib.layers.python.layers import batch_norm as batch_norm
print(os.listdir("../input"))
ROOT = '../input/movielens/'
3 处理数据集
def load_data(root, savefile):
users_title = ['UserID', 'Gender', 'Age', 'JobID', 'Zip-code']
users = pd.read_csv(root+'users.dat', sep='::', header=None, names=users_title, engine = 'python')
users.drop(columns=['Zip-code'], inplace=True)
users_ori = users.copy()
feature_num = 0
feature_dict = {}
for col in users.columns:
unique_val = users[col].unique()
feature_dict[col] = dict(zip(unique_val,range(feature_num, feature_num+len(unique_val))))
feature_num += len(unique_val)
users[col] = users[col].map(feature_dict[col])
movies_title = ['MovieID', 'Title', 'Genres']
movies = pd.read_csv(root+'movies.dat', sep='::', header=None, names=movies_title, engine = 'python')
movies_ori = movies.copy()
unique_val = movies['MovieID'].unique()
feature_dict['MovieID'] = dict(zip(unique_val,range(feature_num, feature_num+len(unique_val))))
feature_num += len(unique_val)
movies['MovieID'] = movies['MovieID'].map(feature_dict['MovieID'])
title_set = set()
genres_set = set()
for row in movies.itertuples():
title_set.update(row.Title.split(' ')[:-1])
genres_set.update(row.Genres.split('|'))
title_set.add('')
genres_set.add('')
title2int = {x:(i+feature_num) for i,x in enumerate(title_set)}
feature_num += len(title_set)
title2map = {x:[title2int[r] for r in x.split(' ')[:-1]] for x in set(movies['Title'])}
title_max_len = movies['Title'].str.split(' ').apply(len).max()
for key in title2map.keys():
l = len(title2map[key])
title2map[key] = title2map[key] + [title2int['']]*(title_max_len-l)
movies['Title'] = movies['Title'].map(title2map)
genres_set.add('')
genres2int = {x:(i+feature_num) for i,x in enumerate(genres_set)}
feature_num += len(genres_set)
genres2map = {x:[genres2int[r] for r in x.split('|')] for x in set(movies['Genres'])}
genres_max_len = movies['Genres'].str.split('|').apply(len).max()
for key in genres2map.keys():
l = len(genres2map[key])
genres2map[key] = genres2map[key] + [genres2int['']]*(genres_max_len-l)
movies['Genres'] = movies['Genres'].map(genres2map)
feature_dict['Title'] = title2map
feature_dict['Genres'] = genres2map
ratings_title = ['UserID','MovieID', 'Rating', 'timestamps']
ratings = pd.read_csv(root+'ratings.dat', sep='::', header=None, names=ratings_title, engine = 'python')
ratings.drop(columns=['timestamps'], inplace=True)
ratings['UserID'] = ratings['UserID'].map(feature_dict['UserID'])
ratings['MovieID'] = ratings['MovieID'].map(feature_dict['MovieID'])
pickle.dump((feature_num, feature_dict, title_max_len, genres_max_len, users_ori, movies_ori)
, open(savefile, 'wb'))
return users, movies, ratings
users, movies, ratings = load_data(ROOT, 'preprocess.p')
data = pd.merge(pd.merge(ratings, users), movies)
feature_num, feature_dict, title_max_len, genres_max_len, users_ori, movies_ori \
= pickle.load(open('preprocess.p', mode='rb'))
user_clumns = users.columns
movie_columns = ['MovieID', 'Genres'] # 'Genres_0', 'Genres_1', 'Genres_2', 'Genres_3', 'Genres_4',
text_columns = ['Title']
user_data = data[user_clumns].values
movie_data = []
text_data = []
for row in data.itertuples():
movie_data.append([row.MovieID]+row.Genres)
text_data.append(row.Title)
label = data['Rating'].values
4 搭建模型
class RecomCNN(BaseEstimator, TransformerMixin):
def __init__(self, feature_size,
user_size,
movie_size,
text_size=15,
filter_size=8,
embedding_size=8,
user_deep_layers=[200,200],
movie_deep_layers=[96,200],
user_dropout_keep_layers=[0.8,0.8,0.8],
movie_dropout_keep_layers=[0.8,0.8,0.8],
windows=[2,3,4,5],
deep_layers_activation=tf.nn.relu,
epoch=10,
batch_size=256,
learning_rate=0.001,
optimizer_type="adam",
verbose=1,
random_seed=2016,
loss_type="logloss",
eval_metric=roc_auc_score,
l2_reg=0.0,
isrestore=False,
save_path=''):
assert loss_type in ["logloss", "mse"], \
"loss_type can be either 'logloss' for classification task or 'mse' for regression task"
self.feature_size = feature_size
self.user_size = user_size
self.movie_size = movie_size
self.text_size = text_size
self.filter_size = filter_size
self.embedding_size = embedding_size
self.user_deep_layers = user_deep_layers
self.user_dropout_keep_layers = user_dropout_keep_layers
self.movie_deep_layers = movie_deep_layers
self.movie_dropout_keep_layers = movie_dropout_keep_layers
self.windows = windows
self.deep_layers_activation = deep_layers_activation
self.l2_reg = l2_reg
self.epoch = epoch
self.batch_size = batch_size
self.learning_rate = learning_rate
self.optimizer_type = optimizer_type
self.verbose = verbose
self.random_seed = random_seed
self.loss_type = loss_type
self.eval_metric = eval_metric
self.isrestore = isrestore
self.save_path = save_path
self._init_graph()
def _init_graph(self):
self.graph = tf.Graph()
with self.graph.as_default():
tf.set_random_seed(self.random_seed)
self.user_index = tf.placeholder(tf.int32, shape=[None, None],
name="user_index") # None * F
self.movie_index = tf.placeholder(tf.int32, shape=[None, None],
name="movie_index") # None * F
self.text_index = tf.placeholder(tf.int32, shape=[None, None],
name="text_index") # None * F
self.label = tf.placeholder(tf.float32, shape=[None, 1], name="label") # None * 1
self.user_dropout_keep = tf.placeholder(tf.float32, shape=[None], name="user_dropout_keep_layer")
self.movie_dropout_keep = tf.placeholder(tf.float32, shape=[None], name="movie_dropout_keep_layer")
self.weights = self._initialize_weights()
# model
# ---------- user part ----------
self.user_embeddings = tf.nn.embedding_lookup(self.weights["feature_embeddings"],
self.user_index) # None * U * K
self.user_deep = tf.reshape(self.user_embeddings, (-1, self.user_size * self.embedding_size))
self.user_deep = tf.nn.dropout(self.user_deep, self.user_dropout_keep[0])
for i in range(0, len(self.user_deep_layers)):
self.user_deep = tf.add(
tf.matmul(self.user_deep, self.weights["user_layer_%d" %i]), self.weights["user_bias_%d"%i]) # None * layer[i] * 1
self.user_deep = self.deep_layers_activation(self.user_deep)
self.user_deep = tf.nn.dropout(self.user_deep, self.user_dropout_keep[1+i]) # dropout at each Deep layer
# ---------- movie part ----------
# movie genres
self.movie_embeddings = tf.nn.embedding_lookup(self.weights["feature_embeddings"],
self.movie_index) # None * M * K
self.movie_deep = tf.reshape(self.movie_embeddings, (-1, self.movie_size * self.embedding_size))
self.movie_deep = tf.nn.dropout(self.movie_deep, self.movie_dropout_keep[0])
for i in range(0, len(self.movie_deep_layers)-1):
self.movie_deep = tf.add(
tf.matmul(self.movie_deep, self.weights["movie_layer_%d" %i]), self.weights["movie_bias_%d"%i]) # None * layer[i] * 1
self.movie_deep = self.deep_layers_activation(self.movie_deep)
self.movie_deep = tf.nn.dropout(self.movie_deep, self.movie_dropout_keep[1+i])
# movie text
self.text_embeddings = tf.nn.embedding_lookup(
self.weights["feature_embeddings"], self.text_index) # None * T * K
self.text_deep = tf.expand_dims(self.text_embeddings, 3) # None * T * K * 1
text_cnn_output = []
for i in range(len(self.windows)):
text_cnn = tf.nn.conv2d(
self.text_deep, self.weights['text_filter_%d' % i], strides=[1,1,self.embedding_size,1],
padding="VALID")
ksize = self.text_size - self.windows[i] + 1
text_cnn_output.append(
tf.reshape(
tf.nn.max_pool2d(text_cnn, [ksize,1], strides=[1,ksize,1,1], padding="VALID"),
(-1, self.filter_size)))
self.text_deep = tf.concat(text_cnn_output, axis=1) # None * (filter_size*len(windows))
self.movie_deep = tf.concat([self.text_deep, self.movie_deep], axis=1)
i = len(self.movie_deep_layers) - 1
self.movie_deep = tf.add(
tf.matmul(self.movie_deep, self.weights["movie_layer_%d" %i]), self.weights["movie_bias_%d"%i]) # None * layer[i] * 1
self.movie_deep = self.deep_layers_activation(self.movie_deep)
self.movie_deep = tf.nn.dropout(self.movie_deep, self.movie_dropout_keep[1+i])
# ---------- Conbine ----------
self.out = tf.reduce_sum(tf.multiply(self.user_deep, self.movie_deep), axis=1, keep_dims=True)
# loss
if self.loss_type == "logloss":
self.out = tf.nn.sigmoid(self.out)
self.loss = tf.losses.log_loss(self.label, self.out)
elif self.loss_type == "mse":
self.loss = tf.nn.l2_loss(tf.subtract(self.label, self.out))
# l2 regularization on weights
if self.l2_reg > 0:
for key in self.weights.keys():
if key == 'feature_embeddings':
continue
self.loss += tf.contrib.layers.l2_regularizer(
self.l2_reg)(self.weights[key])
# optimizer
if self.optimizer_type == "adam":
self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate, beta1=0.9, beta2=0.999,
epsilon=1e-8).minimize(self.loss)
elif self.optimizer_type == "adagrad":
self.optimizer = tf.train.AdagradOptimizer(learning_rate=self.learning_rate,
initial_accumulator_value=1e-8).minimize(self.loss)
elif self.optimizer_type == "gd":
self.optimizer = tf.train.GradientDescentOptimizer(learning_rate=self.learning_rate).minimize(self.loss)
elif self.optimizer_type == "momentum":
self.optimizer = tf.train.MomentumOptimizer(learning_rate=self.learning_rate, momentum=0.95).minimize(
self.loss)
elif self.optimizer_type == "ftrl":
self.optimizer = tf.train.FtrlOptimizer(learning_rate=self.learning_rate).minimize(
self.loss)
# init
self.saver = tf.train.Saver()
init = tf.global_variables_initializer()
self.sess = self._init_session()
if self.isrestore:
self.saver.restore(self.sess, self.save_path)
else:
self.sess.run(init)
# number of params
total_parameters = 0
for variable in self.weights.values():
shape = variable.get_shape()
variable_parameters = 1
for dim in shape:
variable_parameters *= dim.value
total_parameters += variable_parameters
if self.verbose > 0:
print("#params: %d" % total_parameters)
def _init_session(self):
config = tf.ConfigProto(device_count={"gpu": 0})
config.gpu_options.allow_growth = True
return tf.Session(config=config)
def _initialize_weights(self):
weights = dict()
# embeddings
weights["feature_embeddings"] = tf.Variable(
tf.random_normal([self.feature_size, self.embedding_size], 0.0, 0.01),
name="feature_embeddings") # feature_size * K
# user deep layers
num_layer = len(self.user_deep_layers)
input_size = self.user_size * self.embedding_size
glorot = np.sqrt(2.0 / (input_size + self.user_deep_layers[0]))
weights["user_layer_0"] = tf.Variable(
np.random.normal(loc=0, scale=glorot, size=(input_size, self.user_deep_layers[0])), dtype=np.float32)
weights["user_bias_0"] = tf.Variable(np.random.normal(loc=0, scale=glorot, size=(1, self.user_deep_layers[0])),
dtype=np.float32) # 1 * layers[0]
for i in range(1, num_layer):
glorot = np.sqrt(2.0 / (self.user_deep_layers[i-1] + self.user_deep_layers[i]))
weights["user_layer_%d" % i] = tf.Variable(
np.random.normal(loc=0, scale=glorot, size=(self.user_deep_layers[i-1], self.user_deep_layers[i])),
dtype=np.float32) # layers[i-1] * layers[i]
weights["user_bias_%d" % i] = tf.Variable(
np.random.normal(loc=0, scale=glorot, size=(1, self.user_deep_layers[i])),
dtype=np.float32) # 1 * layer[i]
# movie deep layers
num_layer = len(self.movie_deep_layers)
input_size = self.movie_size * self.embedding_size
glorot = np.sqrt(2.0 / (input_size + self.movie_deep_layers[0]))
weights["movie_layer_0"] = tf.Variable(
np.random.normal(loc=0, scale=glorot, size=(input_size, self.movie_deep_layers[0])), dtype=np.float32)
weights["movie_bias_0"] = tf.Variable(np.random.normal(loc=0, scale=glorot, size=(1, self.movie_deep_layers[0])),
dtype=np.float32) # 1 * layers[0]
for i in range(1, num_layer):
text_concat_size = 0
if i == num_layer-1:
text_concat_size = self.filter_size * len(self.windows)
glorot = np.sqrt(2.0 / (self.movie_deep_layers[i-1] + self.movie_deep_layers[i] + text_concat_size))
weights["movie_layer_%d" % i] = tf.Variable(
np.random.normal(loc=0, scale=glorot,
size=(self.movie_deep_layers[i-1]+text_concat_size, self.movie_deep_layers[i])),
dtype=np.float32) # layers[i-1] * layers[i]
weights["movie_bias_%d" % i] = tf.Variable(
np.random.normal(loc=0, scale=glorot, size=(1, self.movie_deep_layers[i])),
dtype=np.float32) # 1 * layer[i]
# text cnn layers
for i in range(len(self.windows)):
weights['text_filter_%d' % i] = tf.Variable(
tf.random_normal([self.windows[i], self.embedding_size, 1, self.filter_size], 0.0, 0.01))
return weights
def get_batch(self, Xu, Xm, Xt, y, batch_size, index):
start = index * batch_size
end = (index+1) * batch_size
end = end if end < len(y) else len(y)
return Xu[start:end], Xm[start:end], Xt[start:end], [[y_] for y_ in y[start:end]]
# shuffle three lists simutaneously
def shuffle_in_unison_scary(self, a, b, c, d):
rng_state = np.random.get_state()
np.random.shuffle(a)
np.random.set_state(rng_state)
np.random.shuffle(b)
np.random.set_state(rng_state)
np.random.shuffle(c)
np.random.set_state(rng_state)
np.random.shuffle(d)
def fit_on_batch(self, Xu, Xm, Xt, y):
feed_dict = {self.user_index: Xu,
self.movie_index: Xm,
self.text_index: Xt,
self.label: y,
self.user_dropout_keep: self.user_dropout_keep_layers,
self.movie_dropout_keep: self.movie_dropout_keep_layers,}
opt = self.sess.run(self.optimizer, feed_dict=feed_dict)
def fit(self, Xu_train, Xm_train, Xt_train, y_train,
Xu_valid=None, Xm_valid=None, Xt_valid=None, y_valid=None, epoches=10, paras_save=False):
"""
self.user_index
self.movie_index
self.text_index
user_dropout_keep_layers
movie_dropout_keep_layers
"""
self.epoch = epoches
has_valid = y_valid is not None
for epoch in range(self.epoch):
t1 = time()
self.shuffle_in_unison_scary(Xu_train, Xm_train, Xt_train, y_train)
total_batch = int(np.ceil(len(y_train) / self.batch_size))
for i in range(total_batch):
Xu_batch, Xm_batch, Xt_batch, y_batch = self.get_batch(
Xu_train, Xm_train, Xt_train, y_train, self.batch_size, i)
self.fit_on_batch(Xu_batch, Xm_batch, Xt_batch, y_batch)
# evaluate training and validation datasets
if has_valid:
valid_result = self.evaluate(Xu_valid, Xm_valid, Xt_valid, y_valid)
# self.valid_result.append(valid_result)
if self.verbose > 0 and epoch % self.verbose == 0:
train_result = self.evaluate(Xu_train, Xm_train, Xt_train, y_train)
# self.train_result.append(train_result)
if has_valid:
print("[%d] train-result=%.4f, valid-result=%.4f [%.1f s]"
% (epoch + 1, train_result, valid_result, time() - t1))
else:
print("[%d] train-result=%.4f [%.1f s]"
% (epoch + 1, train_result, time() - t1))
if paras_save:
self.saver.save(self.sess, self.save_path)
def predict(self, Xu_train, Xm_train, Xt_train):
"""
:param Xi: list of list of feature indices of each sample in the dataset
:param Xv: list of list of feature values of each sample in the dataset
:return: predicted probability of each sample
"""
# dummy y
dummy_y = [1] * len(Xu_train)
total_batch = int(np.ceil(len(Xu_train) / self.batch_size))
y_pred = None
for i in range(total_batch):
Xu_batch, Xm_batch, Xt_batch, y_batch = self.get_batch(
Xu_train, Xm_train, Xt_train, dummy_y, self.batch_size, i)
feed_dict = {self.user_index: Xu_batch,
self.movie_index: Xm_batch,
self.text_index: Xt_batch,
self.user_dropout_keep: [1.0]*len(self.user_dropout_keep_layers),
self.movie_dropout_keep: [1.0]*len(self.movie_dropout_keep_layers),}
batch_out = self.sess.run(self.out, feed_dict=feed_dict)
if i == 0:
y_pred = batch_out.flatten()
else:
y_pred = np.concatenate((y_pred, batch_out.flatten()))
return y_pred
def evaluate(self, Xu, Xm, Xt, y):
"""
:param Xi: list of list of feature indices of each sample in the dataset
:param Xv: list of list of feature values of each sample in the dataset
:param y: label of each sample in the dataset
:return: metric of the evaluation
"""
y_pred = self.predict(Xu, Xm, Xt)
return self.eval_metric(y, y_pred)
5 训练数据
model = RecomCNN(feature_size=15186,
user_size=len(user_data[0]),
movie_size=len(movie_data[0]),
text_size=len(text_data[0]),
filter_size=8,
embedding_size=32,
user_deep_layers=[200,200],
movie_deep_layers=[96,200],
user_dropout_keep_layers=[1,1,0.8],
movie_dropout_keep_layers=[1,1,0.8],
windows=[2,3,4,5],
deep_layers_activation=tf.nn.relu,
epoch=10,
batch_size=256,
learning_rate=0.001,
optimizer_type="adam",
verbose=1,
random_seed=2019,
loss_type="mse",
eval_metric=mean_squared_error,
l2_reg=0.0,
isrestore=False,
save_path='movie.ckpt')
model.fit(user_data, movie_data, text_data, label, epoches=10, paras_save=True)
6 建立用户和电影的特征向量
def add_feature_vector(users, movies, model, users_ori, movies_ori):
movie_data = [[movies['MovieID'][i]]+row for i,row in enumerate(movies['Genres'])]
text_data = [row for row in movies['Title']]
feed_dict = {model.user_index: users.values,
model.movie_index: movie_data,
model.text_index: text_data,
model.user_dropout_keep: [1.0]*3,
model.movie_dropout_keep: [1.0]*3,
}
user_matrix = model.user_deep.eval(feed_dict, session=model.sess)
movie_matrix = model.movie_deep.eval(feed_dict, session=model.sess)
users_ori['feat_vect'] = user_matrix.tolist()
movies_ori['feat_vect'] = movie_matrix.tolist()
return users_ori, movies_ori
users_with_feat_vector, movies_with_feat_vector = add_feature_vector(users, movies, model, users_ori, movies_ori)
users_with_feat_vector.set_index('UserID', inplace=True)
movies_with_feat_vector.set_index('MovieID', inplace=True)
7 推荐环节
# 推荐与当前电影类似的电影
def recommend_same_type_movie(movie_id, movies, top_k=20, recom_size=5):
'''
paras: 'top_k'为最匹配的top内容,'recom_size'为从匹配的top内容里随机选取的内容
'''
movie_vec = np.asarray([movies.loc[movie_id, 'feat_vect']])
movie_matrix = np.asarray([row for row in movies['feat_vect']])
movie_vec = movie_vec / np.sqrt(np.sum(np.square(movie_vec)))
movie_matrix = movie_matrix / np.sqrt(np.sum(np.square(movie_matrix), axis=1, keepdims=True))
similar = pd.Series((movie_matrix@movie_vec.T).flatten(), index=movies.index)
prob = similar.sort_values(ascending=False)[1:1+top_k]
recom_index = np.random.choice(prob.index, size=recom_size, replace=False, p=(prob/sum(prob)).values)
print('当前电影为: {}'.format(movie_id), movies.loc[movie_id, 'Title'], movies.loc[movie_id, 'Genres'])
print('相似内容有:')
for i in recom_index:
print(i, movies.loc[i, 'Title'], movies.loc[i, 'Genres'],
'相似度为:{}'.format(round(similar[i], 2)))
# 推荐用户可能喜欢的电影
def recommend_your_favorite_movie(user_id, users, movies, top_k=20, recom_size=5):
user_vec = np.asarray([users.loc[user_id, 'feat_vect']])
movie_matrix = np.asarray([row for row in movies['feat_vect']])
ratings = pd.Series((movie_matrix@user_vec.T).flatten(), index = movies.index)
prob = ratings.sort_values(ascending=False)[:top_k]
recom_index = np.random.choice(prob.index, size=recom_size, replace=False, p=(prob/sum(prob)).values)
print('以下为推荐内容:')
for i in recom_index:
print(i, movies.loc[i, 'Title'], movies.loc[i, 'Genres'],
'喜欢度为:{}'.format(round(ratings[i], 2)))
# 推荐喜欢当前电影的用户及其喜欢的电影
def recommend_other_favorite_movie(movie_id, users, movies, top_k=20, recom_size=5):
movie_vec = np.asarray([movies.loc[movie_id, 'feat_vect']])
user_matrix = np.asarray([row for row in users['feat_vect']])
movie_matrix = np.asarray([row for row in movies['feat_vect']])
users_ratings = pd.Series((user_matrix@movie_vec.T).flatten(), index=users.index)
user_top_ratings = users_ratings.sort_values(ascending=False)[:top_k]
user_recom_id = np.random.choice(user_top_ratings.index, size=recom_size, replace=False,
p=(user_top_ratings/sum(user_top_ratings)).values)
user_top_matrix = np.asarray([row for row in users.loc[user_top_ratings.index, 'feat_vect']])
movie_top_index = pd.Series(np.argpartition(movie_matrix@user_top_matrix.T, 10, axis=0)[0:10].flatten()).value_counts()
movie_recom_index = np.random.choice(movie_top_index.index, size=recom_size, replace=False,
p=(movie_top_index/sum(movie_top_index)).values)
print('您看的电影是:', movie_id, movies.loc[movie_id].Title, movies.loc[movie_id].Genres)
print('喜欢该电影的用户有:')
for i in user_recom_id:
print(i, users.loc[i, 'Gender'], users.loc[i, 'Age'],)
print('喜欢该电影的人还喜欢:')
for i in movie_recom_index:
print(movies.index[i], movies.iloc[i].Title, movies.iloc[i].Genres,)
recommend_same_type_movie(1401, movies_with_feat_vector)
recommend_your_favorite_movie(1041, users_with_feat_vector, movies_with_feat_vector)
recommend_other_favorite_movie(1401, users, movies)
8 结果展示
您看的电影是: 1401 Ghosts of Mississippi (1996) Drama
喜欢该电影的用户有:
2867 M 45
3902 M 25
2155 F 1
215 M 35
283 M 25
喜欢该电影的人还喜欢:
1323 Amityville 3-D (1983) Horror
1383 Adrenalin: Fear the Rush (1996) Action|Sci-Fi
3574 Carnosaur 3: Primal Species (1996) Horror|Sci-Fi
810 Kazaam (1996) Children's|Comedy|Fantasy
1495 Turbo: A Power Rangers Movie (1997) Action|Adventure|Children's
9 引用
使用MovieLens数据集训练的电影推荐系统