1、搜集数据集
https://grouplens.org/datasets/movielens/
2、准备数据
import pandas as pd
import numpy as np
import tensorflow as tf
ratings_df = pd.read_csv('C:/Users/Administrator/PycharmProjects/myproject/ml-latest-small/ratings.csv')
#print(ratings_df.tail()) #加载ratings.csv文件
movies_df = pd.read_csv('ml-latest-small/movies.csv')
#print(movies_df.tail())# 加载movies.csv文件
movies_df['movieRow'] = movies_df.index
#print(movies_df.tail())
# # 筛选movies_df中的特征
movies_df = movies_df[['movieRow', 'movieId', 'title']]
movies_df.to_csv('moviesProcessed.csv', index = False, header=True, encoding='utf-8')
#print(movies_df.tail())#筛选好之后的movies_df文件
#将rating_df中的movieId替换为行号
ratings_df = pd.merge(ratings_df, movies_df, on='movieId')
#print(ratings_df.head()) #查看处理好的ratings_df文件
ratings_df = ratings_df[['userId', 'movieRow', 'rating']]
ratings_df.to_csv('ratingsProcessed.csv', index=False, header=True, encoding='utf-8')
#查看筛选好之后的电影评分信息,只包含userId,movieRow,rating 三个
print(ratings_df.head())
# 创建电影评分矩阵rating 和评分记录矩阵record
userNo = ratings_df['userId'].max()+1
movieNo = ratings_df['movieRow'].max()+1
print(userNo)#最大用户编号
print(movieNo)#最大电影编号
rating = np.zeros((movieNo, userNo))
flag = 0
ratings_df_length = np.shape(ratings_df)[0]
for index, row in ratings_df.iterrows(): #将rating_df中的数据填写到rating当中
rating[int(row['movieRow']), int(row['userId'])] = row['rating']
flag += 1
# print('processed %d, %d left' % (flag, ratings_df_length-flag))
record = rating > 0
print(record)
record = np.array(record, dtype=int) #将record的布尔型转化为0和1,0:用户没有对此电影评分;1:用户对此电影进行了评分
print(record)
3、构建模型
def normalizeRatings(rating, record):
m, n = rating.shape #m:电影数量;n:用户数量
rating_mean = np.zeros((m,1))
rating_norm = np.zeros((m,n))
# 计算每个电影的原始评分-平均评分,平均评分
for i in range(m):
idx = record[i, :] != 0
rating_mean[i] = np.mean(rating[i, idx])#计算用户评过分的电影的平均值
rating_norm[i, idx] -= rating_mean[i]
return rating_norm, rating_mean
rating_norm, rating_mean = normalizeRatings(rating, record)
#print(rating_norm, rating_mean)#会报错,因为rating_norm,rating_mean中含有NAN
rating_norm = np.nan_to_num(rating_norm)#将rating_norm 中的NaN变成数字0
print(rating_norm)
rating_mean = np.nan_to_num(rating_mean)#将rating_mean 中的NaN变成数字0
print(rating_mean)
num_features = 10 #假设有10种类型的电影
#初始化电影内容矩阵X和用户喜好矩阵theta, 标准正态分布
X_parameters = tf.Variable(tf.random_normal([movieNo, num_features], stddev=0.35))
Theta_parameters = tf.Variable(tf.random.normal([userNo, num_features], stddev=0.35))
#计算损失函数:loss = (X*Theta'-rating_norm)^2 +1/2(X^2 + Theta^2)
loss = 1/2 * tf.reduce_sum(((tf.matmul(X_parameters, Theta_parameters, transpose_b=True)-rating_norm) * record)**2)\
+ 1/2 * ((tf.reduce_sum(X_parameters**2) + tf.reduce_sum(Theta_parameters**2)))
4、优化目标
optimizer = tf.train.AdamOptimizer(1e-4)
train = optimizer.minimize(loss)
5、模型训练
tf.summary.scalar('loss', loss)
summaryMerged = tf.summary.merge_all()
filename = './movie_tensorboard'
writer = tf.summary.FileWriter(filename)
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)
for i in range(5000): #训练次数
_, movie_summary = sess.run([train, summaryMerged])
writer.add_summary(movie_summary, i) #代价值随着迭代次数增加的变化情况
6、查看训练结果
打开cmd,切换到保存数据的路径当中,
cd D:\Users\Administrator\PycharmProjects\myproject\movie_tensorboard
执行命令:tensorboard –logdir=./
在浏览器地址栏输入:127.0.0.1:6006,就可以在tensorboard中看到代价值随着迭代次数增加的变化情况。
7、模型评估
#将当前X与Theta保存
Current_X_parameters, Current_Theta_parameters = sess.run([X_parameters, Theta_parameters])
predicts = np.dot(Current_X_parameters, Current_Theta_parameters.T) + rating_mean
errors = np.sqrt(np.sum((predicts - rating)**2))
print(errors)
8、构建完整的电影推荐系统
user_id = input('您要向哪位用户进行电影推荐?请输入用户编号:')
sortedResult = predicts[:, int(user_id)].argsort()[::-1] #按降序排列
idx = 0 #用来表示为用户推荐的电影数量
print('为用户推荐的评分最高的20部电影是:'.center(80, '='))
for i in sortedResult:
print('评分:%2f,电影名:%s' % (predicts[i, int(user_id)], movies_df.iloc[i]['title']))
idx += 1
if idx == 20: break