程序功能:基于MoviesLens 100k数据集中的数据,给定一个新的被试已观看的电影评分,得到没有观看的电影评分,并对该被试进行电影推荐。
代码:
# 2018-05-05
# 根据给出的数据,对新来的数据进行电影的预测评级
# Code by Han Sun, Southeast Univ. Nanjing
# 载入必要的库
import pandas as pd
import numpy as np
from scipy import optimize
from scipy.sparse import hstack
# 子程序1:对评级进行标准化化
def normalizeRatings(Y,R):
S = Y.shape
Ymean = np.zeros((S[0],1))
Ynorm = np.zeros(S)
for ind in range(S[0]):
Ind = []
SUM = 0
Row = R[ind,:] # 提取每行数据
# 找出每行数据中布尔矩阵标记为1的数,求平均并标准化,(数值-平均值)
for ind1 in range(Row.shape[0]):
if Row[ind1] == 1:
Ind.append(ind1)
for ind2 in range(len(Ind)):
SUM += Y[ind,Ind[ind2]]
Avg = SUM/len(Ind)
for ind3 in range(len(Ind)):
Ynorm[ind,Ind[ind3]] = Y[ind,Ind[ind3]] - Avg
Ymean[ind] = Avg
return Ynorm,Ymean
# 子程序2:代价函数
def CostFunc(params,*args):
Y,R,nu,nm,nf,lb = args
# 对输入的一列向量进行分解
X_X = params[0:nm*nf]
X_Theat = params[nm*nf:len(params)]
X = X_X.reshape((nm,nf))
Theta = X_Theat.reshape((nu,nf))
cost = 0 # 初始化损失值为0
Pred = np.mat(X)*np.mat(Theta).T
Diff_Pred = Pred - np.mat(Y)
# 计算代价函数,代价函数一共分成三个部分
cost_P1 = (sum(np.array((Pred[np.where(R==1)]-Y[np.where(R==1)]).T)**2))/2
cost_P2 = sum(sum(X**2))*lb/2
cost_P3 = sum(sum(Theta**2))*lb/2
cost = cost_P1 + cost_P2 + cost_P3
return cost
# 子程序3:梯度
def CostFuncGrad(params,*args):
Y,R,nu,nm,nf,lb = args
# 对输入的一列向量进行分解
X_X = params[0:nm*nf]
X_Theta = params[nm*nf:len(params)]
X = X_X.reshape((nm,nf))
Theta = X_Theta.reshape((nu,nf))
Pred = np.mat(X)*np.mat(Theta).T
Diff_Pred = Pred - np.mat(Y)
# 初始化X的梯度和Theta的梯度
X_grad = np.zeros(X.shape)
Theta_grad = np.zeros(Theta.shape)
for ind1 in range(nm):
Rate_Lab = list(np.where(R[ind1,:]==1))[0]
X_grad[ind1,:] = Diff_Pred[ind1,Rate_Lab]*Theta[Rate_Lab,:] + lb*X[ind1,:]
for ind2 in range(nu):
Rate_Lab_User = list(np.where(R[:,ind2]==1))[0]
Theta_grad[ind2,:] = Diff_Pred[Rate_Lab_User,ind2].T*X[Rate_Lab_User,:] + lb*Theta[ind2,:]
X_grad_Res = X_grad.reshape(X_X.shape)
Theta_grad_Res = Theta_grad.reshape(X_Theta.shape)
#print(X_grad_Res.shape,Theta_grad_Res.shape)
grad = np.hstack((X_grad_Res.T,Theta_grad_Res.T))
#print(grad.shape)
return grad
# 主程序
# ========= Step1:读入数据 ============
s0 = "MOOC_用Python玩转数据".center(40,'=')
print(s0)
s00 = "感谢老师的认真严谨,收获很多".center(40,'=')
print(s00)
s1 = "读入数据中".center(40,'=')
print(s1)
print('.'*10)
# 读入Users数据
u_names = ['user_id','age','gender','occupation','zip code']
User_Info = pd.read_table('C:/Users/Rainbow Sun/Desktop/Mooc_Data Science/Project1/ml-100k/u.user',sep='\|',\
names = u_names, engine = 'python')
# 读入Movie数据
m_names = ['movie/item id','movie title','release data','video release data','IMDb URL','unknown','Action','Adventure','Animation'\
"Children's",'Comedy','Crime','Documentary','Drama','Fantasy','Film-Noir','Horror','Musical','Mystery','Romance',\
'Sci-Fi','Thriller','War','Western','1']
Movie_Info = pd.read_table('C:/Users/Rainbow Sun/Desktop/Mooc_Data Science/Project1/ml-100k/u.item',sep='\|',\
names = m_names, engine = 'python')
# 读入评分数据
r_names = ['user id','item id','rating','timestamp']
Rating_Info = pd.read_table('C:/Users/Rainbow Sun/Desktop/Mooc_Data Science/Project1/ml-100k/u.data',sep='\t',\
names = r_names, engine = 'python')
# ========= Step2:定义基于Users和Movie的矩阵,并自定义一个评级测试矩阵 ============
s2 = "已观看电影评级".center(40,'=')
print(s2)
# 注意:自定义矩阵可以随意定义!!!
# 逻辑布尔矩阵:有评级的是1,没有评级的是0.用于对评级矩阵的初始化
num_users = len(User_Info)
num_movies = len(Movie_Info)
Rating_Init = np.zeros((num_movies,num_users)) # 相当于矩阵的行是电影,列是被试
Rating_Bool_Init = np.zeros((num_movies,num_users)) # 对于评级的逻辑布尔矩阵
Rating_Mine = np.zeros((num_movies,1)) # 自定义测试矩阵
Rating_Bool_Mine = np.zeros((num_movies,1)) # 自定义矩阵的逻辑布尔矩阵
# 给出自己看过的电影和评级。
# Movies_Mine = [609,452,1582,1517,1515,1463,123,448,1539,872,1605,1103,772,1643,1373,99,746,174,333,1438]
# R_Mine = [5,1,5,3,1,1,3,5,5,1,5,5,2,4,1,2,5,4,5,3]
Movies_Mine = [0,97,6,11,53,63,65,68,182,225,354]
R_Mine = [4,2,3,5,4,5,3,5,4,5,5]
Movies_Watched = len(Movies_Mine) # 看过的电影数目
for ind1 in range(Movies_Watched):
Rating_Mine[Movies_Mine[ind1]] = R_Mine[ind1]
Rating_Bool_Mine[Movies_Mine[ind1]] = 1
# 选择需要的列,给出数据集中被试对于电影的评级
# 如果被试给出了评级,则存入相应的评级;如果没有评级,则用0表示
Ratings_df = pd.DataFrame()
Ratings_df['user id'] = Rating_Info['user id']
Ratings_df['item id'] = Rating_Info['item id']
Ratings_df['rating'] = Rating_Info['rating']
Ratings_array = Ratings_df.values # 提取DataFrame中的值
for ind2 in range(len(Ratings_array)):
Rating_Init[Ratings_array[ind2][1]-1][Ratings_array[ind2][0]-1] = Ratings_array[ind2][2]
Rating_Bool_Init[Ratings_array[ind2][1]-1][Ratings_array[ind2][0]-1] = 1
# 合并测试矩阵和训练矩阵以及逻辑布尔矩阵
Rating_Merge = np.hstack((Rating_Mine,Rating_Init))
Rating_Bool_Merge = np.hstack((Rating_Bool_Mine,Rating_Bool_Init))
# 调用标准化函数对前面两个合并矩阵标准化
(Rating_Norm,Rating_Mean) = normalizeRatings(Rating_Merge,Rating_Bool_Merge)
# 输出开始观看的电影和预测的电影
Movies_df = pd.DataFrame()
Movies_df['movie/item id'] = Movie_Info['movie/item id']
Movies_df['movie title'] = Movie_Info['movie title']
Movies_array = Movies_df.values
for item in Movies_Mine:
print("观看过的电影为{},评级为{:.4f}".format(Movies_array[item,1],Rating_Merge[item,0]))
# 更新有用的参数,包括被试数量,电影数量,以及特征数量
num_users_merge = Rating_Merge.shape[1]
num_movies_merge = Rating_Merge.shape[0]
num_features_merge = 10
# ========= Step3:设置系数矩阵,并且优化 ============
s3 = "推荐系统训练".center(40,'=')
print(s3)
# 设置初始化随机值(Theta,X)
X = np.random.randn(num_movies_merge,num_features_merge)
Theta = np.random.randn(num_users_merge,num_features_merge)
X_Res = X.reshape((num_movies_merge*num_features_merge,1))
Theta_Res = Theta.reshape((num_users_merge*num_features_merge,1))
Init_Param = np.vstack((X_Res,Theta_Res)) # 将X和Theta转换成一列数据之后合并输入代价函数
# 设置可变参数
lambda1 = 10
args = (Rating_Norm,Rating_Bool_Merge,num_users_merge,num_movies_merge,num_features_merge,lambda1)
# 最优化得到参数theta
theta = optimize.fmin_cg(CostFunc,x0=Init_Param,fprime=CostFuncGrad,args=args,maxiter=100)
X = theta[0:num_movies_merge*num_features_merge].reshape((num_movies_merge,num_features_merge))
Theta = theta[num_movies_merge*num_features_merge:len(theta)].reshape((num_users_merge,num_features_merge))
# ========= Step4:推荐系统 =========================
s4 = "推荐系统结果".center(40,'=')
print(s4)
p = np.mat(X) * np.mat(Theta).T
my_prediction = p[:,0] + Rating_Mean
# 对预测的结果进行排序
ix = np.argsort(-my_prediction,axis=0)
r = -np.sort(-my_prediction,axis=0)
s = "推荐系统训练中"
print('推荐系统学习完成')
# 输出根据评分推荐的前十名的电影名字及评级
for ind3 in range(10):
print("推荐的电影排名{},电影名称{},评级{:.4f}".format((ind3+1),Movies_array[int(ix[ind3]),1],\
r[ind3,0]))
# 代码完成
s5 = "项目代码完成_Version1".center(40,'=')
print(s5)
s6 = "Coding by Sun, 2018-05-12".center(40,'=')
print(s6)
s7 = "谢谢老师评阅".center(40,'=')
print(s7)