用于推荐引擎的指标是最小均方根误差(RMSE),我们先计算均方误差的平均值然后取其平方根,如果评级在1星到5星范围内,而我们的RMSE结果为1星的话,代表我们的预测值与用户给出的真实评价相差1颗星。
测试过程def getDataFrameAndBookId(): #获得图书评分矩阵
db=pymysql.connect("localhost","root","","douban")
cursor=db.cursor()
sql="select bookid,userid,score from comment limit 10000"
cursor.execute(sql)
record=cursor.fetchall()
data=DataFrame()
for item in record:
data.loc[item[1],item[0]]=item[2]
_=data.fillna(0,inplace=True)
bookid={}
userid={}
for i in range(len(data.columns)):
bookid[i]=data.columns[i]
for i in range(len(data.index)):
userid[i]=data.index[i]
db.close()
return mat(data.as_matrix()),bookid,userid
def ecludSim(inA,inB):
return 1.0/(1.0+la.norm(inA-inB))
def pearsSim(inA,inB):
if len(inA)<3: #如果小于3,则直接返回1.0
return 1.0
if sum(var(inA))==0 or sum(var(inB))==0:
return 0
return 0.5+0.5*corrcoef(inA,inB,rowvar=0)[0][1] #因为相似度算出来是个2*2的矩阵,所以要加[0][1]
def cosSim(inA,inB):
num=float(inA.T*inB)
denom=la.norm(inA)*la.norm(inB)
return 0.5+0.5*(num/denom)
def standEst(dataMat,user,simMeas,item): #计算这个item对某用户的相似度,预测评分
n=shape(dataMat)[1]
simTotal=0.0
ratSimTotal=0.0
for j in range(n):
userRating=dataMat[user,j] #找该用户评过分的物品
if userRating==0:continue
overLap=nonzero(logical_and(dataMat[:,item].A>0,dataMat[:,j].A>0))[0] #评过分的物品和某item都取评分不为0的项计算相似度
#print('OVERLAP',overLap)
if len(overLap)==0:
similarity=0
else:
similarity=simMeas(dataMat[overLap,item],dataMat[overLap,j])
# print('the %d and %d similarity is: %f' % (item,j,similarity))
simTotal+=similarity #累加相似度
ratSimTotal+=similarity*userRating #相似度乘以评分
if simTotal==0:
return 0
else:
return ratSimTotal/simTotal #得到预测评分
def getSum(dataMat,user,item,simMeas=ecludSim,estMethod=standEst):
ori=dataMat[user,item] #对应书籍的评分
dataMat[user,item]=0 #将评分置为0
sim=standEst(dataMat, user, simMeas, item) #进行计算
dataMat[user,item]=ori #恢复评分
return (ori-sim)**2 #范围预测值与真实值的平方
def test(): #测试最小均方根误差
db=pymysql.connect("localhost","root","","douban")
cursor=db.cursor()
myMat,bookDict,userDict=getDataFrameFaster()
n=shape(myMat)[1]
m=shape(myMat)[0]
sumPears=0.0
sumEclud=0.0
sumCos=0.0
cnt=0
for i in [random.randint(0,m) for k in range(1000)]:
for j in nonzero(myMat[i,:]>0)[1]:
sumPears+=getSum(myMat,i,j,simMeas=pearsSim)
sumEclud+=getSum(myMat, i, j, simMeas=ecludSim)
sumCos+=getSum(myMat,i,j,simMeas=cosSim)
cnt+=1
return ((sumPears**0.5)/cnt),((sumEclud**0.5)/cnt),((sumCos**0.5)/cnt)
可以看到,三种距离计算方式的差异极小,且均不到0.1,说明我们的推荐系统在评分预测上与用户的真实评分误差在0.1以内,可见我们的推荐系统可以比较精准的预测用户的喜好。相对来说,采用余弦相似度或欧氏距离的最小均方根误差相对要好