参考了Yehuda Koren 08年的论文Factorization Meets the Neighborhood: a Multifaceted Collaborative Filtering Model
代码如下:
''' Version:1.0 Created on 2014-02-25 @Author:Dior ''' import random import math import cPickle as pickle class SVD(): def __init__(self,allfile,trainfile,testfile,factorNum=10): #all data file self.allfile=allfile #training set file self.trainfile=trainfile #testing set file self.testfile=testfile #get factor number self.factorNum=factorNum #get user number self.userNum=self.getUserNum() #get item number self.itemNum=self.getItemNum() #learning rate self.learningRate=0.01 #the regularization lambda self.regularization=0.05 #initialize the model and parameters self.initModel() #get user number function def getUserNum(self): file=self.allfile cnt=0 userSet=set() for line in open(file): user=line.split('\t')[0].strip() if user not in userSet: userSet.add(user) cnt+=1 return cnt #get item number function def getItemNum(self): file=self.allfile cnt=0 itemSet=set() for line in open(file): item=line.split('\t')[1].strip() if item not in itemSet: itemSet.add(item) cnt+=1 return cnt #initialize all parameters def initModel(self): self.av=self.average(self.trainfile) self.bu=[0.0 for i in range(self.userNum)] self.bi=[0.0 for i in range(self.itemNum)] temp=math.sqrt(self.factorNum) self.pu=[[(0.1*random.random()/temp) for i in range(self.factorNum)] for j in range(self.userNum)] self.qi=[[0.1*random.random()/temp for i in range(self.factorNum)] for j in range(self.itemNum)] print "Initialize end.The user number is:%d,item number is:%d,the average score is:%f" % (self.userNum,self.itemNum,self.av) #train model def train(self,iterTimes=100): print "Beginning to train the model......" trainfile=self.trainfile preRmse=10000.0 for iter in range(iterTimes): fi=open(trainfile,'r') #read the training file for line in fi: content=line.split('\t') user=int(content[0].strip())-1 item=int(content[1].strip())-1 rating=float(content[2].strip()) #calculate the predict score pscore=self.predictScore(self.av,self.bu[user],self.bi[item],self.pu[user],self.qi[item]) #the delta between the real score and the predict score eui=rating-pscore #update parameters bu and bi(user rating bais and item rating bais) self.bu[user]+=self.learningRate*(eui-self.regularization*self.bu[user]) self.bi[item]+=self.learningRate*(eui-self.regularization*self.bi[item]) for k in range(self.factorNum): temp=self.pu[user][k] #update pu,qi self.pu[user][k]+=self.learningRate*(eui*self.qi[user][k]-self.regularization*self.pu[user][k]) self.qi[item][k]+=self.learningRate*(temp*eui-self.regularization*self.qi[item][k]) #print pscore,eui #close the file fi.close() #calculate the current rmse curRmse=self.test(self.av,self.bu,self.bi,self.pu,self.qi) print "Iteration %d times,RMSE is : %f" % (iter+1,curRmse) if curRmse>preRmse: break else: preRmse=curRmse print "Iteration finished!" #test on the test set and calculate the RMSE def test(self,av,bu,bi,pu,qi): testfile=self.testfile rmse=0.0 cnt=0 fi=open(testfile) for line in fi: cnt+=1 content=line.split('\t') user=int(content[0].strip())-1 item=int(content[1].strip())-1 score=float(content[2].strip()) pscore=self.predictScore(av,bu[user],bi[item],pu[user],qi[item]) rmse+=math.pow(score-pscore,2) fi.close() return math.sqrt(rmse/cnt) #calculate the average rating in the training set def average(self,filename): result=0.0 cnt=0 for line in open(filename): cnt+=1 score=float(line.split('\t')[2].strip()) result+=score return result/cnt #calculate the inner product of two vectors def innerProduct(self,v1,v2): result=0.0 for i in range(len(v1)): result+=v1[i]*v2[i] return result def predictScore(self,av,bu,bi,pu,qi): pscore=av+bu+bi+self.innerProduct(pu,qi) if pscore<1: pscore=1 if pscore>5: pscore=5 return pscore if __name__=='__main__': s=SVD("data\\u.data","data\\ua.base","data\\ua.test") #print s.userNum,s.itemNum #print s.average("data\\ua.base") s.train()
Initialize end.The user number is:943,item number is:1682,the average score is:3.523827
Beginning to train the model......
Iteration 1 times,RMSE is : 1.002799
Iteration 2 times,RMSE is : 0.982096
Iteration 3 times,RMSE is : 0.972882
Iteration 4 times,RMSE is : 0.967720
Iteration 5 times,RMSE is : 0.964554
Iteration 6 times,RMSE is : 0.962498
Iteration 7 times,RMSE is : 0.961116
Iteration 8 times,RMSE is : 0.960166
Iteration 9 times,RMSE is : 0.959482
Iteration 10 times,RMSE is : 0.958933
Iteration 11 times,RMSE is : 0.958416
Iteration 12 times,RMSE is : 0.957814
Iteration 13 times,RMSE is : 0.956986
Iteration 14 times,RMSE is : 0.955798
Iteration 15 times,RMSE is : 0.954165
Iteration 16 times,RMSE is : 0.952135
Iteration 17 times,RMSE is : 0.949907
Iteration 18 times,RMSE is : 0.947718
Iteration 19 times,RMSE is : 0.945695
Iteration 20 times,RMSE is : 0.943901
Iteration 21 times,RMSE is : 0.942296
Iteration 22 times,RMSE is : 0.940793
Iteration 23 times,RMSE is : 0.939336
Iteration 24 times,RMSE is : 0.937880
Iteration 25 times,RMSE is : 0.936398
Iteration 26 times,RMSE is : 0.934883
Iteration 27 times,RMSE is : 0.933353
Iteration 28 times,RMSE is : 0.931833
Iteration 29 times,RMSE is : 0.930368
Iteration 30 times,RMSE is : 0.928991
Iteration 31 times,RMSE is : 0.927724
Iteration 32 times,RMSE is : 0.926570
Iteration 33 times,RMSE is : 0.925547
Iteration 34 times,RMSE is : 0.924663
Iteration 35 times,RMSE is : 0.923920
Iteration 36 times,RMSE is : 0.923318
Iteration 37 times,RMSE is : 0.922853
Iteration 38 times,RMSE is : 0.922525
Iteration 39 times,RMSE is : 0.922330
Iteration 40 times,RMSE is : 0.922255
Iteration 41 times,RMSE is : 0.922297
Iteration finished!
最后的RMSE是0.922左右,可见效果一般。后面会尝试改进。