参考了Yehuda Koren 08年的论文Factorization Meets the Neighborhood: a Multifaceted Collaborative Filtering Model
代码如下:
'''
Version:1.0
Created on 2014-02-25
@Author:Dior
'''
import random
import math
import cPickle as pickle
class SVD():
def __init__(self,allfile,trainfile,testfile,factorNum=10):
#all data file
self.allfile=allfile
#training set file
self.trainfile=trainfile
#testing set file
self.testfile=testfile
#get factor number
self.factorNum=factorNum
#get user number
self.userNum=self.getUserNum()
#get item number
self.itemNum=self.getItemNum()
#learning rate
self.learningRate=0.01
#the regularization lambda
self.regularization=0.05
#initialize the model and parameters
self.initModel()
#get user number function
def getUserNum(self):
file=self.allfile
cnt=0
userSet=set()
for line in open(file):
user=line.split('\t')[0].strip()
if user not in userSet:
userSet.add(user)
cnt+=1
return cnt
#get item number function
def getItemNum(self):
file=self.allfile
cnt=0
itemSet=set()
for line in open(file):
item=line.split('\t')[1].strip()
if item not in itemSet:
itemSet.add(item)
cnt+=1
return cnt
#initialize all parameters
def initModel(self):
self.av=self.average(self.trainfile)
self.bu=[0.0 for i in range(self.userNum)]
self.bi=[0.0 for i in range(self.itemNum)]
temp=math.sqrt(self.factorNum)
self.pu=[[(0.1*random.random()/temp) for i in range(self.factorNum)] for j in range(self.userNum)]
self.qi=[[0.1*random.random()/temp for i in range(self.factorNum)] for j in range(self.itemNum)]
print "Initialize end.The user number is:%d,item number is:%d,the average score is:%f" % (self.userNum,self.itemNum,self.av)
#train model
def train(self,iterTimes=100):
print "Beginning to train the model......"
trainfile=self.trainfile
preRmse=10000.0
for iter in range(iterTimes):
fi=open(trainfile,'r')
#read the training file
for line in fi:
content=line.split('\t')
user=int(content[0].strip())-1
item=int(content[1].strip())-1
rating=float(content[2].strip())
#calculate the predict score
pscore=self.predictScore(self.av,self.bu[user],self.bi[item],self.pu[user],self.qi[item])
#the delta between the real score and the predict score
eui=rating-pscore
#update parameters bu and bi(user rating bais and item rating bais)
self.bu[user]+=self.learningRate*(eui-self.regularization*self.bu[user])
self.bi[item]+=self.learningRate*(eui-self.regularization*self.bi[item])
for k in range(self.factorNum):
temp=self.pu[user][k]
#update pu,qi
self.pu[user][k]+=self.learningRate*(eui*self.qi[user][k]-self.regularization*self.pu[user][k])
self.qi[item][k]+=self.learningRate*(temp*eui-self.regularization*self.qi[item][k])
#print pscore,eui
#close the file
fi.close()
#calculate the current rmse
curRmse=self.test(self.av,self.bu,self.bi,self.pu,self.qi)
print "Iteration %d times,RMSE is : %f" % (iter+1,curRmse)
if curRmse>preRmse:
break
else:
preRmse=curRmse
print "Iteration finished!"
#test on the test set and calculate the RMSE
def test(self,av,bu,bi,pu,qi):
testfile=self.testfile
rmse=0.0
cnt=0
fi=open(testfile)
for line in fi:
cnt+=1
content=line.split('\t')
user=int(content[0].strip())-1
item=int(content[1].strip())-1
score=float(content[2].strip())
pscore=self.predictScore(av,bu[user],bi[item],pu[user],qi[item])
rmse+=math.pow(score-pscore,2)
fi.close()
return math.sqrt(rmse/cnt)
#calculate the average rating in the training set
def average(self,filename):
result=0.0
cnt=0
for line in open(filename):
cnt+=1
score=float(line.split('\t')[2].strip())
result+=score
return result/cnt
#calculate the inner product of two vectors
def innerProduct(self,v1,v2):
result=0.0
for i in range(len(v1)):
result+=v1[i]*v2[i]
return result
def predictScore(self,av,bu,bi,pu,qi):
pscore=av+bu+bi+self.innerProduct(pu,qi)
if pscore<1:
pscore=1
if pscore>5:
pscore=5
return pscore
if __name__=='__main__':
s=SVD("data\\u.data","data\\ua.base","data\\ua.test")
#print s.userNum,s.itemNum
#print s.average("data\\ua.base")
s.train()
Initialize end.The user number is:943,item number is:1682,the average score is:3.523827
Beginning to train the model......
Iteration 1 times,RMSE is : 1.002799
Iteration 2 times,RMSE is : 0.982096
Iteration 3 times,RMSE is : 0.972882
Iteration 4 times,RMSE is : 0.967720
Iteration 5 times,RMSE is : 0.964554
Iteration 6 times,RMSE is : 0.962498
Iteration 7 times,RMSE is : 0.961116
Iteration 8 times,RMSE is : 0.960166
Iteration 9 times,RMSE is : 0.959482
Iteration 10 times,RMSE is : 0.958933
Iteration 11 times,RMSE is : 0.958416
Iteration 12 times,RMSE is : 0.957814
Iteration 13 times,RMSE is : 0.956986
Iteration 14 times,RMSE is : 0.955798
Iteration 15 times,RMSE is : 0.954165
Iteration 16 times,RMSE is : 0.952135
Iteration 17 times,RMSE is : 0.949907
Iteration 18 times,RMSE is : 0.947718
Iteration 19 times,RMSE is : 0.945695
Iteration 20 times,RMSE is : 0.943901
Iteration 21 times,RMSE is : 0.942296
Iteration 22 times,RMSE is : 0.940793
Iteration 23 times,RMSE is : 0.939336
Iteration 24 times,RMSE is : 0.937880
Iteration 25 times,RMSE is : 0.936398
Iteration 26 times,RMSE is : 0.934883
Iteration 27 times,RMSE is : 0.933353
Iteration 28 times,RMSE is : 0.931833
Iteration 29 times,RMSE is : 0.930368
Iteration 30 times,RMSE is : 0.928991
Iteration 31 times,RMSE is : 0.927724
Iteration 32 times,RMSE is : 0.926570
Iteration 33 times,RMSE is : 0.925547
Iteration 34 times,RMSE is : 0.924663
Iteration 35 times,RMSE is : 0.923920
Iteration 36 times,RMSE is : 0.923318
Iteration 37 times,RMSE is : 0.922853
Iteration 38 times,RMSE is : 0.922525
Iteration 39 times,RMSE is : 0.922330
Iteration 40 times,RMSE is : 0.922255
Iteration 41 times,RMSE is : 0.922297
Iteration finished!
最后的RMSE是0.922左右,可见效果一般。后面会尝试改进。