''' Created on 2014-03-05 @Author:Dior ''' import random import math import operator class SimpleTagBased(): #The constructor function def __init__(self,filename): self.filename=filename #self.N=N self.loadData() self.randomlySplitData(0.2) self.initStat() self.testRecommend() #read in the data file def loadData(self): print "##################load data begin#######################" filename=self.filename self.records={} fi=open(filename) lineNum=0 for line in fi: lineNum+=1 if lineNum==1: continue uid,iid,tag,timestamp=line.split('\t') uid=int(uid)-1 iid=int(iid)-1 tag=int(tag)-1 self.records.setdefault(uid,{}) self.records[uid].setdefault(iid,[]) self.records[uid][iid].append(tag) fi.close() print "Load data success.The total records is %d." % (lineNum) print "The total records number is %d." % (len(self.records)) print "##################load data end#######################\n" #Randomly split the data into training set and testing set def randomlySplitData(self,ratio,seed=100): print "################beginning to split data#####################" random.seed(seed) self.train=dict() self.test=dict() for u in self.records.keys(): for i in self.records[u].keys(): if random.random()<ratio: self.test.setdefault(u,{}) self.test[u].setdefault(i,[]) for t in self.records[u][i]: self.test[u][i].append(t) else: self.train.setdefault(u,{}) self.train[u].setdefault(i,[]) for t in self.records[u][i]: self.train[u][i].append(t) print "Split data complete." print "The length of train set is %d,the length of test set is %d." % (len(self.train),len(self.test)) print "##################split data end#######################\n" #Initialize the user_tags,tag_items and user_items dictionary def initStat(self): print "##################initstat begin#######################" records=self.train self.user_tags=dict() self.tag_items=dict() self.user_items=dict() #TODO: self.tag_users=dict() #TODO self.item_users=dict() for u,items in records.items(): for i,tags in items.items(): for tag in tags: #print tag self._addValueToMat(self.user_tags,u,tag,1) self._addValueToMat(self.tag_items,tag,i,1) self._addValueToMat(self.user_items,u,i,1) #TODO self._addValueToMat(self.tag_users,tag,u,1) #TODO self._addValueToMat(self.item_users,i,u,1) print "Initialize state complete." print "The length of the user_tags is %d,the length of the tag_items is %d,the length of the user_items is %d" % (len(self.user_tags),len(self.tag_items),len(self.user_items)) print "##################initstat end#######################\n" #The private function which is used to add value to matrix def _addValueToMat(self,mat,index,item,value=1): #the private function which is used to add value to matrix if index not in mat: mat.setdefault(index,{}) mat[index].setdefault(item,value) else: if item not in mat[index]: mat[index][item]=value else: mat[index][item]+=value #The precision and recall def precisionAndRecall(self,N): #print "##################precisionAndRecall begin#######################" #print "Beginning calculating......" hit=0 h_recall=0 h_precision=0 for user,items in self.test.items(): if user not in self.train: continue rank=self.recommend(user,N) for item,rui in rank: if item in items: hit+=1 #print "The items in test set is:" #print items #print "The items in recommended set is:" #print rank h_recall+=len(items) h_precision+=N #print hit #print "Calculating end....." #print "##################precisionAndRecall end#######################" return (hit/(h_precision*1.0)),(hit/(h_recall*1.0)) #The recommend function def recommend(self,user,N): recommend_items=dict() #N=self.N tagged_items=self.user_items[user] for tag,wut in self.user_tags[user].items(): wut=wut*1.0/math.log(1+len(self.tag_users[tag])) for item,wti in self.tag_items[tag].items(): wti=wti*1.0/math.log(1+len(self.item_users[item])) if item in tagged_items: continue if item not in recommend_items: recommend_items[item]=wut*wti else: recommend_items[item]+=wut*wti return sorted(recommend_items.items(),key=operator.itemgetter(1),reverse=True)[0:N] #Test recommend function def testRecommend(self): print "##################testRecommend begin#######################" #precision,recall=self.precisionAndRecall() print "%3s%20s%20s" % ('K',"recall",'precision') for n in [5,10,20,40,60,80,160]: precision,recall=self.precisionAndRecall(n) print "%3d%19.3f%%%19.3f%%" % (n,recall * 100,precision * 100) #print "The precision is %f,the recall is %f" % (precision,recall) print "##################testRecommend end#######################\n" if __name__=='__main__': stb=SimpleTagBased("E:\\RecommenderSystem\\datasets\\hetrec2011-delicious-2k\\user_taggedbookmarks-timestamps.dat") #print stb.user_items #print stb.item_users #for item in stb.item_users: #if 7 in stb.item_users[item]: #print item,stb.item_users[item][7]