SimpleTag_TFIDF++

'''
Created on 2014-03-05
@Author:Dior
'''
import random
import math
import operator

class SimpleTagBased():
    
    #The constructor function
    def __init__(self,filename):
        self.filename=filename
        #self.N=N
        self.loadData()
        self.randomlySplitData(0.2)
        self.initStat()
        self.testRecommend()
        
    #read in the data file
    def loadData(self):
        print "##################load data begin#######################"
        filename=self.filename
        self.records={}
        fi=open(filename)
        lineNum=0
        for line in fi:
            lineNum+=1
            if lineNum==1:
                continue
            uid,iid,tag,timestamp=line.split('\t')
            uid=int(uid)-1
            iid=int(iid)-1
            tag=int(tag)-1
            self.records.setdefault(uid,{})
            self.records[uid].setdefault(iid,[])
            self.records[uid][iid].append(tag)
        fi.close()
        print "Load data success.The total records is %d." % (lineNum)
        print "The total records number is %d." % (len(self.records))
        print "##################load data end#######################\n"
    
    #Randomly split the data into training set and testing set    
    def randomlySplitData(self,ratio,seed=100):
        print "################beginning to split data#####################"
        random.seed(seed)
        self.train=dict()
        self.test=dict()
        for u in self.records.keys():
            for i in self.records[u].keys():
                if random.random()<ratio:
                    self.test.setdefault(u,{})
                    self.test[u].setdefault(i,[])
                    for t in self.records[u][i]:
                        self.test[u][i].append(t)
                else:
                    self.train.setdefault(u,{})
                    self.train[u].setdefault(i,[])
                    for t in self.records[u][i]:
                        self.train[u][i].append(t)        
        print "Split data complete."
        print "The length of train set is %d,the length of test set is %d." % (len(self.train),len(self.test))
        print "##################split data end#######################\n"
    
    #Initialize the user_tags,tag_items and user_items dictionary    
    def initStat(self):
        print "##################initstat begin#######################"
        records=self.train
        self.user_tags=dict()
        self.tag_items=dict()
        self.user_items=dict()
        #TODO:
        self.tag_users=dict()
        #TODO
        self.item_users=dict()
        for u,items in records.items():
            for i,tags in items.items():
                for tag in tags:
                    #print tag
                    self._addValueToMat(self.user_tags,u,tag,1)
                    self._addValueToMat(self.tag_items,tag,i,1)
                    self._addValueToMat(self.user_items,u,i,1)
                    #TODO
                    self._addValueToMat(self.tag_users,tag,u,1)
                    #TODO
                    self._addValueToMat(self.item_users,i,u,1)
        print "Initialize state complete."
        print "The length of the user_tags is %d,the length of the tag_items is %d,the length of the user_items is %d" % (len(self.user_tags),len(self.tag_items),len(self.user_items))
        print "##################initstat end#######################\n"
    
    #The private function which is used to add value to matrix    
    def _addValueToMat(self,mat,index,item,value=1):
        #the private function which is used to add value to matrix
        if index not in mat:
            mat.setdefault(index,{})
            mat[index].setdefault(item,value)
        else:
            if item not in mat[index]:
                mat[index][item]=value
            else:
                mat[index][item]+=value
    
    #The precision and recall
    def precisionAndRecall(self,N):
        #print "##################precisionAndRecall begin#######################"
        #print "Beginning calculating......"
        hit=0
        h_recall=0
        h_precision=0
        for user,items in self.test.items():
            if user not in self.train:
                continue
            rank=self.recommend(user,N)
            for item,rui in rank:
                if item in items:
                    hit+=1
            #print "The items in test set is:"
            #print items
            #print "The items in recommended set is:"
            #print rank
            h_recall+=len(items)
            h_precision+=N
        #print hit        
        #print "Calculating end....."
        #print "##################precisionAndRecall end#######################"
        return (hit/(h_precision*1.0)),(hit/(h_recall*1.0))
    
    #The recommend function
    def recommend(self,user,N):
        recommend_items=dict()
        #N=self.N
        tagged_items=self.user_items[user]     
        for tag,wut in self.user_tags[user].items():
            wut=wut*1.0/math.log(1+len(self.tag_users[tag]))
            for item,wti in self.tag_items[tag].items():
                wti=wti*1.0/math.log(1+len(self.item_users[item]))
                if item in tagged_items:
                    continue
                if item not in recommend_items:
                    recommend_items[item]=wut*wti
                else:
                    recommend_items[item]+=wut*wti
        return sorted(recommend_items.items(),key=operator.itemgetter(1),reverse=True)[0:N]
    
    #Test recommend function
    def testRecommend(self):
        print "##################testRecommend begin#######################"
        #precision,recall=self.precisionAndRecall()
        print "%3s%20s%20s" % ('K',"recall",'precision')
        for n in [5,10,20,40,60,80,160]:
            precision,recall=self.precisionAndRecall(n)
            print "%3d%19.3f%%%19.3f%%" % (n,recall * 100,precision * 100)
        #print "The precision is %f,the recall is %f" % (precision,recall)
        print "##################testRecommend end#######################\n"
        
        
if __name__=='__main__':
    stb=SimpleTagBased("E:\\RecommenderSystem\\datasets\\hetrec2011-delicious-2k\\user_taggedbookmarks-timestamps.dat")
    #print stb.user_items
    #print stb.item_users
    #for item in stb.item_users:
        #if 7 in stb.item_users[item]:
            #print item,stb.item_users[item][7]

你可能感兴趣的:(SimpleTag_TFIDF++)