推荐经典算法实现之PMF(python+MovieLen)

# -*- Encoding:UTF-8 -*-
'''
@author: Jason.F
@data: 2019.07.18
@function: Implementing PMF
           Dataset: Movielen Dataset(ml-1m) 
           Evaluating: hitradio,ndcg
           https://papers.nips.cc/paper/3208-probabilistic-matrix-factorization.pdf
           Matlab: http://www.utstat.toronto.edu/~rsalakhu/BPMF.html 
@reference: https://github.com/adamzjw/Probabilistic-matrix-factorization-in-Python
'''
import numpy as np
import pandas as pd
from numpy.random import RandomState
import copy
import heapq
import math
from numpy import linalg as LA
import random
#define class PMF
class PMF:
    def __init__(self, num_feat=8, epsilon=1, _lambda=0.1, momentum=0.8, maxepoch=20, num_batches=10, batch_size=1000):
        self.num_feat = num_feat
        self.epsilon = epsilon
        self._lambda = _lambda
        self.momentum = momentum
        self.maxepoch = maxepoch
        self.num_batches = num_batches
        self.batch_size = batch_size
        
        self.w_C = None
        self.w_I = None

        self.err_train = []
        self.err_val = []
        
    def fit(self, train_vec, val_vec):   
        # mean subtraction
        self.mean_inv = np.mean(train_vec[:,2])
        
        pairs_tr = train_vec.shape[0]
        pairs_va = val_vec.shape[0]
        
        # 1-p-i, 2-m-c
        num_inv = int(max(np.amax(train_vec[:,0]), np.amax(val_vec[:,0]))) + 1
        num_com = int(max(np.amax(train_vec[:,1]), np.amax(val_vec[:,1]))) + 1

        incremental = False
        if ((not incremental) or (self.w_C is None)):
            # initialize
            self.epoch = 0
            self.w_C = 0.1 * np.random.randn(num_com, self.num_feat)
            self.w_I = 0.1 * np.random.randn(num_inv, self.num_feat)
            
            self.w_C_inc = np.zeros((num_com, self.num_feat))
            self.w_I_inc = np.zeros((num_inv, self.num_feat))
        
        
        while self.epoch < self.maxepoch:
            self.epoch += 1

            # Shuffle training truples
            shuffled_order = np.arange(train_vec.shape[0])
            np.random.shuffle(shuffled_order)

            # Batch update
            for batch in range(self.num_batches):
                # print "epoch %d batch %d" % (self.epoch, batch+1)

                batch_idx = np.mod(np.arange(self.batch_size * batch,
                                             self.batch_size * (batch+1)),
                                   shuffled_order.shape[0])

                batch_invID = np.array(train_vec[shuffled_order[batch_idx], 0], dtype='int32')
                batch_comID = np.array(train_vec[shuffled_order[batch_idx], 1], dtype='int32')

                # Compute Objective Function
                pred_out = np.sum(np.multiply(self.w_I[batch_invID,:], 
                                                self.w_C[batch_comID,:]),
                                axis=1) # mean_inv subtracted

                rawErr = pred_out - train_vec[shuffled_order[batch_idx], 2] + self.mean_inv

                # Compute gradients
                Ix_C = 2 * np.multiply(rawErr[:, np.newaxis], self.w_I[batch_invID,:]) \
                        + self._lambda * self.w_C[batch_comID,:]
                Ix_I = 2 * np.multiply(rawErr[:, np.newaxis], self.w_C[batch_comID,:]) \
                        + self._lambda * self.w_I[batch_invID,:]
            
                dw_C = np.zeros((num_com, self.num_feat))
                dw_I = np.zeros((num_inv, self.num_feat))

                # loop to aggreate the gradients of the same element
                for i in range(self.batch_size):
                    dw_C[batch_comID[i],:] += Ix_C[i,:]
                    dw_I[batch_invID[i],:] += Ix_I[i,:]


                # Update with momentum
                self.w_C_inc = self.momentum * self.w_C_inc + self.epsilon * dw_C / self.batch_size
                self.w_I_inc = self.momentum * self.w_I_inc + self.epsilon * dw_I / self.batch_size


                self.w_C = self.w_C - self.w_C_inc
                self.w_I = self.w_I - self.w_I_inc

                # Compute Objective Function after
                if batch == self.num_batches - 1:
                    pred_out = np.sum(np.multiply(self.w_I[np.array(train_vec[:,0], dtype='int32'),:],
                                                    self.w_C[np.array(train_vec[:,1], dtype='int32'),:]),
                                        axis=1) # mean_inv subtracted
                    rawErr = pred_out - train_vec[:, 2] + self.mean_inv
                    obj = LA.norm(rawErr) ** 2 \
                            + 0.5*self._lambda*(LA.norm(self.w_I) ** 2 + LA.norm(self.w_C) ** 2)

                    self.err_train.append(np.sqrt(obj/pairs_tr))

                # Compute validation error
                if batch == self.num_batches - 1:
                    pred_out = np.sum(np.multiply(self.w_I[np.array(val_vec[:,0], dtype='int32'),:],
                                                    self.w_C[np.array(val_vec[:,1], dtype='int32'),:]),
                                        axis=1) # mean_inv subtracted
                    rawErr = pred_out - val_vec[:, 2] + self.mean_inv
                    self.err_val.append(LA.norm(rawErr)/np.sqrt(pairs_va))

                # Print info
                if batch == self.num_batches - 1:
                    print ('Training RMSE: %f, Val RMSE %f' % (self.err_train[-1], self.err_val[-1]))
    
    def predict(self, invID, comID): 
        return np.dot(self.w_C[comID,:], self.w_I[invID,:]) + self.mean_inv
               
    def evaluate(self, test_vec, k=10):
        def getHitRatio(ranklist, gtItem):
            for item in ranklist:
                if item == gtItem:
                    return 1
            return 0

        def getNDCG(ranklist, gtItem):
            for i in range(len(ranklist)):
                item = ranklist[i]
                if item == gtItem:
                    return math.log(2) / math.log(i+2)
            return 0
        
        testset = pd.DataFrame(test_vec, columns=['u','i'])
        hits = []
        ndcgs = []
        list_csr = list(set(np.array(testset['u']).tolist()))
        for u in list_csr:
            csrset = np.array(testset[testset['u']==u]).tolist()
            scorelist = []
            positem = csrset[0][1]#first item is positive
            for _, i in csrset:
                scorelist.append([i, self.predict(u,i)])
            #get topk 
            map_item_score = {}
            for item, rate in scorelist: #turn dict
                map_item_score[item] = rate
            ranklist = heapq.nlargest(10, map_item_score, key=map_item_score.get)#default Topn=10
            hr = getHitRatio(ranklist, positem)
            hits.append(hr)
            ndcg = getNDCG(ranklist, positem)
            ndcgs.append(ndcg)
        hitratio,ndcg = np.array(hits).mean(), np.array(ndcgs).mean()
        return hitratio,ndcg
        
#loading dataset
def getTrainset(filePath):
    trainset = []
    maxu = 0 
    maxi = 0 
    maxr = 0.0
    with open(filePath, 'r') as fd:
        line = fd.readline()
        while line != None and line != "":
            arr = line.split("\t")
            u, i, rating = int(arr[0]), int(arr[1]), float(arr[2])
            trainset.append([int(arr[0]), int(arr[1]), float(arr[2])])
            if rating > maxr: maxr = rating
            if u > maxu: maxu = u
            if i > maxi: maxi = i
            line = fd.readline()
        return trainset, maxr, maxu, maxi

def getTestset(filePath):
    testset = []
    with open(filePath, 'r') as fd:
        line = fd.readline()
        while line != None and line != '':
            arr = line.split('\t')
            u = eval(arr[0])[0]
            testset.append([u, eval(arr[0])[1]])#one postive item
            for i in arr[1:]:
                testset.append([u, int(i)]) #99 negative items
            line = fd.readline()
    return testset    

def getTrainDict(data):
        dataDict = {}
        for i in data:
            dataDict[(i[0], i[1])] = i[2]
        return dataDict
    
def getNegTrain(data, maxi, negNum=4):
        datadict = getTrainDict(data)
        trainneg = []
        for i in data:
            trainneg.append([i[0],i[1],i[2]])
            for t in range(negNum):
                j = np.random.randint(maxi)
                while (i[0], j) in datadict:
                    j = np.random.randint(maxi)
                trainneg.append([i[0], j, 0.0])
        return trainneg
    
if __name__ == '__main__':
    trainset, maxr, maxu, maxi = getTrainset("/data/fjsdata/ctKngBase/ml/ml-1m.train.rating")
    trainneg = getNegTrain(trainset, maxi)
    testset = getTestset("/data/fjsdata/ctKngBase/ml/ml-1m.test.negative")
    print('Dataset Statistics: Interaction = %d, User = %d, Item = %d, Sparsity = %.4f' % (len(trainset), maxu+1, maxi+1, len(trainset)/(maxu*maxr)))

    print ("%3s%20s%20s" % ('K','HR@10', 'NDCG@10'))
    for K in [8, 16, 32, 64]:
        pmf = PMF(num_feat=K)
        valtest = random.sample(trainset,int(0.2*len(trainset)))
        pmf.fit(np.array(trainneg), np.array(valtest))
        hit, ndcg = pmf.evaluate(testset)
        print ("%3d%20.6f%20.6f" % (K, hit, ndcg))
'''
nohup python -u PMF-ml.py > pmf-ml.log  &
 K               HR@10             NDCG@10
8             0.101325            0.045585
16            0.099338            0.046128
32            0.094371            0.042417
64            0.105795            0.047519
'''

矩阵分解模型的概率角度解释。

你可能感兴趣的:(Algorithm)