【推荐系统】SVD++

用户对物品的评分等操作实际上很少,但是用户的点击,关注,收藏,浏览时长等隐性行为却很多,能否将这些考虑进来作为我们的预测准则之一呢?

  SVD++在RSVD的基础上,考虑了用户对物品的隐式行为。因为有时候一个用户的点击,收藏,浏览时长等隐式行为,我们是可以将其考虑进来的,它作为一种交互,也反应了用户的一种隐式意愿。为了将隐式兴趣加入到模型中,在预测规则中加入
r ^ u i = U u . V i . T + U ˉ u . V i . T + b u + b i + μ \hat r_{ui}=U_{u.}V_{i.}^T+\bar U_{u.}V^T_{i.}+b_u+b_i+\mu r^ui=Uu.Vi.T+Uˉu.Vi.T+bu+bi+μ
  其中
U ˉ u . = 1 ∣ I ~ u ∣ ∑ i ′ ∈ I ~ u W i ′ . \bar U_{u.}=\frac{1}{\sqrt{|\widetilde I_u|}}\sum\limits_{i' \in \widetilde I_u}W_{i'.} Uˉu.=I u 1iI uWi.

\widetilde I_u其中 I ~ u \widetilde I_u I u表示用户u有交互过的商品, W i . W_{i.} Wi.是跟 U i . U_{i.} Ui. V i . V_{i.} Vi.相同维度的向量。同时,为了训练隐式意愿的权重W,避免过拟合,加入对应变量的正则项,目标函数变为
min ⁡ θ ∑ u = 1 n ∑ i = 1 m y u i [ 1 2 ( r u i − r ^ u i ) 2 + α u 2 ∣ ∣ U u . ∣ ∣ 2 + α v 2 ∣ ∣ v i . ∣ ∣ 2 + β u 2 ∣ ∣ b u ∣ ∣ 2 + β v 2 ∣ ∣ b i ∣ ∣ 2 + β w 2 ∣ ∣ W i . ∣ ∣ 2 ] \min_\theta \sum_{u=1}^n\sum_{i=1}^my_{ui}\left[\frac{1}{2}(r_{ui}-\hat r_{ui})^2+\frac{\alpha_u}{2}||U_{u.}||^2+\frac{\alpha_v}{2}||v_{i.}||^2+\frac{\beta_u}{2}||b_u||^2+\frac{\beta_v}{2}||b_i||^2+\frac{\beta_w}{2}||W_{i.}||^2\right] θminu=1ni=1myui[21(ruir^ui)2+2αuUu.2+2αvvi.2+2βubu2+2βvbi2+2βwWi.2]
  令 e u i = r u i − r ^ u i e_{ui}=r_{ui}-\hat r_{ui} eui=ruir^ui,对变量 θ = ( U u . , V i . , b u , b i , W i ′ . ) \theta=(U_{u.},V_{i.},b_u,b_i,W_{i'.}) θ=(Uu.,Vi.,bu,bi,Wi.)求偏导,有
∇ μ = − e u i ∇ b u = − e u i + β u b u ∇ b i = − e u i + β v b i ∇ U u . = − e u i V i . + α u U u . ∇ V i . = − e u i ( U u . + ( 1 ∣ I ~ u ∣ ∑ i ′ ∈ I ~ u W i ′ . ) ) + α v V i . ∇ W i ′ . = − e u i 1 ∣ I ~ u ∣ V i . + α w W i ′ . , i ∈ I ~ u \begin{aligned} &\nabla_\mu=-e_{ui}\\ &\nabla b_u=-e_{ui}+\beta_ub_u \\ &\nabla b_i= -e_{ui}+\beta_vb_i \\ &\nabla U_{u.}=-e_{ui}V_{i.}+\alpha_uU_{u.} \\ &\nabla V_{i.}=-e_{ui}(U_{u.}+(\frac{1}{\sqrt{|\widetilde I_u|}}\sum\limits_{i' \in \widetilde I_u}W_{i'.}))+\alpha_vV_{i.}\\ &\nabla W_{i'.}=-e_{ui}\frac{1}{\sqrt{|\widetilde I_u|}}V_{i.}+\alpha_wW_{i'.},i \in \widetilde I_u \end{aligned} μ=euibu=eui+βububi=eui+βvbiUu.=euiVi.+αuUu.Vi.=eui(Uu.+(I u 1iI uWi.))+αvVi.Wi.=euiI u 1Vi.+αwWi.,iI u
  使用SGD算法,求解上面的参数直至收敛即可。SVD++算法在小数据集上表现和RSVD差不多,但是在大数据集上的表现要由于RSVD。

附实现代码:

import random
import math
import pandas as pd
import numpy as np
import math

class SVDplusplus():
    def __init__(self, allfile, trainfile, testfile, latentFactorNum=20,alpha_u=0.01,alpha_v=0.01,alpha_w=0.01,beta_u=0.01,beta_v=0.01,learning_rate=0.01):
        data_fields = ['user_id', 'item_id', 'rating', 'timestamp']
        # all data file
        allData = pd.read_table(allfile, names=data_fields)
        user_list=sorted(set(allData['user_id'].values))
        item_list=sorted(set(allData['item_id'].values))
        ua_base=allData.sample(n=90570,replace=False)
        self.test_df=allData.drop(ua_base.index,axis=0)
        #ua_base_implict only have (user_id,item_id)
        self.ua_base_implcit=ua_base.sample(frac=0.5,replace=False)
        self.ua_base_explict=ua_base.drop(self.ua_base_implcit.index,axis=0)
        self.implict=self.ua_base_implcit.pivot(index='user_id', columns='item_id', values='rating')
        print(self.test_df.shape)
        print(self.ua_base_explict.shape)
        print(self.ua_base_implcit.shape)

        data_df = pd.DataFrame(index=user_list, columns=item_list)
        rating_matrix=self.ua_base_explict.pivot(index='user_id', columns='item_id', values='rating')
        data_df.update(rating_matrix)
        self.rating_matrix=data_df
        # training set file
        #self.train_df = pd.read_table(trainfile, names=data_fields)
        # testing set file
        #self.test_df=pd.read_table(testfile, names=data_fields)
        # get factor number
        self.latentFactorNum = latentFactorNum
        # get user number
        self.userNum = len(set(allData['user_id'].values))
        # get item number
        self.itemNum = len(set(allData['item_id'].values))
        # learning rate
        self.learningRate = learning_rate
        # the regularization lambda
        self.alpha_u=alpha_u
        self.alpha_v=alpha_v
        self.alpha_w=alpha_w
        self.beta_u=beta_u
        self.beta_v=beta_v
        # initialize the model and parameters
        self.initModel()

    # initialize all parameters
    def initModel(self):
        self.mu = self.ua_base_explict['rating'].mean()
        self.bu=(self.rating_matrix-self.mu).sum(axis=1)/self.rating_matrix.count(axis=1)
        self.bu=self.bu.values#dataFrame转numpy
        print(self.bu.shape)
        self.bi = (self.rating_matrix - self.mu).sum() / self.rating_matrix.count()
        self.bi = self.bi.values  # dataFrame转numpy
        self.bi[np.isnan(self.bi)]=0 #填充缺失值

        # r = (np.random.random(1)[0]-0.05)*0.01
        # np.mat((np.random.rand(self.userNum, self.latentFactorNum)-0.05)*0.01)
        self.U = np.mat((np.random.rand(self.userNum, self.latentFactorNum)-0.05)*0.01)
        self.V = np.mat((np.random.rand(self.itemNum, self.latentFactorNum)-0.05)*0.01)
        self.W = np.mat((np.random.rand(self.itemNum, self.latentFactorNum)-0.05)*0.01)
        # self.bu = [0.0 for i in range(self.userNum)]
        # self.bi = [0.0 for i in range(self.itemNum)]
        # temp = math.sqrt(self.latentFactorNum)
        # self.U = [[(0.1 * random.random() / temp) for i in range(self.latentFactorNum)] for j in range(self.userNum)]
        # self.V = [[0.1 * random.random() / temp for i in range(self.latentFactorNum)] for j in range(self.itemNum)]

        print("Initialize end.The user number is:%d,item number is:%d" % (self.userNum, self.itemNum))

    def train(self, iterTimes=100):
        print("Beginning to train the model......")
        preRmse = 10000.0
        temp_count = 0
        for iter in range(iterTimes):
            count=0
            for index in self.ua_base_explict.index:
                user = int(self.ua_base_explict.loc[index]['user_id'])-1
                item = int(self.ua_base_explict.loc[index]['item_id'])-1
                rating = float(self.ua_base_explict.loc[index]['rating'])
                pscore = self.predictScore(self.mu, self.bu[user], self.bi[item], self.U[user], self.V[item],self.W[item],user+1)
                eui = rating - pscore
                # update parameters bu and bi(user rating bais and item rating bais)
                self.mu= -eui
                self.bu[user] += self.learningRate * (eui - self.beta_u * self.bu[user])
                self.bi[item] += self.learningRate * (eui - self.beta_v * self.bi[item])

                temp_Uuser = self.U[user]
                temp_Vitem = self.V[item]

                if user+1 in self.implict.index:
                    temp = self.implict.loc[user+1][self.implict.loc[user+1].isnull() == False]
                    U_bar = self.W[temp.index-1].sum()/temp.count()
                else:
                    U_bar = np.zeros(self.latentFactorNum)
                self.U[user] += self.learningRate * (eui * self.V[user] - self.alpha_u * self.U[user])
                self.V[item] += self.learningRate * ((temp_Uuser+U_bar) * eui - self.alpha_v * self.V[item])
                if user+1 in self.implict.index:
                    self.W[item] += self.learningRate * (eui * temp_Vitem / math.sqrt(self.implict.loc[user+1].count())- self.alpha_w * self.W[item])
                else:
                    self.W[item] += self.learningRate * (eui * temp_Vitem - self.alpha_w * self.W[item])
                # for k in range(self.latentFactorNum):
                #     temp = self.U[user][k]
                #     # update U,V
                #     self.U[user][k] += self.learningRate * (eui * self.V[user][k] - self.alpha_u * self.U[user][k])
                #     self.V[item][k] += self.learningRate * (temp * eui - self.alpha_v * self.V[item][k])
                #
                count += 1
                if count  % 5000 == 0 :
                    print("第%s轮进度:%s/%s" %(iter+1,count,len(self.ua_base_explict.index)))
                    # calculate the current rmse
            self.learningRate = self.learningRate * 0.9 # 缩减学习率
            curRmse = self.test()
            print("Iteration %d times,RMSE is : %f" % (iter + 1, curRmse))
            if curRmse > preRmse:
                break
            else:
                preRmse = curRmse
        print("Iteration finished!")

    # test on the test set and calculate the RMSE
    def test(self):
        cnt = self.test_df.shape[0]
        rmse = 0.0

        # buT=bu.reshape(bu.shape[0],1)
        # predict_rate_matrix = mu + np.tile(buT,(1,self.itemNum))+ np.tile(bi,(self.userNum,1)) +  self.U * self.V.T
        cur = 0
        for i in self.test_df.index:
            cur +=1
            if cur % 1000 == 0:
                print("测试进度:%s/%s" %(cur,len(self.test_df.index)))
            user = int(self.test_df.loc[i]['user_id']) - 1
            item = int(self.test_df.loc[i]['item_id']) - 1
            score = float(self.test_df.loc[i]['rating'])
            pscore = self.predictScore(self.mu,self.bu[user], self.bi[item], self.U[user], self.V[item],self.W[item],user+1)
            # pscore = predict_rate_matrix[user,item]
            rmse += math.pow(score - pscore, 2)
            #print(score,pscore,rmse)
        RMSE=math.sqrt(rmse / cnt)
        return RMSE


    # calculate the inner product of two vectors
    def innerProduct(self, v1, v2):
        result = 0.0
        for i in range(len(v1)):
            result += v1[i] * v2[i]
        return result

    def predictScore(self, mu, bu, bi, U, V, W ,user_id):
        #pscore = mu + bu + bi + self.innerProduct(U, V)
        if user_id in self.implict.index:
            temp = self.implict.loc[user_id][self.implict.loc[user_id].isnull() == False]
            U_bar = self.W[temp.index-1].sum() / temp.count()
        else:
            U_bar = np.zeros(self.latentFactorNum)
        pscore = mu + bu + bi + np.multiply(U,V).sum() +np.multiply(U_bar,V).sum()
        if np.isnan(pscore):
            print("!!!!")
            print(mu,bu,bi,np.multiply(U,V).sum(),np.multiply(U_bar,V).sum(),U_bar)
        if pscore < 1:
            pscore = 1
        if pscore > 5:
            pscore = 5
        return pscore


if __name__ == '__main__':
    s = SVDplusplus("../datasets/ml-100k/u.data", "../datasets/ml-100k/u1.base", "../datasets/ml-100k/u1.test")
    s.train()

你可能感兴趣的:(机器学习之旅)