在推荐系统中有两种协同过滤的方式。
原理在上面已经简单介绍。
假设我们已有有了一个评分矩阵 R m , n , m R_{m,n},m Rm,n,m个用户对 n n n个物品的评分全在这个矩阵里,当然这是一个高度稀疏的矩阵,我们用 r u , i r_{u,i} ru,i表示用户 u u u对物品 i i i的评分。LFM认为 R m , n = P m , F ⋅ Q F , n R_{m,n}=P_{m,F}·Q_{F,n} Rm,n=Pm,F⋅QF,n即R是两个矩阵的乘积(所以LFM又被称为矩阵分解法,MF,matrix factorizationmodel),F是隐因子的个数;P的每一行代表一个用户对各个隐因子的喜好程度;Q的每一列代表一个物品在各个隐因子上的概率分布。
# _*_coding:utf-8 _*_
__author__ = "ricky"
import random
import math
class LFM(object):
def __init__(self, rating_data, F, alpha=0.1, lmbd=0.1, max_iter=500):
"""
:param rating_data: rating_data是[(user,[(item,rate)]]类型
:param F: 隐因子个数
:param alpha: 学习率
:param lmbd: 正则化
:param max_iter:最大迭代次数
"""
self.F = F
self.P = dict() # R=PQ^T,代码中的Q相当于博客中Q的转置
self.Q = dict()
self.alpha = alpha
self.lmbd = lmbd
self.max_iter = max_iter
self.rating_data = rating_data
'''随机初始化矩阵P和Q'''
for user, rates in self.rating_data:
self.P[user] = [random.random() / math.sqrt(self.F)
for x in range(self.F)]
for item, _ in rates:
if item not in self.Q:
self.Q[item] = [random.random() / math.sqrt(self.F)
for x in range(self.F)]
def train(self):
"""
随机梯度下降法训练参数P和Q
:return:
"""
for step in range(self.max_iter):
for user, rates in self.rating_data:
for item, rui in rates:
hat_rui = self.predict(user, item)
err_ui = rui - hat_rui
for f in range(self.F):
self.P[user][f] += self.alpha * (err_ui * self.Q[item][f] - self.lmbd * self.P[user][f])
self.Q[item][f] += self.alpha * (err_ui * self.P[user][f] - self.lmbd * self.Q[item][f])
self.alpha *= 0.9 # 每次迭代步长要逐步缩小
def predict(self, user, item):
"""
:param user:
:param item:
:return:
预测用户user对物品item的评分
"""
return sum(self.P[user][f] * self.Q[item][f] for f in range(self.F))
if __name__ == '__main__':
'''用户有A B C,物品有a b c d'''
rating_data = list()
rate_A = [('a', 1.0), ('b', 1.0)]
rating_data.append(('A', rate_A))
rate_B = [('b', 1.0), ('c', 1.0)]
rating_data.append(('B', rate_B))
rate_C = [('c', 1.0), ('d', 1.0)]
rating_data.append(('C', rate_C))
lfm = LFM(rating_data, 2)
lfm.train()
for item in ['a', 'b', 'c', 'd']:
print(item, lfm.predict('A', item)) # 计算用户A对各个物品的喜好程度
# _*_ coding:utf-8 _*_
__author__ = "Ricky"
import random
import math
class BiasLFM(object):
def __init__(self, rating_data, F, alpha=0.1, lmbd=0.1, max_iter=500):
'''rating_data是list<(user,list<(position,rate)>)>类型
'''
self.F = F
self.P = dict()
self.Q = dict() # 相当于博客中Q的转置
self.bu = dict()
self.bi = dict()
self.alpha = alpha
self.lmbd = lmbd
self.max_iter = max_iter
self.rating_data = rating_data
self.mu = 0.0
'''随机初始化矩阵P和Q'''
cnt = 0
for user, rates in self.rating_data:
self.P[user] = [random.random() / math.sqrt(self.F)
for x in range(self.F)]
self.bu[user] = 0
cnt += len(rates)
for item, rate in rates:
self.mu += rate
if item not in self.Q:
self.Q[item] = [random.random() / math.sqrt(self.F)
for x in range(self.F)]
self.bi[item] = 0
self.mu /= cnt
def train(self):
'''随机梯度下降法训练参数P和Q
'''
for step in range(self.max_iter):
for user, rates in self.rating_data:
for item, rui in rates:
hat_rui = self.predict(user, item)
err_ui = rui - hat_rui
# 更新偏置
self.bu[user] += self.alpha * (err_ui - self.lmbd * self.bu[user])
self.bi[item] += self.alpha * (err_ui - self.lmbd * self.bi[item])
for f in range(self.F):
# 更新P、Q
self.P[user][f] += self.alpha * (err_ui * self.Q[item][f] - self.lmbd * self.P[user][f])
self.Q[item][f] += self.alpha * (err_ui * self.P[user][f] - self.lmbd * self.Q[item][f])
self.alpha *= 0.9 # 每次迭代步长要逐步缩小
def predict(self, user, item):
'''预测用户user对物品item的评分
'''
return sum(self.P[user][f] * self.Q[item][f] for f in range(self.F)) + self.bu[user] + self.bi[item] + self.mu
if __name__ == '__main__':
'''用户有A B C,物品有a b c d'''
rating_data = list()
rate_A = [('a', 1.0), ('b', 1.0)]
rating_data.append(('A', rate_A))
rate_B = [('b', 1.0), ('c', 1.0)]
rating_data.append(('B', rate_B))
rate_C = [('c', 1.0), ('d', 1.0)]
rating_data.append(('C', rate_C))
lfm = BiasLFM(rating_data, 2)
lfm.train()
for item in ['a', 'b', 'c', 'd']:
print(item, lfm.predict('A', item)) # 计算用户A对各个物品的喜好程度
# coding:utf-8
__author__ = "ricky"
import random
import math
class SVDPP(object):
def __init__(self, rating_data, F, alpha=0.1, lmbd=0.1, max_iter=500):
'''rating_data是list<(user,list<(position,rate)>)>类型
'''
self.F = F
self.P = dict()
self.Q = dict() # 相当于博客中Q的转置
self.Y = dict()
self.bu = dict()
self.bi = dict()
self.alpha = alpha
self.lmbd = lmbd
self.max_iter = max_iter
self.rating_data = rating_data
self.mu = 0.0
'''随机初始化矩阵P、Q、Y'''
cnt = 0
for user, rates in self.rating_data:
self.P[user] = [random.random() / math.sqrt(self.F)
for x in range(self.F)]
self.bu[user] = 0
cnt += len(rates)
for item, rate in rates:
self.mu += rate
if item not in self.Q:
self.Q[item] = [random.random() / math.sqrt(self.F)
for x in range(self.F)]
if item not in self.Y:
self.Y[item] = [random.random() / math.sqrt(self.F)
for x in range(self.F)]
self.bi[item] = 0
self.mu /= cnt
def train(self):
'''随机梯度下降法训练参数P和Q
'''
for step in range(self.max_iter):
for user, rates in self.rating_data:
z = [0.0 for f in range(self.F)]
for item, _ in rates:
for f in range(self.F):
z[f] += self.Y[item][f]
ru = 1.0 / math.sqrt(1.0 * len(rates))
s = [0.0 for f in range(self.F)]
for item, rui in rates:
hat_rui = self.predict(user, item, rates)
err_ui = rui - hat_rui
self.bu[user] += self.alpha * (err_ui - self.lmbd * self.bu[user])
self.bi[item] += self.alpha * (err_ui - self.lmbd * self.bi[item])
for f in range(self.F):
s[f] += self.Q[item][f] * err_ui
self.P[user][f] += self.alpha * (err_ui * self.Q[item][f] - self.lmbd * self.P[user][f])
self.Q[item][f] += self.alpha * (
err_ui * (self.P[user][f] + z[f] * ru) - self.lmbd * self.Q[item][f])
for item, _ in rates:
for f in range(self.F):
self.Y[item][f] += self.alpha * (s[f] * ru - self.lmbd * self.Y[item][f])
self.alpha *= 0.9 # 每次迭代步长要逐步缩小
def predict(self, user, item, ratedItems):
'''预测用户user对物品item的评分
'''
z = [0.0 for f in range(self.F)]
for ri, _ in ratedItems:
for f in range(self.F):
z[f] += self.Y[ri][f]
return sum(
(self.P[user][f] + z[f] / math.sqrt(1.0 * len(ratedItems))) * self.Q[item][f] for f in range(self.F)) + \
self.bu[user] + self.bi[item] + self.mu
if __name__ == '__main__':
'''用户有A B C,物品有a b c d'''
rating_data = list()
rate_A = [('a', 1.0), ('b', 1.0)]
rating_data.append(('A', rate_A))
rate_B = [('b', 1.0), ('c', 1.0)]
rating_data.append(('B', rate_B))
rate_C = [('c', 1.0), ('d', 1.0)]
rating_data.append(('C', rate_C))
lfm = SVDPP(rating_data, 2)
lfm.train()
for item in ['a', 'b', 'c', 'd']:
print(item, lfm.predict('A', item, rate_A) ) # 计算用户A对各个物品的喜好程度
【显性反馈和隐性反馈】
//显性反馈模型
val model1 = ALS.train(ratings, rank, numIterations, lambda)
//隐性反馈模型
val model2 = ALS.trainImplicit(ratings, rank, numIterations, lambda, alpha)
rating
:评分矩阵rank
:隐因子的个数,一般设置为100左右numlterations
:迭代次数,调参得到lambada
:正则项alpha
:置信参数c u i = 1 + α d u i c_{ui}=1 + \alpha d_{ui} cui=1+αdui
【使用Spark的ALS模型构建推荐模型】
model.recommendProducts(userID, N)
model.predict(user, item)
或者model.predict(RDD[int, int])
。model.predict(RDD[int, int])
model.userFeatures
【ALS算法推导】
R m ∗ n ≈ X m ∗ k Y k ∗ n R_{m*n}\approx X_{m*k}Y_{k*n} Rm∗n≈Xm∗kYk∗n
( k ∗ 1 ) [ ( 1 ∗ k ) ⋅ ( k ∗ 1 ) ] = ( k ∗ 1 ) ( 1 ∗ k ) ( k ∗ 1 ) (k*1)[(1*k)·(k*1)]=(k*1)(1*k)(k*1) (k∗1)[(1∗k)⋅(k∗1)]=(k∗1)(1∗k)(k∗1)