目录
1 LFM(Latent Factor Model)原理
2 LFM 特点
3 基于LFM的电影推荐
隐语义模型思想:将user与item分别用一个F维的隐向量来表示,其乘积就表示用户对物品的评分或感兴趣程度。
可以理解为:user并不是直接喜欢物品,而是喜欢F个类别,而item属于这F个类别,学习每个user对类别的感兴趣度,item与类别之间的关系。
item1 | item2... | item n | |
user1 | 1 | 0.... | 1 |
user2 . . . |
0 . . . |
1 . . . |
0 . . . |
user n | 1 | 1... | 1 |
用户u对物品i的兴趣或评分为:
损失函数:
,后面是惩罚项
梯度下降方法最小化损失函数,偏导:
同理可得qi的迭代公式。
与CF相比:
1 LFM具有比较好的理论基础,它是一种学习方法,通过优化一个设定的指标建立最优模型。而CF是基于统计的方法
2 离线空间复杂度: O(F*(m+n)),而CF是O(m*m)或O(n*n)
3 离线时间复杂度: LFM要多次迭代训练,略高于CF,无大的差别
4 在线实时响应:LFM,要计算user_vector[userid]与每一个item_vector的乘积,然后排序返回topN。当物品非常多时,复杂度可达O(m*F*n),因此LFM不适合物品数量庞大的系统;另外,LFM 生成推荐列表的速度太慢,需要将所有用户的推荐结果事先离线计算好存储起来。
5 可解释性不足,无法给出推荐理由
数据集:MovieLens(Small: 100,000 ratings and 3,600 tag applications applied to 9,000 movies by 600 users.)
参数:隐向量的维度F,正则化参数alpha,学习率beta,迭代步数step
模型推荐过程:
获得电影信息(itemid,title,genre)->获得电影平均得分(itemid,average_score)->抽取正负样本,划分训练集和测试集(userid,itemid,label)->LFM模型训练(user vector,item vector)->为目标用户推荐(recom_list)
def get_item_info(input_file):
"""
获得电影信息,movieid: title,genre
Args:
input_file: item info file
Return:
a dict: key itemid, value:[title,genre]
"""
import os
if not os.path.exists(input_file):
return {}
fp=open(input_file,'r', encoding='UTF-8')#防止gbk解码出错
#UnicodeDecodeError: 'gbk' codec can't decode byte 0x93 in position 7565: illegal multibyte sequence
item_info={}
linenum=0
for line in fp:
if linenum==0:
linenum+=1
continue
item=line.strip().split(',')
if len(item)<3:
continue
elif len(item)==3:
itemid,title,genre=item[0],item[1],item[2]
else: #防止电影title中存在,如"American President, The (1995)"
itemid=item[0]
genre=item[-1]
title=",".join(item[1:-1])
item_info[itemid]=[title,genre]
fp.close()
return item_info
def get_ave_score(input_file):
"""
获得电影的平均分 itemid: average score
Args:
input_file: user rating file
Return:
a dict: key itemid, value:ave_score
"""
import os
if not os.path.exists(input_file):
return {}
fp=open(input_file)
linenum=0
record_dict={}
score_dict={}
for line in fp:
if linenum==0:
linenum+=1
continue
item=line.strip().split(',')
if len(item)<4:
continue
userid,itemid,rating=item[0],item[1],item[2]
if itemid not in record_dict:
record_dict[itemid]=[0,0]
record_dict[itemid][0]+=1 #记录每部电影的评分次数
record_dict[itemid][1]+=float(rating) #记录每部的所有评分
fp.close()
for itemid in record_dict:
score_dict[itemid]=round(record_dict[itemid][1]/record_dict[itemid][0],3)
return score_dict
def get_train_data(input_file,ratio=0.75):
"""
获得训练集与测试集:userid,itemid,label
保证每个user的正负样本均衡
阈值的确定2.5 or 平均值?
Args:
input_file: user item rating file
ratio: testset number/trainset number
Return:
trainset: a list: userid,itemid,label
testset: a list: userid,itemid,label
"""
import random
import os
if not os.path.exists(input_file):
return {}
score_dict=get_ave_score(input_file) #获得平均分
train_data=[]
test_data=[]
neg_dict={}
pos_dict={}
test_pos_dict={}
test_neg_dict={}
fp=open(input_file)
linenum=0
score_thr=2.5 #正负样本划分阈值
record_dict={}
score_dict={}
for line in fp:
if linenum==0:
linenum+=1
continue
item=line.strip().split(",")
if len(item)<4:
continue
userid,itemid,rating=item[0],item[1],float(item[2])
if(random.random() < ratio):#训练集
if userid not in pos_dict:
pos_dict[userid]=[]
if userid not in neg_dict:
neg_dict[userid]=[]
if rating>=score_thr:
pos_dict[userid].append((itemid,1))
else:
#如果评分小于阈值,则添加该item的平均分(不存在则添0)
score=score_dict.get(itemid,0)
neg_dict[userid].append((itemid,score))
else: #测试集
if userid not in test_pos_dict:
test_pos_dict[userid]=[]
if userid not in test_neg_dict:
test_neg_dict[userid]=[]
if rating>=score_thr:
test_pos_dict[userid].append((itemid,1))
else:
#如果评分小于阈值,则添加该item的平均分(不存在则添0)
score=score_dict.get(itemid,0)
test_neg_dict[userid].append((itemid,score))
fp.close()
#构造训练集(正负样本均衡)
for userid in pos_dict:
data_num=min(len(pos_dict[userid]),len(neg_dict.get(userid,[]))) #一个userid对应的正负样本数量最小值
if data_num>0: #要保证正负样本均存在,才会添加到训练集中,并且数量相等,都取最小值
train_data+=[(userid,zuhe[0],zuhe[1]) for zuhe in pos_dict[userid]][:data_num]
else:
continue
sorted_neg=sorted(neg_dict[userid],key=lambda element:element[1],reverse=True)[:data_num]
#保证负样本取少量
train_data+=[(userid,zuhe[0],0) for zuhe in sorted_neg]
#构造测试集
for userid in test_pos_dict:
data_num=min(len(pos_dict[userid]),len(neg_dict.get(userid,[]))) #一个userid对应的正负样本数量最小值
if data_num>0: #要保证正负样本均存在,才会添加到训练集中,并且数量相等,都取最小值
test_data+=[(userid,zuhe[0],zuhe[1]) for zuhe in test_pos_dict[userid]][:data_num]
else:
continue
test_sorted_neg=sorted(test_neg_dict[userid],key=lambda element:element[1],reverse=True)[:data_num]
#保证负样本取少量
test_data+=[(userid,zuhe[0],0) for zuhe in test_sorted_neg]
return train_data,test_data
def lfm_train(train_data,F,alpha,beta,step):
"""
LFM模型训练
Args:
train_data: itemid,userid,label
F: 隐向量的维度
alpha: 正则化因子
beta: learning rate
step: 迭代步数
Return:
dict: key itemid value:ndarray
dict: key userid value:ndarray
"""
user_vec={}
item_vec={}
for s in range(step):
for data in train_data:
userid,itemid,label=data[0],data[1],data[2]
if userid not in user_vec:
user_vec[userid]=init_model(F)
if itemid not in item_vec:
item_vec[itemid]=init_model(F)
deta=label-model_predict(user_vec[userid],item_vec[itemid])#计算差值
for index in range(F):
user_vec[userid][index]+=beta*(deta*item_vec[itemid][index]-alpha*user_vec[userid][index])
item_vec[itemid][index]+=beta*(deta*user_vec[userid][index]-alpha*item_vec[itemid][index])
beta=beta*0.9 #学习率衰减
return user_vec,item_vec
def init_model(vec_len):
"""
向量初始化
Args: len vector
Return:
a ndarray
"""
import numpy as np
return np.random.randn(vec_len) #返回一个随机向量
def model_predict(user_vector,item_vector):
"""
求两个向量的余弦相似度
Args:
user vector,item vector
Return:
a num
#np.linalg.norm(vector),求向量的二范数
"""
import numpy as np
#余弦相似度
return np.dot(user_vector,item_vector)/(np.linalg.norm(user_vector)*np.linalg.norm(item_vector))
def give_recom_result(user_vec,item_vec,userid):
"""
获取目标用户的推荐电影列表
Args:
user vector,item vector:lfm result
userid: objective user
Return:
a list:[(itemid,score)]
"""
import operator
import numpy as np
if userid not in user_vec:
return []
record={} #记录所有item与user的乘积
recom_list=[] #返回num_recom个元素的推荐电影列表
user_vector=user_vec[userid]
num_recom=10
for itemid in item_vec:
item_vector=item_vec[itemid]
res=np.dot(user_vector,item_vector)/(np.linalg.norm(user_vector)*np.linalg.norm(item_vector))
record[itemid]=res
for zuhe in sorted(record.items(),key=operator.itemgetter(1),reverse=True)[:num_recom]:
itemid=zuhe[0]
score=round(zuhe[1],3)
recom_list.append((itemid,score))
return recom_list
def ana_recom_result(train_data,recom_list,userid,item_info_file):
"""
给出推荐分析,展示喜欢的和推荐的并比较
Args:
train_data: training data
recom_list: recom list for objective user
userid: objective user
item_info_file: item info file path
Print:
print: label==1 item info for userid
print: recom list for userid
"""
item_info=get_item_info(item_info_file)
print("user liked:")
for data in train_data:
useridd,itemid,label=data
if useridd==userid:
if label==1:
print(item_info[itemid])
print("recom result:")
for zuhe in recom_list:
print(item_info[zuhe[0]])
显示推荐结果:
input_file="./ml-latest-small/ratings.csv"
item_info_file="./ml-latest-small/movies.csv"
F,alpha,beta,step=10,0.001,0.1,10
userid="24"
give_recom_ana(input_file,F,alpha,beta,step,userid,item_info_file)
user liked: ['Grumpier Old Men (1995)', 'Comedy|Romance'] ['Heat (1995)', 'Action|Crime|Thriller'] ['Seven (a.k.a. Se7en) (1995)', 'Mystery|Thriller'] ['"Usual Suspects, The (1995)"', 'Crime|Mystery|Thriller'] ['From Dusk Till Dawn (1996)', 'Action|Comedy|Horror|Thriller'] recom result: ['Waking Ned Devine (a.k.a. Waking Ned) (1998)', 'Comedy'] ['"Misérables, Les (1995)"', 'Drama|War'] ['Fanny and Alexander (Fanny och Alexander) (1982)', 'Drama|Fantasy|Mystery'] ['"Lord of the Rings: The Return of the King, The (2003)"', 'Action|Adventure|Drama|Fantasy'] ['"Inkwell, The (1994)"', 'Comedy|Drama'] ["Ocean's Twelve (2004)", 'Action|Comedy|Crime|Thriller'] ['Maze Runner: Scorch Trials (2015)', 'Action|Thriller'] ['City Hunter (Sing si lip yan) (1993)', 'Action|Comedy|Romance'] ['What If (2013)', 'Comedy|Drama|Romance'] ['Balls of Fury (2007)', 'Comedy']
算法性能评估:
粗略查看F值影响
#evaluate
def evaluate(input_file):
"""
测试性能:
但是需要重新划分训练集和测试集!
num_recom: recom number
"""
train_data,test_data=get_train_data(input_file)
test_dict={}
all_movies=set()
for userid,itemid,label in test_data:
all_movies.add(itemid) #保存所有电影种类,计算coverage
if label==1:
if userid not in test_dict:
test_dict[userid]={}
test_dict[userid][itemid]=label #建立用户-电影dict,方便判断movie是否存在于test
num_recom=10
hit = 0
rec_count = 0
test_count = 0
all_rec_movies = set()
movie_count=len(all_movies)
uservec,itemvec=lfm_train(train_data,30,0.01,0.01,50)
for userid in test_dict.keys():
test_moives =test_dict.get(userid,{})
rec_movies=give_recom_result(user_vec,item_vec,userid)
for itemid,score in rec_movies:
if itemid in test_moives:
hit += 1
all_rec_movies.add(itemid)
rec_count += num_recom
test_count += len(test_moives) #统计所有testset的电影数量,计算recall
precision = hit / (1.0 * rec_count)
recall = hit / (1.0 * test_count)
coverage = len(all_rec_movies) / (1.0 *movie_count)
print('precisioin=%.4f\trecall=%.4f\tcoverage=%.4f' % (precision, recall, coverage))
return [precision, recall, coverage]
现实应用场景中,当user或item数量过多,LFM训练过程太慢:
往往采用分布式计算方法加快训练速度;
或者,先用CF确定一定的搜索范围,然后利用LFM进一步对item进行排序,输出推荐结果。