推荐算法实例代码:
1.数据处理过程,主要涉及数据的读取,文件data_process.py
import pandas as pd
import os
import csv
def get_item_info(input_file):
"""
得到Item的信息
input_file: Item的文件地址
return:
dict: {itemID:[item_info]}
"""
item_info = {}
if not os.path.exists(input_file):
return {}
with open(input_file, "r", encoding='utf-8') as file:
lines = csv.reader(file)
i = 0
for line in lines: # 遍历每一条信息
if i == 0: # 跳过表头
i += 1
continue
else:
item_info[line[0]] = line[1:]
return item_info
def get_average_score(input_file):
"""
得到Item的平均得分
input_file: Item的打分文件 ratings.csv
return:
dict {ItemID:average_score}
"""
score_dict = {}
if not os.path.exists(input_file):
return {}
ratings_data = pd.read_csv(input_file)
ratings_mean_score = ratings_data[["movieId", "rating"]].groupby("movieId").agg("mean") # 对item分组求均值
movieID = ratings_mean_score.index.values.astype("str") # 将itemID 转化为str型
mean_score = ratings_mean_score["rating"].values.round(3) # 将均值保留三位小数
movieID_mean_score_zip = zip(movieID, mean_score)
for movieID, score in movieID_mean_score_zip:
score_dict[movieID] = score
return score_dict
def get_train_data(input_file):
"""
得到LFM的训练数据
input_file: user、item rating 文件
return:
list [(userID, itemID, label), (userID, itemID, label)]
"""
if not os.path.exists(input_file):
return []
score_dict = get_average_score(input_file) # item的平均得分
pos_dict, neg_dict = {}, {} # 正样本, 负样本
train_data = [] # 训练集
threshold = 4.0 # 阈值 (大于该值,为正样本;否则为负样本)
with open(input_file, "r", encoding='utf-8') as file:
lines = csv.reader(file)
i = 0
for line in lines:
if i == 0: # 跳过表头
i += 1
continue
userID, itemID, rating = line[0], line[1], float(line[2])
if userID not in pos_dict:
pos_dict[userID] = []
if userID not in neg_dict:
neg_dict[userID] = []
if rating > threshold: # rating 大于 4.0,正样本;添加到正样本中的
pos_dict[userID].append((itemID, 1))
else:
score = score_dict.get(itemID, 0) # 否则,获取该item 对应的平均得分;添加到负样本中
neg_dict[userID].append((itemID, score))
# 均衡正负样本
for userID in pos_dict:
data_num = min(len(pos_dict[userID]), len(neg_dict.get(userID, []))) # 对于某用户,取其正负样本最小的数量为最终正负样本的数量
if data_num > 0:
train_data += [(userID, pos_data[0], pos_data[1]) for pos_data in pos_dict[userID]][: data_num] # 正样本取data_num个
else:
continue
sorted_neg_list = sorted(neg_dict[userID], key=lambda x: x[1], reverse=True)[: data_num] # 根据评分对负样本排序,取前data_num个为负样本!
train_data += [(userID, neg_data[0], 0) for neg_data in sorted_neg_list]
return train_data
if __name__ == '__main__':
input_file = "./data/ratings.csv" # 评分表
train_data = get_train_data(input_file)
print(train_data[:10])
2.lfm的训练,数据的预测,需要导入上面的文件data_process.py,处理流程大致如下:
import numpy as np
# import sys
# sys.path.append()
from data_process import * # 导入数据处理脚本,注意位置
import operator
from tqdm import *
def init_vector(train_data, vector_len):
"""
初始化user和item的向量
train_data: 训练数据
vector_len: 向量的长度
return:
user vector and item vector
"""
init_user_vec = {}
init_item_vec = {}
for data_instance in train_data:
userID, itemID, _ = data_instance
init_user_vec[userID] = np.random.randn(vector_len)
init_item_vec[itemID] = np.random.randn(vector_len)
return init_user_vec, init_item_vec
def lfm_train(train_data, F, alpha, learning_rate, step):
"""
采用梯度下降,不断更新迭代 user_vector and item_vector
train_data: 训练数据集
F: 隐特征数
alpha: 正则化系数
learning_rate: 学习率
step: 迭代轮次
return:
dict {userID: [user_vector]}
dict {itemID: [user_vector]}
"""
user_vec, item_vec = init_vector(train_data, F) # 随机初始化user and item vector
for _ in tqdm(range(step), desc="训练进度: "): # 训练轮次
for data_instance in (train_data):
userID, itemID, label = data_instance
user_vector, item_vector = user_vec[userID], item_vec[itemID] # user vector, item vector
vector_dot = np.dot(user_vector, item_vector) / (np.linalg.norm(user_vector) * np.linalg.norm(item_vector)) # 向量点积(用户对物品的喜爱程度)
loss = label - vector_dot # loss
user_vector += np.multiply(learning_rate, (loss * item_vector - alpha * user_vector)) # user_vector 不断更新(根据梯度下降公式)
item_vector += np.multiply(learning_rate, (loss * user_vector - alpha * item_vector)) # item_vector 不断更新(根据梯度下降公式)
learning_rate = learning_rate * 0.95 # 学习率衰减
return user_vec, item_vec
def top_n_item(user_vec, item_vec, userid):
"""
计算某用户最喜爱的top_n个物品
user_vec: 用户向量
item_vec: 物品向量
userid: 用户ID
return:
a list: [(item, score), (item1, score1),,,]
"""
top_n = 10
if userid not in user_vec:
return []
item_score = {} # 存放用户对某物品的喜爱程度
top_list = [] # top_n喜欢的物品
user_vector = user_vec[userid] # 获取指定user对应的vector
for itemID in item_vec: # 遍历所有的item
item_vector = item_vec[itemID] # 拿到item对应的vector
vector_dot = np.dot(user_vector, item_vector) / (np.linalg.norm(user_vector) * np.linalg.norm(item_vector)) # 计算user和item1的相似度(喜爱程度)
item_score[itemID] = vector_dot # 将该用户对所有的物品喜爱程度写入字典中
# 以喜爱程度排序,取前top_n个用户最喜欢的item,返回
for item_i in sorted(item_score.items(), key=operator.itemgetter(1), reverse=True)[:top_n]:
itemID, score= item_i[0], round(item_i[1], 3)
top_list.append((itemID, score)) # 添加到最终的列表
return top_list
def print_top_result(train_data, userid, top_n):
"""
打印某用户以前看过的物品和算法推荐的物品
:param train_data: 用户之前喜欢的item
:param userid: 用户id
:param recom_list: 推荐的item
:return:
"""
item_info = get_item_info("./data/movies.csv") # item的信息
print("----------------------------User clicked item----------------------------")
for data_instance in train_data: # 遍历数据集
userID_tem, itemID, label = data_instance
if userID_tem == userid and label == 1: # 若打印某user喜爱的物品(正样本)
print(f"[User clicked items] itemID : {itemID} ; item_info : {item_info[itemID]} ")
# print(userid)
print("----------------------------Recommendation result of LFM algorithm----------------------------")
for item_i in top_n:
print(f"[Recommended items] itemID : {item_i[0]} ; item_info : {item_info[item_i[0]]} ; 喜爱程度: {item_i[1]}")
if __name__ == '__main__':
train_data = get_train_data("./data/ratings.csv")
user_vec, item_vec = lfm_train(train_data, 32, 0.01, 0.1, 2) # 训练
userID = '10' # 以用户ID‘10’为例,查看其推荐结果
top_result = top_n_item(user_vec, item_vec, userID)
print_top_result(train_data, userID, top_result)
数据如下:
https://download.csdn.net/download/L_goodboy/86511455
网盘链接:
链接:https://pan.baidu.com/s/1kNXonLMxYaql6Hy15ktc7Q
提取码:tn2n