要将基于用户的协同过滤(User-Based Collaborative Filtering,
UBCF)与交替最小二乘(Alternating Least Squares, ALS)结合起来,设计一个混合推荐系统。这种系统可以利用
ALS 的优点(如处理稀疏数据的能力)来改进基于用户的协同过滤
基于用户之间的相似度来进行推荐。在基于用户的协同过滤中,系统寻找与目标用户兴趣相似的其他用户,然后推荐这些相似用户喜欢的物品给目标用户
基于用户的协同过滤的基本步骤如下:
因为不同用户直接的评分范围差异比较大 使用余弦相似度来计算
import numpy as np
from scipy.spatial.distance import cosine
from collections import defaultdict
def calculate_similarity(ratings_matrix):
# 计算用户之间的余弦相似度 返回相似度矩阵 user*user
num_users = ratings_matrix.shape[0]
similarity_matrix = np.zeros((num_users, num_users))
for i in range(num_users):
for j in range(i + 1, num_users):
sim = 1 - cosine(ratings_matrix[i], ratings_matrix[j]) # 计算向量之间相似度
similarity_matrix[i][j] = sim
similarity_matrix[j][i] = sim
return similarity_matrix
def get_top_n_recommendations(user_id, ratings_matrix, similarity_matrix, n=10):
# 获取目标用户的最近邻用户
target_user_ratings = ratings_matrix[user_id]
target_user_similarities = similarity_matrix[user_id]
# 计算每个物品的预测评分
predicted_ratings = defaultdict(float)
for neighbor_id, similarity in enumerate(target_user_similarities):
## 不考虑相似度低的用户
if similarity < threshold:
continue
if neighbor_id == user_id:
continue # 跳过自身
neighbor_ratings = ratings_matrix[neighbor_id] ## 相似用户的评分
for item_id, rating in enumerate(neighbor_ratings):
# 检查物品是否已经被目标用户评分过。
#如果是未评分的物品(即目标用户对该物品的评分是 0),并且邻居用户对该物品有评分(评分大于 0),则进行以下操作
if rating > 0 and target_user_ratings[item_id] == 0:
predicted_ratings[item_id] += similarity * rating
# 根据预测评分排序
recommendations = sorted(predicted_ratings.items(), key=lambda x: x[1], reverse=True)[:n]
return recommendations
# 示例数据
ratings_matrix = np.array([
[5, 3, 0, 1],
[4, 0, 0, 1],
[1, 1, 0, 5],
[1, 0, 0, 4],
[0, 1, 5, 4]
])
similarity_matrix = calculate_similarity(ratings_matrix)
# 获取用户 0 的推荐
top_n_recommendations = get_top_n_recommendations(0, ratings_matrix, similarity_matrix, n=2)
print("Top 2 Recommendations for User 0:")
for item_id, prediction in top_n_recommendations:
print(f"Item {item_id}: {prediction:.2f}")
基于用户的协同过滤是一种经典的推荐算法,它通过寻找具有相似兴趣的用户来推荐物品,它关注的是用户之间的相似性而非物品之间的相似性。然而,在大规模数据集上,这种方法可能会因为计算用户相似度矩阵所需的大量计算资源而变得不太实用
鉴于上述缺点,现代推荐系统通常采用其他技术或组合多种技术来克服这些问题:
参照
推荐系统中ALS验证
import numpy as np
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict
def calculate_similarity(ratings_matrix):
# 计算用户之间的余弦相似度
similarity_matrix = cosine_similarity(ratings_matrix)
return similarity_matrix
def als_train(ratings_matrix, latent_features=10, learning_rate=0.001, reg_param=0.02, iterations=100):
# 初始化用户和物品的特征向量
user_features = np.random.normal(scale=1./latent_features, size=(ratings_matrix.shape[0], latent_features))
item_features = np.random.normal(scale=1./latent_features, size=(ratings_matrix.shape[1], latent_features))
# 训练 ALS 模型
for iteration in range(iterations):
for u, rated_items in enumerate(ratings_matrix):
items, ratings = zip(*[(i, r) for i, r in enumerate(rated_items) if r > 0])
item_matrix = item_features[items]
user_features[u] = np.linalg.solve(item_matrix.T @ item_matrix + reg_param * len(items) * np.eye(latent_features),
item_matrix.T @ ratings).ravel()
for i, rated_users in enumerate(ratings_matrix.T):
users, ratings = zip(*[(u, r) for u, r in enumerate(rated_users) if r > 0])
user_matrix = user_features[users]
item_features[i] = np.linalg.solve(user_matrix.T @ user_matrix + reg_param * len(users) * np.eye(latent_features),
user_matrix.T @ ratings).ravel()
return user_features, item_features
def get_top_n_recommendations(user_id, ratings_matrix, user_features, item_features, n=10):
# 生成基于 ALS 的推荐
predictions_als = user_features[user_id].dot(item_features.T)
# 生成基于用户相似度的推荐
target_user_ratings = ratings_matrix[user_id]
target_user_similarities = similarity_matrix[user_id]
predicted_ratings = defaultdict(float)
for neighbor_id, similarity in enumerate(target_user_similarities):
if neighbor_id == user_id:
continue # 跳过自身
neighbor_ratings = ratings_matrix[neighbor_id]
for item_id, rating in enumerate(neighbor_ratings):
if rating > 0 and target_user_ratings[item_id] == 0:
predicted_ratings[item_id] += similarity * rating
# 结合 ALS 和基于用户相似度的预测
combined_predictions = defaultdict(float)
for item_id in predicted_ratings.keys():
combined_predictions[item_id] = 0.7 * predictions_als[item_id] + 0.3 * predicted_ratings[item_id]
# 根据预测评分排序
recommendations = sorted(combined_predictions.items(), key=lambda x: x[1], reverse=True)[:n]
return recommendations
# 示例数据
ratings_data = np.array([
[5, 3, 0, 1],
[4, 0, 0, 1],
[1, 1, 0, 5],
[1, 0, 0, 4],
[0, 1, 5, 4]
])
# 构建稀疏矩阵
ratings_matrix = csr_matrix(ratings_data)
# 训练 ALS 模型
user_features, item_features = als_train(ratings_matrix)
# 计算用户之间的相似度
similarity_matrix = calculate_similarity(ratings_matrix.toarray())
# 获取用户 0 的推荐
top_n_recommendations = get_top_n_recommendations(0, ratings_matrix, user_features, item_features, n=2)
print("Top 2 Recommendations for User 0:")
for item_id, prediction in top_n_recommendations:
print(f"Item {item_id}: {prediction:.2f}")