协同过滤算法常用于商品推荐或者类似的场合,根据用户之间或商品之间的相似性进行精准推荐
UserCF:
D, N = {}, {}
# 物品到用户倒排表
for user, items in user_movie.items():
N[user] = len(items)
for i in items.keys():
D.setdefault(i, [])
D[i].append(user)
C = {}
# 用户到用户矩阵
for item, user in D.items():
for i in user:
C.setdefault(i, {})
for j in user:
if i== j :continue
C[i].setdefault(j, 0)
C[i][j] += 1
W = {}
# 计算用户相似度
for i, related_users in C.items():
W.setdefault(i, {})
for j, cij in related_users.items():
W[i][j] = cij / math.sqrt((N[i]* N[j]))
rank = {} # 排名
# W 用户相似矩阵, N: 喜欢物品的用户集合,
for u, wu in sorted(W[user].items(), key=lambda x: x[1], reverse=True)[0:K]:
for movie, users in movie_user.items():
# 选择K个最高相似度用户
# 选出 前K个喜欢,user没有选择的film
if u in users and user not in users:
rank.setdefault(movie, 0)
score = movie_user[movie][u]
rank[movie] += score * wu
return dict(sorted(rank.items(), key=lambda x:x[1], reverse=True)[0:N])
ItemCF:
C = {}
N = {}
for user, items in user_movie.items():
# items : {'film6': 4, 'film11': 2, 'film8': 2, 'film14': 2},
for i in items.keys():
# i :‘film6...
N.setdefault(i , 0)
N[i] += 1 # 电影打分人数
C.setdefault(i, {})
for j in items.keys():
if i == j : continue
C[i].setdefault(j, 0)
C[i][j] += 1 # 电影相似度
W1 = {}
for i, related_items in C.items():
W.setdefault(i, {})
for j, cij in related_items.items():
W[i][j] = cij / (math.sqrt(N[i] * N[j]))
rank = {}
action_item = user_movie[user]
for item, score in action_item.items():
for j, Wj in sorted(W[item].items(), key=lambda x: x[1], reverse=True)[0:K]:取前K个相似用户,减少计算量
if j in action_item.keys(): # 用户已看过该电影
continue
rank.setdefault(j, 0)
rank[j] += score * Wj # 用户没看过,用看过的电影*物品相似度
return dict(sorted(rank.items(), key=lambda x: x[1], reverse=True)[0:N])
虚拟数据:
10个用户,对14部电影的评分
ex:计算相似度的三种方法:
在预测时使用加权的方法
ex:对user0 ->film2的预测:
取N个最相近的用户
p = N1与user0相似度xN1对film2的评分+N2与user0相似度xN2对film2的评分 / N1的相似度+N2的相似度
利用上述表格利用不同方法进行对user0做推荐
UserCF,用user-item倒排表余弦相似度计算相似度
UserCF直接在表格用皮尔逊相关系数计算相似度
使用MovieLens作为实验数据
UserCF,用item-user计算相似度
可以看出geners大多数相同,效果应该应该还不错
ItemCF,使用user-item倒排表使用余弦相似度
耗时较长,应该是item远远大于user, item相似度矩阵非常大
UserCF直接在表格用皮尔逊相关系数计算相似度
UserCF和ItemCF代码
# -*- coding: utf-8 -*-
# datetime:2020/6/9 11:48
import math
from itertools import islice
import time
def readData():
file_user_movie = './ml-latest-small/ratings.csv'
file_movie_info = './ml-latest-small/movies.csv'
user_movie = {}
for line in islice(open(file_user_movie, encoding='utf-8'), 1, None):
user, item, score = line.split(',')[0:3]
user_movie.setdefault(user, {})
user_movie[user][item] = float(score)
movies = {}
for line in islice(open(file_movie_info, encoding='utf-8'), 1, None):
(movieId, movieTitle, genres) = line.split(',')[0:3]
movies[movieId] = movieTitle+' '+'genres:{}'.format(genres)
return user_movie, movies
# ItemCF
# 使用余弦计算相似度
def cosineItemSimilarity(user_movie):
C = {}
N = {}
for user, items in user_movie.items():
for i in items.keys():
# i : 电影id
N.setdefault(i , 0)
N[i] += 1 # 用户对应物品集合
C.setdefault(i, {})
for j in items.keys():
if i == j : continue
C[i].setdefault(j, 0)
C[i][j] += 1 # 物品对应用户的集合
# print(C)
W = {}
for i, related_items in C.items():
W.setdefault(i, {})
for j, cij in related_items.items():
W[i][j] = cij / (math.sqrt(N[i] * N[j]))
return W
def recommend(user, user_movie, W, K, N):
rank = {}
action_item = user_movie[user]
for item, score in action_item.items():
for j, Wj in sorted(W[item].items(), key=lambda x: x[1], reverse=True)[0:K]:
if j in action_item.keys(): continue
rank.setdefault(j, 0)
rank[j] += score * Wj
return dict(sorted(rank.items(), key=lambda x: x[1], reverse=True)[0:N])
# userCF
def simmialr(user_movie):
D, N = {}, {}
# 物品到用户倒排表
for user, items in user_movie.items():
N[user] = len(items)
for film, score in items.items():
D.setdefault(film, {})
D[film][user] = score
C = {}
# 用户到用户矩阵
for item, user in D.items():
for i in user:
C.setdefault(i, {})
for j in user:
if i== j :continue
C[i].setdefault(j, 0)
C[i][j] += 1
W = {}
# 计算用户相似度
for i, related_users in C.items():
W.setdefault(i, {})
for j, cij in related_users.items():
W[i][j] = cij / math.sqrt((N[i]* N[j]))
return W, D
def recommend2(user, movie_user, W, K, N):
rank = {} # 排名
# W 用户相似矩阵, N: 喜欢物品的用户集合,
for u, wu in sorted(W[user].items(), key=lambda x: x[1], reverse=True)[0:K]:
for movie, users in movie_user.items():
# 选择K个最高相似度用户
# 选出 前K个喜欢,user没有选择的film
if u in users and user not in users:
rank.setdefault(movie, 0)
score = movie_user[movie][u]
rank[movie] += score * wu
return dict(sorted(rank.items(), key=lambda x:x[1], reverse=True)[0:N])
#
if __name__ == '__main__':
user_movie, movies = readData()
pre = time.time()
# UW, movie_user = simmialr(user_movie) # userCF
# result = recommend2('1', movie_user, UW, 10, 10) # userCF
W = cosineItemSimilarity(user_movie)
result = recommend('1', user_movie, W, 10, 10)
print('耗时{}s'.format(time.time()- pre))
for film, rating in result.items():
print(movies[film])
print(rating)
print('\n')
UserCF,表格使用余弦计算相似度
def simmialr(user_moives):
# 余弦计算相关度
W={}
for user1, items1 in user_moives.items():
W.setdefault(user1, {})
for user2, items2 in user_moives.items():
if user1 == user2: continue
W[user1].setdefault(user2, 0)
items = items1.keys() & items2.keys() # 取交集
if items:
for item in items:
# 余弦计算公式
W[user1][user2] += \
(user_moives[user1][item]*user_moives[user2][item]) / \
(math.sqrt(sum([x**2 for x in items1.values()])) * math.sqrt(sum([y**2 for y in items2.values()])))
return W
def recommend(uuser, user_movie, W, K, N):
rank,sumN = {}, {}
action_item = user_movie[uuser].keys()
for user, Wj in sorted(W[uuser].items(), key=lambda x: x[1], reverse=True)[0:K]:
for item, score in user_movie[user].items():
if item in action_item: continue
rank.setdefault(item, 0)
if item in user_movie[user].keys():
sumN.setdefault(item, 0)
sumN[item] += Wj
rank[item] += score * Wj
for item1, score1 in rank.items():
for item2, score2 in sumN.items():
if item1 == item2:
# 除以该item的相似度总和
rank[item1] = score1 / score2
return dict(sorted(rank.items(), key=lambda x: x[1], reverse=True)[0:N])
UserCF直接在表格用皮尔逊相关系数计算相似度(代码)
def simmialr(user_movie):
#
W = {}
# 物品到用户倒排表
for user1, items1 in user_movie.items():
W.setdefault(user1, {})
for user2, items2 in user_movie.items():
if user1 == user2 : continue
W[user1].setdefault(user2, 0)
ave1 = sum(items1.values()) / len(items1)
ave2 = sum(items2.values()) / len(items2)
U = items1.keys() | items2.keys()
u1, u2 = [], []
for i in U:
# print(i)
if i in items1.keys():
u1.append(items1[i])
else:
u1.append(ave1)
if i in items2.keys():
u2.append(items2[i])
else:
u2.append(ave2)
W[user1][user2] = corrcoef(u1, u2)
return W
def multipl(a, b):
sumofab = 0.0
for i in range(len(a)):
temp = a[i] * b[i]
sumofab += temp
return sumofab
def corrcoef(x, y):
n = len(x)
# 求和
sum1 = sum(x)
sum2 = sum(y)
# 求乘积之和
sumofxy = multipl(x, y)
# 求平方和
sumofx2 = sum([pow(i, 2) for i in x])
sumofy2 = sum([pow(j, 2) for j in y])
num = sumofxy - (float(sum1) * float(sum2) / n)
# 计算皮尔逊相关系数
den = math.sqrt((sumofx2 - float(sum1 ** 2) / n) * (sumofy2 - float(sum2 ** 2) / n))
if den:
return num / den
else:
return num / 1
def recommend(uuser, user_movie, W, K, N):
rank,sumN = {}, {}
action_item = user_movie[uuser].keys()
for user, Wj in sorted(W[uuser].items(), key=lambda x: x[1], reverse=True)[0:K]:
for item, score in user_movie[user].items():
if item in action_item: continue
rank.setdefault(item, 0)
rank[item] += score * Wj
return dict(sorted(rank.items(), key=lambda x: x[1], reverse=True)[0:N])