本文采用的数据集来源于Netflix用户电影评分数据,实现一个简单的基于用户的协同过滤推荐系统,其中采用皮尔逊系数衡量两个用户之间的相似度。
数据集地址
使用到的数据文件:
由于数据量过大,这里仅选择原数据集中的1000个用户及其评分数据进行推荐算法的简单实现,否则在单机上难以运行(仅1000个用户数据的处理时间已经达到了数十分钟)。
首先选择1000个用户:
def __selectSomeUsers(self):
print("随机选择1000个用户")
if os.path.exists("data/train.json") and os.path.exists("data/test.json"):
return list()
else:
users = set()
for file in os.listdir(self.file_path):
one_path = "{}/{}".format(self.file_path, file)
print("{}".format(one_path))
with open(one_path, "r") as fp:
for line in fp.readlines():
if line.strip().endswith(":"):
continue
userID,_,_ = line.split(",")
users.add(userID)
some_users = random.sample(list(users),1000)
print(some_users)
return some_users
然后加载评分数据信息并分割训练集和测试集:
# 加载并拆分数据
def _load_and_split_data(self):
train = dict()
test = dict()
if os.path.exists("data/train.json") and os.path.exists("data/test.json"):
print("从文件中加载数据集")
train = json.load(open("data/train.json"))
test = json.load(open("data/test.json"))
print("数据加载完成")
else:
i=0
random.seed(self.seed)
for file in os.listdir(self.file_path):
one_path = "{}/{}".format(self.file_path, file)
print("{}".format(one_path))
with open(one_path, "r") as fp:
movieID = fp.readline().split(":")[0]
print("movie ID:"+movieID)
for line in fp.readlines():
if line.strip().endswith(":"):
movieID = line.split(":")[0]
print("movie ID:"+movieID)
continue
userID, rate, _ = line.split(",")
if(userID in self.some_users):
if random.randint(1,50) == 1:
test.setdefault(userID,{})[movieID] = int(rate)
else:
train.setdefault(userID, {})[movieID] = int(rate)
print("加载数据到 data/train.json data/test/json")
json.dump(train,open("data/train.json","w"))
json.dump(test,open("data/test.json","w"))
print("数据加载完成")
return train,test
这里采用皮尔逊系数进行计算,采用其近似计算如下:
r ′ = ∑ i = 1 n x i y i − ∑ i = 1 n x i ∑ i = 1 n y i n ∑ i = 1 n x i 2 − ( ∑ i = 1 n x i ) 2 n ∑ i = 1 n y i 2 − ( ∑ i = 1 n y i ) 2 n r'=\frac{\sum_{i=1}^{n}x_iy_i-\frac{\sum_{i=1}^{n}x_i\sum_{i=1}^{n}y_i}{n}} {\sqrt{\sum_{i=1}^{n}x_i^2- \frac{ \left( \sum_{i=1}^{n}x_i \right)^2 }{n}} \sqrt{\sum_{i=1}^{n}y_i^2- \frac{ \left( \sum_{i=1}^{n}y_i \right)^2 }{n} }} r′=∑i=1nxi2−n(∑i=1nxi)2∑i=1nyi2−n(∑i=1nyi)2∑i=1nxiyi−n∑i=1nxi∑i=1nyi
def pearson(self, rating1, rating2):
sum_xy = 0
sum_x = 0
sum_y = 0
sum_x2 = 0
sum_y2 = 0
num = 0
for key in rating1.keys():
if key in rating2.keys():
num += 1
x = rating1[key]
y = rating2[key]
sum_xy += x * y
sum_x += x
sum_y += y
sum_x2 += math.pow(x,2)
sum_y2 += math.pow(y,2)
if num == 0:
return 0
denominator = math.sqrt( sum_x2 - math.pow(sum_x,2) / num) * math.sqrt( sum_y2 - math.pow(sum_y,2) / num)
if denominator == 0:
return 0
else:
return (sum_xy - (sum_x * sum_y) / num) / denominator
即采用KNN寻找用户user的k个近邻并进行排序,并选择其中评分较高的n个电影,推荐给当前用户即可。
def recommend(self,userID):
neighborUser = dict()
for user in self.train.keys():
if userID != user:
distance = self.pearson(self.train[userID], self.train[user])
neighborUser[user] = distance
newNU = sorted(neighborUser.items(), key= lambda k:k[1],reverse=True)
movies = dict()
for (sim_user,sim) in newNU[:self.k]:
for movieID in self.train[sim_user].keys():
movies.setdefault(movieID,0)
movies[movieID] += sim * self.train[sim_user][movieID]
newMovies = sorted(movies.items(),key=lambda k:k[1],reverse=True)
return newMovies
import os
import json
import random
import math
class FisrtRec:
"""
初始化函数
file_path: 数据文件路径
seed: 随机数种子
k: 选取的近邻个数
n_items: 推荐的电影数量
"""
def __init__(self, file_path, seed, k, n_items):
self.file_path = file_path
self.seed = seed
self.k = k
self.n_items = n_items
self.some_users = self.__selectSomeUsers()
self.train,self.test = self._load_and_split_data()
def __selectSomeUsers(self):
print("随机选择1000个用户")
if os.path.exists("data/train.json") and os.path.exists("data/test.json"):
return list()
else:
users = set()
for file in os.listdir(self.file_path):
one_path = "{}/{}".format(self.file_path, file)
print("{}".format(one_path))
with open(one_path, "r") as fp:
for line in fp.readlines():
if line.strip().endswith(":"):
continue
userID,_,_ = line.split(",")
users.add(userID)
some_users = random.sample(list(users),1000)
print(some_users)
return some_users
# 加载并拆分数据
def _load_and_split_data(self):
train = dict()
test = dict()
if os.path.exists("data/train.json") and os.path.exists("data/test.json"):
print("从文件中加载数据集")
train = json.load(open("data/train.json"))
test = json.load(open("data/test.json"))
print("数据加载完成")
else:
i=0
random.seed(self.seed)
for file in os.listdir(self.file_path):
one_path = "{}/{}".format(self.file_path, file)
print("{}".format(one_path))
with open(one_path, "r") as fp:
movieID = fp.readline().split(":")[0]
print("movie ID:"+movieID)
for line in fp.readlines():
if line.strip().endswith(":"):
movieID = line.split(":")[0]
print("movie ID:"+movieID)
continue
userID, rate, _ = line.split(",")
if(userID in self.some_users):
if random.randint(1,50) == 1:
test.setdefault(userID,{})[movieID] = int(rate)
else:
train.setdefault(userID, {})[movieID] = int(rate)
print("加载数据到 data/train.json data/test/json")
json.dump(train,open("data/train.json","w"))
json.dump(test,open("data/test.json","w"))
print("数据加载完成")
return train,test
def pearson(self, rating1, rating2):
sum_xy = 0
sum_x = 0
sum_y = 0
sum_x2 = 0
sum_y2 = 0
num = 0
for key in rating1.keys():
if key in rating2.keys():
num += 1
x = rating1[key]
y = rating2[key]
sum_xy += x * y
sum_x += x
sum_y += y
sum_x2 += math.pow(x,2)
sum_y2 += math.pow(y,2)
if num == 0:
return 0
denominator = math.sqrt( sum_x2 - math.pow(sum_x,2) / num) * math.sqrt( sum_y2 - math.pow(sum_y,2) / num)
if denominator == 0:
return 0
else:
return (sum_xy - (sum_x * sum_y) / num) / denominator
def recommend(self,userID):
neighborUser = dict()
for user in self.train.keys():
if userID != user:
distance = self.pearson(self.train[userID], self.train[user])
neighborUser[user] = distance
newNU = sorted(neighborUser.items(), key= lambda k:k[1],reverse=True)
movies = dict()
for (sim_user,sim) in newNU[:self.k]:
for movieID in self.train[sim_user].keys():
movies.setdefault(movieID,0)
movies[movieID] += sim * self.train[sim_user][movieID]
newMovies = sorted(movies.items(),key=lambda k:k[1],reverse=True)
return newMovies
def evaluate(self,num=100):
print("评估准确率")
precisions = list()
random.seed(10)
for userID in random.sample(self.test.keys(),num):
hit = 0
result = self.recommend(userID)[:self.n_items]
for(item,rate) in result:
if item in self.test[userID]:
hit+=1
precisions.append(hit/self.n_items)
return sum(precisions) / precisions.__len__()
if __name__ == "__main__":
file_path = "C:\\Users\\Mr.Throne\\Desktop\\推荐系统\\archive\\data"
seed = 30
k = 10
n_items = 5
f_rec = FisrtRec(file_path,seed,k,n_items)
# result = f_rec.recommend("968796")
print("算法推荐准确率:{}".format(f_rec.evaluate()))