推荐系统学习(一)--电影推荐系统搭建

1. 推荐系统的架构

本文采用的数据集来源于Netflix用户电影评分数据,实现一个简单的基于用户的协同过滤推荐系统,其中采用皮尔逊系数衡量两个用户之间的相似度。
数据集地址
使用到的数据文件:
推荐系统学习(一)--电影推荐系统搭建_第1张图片

2. 数据的预处理

由于数据量过大,这里仅选择原数据集中的1000个用户及其评分数据进行推荐算法的简单实现,否则在单机上难以运行(仅1000个用户数据的处理时间已经达到了数十分钟)。
首先选择1000个用户:

    def __selectSomeUsers(self):
        print("随机选择1000个用户")
        if os.path.exists("data/train.json") and os.path.exists("data/test.json"):
            return list()
        else:
            users = set()
            for file in os.listdir(self.file_path):
                one_path = "{}/{}".format(self.file_path, file)
                print("{}".format(one_path))
                with open(one_path, "r") as fp:
                    for line in fp.readlines():
                        if line.strip().endswith(":"):
                            continue
                        userID,_,_ = line.split(",")
                        users.add(userID)
            some_users = random.sample(list(users),1000)
            print(some_users)
            return some_users

然后加载评分数据信息并分割训练集和测试集:

 # 加载并拆分数据
  def _load_and_split_data(self):
      train = dict()
      test = dict()
      if os.path.exists("data/train.json") and os.path.exists("data/test.json"):
          print("从文件中加载数据集")
          train = json.load(open("data/train.json"))
          test = json.load(open("data/test.json"))
          print("数据加载完成")
      else:
          i=0
          random.seed(self.seed)
          for file in os.listdir(self.file_path):
              one_path = "{}/{}".format(self.file_path, file)
              print("{}".format(one_path))
              with open(one_path, "r") as fp:
                  movieID = fp.readline().split(":")[0]
                  print("movie ID:"+movieID)
                  for line in fp.readlines():
                      if line.strip().endswith(":"):
                          movieID = line.split(":")[0]
                          print("movie ID:"+movieID)
                          continue
                      userID, rate, _ = line.split(",")
                      if(userID in self.some_users):
                          if random.randint(1,50) == 1:
                              test.setdefault(userID,{})[movieID] = int(rate)
                          else:
                              train.setdefault(userID, {})[movieID] = int(rate)
          print("加载数据到 data/train.json data/test/json")
          json.dump(train,open("data/train.json","w"))
          json.dump(test,open("data/test.json","w"))
          print("数据加载完成")
      return train,test

3. 计算用户相似度

这里采用皮尔逊系数进行计算,采用其近似计算如下:
r ′ = ∑ i = 1 n x i y i − ∑ i = 1 n x i ∑ i = 1 n y i n ∑ i = 1 n x i 2 − ( ∑ i = 1 n x i ) 2 n ∑ i = 1 n y i 2 − ( ∑ i = 1 n y i ) 2 n r'=\frac{\sum_{i=1}^{n}x_iy_i-\frac{\sum_{i=1}^{n}x_i\sum_{i=1}^{n}y_i}{n}} {\sqrt{\sum_{i=1}^{n}x_i^2- \frac{ \left( \sum_{i=1}^{n}x_i \right)^2 }{n}} \sqrt{\sum_{i=1}^{n}y_i^2- \frac{ \left( \sum_{i=1}^{n}y_i \right)^2 }{n} }} r=i=1nxi2n(i=1nxi)2 i=1nyi2n(i=1nyi)2 i=1nxiyini=1nxii=1nyi

    def pearson(self, rating1, rating2):
        sum_xy = 0
        sum_x = 0
        sum_y = 0
        sum_x2 = 0
        sum_y2 = 0
        num = 0
        for key in rating1.keys():
            if key in rating2.keys():
                num += 1
                x = rating1[key]
                y = rating2[key]
                sum_xy += x * y
                sum_x += x
                sum_y += y
                sum_x2 += math.pow(x,2)
                sum_y2 += math.pow(y,2)
        if num == 0:
            return 0
        denominator = math.sqrt( sum_x2 - math.pow(sum_x,2) / num) * math.sqrt( sum_y2 - math.pow(sum_y,2) / num)
        if denominator == 0:
            return  0
        else:
            return (sum_xy - (sum_x * sum_y) / num) / denominator

4. 基于协同过滤进行推荐

即采用KNN寻找用户user的k个近邻并进行排序,并选择其中评分较高的n个电影,推荐给当前用户即可。

    def recommend(self,userID):
        neighborUser = dict()
        for user in self.train.keys():
            if userID != user:
                distance = self.pearson(self.train[userID], self.train[user])
                neighborUser[user] = distance
        newNU = sorted(neighborUser.items(), key= lambda k:k[1],reverse=True)
        movies = dict()
        for (sim_user,sim) in newNU[:self.k]:
            for movieID in self.train[sim_user].keys():
                movies.setdefault(movieID,0)
                movies[movieID] += sim * self.train[sim_user][movieID]
        newMovies = sorted(movies.items(),key=lambda k:k[1],reverse=True)
        return newMovies

代码附录:

import os
import json
import random
import math

class FisrtRec:
    """
        初始化函数
        file_path: 数据文件路径
        seed: 随机数种子
        k: 选取的近邻个数
        n_items: 推荐的电影数量
    """
    def __init__(self, file_path, seed, k, n_items):
        self.file_path = file_path
        self.seed = seed
        self.k = k
        self.n_items = n_items
        self.some_users = self.__selectSomeUsers()
        self.train,self.test = self._load_and_split_data()

    def __selectSomeUsers(self):
        print("随机选择1000个用户")
        if os.path.exists("data/train.json") and os.path.exists("data/test.json"):
            return list()
        else:
            users = set()
            for file in os.listdir(self.file_path):
                one_path = "{}/{}".format(self.file_path, file)
                print("{}".format(one_path))
                with open(one_path, "r") as fp:
                    for line in fp.readlines():
                        if line.strip().endswith(":"):
                            continue
                        userID,_,_ = line.split(",")
                        users.add(userID)
            some_users = random.sample(list(users),1000)
            print(some_users)
            return some_users

    # 加载并拆分数据
    def _load_and_split_data(self):
        train = dict()
        test = dict()
        if os.path.exists("data/train.json") and os.path.exists("data/test.json"):
            print("从文件中加载数据集")
            train = json.load(open("data/train.json"))
            test = json.load(open("data/test.json"))
            print("数据加载完成")
        else:
            i=0
            random.seed(self.seed)
            for file in os.listdir(self.file_path):
                one_path = "{}/{}".format(self.file_path, file)
                print("{}".format(one_path))
                with open(one_path, "r") as fp:
                    movieID = fp.readline().split(":")[0]
                    print("movie ID:"+movieID)
                    for line in fp.readlines():
                        if line.strip().endswith(":"):
                            movieID = line.split(":")[0]
                            print("movie ID:"+movieID)
                            continue
                        userID, rate, _ = line.split(",")
                        if(userID in self.some_users):
                            if random.randint(1,50) == 1:
                                test.setdefault(userID,{})[movieID] = int(rate)
                            else:
                                train.setdefault(userID, {})[movieID] = int(rate)
            print("加载数据到 data/train.json data/test/json")
            json.dump(train,open("data/train.json","w"))
            json.dump(test,open("data/test.json","w"))
            print("数据加载完成")
        return train,test

    def pearson(self, rating1, rating2):
        sum_xy = 0
        sum_x = 0
        sum_y = 0
        sum_x2 = 0
        sum_y2 = 0
        num = 0
        for key in rating1.keys():
            if key in rating2.keys():
                num += 1
                x = rating1[key]
                y = rating2[key]
                sum_xy += x * y
                sum_x += x
                sum_y += y
                sum_x2 += math.pow(x,2)
                sum_y2 += math.pow(y,2)
        if num == 0:
            return 0
        denominator = math.sqrt( sum_x2 - math.pow(sum_x,2) / num) * math.sqrt( sum_y2 - math.pow(sum_y,2) / num)
        if denominator == 0:
            return  0
        else:
            return (sum_xy - (sum_x * sum_y) / num) / denominator

    def recommend(self,userID):
        neighborUser = dict()
        for user in self.train.keys():
            if userID != user:
                distance = self.pearson(self.train[userID], self.train[user])
                neighborUser[user] = distance
        newNU = sorted(neighborUser.items(), key= lambda k:k[1],reverse=True)
        movies = dict()
        for (sim_user,sim) in newNU[:self.k]:
            for movieID in self.train[sim_user].keys():
                movies.setdefault(movieID,0)
                movies[movieID] += sim * self.train[sim_user][movieID]
        newMovies = sorted(movies.items(),key=lambda k:k[1],reverse=True)
        return newMovies

    def evaluate(self,num=100):
        print("评估准确率")
        precisions = list()
        random.seed(10)
        for userID in random.sample(self.test.keys(),num):
            hit = 0
            result = self.recommend(userID)[:self.n_items]
            for(item,rate) in result:
                if item in self.test[userID]:
                    hit+=1
            precisions.append(hit/self.n_items)
        return sum(precisions) / precisions.__len__()


if __name__ == "__main__":
    file_path = "C:\\Users\\Mr.Throne\\Desktop\\推荐系统\\archive\\data"
    seed = 30
    k = 10
    n_items = 5
    f_rec = FisrtRec(file_path,seed,k,n_items)
    # result = f_rec.recommend("968796")
    print("算法推荐准确率:{}".format(f_rec.evaluate()))

你可能感兴趣的:(推荐系统,推荐算法,算法,机器学习)