Item-Based Collaborative Filtering Recommendation Algorithms

【参考文献】:Sarwar B M . Item-based collaborative filtering recommendation algorithms[C]// International Conference on World Wide Web. ACM, 2001.
背景:推荐领域必读文献之一,经典之作,本博客主要记录了该文章的主要思想和相关实现代码,欢迎观摩!

前提或假设

  1. 用户对项目的评分值,能够反应用户对项目某种程度上的偏好。
  2. 用户过去的偏好很可能展示或者反应未来的兴趣偏好。

数据集

我们选用MovieLens 100K Dataset,=> 100,000 ratings from 1000 users on 1700 movies.
下载地址:movielens数据集

算法理论
算法框架:如图,输入是user-item的评分矩阵,该矩阵非常稀疏。算法的任务是预测特定用户对特定项目的评分,填补矩阵中空白单元格,接着根据预测评分从高到低为特定用户进行top-N推荐

算法预测:算法认为某用户喜欢某项目,在很大程度上也会对和该项目较相似的项目产生兴趣。所以预测分两步进行:计算项目之间的相似性和根据相似性进行预测评分。
文章提供了三个相似性计算公式:
Cosine-based Similarity
$$ sim(i,j)= cos(\vec{i},\vec{j})= \frac{\vec{i}\cdot \vec{j}}{\left \| \vec{i} \right \|_{2}*\left \| \vec{j} \right \|_{2}} $$
Correlation-based Similarity
$$ sim(i,j)= \frac{\sum _{u\in U}(R_{u,i}-\bar{R}_{i})(R_{u,j}-\bar{R}_{j})}{\sqrt{\sum _{u\in U}(R_{u,i}-\bar{R}_{i})^{2}}\sqrt{\sum _{u\in U}(R_{u,j}-\bar{R}_{j})^{2}}} $$
Adjusted Cosine Similarity
$$ sim(i,j)= \frac{\sum _{u\in U}(R_{u,i}-\bar{R}_{u})(R_{u,j}-\bar{R}_{u})}{\sqrt{\sum _{u\in U}(R_{u,i}-\bar{R}_{u})^{2}}\sqrt{\sum _{u\in U}(R_{u,j}-\bar{R}_{u})^{2}}} $$
但是所有的相似性计算公式必须在共同评分项上进行,即同时评价过i和j的历史评分

算法选取和该项目最相似的前N个项目作为预测基础,预测公式如下:
$$ P_{u,i}=\frac{\sum _{all similar items,N}(S_{i,N}*R_{u,N})}{\sum _{all similar items,N}(\left | S_{i,N} \right |)} $$
算法最后一步,根据预测评分值从高到低进行推荐

实验度量
文章采用MAE进行误差度量,公式如下:
$$ MAE = \frac{\sum_{i=1}^{N}\left | p_{i}-q_{i} \right |}{N} $$

Python 代码

# !usr/bin/python
# -*- coding=utf-8 -*-
import math
import operator
#加载数据
def loadData():
    # trainSet格式为: testSet格式一致
    # {
    #   userid:{
    #       itemid1: rating,
    #       itemid2: rating
    #   }
    # }
    # movieUser格式为:看过某一部电影的所有用户集合
    # {
    #   itemid: {
    #       userid1: rating,
    #       userid2: rating
    #   }
    # }
    # 
    # 
    # 
    trainSet = {}
    testSet = {}
    movieUser = {}

    TrainFile = './dataset/u1.base'  # 指定训练集
    TestFile = './dataset/u1.test'  # 指定测试集

    # 读取训练集
    f = open(TrainFile,'r')
    lines = f.readlines()
    for line in lines:
        arr = line.strip().split('\t')
        userId = arr[0]
        itemId = arr[1]
        rating = arr[2]
        trainSet.setdefault(userId, {})
        trainSet[userId].setdefault(itemId, float(rating))
        movieUser.setdefault(itemId, {})
        movieUser[itemId].setdefault(userId, float(rating))

    # 读取测试集
    f1 = open(TestFile,'r')
    lines1 = f1.readlines()
    for line1 in lines1:
        arr1 = line1.strip().split('\t')
        userId1 = arr1[0]
        itemId1 = arr1[1]
        rating1 = arr1[2]
        testSet.setdefault(userId1, {})
        testSet[userId1].setdefault(itemId1, float(rating1))

    arr = [trainSet,movieUser]
    return arr


# 生成电影电影共有用户矩阵
def i_j_users(i_id,j_id,movieUser):
    # ij_users格式为:
    # {
    #   (i_id,j_id):{userid1:None,userid2:None,....}
    # }
    if i_id in movieUser.keys():
        i_users = movieUser[i_id]
    else:
        i_users = {}
    if j_id in movieUser.keys():
        j_users = movieUser[j_id]
    else:
        j_users = {}
    
    inter = dict.fromkeys([x for x in i_users if x in j_users])
    i_j_users = {(i_id,j_id):inter}
    return i_j_users


#计算一个用户的平均分数
def getAverageRating(trainSet,userid):
    average = (sum(trainSet[userid].values()) * 1.0) / len(trainSet[userid].keys())
    return average

#计算项目相似度
def getItemSim(i_j_users,i_id,j_id,trainSet):
    # 分子 sumtop
    # 分母 sumbot1  sumbot2
    sumtop = 0
    sumbot1 = 0
    sumbot2 = 0
    ij_users = i_j_users[(i_id,j_id)]
    if not ij_users:
        ij_sim = -9999  # 疑问? 为0 或者为None
    else:
        for user in ij_users.keys():
            avr_user = getAverageRating(trainSet,user)
            # 求分子
            left = trainSet[user][i_id] - avr_user
            right = trainSet[user][j_id] - avr_user
            sumtop += left*right
            # 求分母
            sumbot1 += left*left
            sumbot2 += right*right      
    if sumbot1 == 0 or sumbot2 == 0:
        ij_sim = 1
    else:
        ij_sim = sumtop*1.0 / (math.sqrt(sumbot1)*math.sqrt(sumbot2))
    
    return ij_sim

# 计算项目i和其她所有项目的相似度并排序
# i_allitem_sim格式为:
# {
#     j_id1:s1,
#     j_id2:s2
# }
def i_allitem_sort(i_id,movieUser,trainSet,N):
    i_allitem = {}
    for j in movieUser.keys():
        if j != i_id:
            i_j_user = i_j_users(i_id,j,movieUser)
            s = getItemSim(i_j_user,i_id,j,trainSet)
            i_allitem.setdefault(j, s)

    i_allitem_sort1 = sorted(i_allitem.items(), key = operator.itemgetter(1), reverse = True)[0:N]
    i_allitem_sort_dict = {}
    for n in range(len(i_allitem_sort1)):
        j1 = i_allitem_sort1[n][0]
        s = i_allitem_sort1[n][1]
        i_allitem_sort_dict.setdefault(j1, s)
    return i_allitem_sort_dict

# 预测评分
def prediction(userid,itemid,moviUser,trainSet,N):
    # predict 格式为:
    # {
    #   (userid,itemid): pui
    # }
    predict = 0
    sumtop = 0
    sumbot = 0
    nsets = i_allitem_sort(itemid,movieUser,trainSet,N)
    for j in nsets.keys():
        # 防止用户对i的领域集合内的j没评分
        if j not in trainSet[userid].keys():
            ruj = 0
            mid = 0
        else:
            ruj = trainSet[userid][j]
            mid = abs(nsets[j])
        sumtop += nsets[j]*ruj
        sumbot += mid
    # 防止分母为0 
    if sumbot == 0:
        predict = 0
    else:
        predict = sumtop * 1.0 / sumbot
    return predict

def saveFile(moviUser,trainSet,N):
    # 读取用户
    string = ""
    # 正在读取
    f = open("../Collaborative Filtering/dataset/u1.test")
    fw = open("../Collaborative Filtering/predict",'w')
    fl = f.readlines()
    for i in fl:
        arr = i.split('\t')
        uid = str(arr[0].strip())
        item = str(arr[1].strip())
        rating = float(arr[2].strip())
        predictScore = prediction(str(uid),str(item),moviUser,trainSet,N)
        string = string + str(uid) + "\t" + str(item) + "\t" + str(rating) + "\t" + str(predictScore) + "\n"
    fw.write(string)
    f.close()
    fw.close()
    
# 计算预测分析准确度
def getMAE():
    f = open("../Collaborative Filtering/predict")
    fl = f.readlines()
    mae = 0.0
    s = 0
    counttest = 0# 测试集的个数
    for i in fl:
        arr = i.split('\t')
        uid = str(arr[0].strip())
        item = str(arr[1].strip())
        rating = float(arr[2].strip())
        predictScore = float(arr[3].strip())
        if predictScore == 0:
            mid = 0
        else:
            mid = abs((predictScore-rating))
            counttest = counttest + 1
        s = s + mid 
    mae = s/counttest
    print(mae)        


if __name__ == '__main__':

    N = 30
    arr = loadData()
    trainSet = arr[0]
    movieUser = arr[1]
    saveFile(movieUser,trainSet,N)
    # getMAE()

你可能感兴趣的:(python,机器学习,协同过滤,推荐算法)