python实现基于物品的隐式协同过滤,使用movielens数据集测试

# 在distance里我们使用的主要是基于用户的协同过滤,但是这样存在两个主要的问题
# 1.扩展性问题,随着用户量变多计算量也会增长,当上百万用户时扩展性就会成为一个问题
# 2.稀疏性,稀疏性,例如网上有上百万本书,用户评价过的只占一小部分,所以可能找不到最近邻居
# 因此,可以使用基于物品的过滤
# 调整余弦分布 减去了用户的平均评分
# consine(item1,item2) = 用户对item1,item2的乘积和/用户对item1,2平方和开根号相乘

# 基于物品的推荐把用户行为通过归一化的形式转换为了权重

#代码如下,写得不好的地方还需要改进

import pandas as pd
import numpy as np
path = 'E:\data\ml-latest-small'
# 9724部电影,id 1-170875
movies = pd.read_csv(path + '\movies.csv')
# 100835条打分记录
ratings = pd.read_csv(path + '\\ratings.csv')
# links = pd.read_csv(path + '\links.csv')
# tags = pd.read_csv(path + '\\tags.csv')
data = pd.merge(ratings, movies, on='movieId')
'''
data.nunique()
userId         610
movieId       9724
rating          10
timestamp    85043
title         9719
genres         951
'''


# 如果建立一张9k*9k的表计算好慢,且有必要吗?18/12/26日问题,先建立前100部电影的相似度矩阵
# step1 计算物品相似度
def compute_similarity(movies=movies):
    item_similarity = pd.DataFrame(index=movies['movieId'].values[:100], columns=movies['movieId'].values[:100])
    movies = data['movieId'].unique()
    averages = data.groupby('userId')['rating'].mean()
    columns = list(item_similarity.columns.values)
    for j, mid1 in enumerate(item_similarity.index):
        movie1 = ratings.loc[ratings['movieId'] == mid1]
        movie1.dropna(inplace=True)
        print('============start a new row ==============')
        for mid2 in columns:
            print(mid2)
            movie2 = ratings.loc[ratings['movieId'] == mid2]
            movie2.dropna(inplace=True)
            divider = 0
            dominater1 = 0
            dominater2 = 0
            flag = 0
            for uid in movie1['userId'].values:
                if uid in movie2['userId'].values:
                    flag = 1
                    usermovie1rank = movie1.loc[movie1['userId'] == uid]['rating'].values[0]
                    # print(usermovie1rank[0])
                    usermovie2rank = movie2.loc[movie2['userId'] == uid]['rating'].values[0]
                    user_avg = int(averages[uid])
                    divider += (usermovie1rank - user_avg) * (usermovie2rank - user_avg)
                    dominater1 += (usermovie1rank - user_avg) ** 2
                    dominater2 += (usermovie2rank - user_avg) ** 2

            if flag == 1:
                item_similarity.at[mid1, mid2] = divider / ((dominater1 * dominater2) ** 0.5)

        print(item_similarity)

    item_similarity.to_csv('./itemsimilarity.csv', header=False, index=False)


# 归一化评分
def normalizar(rating, maxvalue=5, minvalue=1):
    result = (2 * (rating - minvalue) - (maxvalue - minvalue)) / (maxvalue - minvalue)
    return result


# 反归一化评分
def reverse_normalizar(rating, maxvalue=5, minvalue=1):
    result = 0.5 * (maxvalue - minvalue) * (rating + 1) + minvalue
    return result


# 计算预测值
def predict(uid, mid):
    itemsimu = pd.read_csv('itemsimilarity.csv', header=None)
    itemsimu.index = movies['movieId'].values[:100]
    itemsimu.columns = movies['movieId'].values[:100]
    result = ''
    denominator = 0
    divider = 0
    umid = ratings.loc[ratings['userId'] == uid]['movieId']
    urating = ratings.loc[ratings['userId'] == uid]['rating'].map(normalizar)

    for i in umid:
        if i in itemsimu.index:
            # 这里决定了只能预测112-112,即已经计算了的相似度的电影而无法计算之外的电影。
            #print(i)
            #print(urating[i])
            #相似度有空值的整体都计算不出来,需要改进
            print(itemsimu.at[i, mid])
            divider += urating[i] * itemsimu.at[i, mid]
            denominator += abs(itemsimu.at[i, mid])
    print(divider)
    print(denominator)
    if denominator != np.nan:
        result = divider / denominator
        # print('result : ' + str(result))
        return str(reverse_normalizar(result))
    else:
        return '相似矩阵112-112无法预测当前用户'


if __name__ == '__main__':
    # compute_similarity()
    # itemsimularity = pd.read_csv('DataMining/CollaborativeFiltering/itemsimilarity.csv')
    #为啥会有nan
    print(predict(1, 63))

你可能感兴趣的:(pandas)