2021CCF BDCI图书推荐系统竞赛baseline

2021CCF BDCI图书推荐系统竞赛baseline——itemCF

    • 1. 导包
    • 2. 读取数据
    • 3. 切分数据集
    • 4. 计算item相似度
    • 5. 生成推荐list
    • 6. 生成提交文件

比赛地址

这是用最基本的基于物品协同过滤算法实现的图书推荐。

1. 导包

import random
import numpy as np
import pandas as pd

import math
from operator import itemgetter
import logging 
logging.basicConfig(level=logging.INFO, format='%(asctime)-15s %(levelname)s: %(message)s')

2. 读取数据

path = '/Users/Desktop/比赛/图书推荐系统'
train = pd.read_csv(path + '/dataset/train_dataset.csv')
test = pd.read_csv(path + '/dataset/test_dataset.csv')
sub = pd.read_csv(path + '/dataset/submission.csv')

logging.info("打印完毕")
data = train.copy()
data['rating'] = 1
data.head(5)
data.pivot(index='user_id', columns='item_id', values='rating')   # 这样会发现有大量的稀疏, 所以才会用字典进行存放

2021CCF BDCI图书推荐系统竞赛baseline_第1张图片

3. 切分数据集

trainSet, testSet = {}, {}
trainSet_len, testSet_len = 0, 0
pivot = 0.75    # 训练集的比例

# {user: {item_id: rating}}
for ele in data.itertuples(): 
    user, item, rating = getattr(ele, 'user_id'), getattr(ele, 'item_id'), getattr(ele, 'rating')
    if random.random() < pivot:
        trainSet.setdefault(user, {})
        trainSet[user][item] = rating
        trainSet_len += 1
    else:
        testSet.setdefault(user, {})
        testSet[user][item] = rating 
        testSet_len += 1

4. 计算item相似度

item_popular = {}
for user, items in trainSet.items():   # item:{movieID: rating}
    for item in items:      
        if item not in item_popular:     
            item_popular[item] = 0  
        item_popular[item] += 1


item_count = len(item_popular)
print('Total movie number = %d' % movie_count)

# 下面建立item相似矩阵
print('Build user co-rated items matrix ...')
item_sim_matrix = {}
for user, items in trainSet.items():  
    for m1 in items:           # 对于每个item, 都得双层遍历
        for m2 in items:
            if m1 == m2:
                continue
            item_sim_matrix.setdefault(m1, {}) 
            item_sim_matrix[m1].setdefault(m2, 0)
            item_sim_matrix[m1][m2] += 1     # 这里统计两个电影被同一个用户产生行为的次数, 这个就是余弦相似度的分子

# 计算电影之间的相似性
for m1, related_items in item_sim_matrix.items():
    for m2, count in related_items.items():    # 这里面m2是相关电影, count是共同被同一个用户打分的次数
        # 这里item的用户数为0处理
        if item_popular[m1] == 0  or item_popular[m2] == 0:
            item_sim_matrix[m1][m2] = 0
        else:
            item_sim_matrix[m1][m2] = count / math.sqrt(item_popular[m1] * item_popular[m2])  

5. 生成推荐list

user_lst = test['user_id'].tolist()

# 找到最相似的K个item, 最终推荐n个给用户
k = 208
n = 10
result = []
for user in user_lst:
    rank ={}
    watched_items = trainSet[user]     # 找出目标用户看过的书籍

    for item, rating in watched_movies.items():
        #遍历与物品item最相似的前k个产品,获得这些物品及相似分数
        for related_item, w in sorted(item_sim_matrix[item].items(), key=itemgetter(1), reverse=True)[:k]:
            # 若该物品用户看过则不推荐
            if related_item in watched_items:
                continue
        
            # 计算用户user对related_item的偏好值, 初始化该值为0
            rank.setdefault(related_item, 0)
            #通过与其相似物品对物品related_item的偏好值相乘并相加。
            #排名的依据—— > 推荐书籍与该已看书籍的相似度(累计) * 用户对已看书籍的评分
            rank[related_item] += w * float(rating)
    
    # 产生最后的推荐列表
    rec_items = sorted(rank.items(), key=itemgetter(1), reverse=True)[:n] 
    for i in list(rec_items):
        result.append(i)

6. 生成提交文件

r = []
for i in result:
   r.append(i[0]) 
sub['item_id'] = r
sub
sub.to_csv(path + '/result/ItemCF.csv')

线上得分:0.02109538784

你可能感兴趣的:(数据科学比赛,#,推荐项目,python,机器学习,深度学习,人工智能)