推荐系统----第二章(基于领域的算法-基于用户的协同过滤)

基于用户的协调过滤算法

推荐系统----第二章(基于领域的算法-基于用户的协同过滤)_第1张图片

推荐系统----第二章(基于领域的算法-基于用户的协同过滤)_第2张图片

推荐系统----第二章(基于领域的算法-基于用户的协同过滤)_第3张图片

但是在该过程计算相似度时,算法的时间复杂度往往会很高,到遇到很大的数据量时会非常的耗时,事实上,很多用户相互之间并没有对同样的物品产生过行为,为此,可以构建一个物品到用户的倒排表,对于每个物品都保存对该物品产生过行为的用户列表。可以用以下代码实现:

#建立物品倒排表,计算物品相似度
def itemCF(user_dict):
    N=dict()
    C=defaultdict(defaultdict)
    W=defaultdict(defaultdict)
    for key in user_dict:
        for i in user_dict[key]:
            if i[0] not in N.keys(): #i[0]表示movie_id
                N[i[0]]=0
            N[i[0]]+=1               #N[i[0]]表示评论过某电影的用户数
            for j in user_dict[key]:
                if i==j:
                    continue
                if j[0] not in C[i[0]].keys():
                    C[i[0]][j[0]]=0
                C[i[0]][j[0]]+=1      #C[i[0]][j[0]]表示电影两两之间的相似度,eg:同时评论过电影1和电影2的用户数
    for i,related_item in C.items():
        for j,cij in related_item.items():
            W[i][j]=cij/math.sqrt(N[i]*N[j])
    return W

完整的计算用户相似度的算法可参考以下代码:

# coding=utf-8

from math import sqrt
import pandas as pd
import tqdm

#line = [user,source,item,lable]
users = {}
users_list = list()
path = 'C:...'
for line in open(path+"..."):
    lines = line.strip().split(",")
    if lines[0] not in users_list:
        users_list.append(lines[0])
    if lines[0] not in users:
        users[lines[0]] = {}
    users[lines[0]][lines[2]] = float(float(lines[1])+100)

class recommender:
    # data:数据集,这里指users
    # k:表示得出最相近的k的近邻
    # metric:表示使用计算相似度的方法
    # n:表示推荐book的个数
    def __init__(self, data, k=3, metric='pearson', n=12):

        self.k = k
        self.n = n
        self.username2id = {}
        self.userid2name = {}
        self.productid2name = {}

        self.metric = metric
        if self.metric == 'pearson':
            self.fn = self.pearson
        if type(data).__name__ == 'dict':
            self.data = data

    def convertProductID2name(self, id):

        if id in self.productid2name:
            return self.productid2name[id]
        else:
            return id

    # 定义的计算相似度的公式,用的是皮尔逊相关系数计算方法
    def pearson(self, rating1, rating2):
        sum_xy = 0
        sum_x = 0
        sum_y = 0
        sum_x2 = 0
        sum_y2 = 0
        n = 0
        for key in rating1:
            if key in rating2:
                n += 1
                x = rating1[key]
                y = rating2[key]
                sum_xy += x * y
                sum_x += x
                sum_y += y
                sum_x2 += pow(x, 2)
                sum_y2 += pow(y, 2)
        if n == 0:
            return 0

        # 皮尔逊相关系数计算公式
        denominator = sqrt(sum_x2 - pow(sum_x, 2) / n) * sqrt(sum_y2 - pow(sum_y, 2) / n)
        if denominator == 0:
            return 0
        else:
            return (sum_xy - (sum_x * sum_y) / n) / denominator

    def computeNearestNeighbor(self, username):
        distances = []
        for instance in self.data:
            if instance != username:
                distance = self.fn(self.data[username], self.data[instance])
                distances.append((instance, distance))

        distances.sort(key=lambda artistTuple: artistTuple[1], reverse=True)
        return distances

    # 推荐算法的主体函数
    def recommend(self, user):
        # 定义一个字典,用来存储推荐的标签和兴趣度
        recommendations = {}
        # 计算出user与所有其他用户的相似度,返回一个list
        nearest = self.computeNearestNeighbor(user)
        userRatings = self.data[user]
        totalDistance = 0.0
        # 得住最近的k个近邻的总距离
        for i in range(self.k):
            totalDistance += nearest[i][1]
        if totalDistance == 0.0:
            totalDistance = 1.0
        # 将与user最相近的k个人中user没有看过的书推荐给user,并且这里又做了一个分数的计算排名
        for i in range(self.k):
            # 第i个人的与user的相似度,转换到[0,1]之间
            weight = nearest[i][1] / totalDistance
            # 第i个人的name
            name = nearest[i][0]
            # 第i个用户看过的书和相应的打分
            neighborRatings = self.data[name]

            for artist in neighborRatings:
                if not artist in userRatings:
                    if artist not in recommendations:
                        recommendations[artist] = (neighborRatings[artist] * weight)
                    else:
                        recommendations[artist] = (recommendations[artist] + neighborRatings[artist] * weight)

        recommendations = list(recommendations.items())
        recommendations = [(self.convertProductID2name(id1), v) for (id1, v) in recommendations]

        # 做了一个排序
        recommendations.sort(key=lambda artistTuple: artistTuple[1], reverse=True)

        # return recommendations[:self.n], nearest

        return recommendations[:12], nearest


def adjustrecommend(users_list):
    print users_list
    # bookid_list = []
    r = recommender(users)
    itemsCF_list = pd.DataFrame(columns=['top1',
                                         'top2', 'top3', 'top4', 'top5', 'top6',
                                         'top7', 'top8', 'top9', 'top10','top11', 'top12',
                                         'nearuser1', 'nearuser2', 'nearuser3', 'nearuser4', 'nearuser5',
                                         'nearuser6', 'nearuser7', 'nearuser8', 'nearuser9', 'nearuser10'], index=None)
    for u in range(len(users_list)):
    # for u in range(1):
        bookid_list = []
        k, nearuser = r.recommend(users_list[u])
        for i in range(len(k)):
            bookid_list.append(k[i][0])
        print bookid_list
        if len(bookid_list) == 12:
            bookid_list = bookid_list
        else:
            n = 12 - len(bookid_list)
            for i in range(n):
                bookid_list.append(None)
        nearuser = nearuser[:10]
        if len(nearuser) == 10:
            nearuser = nearuser[:10]
        else:
            n = 10 - len(nearuser)
            for i in range(n):
                nearuser.append(None)
        rows = {'box_id': users_list[u],
                            'top1': bookid_list[0],
                            'top2': bookid_list[1],
                            'top3': bookid_list[2],
                            'top4': bookid_list[3],
                            'top5': bookid_list[4],
                            'top6': bookid_list[5],
                            'top7': bookid_list[6],
                            'top8': bookid_list[7],
                            'top9': bookid_list[8],
                            'top10': bookid_list[9],
                            'top11': bookid_list[10],
                            'top12': bookid_list[11],
                            'nearuser1': nearuser[0][0],
                            'nearuser2': nearuser[1][0],
                            'nearuser3': nearuser[2][0],
                            'nearuser4': nearuser[3][0],
                            'nearuser5': nearuser[4][0],
                            'nearuser6': nearuser[5][0],
                            'nearuser7': nearuser[6][0],
                            'nearuser8': nearuser[7][0],
                            'nearuser9': nearuser[8][0],
                            'nearuser10': nearuser[9][0]}
        itemsCF_list = itemsCF_list.append(rows, ignore_index=True)
    return itemsCF_list
    # return bookid_list, nearuser[:15]  # bookid_list推荐书籍的id,nearuser[:15]最近邻的15个用户

def run_fun():
    userCF_list = adjustrecommend(users_list)
    print userCF_list.head(10)
    userCF_list.to_csv('C:...",
                        index=False, mode='wb+')

if __name__ == '__main__':
    import sys
    reload(sys)
    sys.setdefaultencoding('utf8')
    run_fun()

你可能感兴趣的:(推荐系统)