【推荐系统】UserCF与ItemCF

文章目录

  • 1.推荐系统
  • 2.UserCF
    • 2.1模型介绍
    • 2.2优缺点
    • 2.3实现代码
  • 3.ItemCF
    • 3.1模型介绍
    • 3.2优缺点
    • 3.实现代码

1.推荐系统

  在信息严重过载的网络世界中,推荐系统承担着不可或缺的任务。它为不同的用户筛选出其最可能感兴趣的信息,极大地方便了用户寻找到想要的信息,达到一种个性化推荐的效果。

2.UserCF

2.1模型介绍

  UserCF的核心思想是:给用户推荐和他兴趣相似的其他用户喜欢的物品。
  为了根据用户喜好进行推荐,我们首先要找到兴趣爱好和目标用户兴趣相似的用户集合。UserCF就是利用用户行为的相似计算兴趣的相似度。比如说,用户A和用户B都购买了X,Y,Z三样物品。则两个人的兴趣爱好就特别相似。于是我们需要定义相似度计算方法,常见的有皮尔逊相关系数(Pearson correlation coefficient,PCC),余弦相似度。
  我们对每个用户计算与目标用户u的相似度,对这些相似度进行排序,得到与用户u兴趣相似的top-k个用户集合。接下来,我们给目标用户u推荐这top-k个用户喜欢的物品,并且是用户u没有见过的物品。我们需要衡量用户u对某物品的感兴趣程度,给他推荐他最可能感兴趣的物品。

2.2优缺点

  1. UserCF推荐结果着重于和用户兴趣相似的小群体的热点,推荐更加社会化,反映了用户所在兴趣群体中物品的热门程度
  2. UserCF需要计算用户-用户的相似性矩阵,方便我们选出top-K的最相似的用户,并进行推荐,适用于用户个数远少于物品个数的场景。若用户量大,则相似性矩阵的计算会耗费大量内存和时间。
  3. UserCF很难对推荐结果做出解释,解释性差。
  4. UserCF对物品冷启动不敏感。对于新加入的物品,一旦有用户对其产生行为,UserCF就可以将其推荐给相似的人群。
  5. UserCF对用户冷启动非常敏感,因为对于新加入的用户,对物品的交互也较少,很少被划分在某个用户的相似人群中,也就无法对其进行推荐。

2.3实现代码

import pandas as pd
import numpy as np

# 构建共同的评分向量
def build_xy(user_id1, user_id2):
    bool_array = df.loc[user_id1].notnull() & df.loc[user_id2].notnull()
    return df.loc[user_id1, bool_array], df.loc[user_id2, bool_array]

# 欧几里德距离
def euclidean(user_id1, user_id2):
    x, y = build_xy(user_id1, user_id2)
    try:
        value = sum((x - y)**2)**0.5
    except ZeroDivisionError:
        value = 0
    return value

# 余弦相似度
def cosine(user_id1, user_id2):
    x, y = build_xy(user_id1, user_id2)
    # 分母
    denominator = (sum(x*x)*sum(y*y))**0.5
    try:
        value = sum(x*y)/denominator
    except ZeroDivisionError:
        value = 0
    return value

# 皮尔逊相关系数
def pearson(user_id1, user_id2):
    x, y = build_xy(user_id1, user_id2)
    mean1, mean2 = x.mean(), y.mean()
    # 分母
    denominator = (sum((x-mean1)**2)*sum((y-mean2)**2))**0.5
    try:
        value = sum((x - mean1) * (y - mean2)) / denominator
    except ZeroDivisionError:
        value = 0
    return value

metric_funcs = {
    'euclidean': euclidean,
    'pearson': pearson,
    'cosine': cosine
}


# 计算最近的邻居
def computeNearestNeighbor(user_id, metric='pearson', k=50):
    """
    metric: 度量函数
    k:      返回k个邻居
    返回:pd.Series,其中index是邻居名称,values是距离
    """
    if metric in ['manhattan', 'euclidean']:
        return df.drop(user_id).index.to_series().apply(metric_funcs[metric], args=(user_id,)).nsmallest(k)
    elif metric in ['pearson', 'cosine']:
        return df.drop(user_id).index.to_series().apply(metric_funcs[metric], args=(user_id,)).nlargest(k)

# 向给定用户推荐(返回:pd.Series)
def recommend(user_id,K):
    # 找到距离最近的K个用户id
    nearest_user_id = computeNearestNeighbor(user_id, metric='pearson',k=K).index
    print('最近邻用户id:', nearest_user_id)
    #unrated_items_id=df.loc[user_id][df.loc[user_id].isnull()==True].index
    try:
        #获取需要进行评分预测的物品id
        unrated_items_id=final_test_df.loc[user_id][final_test_df.loc[user_id].isnull()==False].index
    except Exception:
        unrated_items_id=[]
    for j in unrated_items_id:
        fenzi=0.
        fenmu=0.
        enter_flag=False
        if j not in df.columns.values:
            continue
        for near_id in nearest_user_id:
            if np.isnan(df.loc[near_id][j])==False:
                enter_flag=True
                s_wu = pearson(near_id, user_id)
                fenzi+=s_wu*(df.loc[near_id][j]-average_rating_by_user[near_id])
                fenmu+=s_wu
        if enter_flag == False:
            df.at[user_id,j]=average_rating_by_user[user_id]
        else:
            df.at[user_id,j]=average_rating_by_user[user_id]+fenzi/fenmu


# 从top-k中预测user_id用户对应的第j个商品的评分
def predict(user_id,j,K):
    # 找到距离最近的用户id
    nearest_user_id = computeNearestNeighbor(user_id, metric='pearson',k=K).index
    print('最近邻用户id:', nearest_user_id)
    fenzi=0.
    fenmu=0.
    enter_flag=False
    for near_id in nearest_user_id:
        if np.isnan(df.loc[near_id][j])==False:
            enter_flag=True
            s_wu = pearson(near_id, user_id)
            fenzi+=s_wu*(df.loc[near_id][j]-average_rating_by_user[near_id])
            fenmu+=s_wu
    if enter_flag == False:
        score=average_rating_by_user[user_id]
    else:
        score=average_rating_by_user[user_id]+fenzi/fenmu
    return score



if __name__ == '__main__':
    data_fields = ['user_id', 'item_id', 'rating', 'timestamp']

    # 默认names = None,此时pd.read_table会拿第1行的value当作列名
    train_df = pd.read_table('../datasets/ml-100k/u1.base', names=data_fields)
    # 默认sep = '\t',拿tab作为分列符
    test_df = pd.read_table('../datasets/ml-100k/u1.test', names=data_fields)
    # print(train_df[:5])
    global df;
    global final_test_df
    global average_rating_by_user;
    global average_rating_by_item;
    df = train_df.pivot(index='user_id', columns='item_id', values='rating')
    final_test_df=test_df.pivot(index='user_id', columns='item_id', values='rating')
    total_rating=df.sum().sum()
    total_count=df.count().sum()
    global_average_rating=total_rating/total_count
    #按行求和
    total_rating_by_user = df.sum(axis=1)
    total_count_by_user = df.count(axis=1)

    average_rating_by_user = total_rating_by_user / total_count_by_user
    #按列求和
    total_rating_by_item = df.sum()
    total_count_by_item = df.count()
    average_rating_by_item = total_rating_by_item / total_count_by_item
    #print(global_average_rating)
    #print(average_rating_by_user)
    # print(average_rating_by_item)
    # 生成2个新的DataFrame,去掉不需要的列
    #test_df['predict']=test_df['rating']-average_rating_by_user[test_df['user_id']]
    # print(build_xy(1, 2))
    # print(pearson(1, 2))
    for i in df.index:
        recommend(i,50)
    test_df['predict']=""
    for i in range(test_df.shape[0]):
        test_df.at[i,'predict']=predict(test_df.iloc[i]['user_id'],test_df.iloc[i]['item_id'],10)
        if np.isnan(df.loc[test_df.iloc[i]['user_id'],test_df.iloc[i]['item_id']])==False:
            test_df.at[i, 'predict'] = df.loc[test_df.iloc[i]['user_id'],test_df.iloc[i]['item_id']]
        else:
            test_df.at[i, 'predict'] = predict(test_df.iloc[i]['user_id'], test_df.iloc[i]['item_id'], 50)

    MAE=(test_df['rating']-test_df['predict']).abs().sum()/test_df.shape[0]
    RMSE=np.sqrt(((test_df['rating']-test_df['predict']).abs()**2).sum()/test_df.shape[0])
    print(MAE,RMSE)

3.ItemCF

3.1模型介绍

  ItemCF的核心思想是给用户推荐和其过去感兴趣的物品相似的物品
  为了得到物品之间的联系,我们需要计算物品间的相似度矩阵。假如用户A同时购买了物品X,Y,那么X和Y可能存在一定程度上的相似,基于这样的假设,我们通过相似性公式计算得到X和Y的相似性。
  构建完物品-物品相似性矩阵后,我们针对目标用户u进行推荐的方式就是,根据其评分过的电影,找到K部相似的电影,并推荐其N部没看过的,最可能感兴趣的电影。

3.2优缺点

  1. ItemCF着重于维护用户的历史兴趣,推荐更加个性化,可以用于长尾物品丰富的领域。
  2. ItemCF可以利用用户的历史行为给推荐结果提供推荐解释,更加合理。
  3. ItemCF需要计算物品-物品相似性矩阵,适用于物品个数远小于用户个数的场景。若用户量大,则相似性矩阵的计算会耗费大量内存和时间。
  4. ItemCF无法很好地应对物品冷启动,因为新加入的物品很少有用户与其产生行为,在相似性矩阵中得分很小。
  5. IctemCF对用户冷启动不敏感,只要新用户对某物品产生行为,ItemCF马上可以根据这个物品进行相似物品的推荐。

3.实现代码

import pandas as pd
import numpy as np

# 构建共同的评分向量
def build_xy(item_id1, item_id2):
    bool_array = df.loc[item_id1].notnull() & df.loc[item_id2].notnull()
    return df.loc[item_id1, bool_array], df.loc[item_id2, bool_array]

# 欧几里德距离
def euclidean(item_id1, item_id2):
    x, y = build_xy(item_id1, item_id2)
    try:
        value = sum((x - y)**2)**0.5
    except ZeroDivisionError:
        value = 0
    return value

# 余弦相似度
def cosine(item_id1, item_id2):
    x, y = build_xy(item_id1, item_id2)
    # 分母
    denominator = (sum(x*x)*sum(y*y))**0.5
    try:
        value = sum(x*y)/denominator
    except ZeroDivisionError:
        value = 0
    return value

# 皮尔逊相关系数
def pearson(item_id1, item_id2):
    x, y = build_xy(item_id1, item_id2)
    mean1, mean2 = x.mean(), y.mean()
    # 分母
    denominator = (sum((x-mean1)**2)*sum((y-mean2)**2))**0.5
    try:
        value = sum((x - mean1) * (y - mean2)) / denominator
    except ZeroDivisionError:
        value = 0
    return value

metric_funcs = {
    'euclidean': euclidean,
    'pearson': pearson,
    'cosine': cosine
}


# 计算相似的物品
def computeNearestNeighbor(item_id, metric='pearson', k=50):
    """
    metric: 度量函数
    k:      返回k个邻居
    返回:pd.Series,其中index是邻居名称,values是距离
    """
    if metric in ['manhattan', 'euclidean']:
        return df.drop(item_id).index.to_series().apply(metric_funcs[metric], args=(item_id,)).nsmallest(k)
    elif metric in ['pearson', 'cosine']:
        return df.drop(item_id).index.to_series().apply(metric_funcs[metric], args=(item_id,)).nlargest(k)

# 向给定用户推荐(返回:pd.Series)
def recommend(user_id,K):
    # 获取用户评分过的电影列表
    watched_movies=set(df[user_id].dropna().index)
    # 找到所有相似的物品
    rank={}
    for item_id in watched_movies:
        nearest_item_id = computeNearestNeighbor(item_id, metric='pearson',k=K)
        print("与电影%s最相近的是%s" %(item_id,nearest_item_id.index))
        for neighbor_item_id in nearest_item_id.index:
            if neighbor_item_id in watched_movies:
                continue
            if neighbor_item_id not in rank:
                rank[neighbor_item_id]=nearest_item_id[neighbor_item_id]
            else:
                if nearest_item_id[neighbor_item_id] >  rank[neighbor_item_id]:
                    rank[neighbor_item_id] = nearest_item_id[neighbor_item_id]
        break
    rank=sorted(rank.items(), key=lambda  x:x[1], reverse=True)
    print('最近邻物品id:', rank)
    #unrated_items_id=df.loc[user_id][df.loc[user_id].isnull()==True].index
    try:
        #获取需要进行评分预测的物品id
        unrated_items_id=final_test_df.loc[user_id][final_test_df.loc[user_id].isnull()==False].index
    except Exception:
        unrated_items_id=[]
    for unrated_id in unrated_items_id:
        fenzi=0.
        fenmu=0.
        enter_flag=False
        if unrated_id not in df.index:
            continue
        for near_id,value in rank:
            if np.isnan(df.loc[near_id][user_id])==False:
                enter_flag=True
                s_kj = pearson(near_id, unrated_id)
                fenzi+=s_kj*df.loc[near_id][user_id]
                fenmu+=s_kj
        if enter_flag == False:
            df.at[user_id,unrated_id]=average_rating_by_user[user_id]
        else:
            df.at[user_id,unrated_id]=fenzi/fenmu


# 从top-k中预测user_id用户对应的第j个商品的评分
def predict(user_id,j,K):
    # 获取用户评分过的电影列表
    watched_movies = set(df[user_id].dropna().index)
    # 找到所有相似的物品
    rank = {}
    for item_id in watched_movies:
        nearest_item_id = computeNearestNeighbor(item_id, metric='pearson', k=K)
        for neighbor_item_id in nearest_item_id:
            if neighbor_item_id not in rank:
                rank[neighbor_item_id] = nearest_item_id[neighbor_item_id]
            else:
                if nearest_item_id[neighbor_item_id] > rank[neighbor_item_id]:
                    rank[neighbor_item_id] = nearest_item_id[neighbor_item_id]
    rank = sorted(rank.items(), key=itemgetter(1), reverse=True)
    print('最近邻物品id:', rank)
    # unrated_items_id=df.loc[user_id][df.loc[user_id].isnull()==True].index
    try:
        # 获取需要进行评分预测的物品id
        unrated_items_id = final_test_df.loc[user_id][final_test_df.loc[user_id].isnull() == False].index
    except Exception:
        unrated_items_id = []
    fenzi = 0.
    fenmu = 0.
    enter_flag = False
    if j not in df.index:
        return 0
    for near_id,value in rank:
        if np.isnan(df.loc[near_id][user_id]) == False:
            enter_flag = True
            s_kj = pearson(near_id, j)
            fenzi += s_kj * df.loc[near_id][user_id]
            fenmu += s_kj
    if enter_flag == False:
        score = average_rating_by_user[user_id]
    else:
        score = fenzi / fenmu
    return score



if __name__ == '__main__':
    data_fields = ['user_id', 'item_id', 'rating', 'timestamp']

    # 默认names = None,此时pd.read_table会拿第1行的value当作列名
    train_df = pd.read_table('../datasets/ml-100k/u1.base', names=data_fields)
    # 默认sep = '\t',拿tab作为分列符
    test_df = pd.read_table('../datasets/ml-100k/u1.test', names=data_fields)
    # print(train_df[:5])
    global df;
    global final_test_df
    global average_rating_by_user;
    global average_rating_by_item;
    df = train_df.pivot(index='user_id', columns='item_id', values='rating')
    final_test_df=test_df.pivot(index='user_id', columns='item_id', values='rating')
    total_rating=df.sum().sum()
    total_count=df.count().sum()
    global_average_rating=total_rating/total_count
    #按行求和
    total_rating_by_user = df.sum(axis=1)
    total_count_by_user = df.count(axis=1)

    average_rating_by_user = total_rating_by_user / total_count_by_user
    #按列求和
    total_rating_by_item = df.sum()
    total_count_by_item = df.count()
    average_rating_by_item = total_rating_by_item / total_count_by_item

    df = train_df.pivot(index='item_id', columns='user_id', values='rating')

    for i in df.index:
        recommend(i,50)
    test_df['predict']=""
    for i in range(test_df.shape[0]):
        test_df.at[i,'predict']=predict(test_df.iloc[i]['user_id'],test_df.iloc[i]['item_id'],10)
        if np.isnan(df.loc[test_df.iloc[i]['user_id'],test_df.iloc[i]['item_id']])==False:
            test_df.at[i, 'predict'] = df.loc[test_df.iloc[i]['item_id'],test_df.iloc[i]['user_id']]
        else:
            test_df.at[i, 'predict'] = predict(test_df.iloc[i]['user_id'], test_df.iloc[i]['item_id'], 50)

    MAE=(test_df['rating']-test_df['predict']).abs().sum()/test_df.shape[0]
    RMSE=np.sqrt(((test_df['rating']-test_df['predict']).abs()**2).sum()/test_df.shape[0])
    print(MAE,RMSE)

你可能感兴趣的:(机器学习之旅,推荐系统,ItemCF,UserCF)