在信息严重过载的网络世界中,推荐系统承担着不可或缺的任务。它为不同的用户筛选出其最可能感兴趣的信息,极大地方便了用户寻找到想要的信息,达到一种个性化推荐的效果。
UserCF的核心思想是:给用户推荐和他兴趣相似的其他用户喜欢的物品。
为了根据用户喜好进行推荐,我们首先要找到兴趣爱好和目标用户兴趣相似的用户集合。UserCF就是利用用户行为的相似计算兴趣的相似度。比如说,用户A和用户B都购买了X,Y,Z三样物品。则两个人的兴趣爱好就特别相似。于是我们需要定义相似度计算方法,常见的有皮尔逊相关系数(Pearson correlation coefficient,PCC),余弦相似度。
我们对每个用户计算与目标用户u的相似度,对这些相似度进行排序,得到与用户u兴趣相似的top-k个用户集合。接下来,我们给目标用户u推荐这top-k个用户喜欢的物品,并且是用户u没有见过的物品。我们需要衡量用户u对某物品的感兴趣程度,给他推荐他最可能感兴趣的物品。
import pandas as pd
import numpy as np
# 构建共同的评分向量
def build_xy(user_id1, user_id2):
bool_array = df.loc[user_id1].notnull() & df.loc[user_id2].notnull()
return df.loc[user_id1, bool_array], df.loc[user_id2, bool_array]
# 欧几里德距离
def euclidean(user_id1, user_id2):
x, y = build_xy(user_id1, user_id2)
try:
value = sum((x - y)**2)**0.5
except ZeroDivisionError:
value = 0
return value
# 余弦相似度
def cosine(user_id1, user_id2):
x, y = build_xy(user_id1, user_id2)
# 分母
denominator = (sum(x*x)*sum(y*y))**0.5
try:
value = sum(x*y)/denominator
except ZeroDivisionError:
value = 0
return value
# 皮尔逊相关系数
def pearson(user_id1, user_id2):
x, y = build_xy(user_id1, user_id2)
mean1, mean2 = x.mean(), y.mean()
# 分母
denominator = (sum((x-mean1)**2)*sum((y-mean2)**2))**0.5
try:
value = sum((x - mean1) * (y - mean2)) / denominator
except ZeroDivisionError:
value = 0
return value
metric_funcs = {
'euclidean': euclidean,
'pearson': pearson,
'cosine': cosine
}
# 计算最近的邻居
def computeNearestNeighbor(user_id, metric='pearson', k=50):
"""
metric: 度量函数
k: 返回k个邻居
返回:pd.Series,其中index是邻居名称,values是距离
"""
if metric in ['manhattan', 'euclidean']:
return df.drop(user_id).index.to_series().apply(metric_funcs[metric], args=(user_id,)).nsmallest(k)
elif metric in ['pearson', 'cosine']:
return df.drop(user_id).index.to_series().apply(metric_funcs[metric], args=(user_id,)).nlargest(k)
# 向给定用户推荐(返回:pd.Series)
def recommend(user_id,K):
# 找到距离最近的K个用户id
nearest_user_id = computeNearestNeighbor(user_id, metric='pearson',k=K).index
print('最近邻用户id:', nearest_user_id)
#unrated_items_id=df.loc[user_id][df.loc[user_id].isnull()==True].index
try:
#获取需要进行评分预测的物品id
unrated_items_id=final_test_df.loc[user_id][final_test_df.loc[user_id].isnull()==False].index
except Exception:
unrated_items_id=[]
for j in unrated_items_id:
fenzi=0.
fenmu=0.
enter_flag=False
if j not in df.columns.values:
continue
for near_id in nearest_user_id:
if np.isnan(df.loc[near_id][j])==False:
enter_flag=True
s_wu = pearson(near_id, user_id)
fenzi+=s_wu*(df.loc[near_id][j]-average_rating_by_user[near_id])
fenmu+=s_wu
if enter_flag == False:
df.at[user_id,j]=average_rating_by_user[user_id]
else:
df.at[user_id,j]=average_rating_by_user[user_id]+fenzi/fenmu
# 从top-k中预测user_id用户对应的第j个商品的评分
def predict(user_id,j,K):
# 找到距离最近的用户id
nearest_user_id = computeNearestNeighbor(user_id, metric='pearson',k=K).index
print('最近邻用户id:', nearest_user_id)
fenzi=0.
fenmu=0.
enter_flag=False
for near_id in nearest_user_id:
if np.isnan(df.loc[near_id][j])==False:
enter_flag=True
s_wu = pearson(near_id, user_id)
fenzi+=s_wu*(df.loc[near_id][j]-average_rating_by_user[near_id])
fenmu+=s_wu
if enter_flag == False:
score=average_rating_by_user[user_id]
else:
score=average_rating_by_user[user_id]+fenzi/fenmu
return score
if __name__ == '__main__':
data_fields = ['user_id', 'item_id', 'rating', 'timestamp']
# 默认names = None,此时pd.read_table会拿第1行的value当作列名
train_df = pd.read_table('../datasets/ml-100k/u1.base', names=data_fields)
# 默认sep = '\t',拿tab作为分列符
test_df = pd.read_table('../datasets/ml-100k/u1.test', names=data_fields)
# print(train_df[:5])
global df;
global final_test_df
global average_rating_by_user;
global average_rating_by_item;
df = train_df.pivot(index='user_id', columns='item_id', values='rating')
final_test_df=test_df.pivot(index='user_id', columns='item_id', values='rating')
total_rating=df.sum().sum()
total_count=df.count().sum()
global_average_rating=total_rating/total_count
#按行求和
total_rating_by_user = df.sum(axis=1)
total_count_by_user = df.count(axis=1)
average_rating_by_user = total_rating_by_user / total_count_by_user
#按列求和
total_rating_by_item = df.sum()
total_count_by_item = df.count()
average_rating_by_item = total_rating_by_item / total_count_by_item
#print(global_average_rating)
#print(average_rating_by_user)
# print(average_rating_by_item)
# 生成2个新的DataFrame,去掉不需要的列
#test_df['predict']=test_df['rating']-average_rating_by_user[test_df['user_id']]
# print(build_xy(1, 2))
# print(pearson(1, 2))
for i in df.index:
recommend(i,50)
test_df['predict']=""
for i in range(test_df.shape[0]):
test_df.at[i,'predict']=predict(test_df.iloc[i]['user_id'],test_df.iloc[i]['item_id'],10)
if np.isnan(df.loc[test_df.iloc[i]['user_id'],test_df.iloc[i]['item_id']])==False:
test_df.at[i, 'predict'] = df.loc[test_df.iloc[i]['user_id'],test_df.iloc[i]['item_id']]
else:
test_df.at[i, 'predict'] = predict(test_df.iloc[i]['user_id'], test_df.iloc[i]['item_id'], 50)
MAE=(test_df['rating']-test_df['predict']).abs().sum()/test_df.shape[0]
RMSE=np.sqrt(((test_df['rating']-test_df['predict']).abs()**2).sum()/test_df.shape[0])
print(MAE,RMSE)
ItemCF的核心思想是给用户推荐和其过去感兴趣的物品相似的物品。
为了得到物品之间的联系,我们需要计算物品间的相似度矩阵。假如用户A同时购买了物品X,Y,那么X和Y可能存在一定程度上的相似,基于这样的假设,我们通过相似性公式计算得到X和Y的相似性。
构建完物品-物品相似性矩阵后,我们针对目标用户u进行推荐的方式就是,根据其评分过的电影,找到K部相似的电影,并推荐其N部没看过的,最可能感兴趣的电影。
import pandas as pd
import numpy as np
# 构建共同的评分向量
def build_xy(item_id1, item_id2):
bool_array = df.loc[item_id1].notnull() & df.loc[item_id2].notnull()
return df.loc[item_id1, bool_array], df.loc[item_id2, bool_array]
# 欧几里德距离
def euclidean(item_id1, item_id2):
x, y = build_xy(item_id1, item_id2)
try:
value = sum((x - y)**2)**0.5
except ZeroDivisionError:
value = 0
return value
# 余弦相似度
def cosine(item_id1, item_id2):
x, y = build_xy(item_id1, item_id2)
# 分母
denominator = (sum(x*x)*sum(y*y))**0.5
try:
value = sum(x*y)/denominator
except ZeroDivisionError:
value = 0
return value
# 皮尔逊相关系数
def pearson(item_id1, item_id2):
x, y = build_xy(item_id1, item_id2)
mean1, mean2 = x.mean(), y.mean()
# 分母
denominator = (sum((x-mean1)**2)*sum((y-mean2)**2))**0.5
try:
value = sum((x - mean1) * (y - mean2)) / denominator
except ZeroDivisionError:
value = 0
return value
metric_funcs = {
'euclidean': euclidean,
'pearson': pearson,
'cosine': cosine
}
# 计算相似的物品
def computeNearestNeighbor(item_id, metric='pearson', k=50):
"""
metric: 度量函数
k: 返回k个邻居
返回:pd.Series,其中index是邻居名称,values是距离
"""
if metric in ['manhattan', 'euclidean']:
return df.drop(item_id).index.to_series().apply(metric_funcs[metric], args=(item_id,)).nsmallest(k)
elif metric in ['pearson', 'cosine']:
return df.drop(item_id).index.to_series().apply(metric_funcs[metric], args=(item_id,)).nlargest(k)
# 向给定用户推荐(返回:pd.Series)
def recommend(user_id,K):
# 获取用户评分过的电影列表
watched_movies=set(df[user_id].dropna().index)
# 找到所有相似的物品
rank={}
for item_id in watched_movies:
nearest_item_id = computeNearestNeighbor(item_id, metric='pearson',k=K)
print("与电影%s最相近的是%s" %(item_id,nearest_item_id.index))
for neighbor_item_id in nearest_item_id.index:
if neighbor_item_id in watched_movies:
continue
if neighbor_item_id not in rank:
rank[neighbor_item_id]=nearest_item_id[neighbor_item_id]
else:
if nearest_item_id[neighbor_item_id] > rank[neighbor_item_id]:
rank[neighbor_item_id] = nearest_item_id[neighbor_item_id]
break
rank=sorted(rank.items(), key=lambda x:x[1], reverse=True)
print('最近邻物品id:', rank)
#unrated_items_id=df.loc[user_id][df.loc[user_id].isnull()==True].index
try:
#获取需要进行评分预测的物品id
unrated_items_id=final_test_df.loc[user_id][final_test_df.loc[user_id].isnull()==False].index
except Exception:
unrated_items_id=[]
for unrated_id in unrated_items_id:
fenzi=0.
fenmu=0.
enter_flag=False
if unrated_id not in df.index:
continue
for near_id,value in rank:
if np.isnan(df.loc[near_id][user_id])==False:
enter_flag=True
s_kj = pearson(near_id, unrated_id)
fenzi+=s_kj*df.loc[near_id][user_id]
fenmu+=s_kj
if enter_flag == False:
df.at[user_id,unrated_id]=average_rating_by_user[user_id]
else:
df.at[user_id,unrated_id]=fenzi/fenmu
# 从top-k中预测user_id用户对应的第j个商品的评分
def predict(user_id,j,K):
# 获取用户评分过的电影列表
watched_movies = set(df[user_id].dropna().index)
# 找到所有相似的物品
rank = {}
for item_id in watched_movies:
nearest_item_id = computeNearestNeighbor(item_id, metric='pearson', k=K)
for neighbor_item_id in nearest_item_id:
if neighbor_item_id not in rank:
rank[neighbor_item_id] = nearest_item_id[neighbor_item_id]
else:
if nearest_item_id[neighbor_item_id] > rank[neighbor_item_id]:
rank[neighbor_item_id] = nearest_item_id[neighbor_item_id]
rank = sorted(rank.items(), key=itemgetter(1), reverse=True)
print('最近邻物品id:', rank)
# unrated_items_id=df.loc[user_id][df.loc[user_id].isnull()==True].index
try:
# 获取需要进行评分预测的物品id
unrated_items_id = final_test_df.loc[user_id][final_test_df.loc[user_id].isnull() == False].index
except Exception:
unrated_items_id = []
fenzi = 0.
fenmu = 0.
enter_flag = False
if j not in df.index:
return 0
for near_id,value in rank:
if np.isnan(df.loc[near_id][user_id]) == False:
enter_flag = True
s_kj = pearson(near_id, j)
fenzi += s_kj * df.loc[near_id][user_id]
fenmu += s_kj
if enter_flag == False:
score = average_rating_by_user[user_id]
else:
score = fenzi / fenmu
return score
if __name__ == '__main__':
data_fields = ['user_id', 'item_id', 'rating', 'timestamp']
# 默认names = None,此时pd.read_table会拿第1行的value当作列名
train_df = pd.read_table('../datasets/ml-100k/u1.base', names=data_fields)
# 默认sep = '\t',拿tab作为分列符
test_df = pd.read_table('../datasets/ml-100k/u1.test', names=data_fields)
# print(train_df[:5])
global df;
global final_test_df
global average_rating_by_user;
global average_rating_by_item;
df = train_df.pivot(index='user_id', columns='item_id', values='rating')
final_test_df=test_df.pivot(index='user_id', columns='item_id', values='rating')
total_rating=df.sum().sum()
total_count=df.count().sum()
global_average_rating=total_rating/total_count
#按行求和
total_rating_by_user = df.sum(axis=1)
total_count_by_user = df.count(axis=1)
average_rating_by_user = total_rating_by_user / total_count_by_user
#按列求和
total_rating_by_item = df.sum()
total_count_by_item = df.count()
average_rating_by_item = total_rating_by_item / total_count_by_item
df = train_df.pivot(index='item_id', columns='user_id', values='rating')
for i in df.index:
recommend(i,50)
test_df['predict']=""
for i in range(test_df.shape[0]):
test_df.at[i,'predict']=predict(test_df.iloc[i]['user_id'],test_df.iloc[i]['item_id'],10)
if np.isnan(df.loc[test_df.iloc[i]['user_id'],test_df.iloc[i]['item_id']])==False:
test_df.at[i, 'predict'] = df.loc[test_df.iloc[i]['item_id'],test_df.iloc[i]['user_id']]
else:
test_df.at[i, 'predict'] = predict(test_df.iloc[i]['user_id'], test_df.iloc[i]['item_id'], 50)
MAE=(test_df['rating']-test_df['predict']).abs().sum()/test_df.shape[0]
RMSE=np.sqrt(((test_df['rating']-test_df['predict']).abs()**2).sum()/test_df.shape[0])
print(MAE,RMSE)