计算相似度之后,u对物品j的兴趣如下:
用户活跃度对物品相似度的影响
考虑到并不是每个用户对推荐的贡献度是一样的,例如某书店店主买了当当上的80%的书存货,这80%的书都会产生关联,但是并不是根据用户的兴趣爱好关联的。我们要降低这个用户的贡献度,引入IUF(inverse user frequence),即用户活跃度对数的倒数。利用IUF修正物品相似度的计算:
物品相似度归一化
Karypis在研究中心发现如果将相似度矩阵按照最大值归一化会提高推荐的准确率。即
完整代码如下
import random
import sys
import math
import os
from operator import itemgetter
random.seed(0)
class ItemBasedCF(object):
def __init__(self):
self.trainset = {}
self.testset = {}
self.n_sim_movie = 20
self.n_rec_movie = 10
self.movie_sim_mat = {}
self.movie_popular = {}
self.movie_count = 0
print('Similar movie number = %d' % self.n_sim_movie, file = sys.stderr)
print('Recommendend movie number = %d' % self.n_rec_movie,file = sys.stderr)
@staticmethod
def loadfile(filename):
fp = open(filename, 'r')
for i, line in enumerate(fp):
yield line.strip('\r\n')
if i % 100000 == 0:
print ('load %s(%s)' %(filename,i), file = sys.stderr)
fp.close()
print('load %s succ' %filename, file = sys.stderr)
def generate_dataset(self, filename, pivot = 0.7):
trainset_len = 0
testset_len = 0
for line in self.loadfile(filename):
user, movie, rating , _= line.split('::')
if random.random() < pivot:
self.trainset.setdefault(user,{})
self.trainset[user][movie] = int(rating)
trainset_len += 1
else:
self.testset.setdefault(user,{})
self.testset[user][movie] = int(rating)
testset_len += 1
print('split succ , trainset is %d , testset is %d' %(trainset_len,testset_len) , file = sys.stderr)
def calc_movie_sim(self):
for user, movies in self.trainset.items():
for movie in movies:
if movie not in self.movie_popular:
self.movie_popular[movie] = 0
self.movie_popular[movie] += 1
print('count movies number and pipularity succ',file = sys.stderr)
self.movie_count = len(self.movie_popular)
print('total movie number = %d' %self.movie_count, file = sys.stderr)
itemsim_mat = self.movie_sim_mat
print('building co-rated users matrix', file = sys.stderr)
for user, movies in self.trainset.items():
for m1 in movies:
for m2 in movies:
if m1 == m2:
continue
itemsim_mat.setdefault(m1,{})
itemsim_mat[m1].setdefault(m2,0)
itemsim_mat[m1][m2] += 1
print('build co-rated users matrix succ', file = sys.stderr)
print('calculating movie similarity matrix', file = sys.stderr)
simfactor_count = 0
PRINT_STEP = 2000000
for m1, related_movies in itemsim_mat.items():
for m2, count in related_movies.items():
itemsim_mat[m1][m2] = count / math.sqrt(self.movie_popular[m1] * self.movie_popular[m2])
simfactor_count += 1
if simfactor_count % PRINT_STEP == 0:
print('calcu movie similarity factor(%d)' %simfactor_count, file = sys.stderr)
print('calcu similiarity succ', file = sys.stderr)
def recommend(self,user):
K = self.n_sim_movie
N = self.n_rec_movie
rank = {}
watched_movies = self.trainset[user]
for movie, rating in watched_movies.items():
for related_movie, similarity_factor in sorted(self.movie_sim_mat[movie].items(), key=itemgetter(1),
reverse=True)[0:K]:
if related_movie in watched_movies:
continue
rank.setdefault(related_movie, 0)
rank[related_movie] += similarity_factor * rating
return sorted(rank.items(), key=itemgetter(1), reverse=True)[0:N]
def evaluate(self):
print('evaluation start', file = sys.stderr)
N = self.n_rec_movie
hit = 0
rec_count = 0
test_count = 0
all_rec_movies = set()
popular_sum = 0
for i, user in enumerate(self.trainset):
if i % 500 == 0:
print('recommend for %d users ' %i , file = sys.stderr)
test_movies = self.testset.get(user,{})
rec_movies = self.recommend(user)
for movie, _ in rec_movies:
if movie in test_movies:
hit += 1
all_rec_movies.add(movie)
popular_sum += math.log(1 + self.movie_popular[movie])
rec_count += N
test_count += len(test_movies)
precision = hit / (1.0 * rec_count)
recall = hit / (1.0 * test_count)
coverage = len(all_rec_movies) / (1.0 * self.movie_count)
popularity = popular_sum / (1.0 * rec_count)
print('precision is %.4f\t recall is %.4f \t coverage is %.4f \t popularity is %.4f'
%(precision,recall,coverage,popularity), file = sys.stderr)
if __name__ == '__main__':
ratingfile = os.path.join('ml-1m', 'ratings.dat')
itemcf = ItemBasedCF()
itemcf.generate_dataset(ratingfile)
itemcf.calc_movie_sim()
itemcf.evaluate()