from math import sqrt users = {"Angelica": {"Blues Traveler": 3.5, "Broken Bells": 2.0, "Norah Jones": 4.5, "Phoenix": 5.0, "Slightly Stoopid": 1.5, "The Strokes": 2.5, "Vampire Weekend": 2.0}, "Bill":{"Blues Traveler": 2.0, "Broken Bells": 3.5, "Deadmau5": 4.0, "Phoenix": 2.0, "Slightly Stoopid": 3.5, "Vampire Weekend": 3.0}, "Chan": {"Blues Traveler": 5.0, "Broken Bells": 1.0, "Deadmau5": 1.0, "Norah Jones": 3.0, "Phoenix": 5, "Slightly Stoopid": 1.0}, "Dan": {"Blues Traveler": 3.0, "Broken Bells": 4.0, "Deadmau5": 4.5, "Phoenix": 3.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0, "Vampire Weekend": 2.0}, "Hailey": {"Broken Bells": 4.0, "Deadmau5": 1.0, "Norah Jones": 4.0, "The Strokes": 4.0, "Vampire Weekend": 1.0}, "Jordyn": {"Broken Bells": 4.5, "Deadmau5": 4.0, "Norah Jones": 5.0, "Phoenix": 5.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0, "Vampire Weekend": 4.0}, "Sam": {"Blues Traveler": 5.0, "Broken Bells": 2.0, "Norah Jones": 3.0, "Phoenix": 5.0, "Slightly Stoopid": 4.0, "The Strokes": 5.0}, "Veronica": {"Blues Traveler": 3.0, "Norah Jones": 5.0, "Phoenix": 4.0, "Slightly Stoopid": 2.5, "The Strokes": 3.0} } #计算曼哈顿距离 def manhattan(rating1,rating2): """计算曼哈顿距离。rating1和rating2参数中存储的数据格式均为 {'The Strokes':3.0,'Slightly Stoopid':2.5}""" distance = 0 for key in rating1: if key in rating2: distance += abs(rating1[key] - rating2[key]) return distance #计算闵可夫斯基距离 def minkowski(rating1,rating2,r): distance = 0 for key in rating1: if key in rating2: distance += pow(abs(rating1[key] - rating2[key]),r) return pow(distance,1.0/r) def computeNearestNeighbor(username,users): distances = [] for user in users: if user != username: #distance = manhattan(users[user],users[username]) distance = minkowski(users[user],users[username],2) distances.append((distance,user)) distances.sort() return distances #计算皮尔逊相关系数(-1~1) def pearson(rating1,rating2): sum_xy = 0 sum_x = 0 sum_y = 0 sum_x2 = 0 sum_y2 = 0 n = 0 for key in rating1: if key in rating2: n += 1 x = rating1[key] y = rating2[key] sum_xy += x*y sum_x += x sum_y += y sum_x2 += pow(x,2) sum_y2 += pow(y,2) denominator = sqrt(sum_x2 - pow(sum_x,2) / n) * sqrt(sum_y2 - pow(sum_y,2) / n) if denominator == 0: return 0 else: return (sum_xy - (sum_x * sum_y) / n) / denominator #计算余弦相似度 def cos(rating1,rating2): sum_x2 = 0 sum_y2 = 0 sum_xy = 0 for key in rating1: if key in rating2: x = rating1[key] y = rating2[key] sum_x2 += pow(x,2) sum_y2 += pow(y,2) sum_xy += x * y denominator = sqrt(sum_x2) * sqrt(sum_y2) if denominator == 0: return 0 else: return sum_xy / denominator def recommend(username,users): nearest = computeNearestNeighbor(username,users)[0][1] recommendations = [] neighborRatings = users[nearest] userRatings = users[username] for artist in neighborRatings: if not artist in userRatings: recommendations.append((artist,neighborRatings[artist])) return sorted(recommendations, key = lambda artistTuple:artistTuple[1], reverse = True) print(recommend('Hailey',users)) #print (recommend('Chan',users))
注意:
1、如果数据存在“分数膨胀”问题,就使用皮尔逊相关系数。
2、如果数据比较密集,变量之间基本都存在公有值,且这些距离数据是非常重要的,那么就使用欧几里得距离或者曼哈顿距离。
3、如果数据是稀疏的,则使用余弦相似度。