邻近算法推荐系统

from math import sqrt

users = {"Angelica": {"Blues Traveler": 3.5, "Broken Bells": 2.0, "Norah Jones": 4.5, "Phoenix": 5.0, "Slightly Stoopid": 1.5, "The Strokes": 2.5, "Vampire Weekend": 2.0},
         "Bill":{"Blues Traveler": 2.0, "Broken Bells": 3.5, "Deadmau5": 4.0, "Phoenix": 2.0, "Slightly Stoopid": 3.5, "Vampire Weekend": 3.0},
         "Chan": {"Blues Traveler": 5.0, "Broken Bells": 1.0, "Deadmau5": 1.0, "Norah Jones": 3.0, "Phoenix": 5, "Slightly Stoopid": 1.0},
         "Dan": {"Blues Traveler": 3.0, "Broken Bells": 4.0, "Deadmau5": 4.5, "Phoenix": 3.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0, "Vampire Weekend": 2.0},
         "Hailey": {"Broken Bells": 4.0, "Deadmau5": 1.0, "Norah Jones": 4.0, "The Strokes": 4.0, "Vampire Weekend": 1.0},
         "Jordyn":  {"Broken Bells": 4.5, "Deadmau5": 4.0, "Norah Jones": 5.0, "Phoenix": 5.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0, "Vampire Weekend": 4.0},
         "Sam": {"Blues Traveler": 5.0, "Broken Bells": 2.0, "Norah Jones": 3.0, "Phoenix": 5.0, "Slightly Stoopid": 4.0, "The Strokes": 5.0},
         "Veronica": {"Blues Traveler": 3.0, "Norah Jones": 5.0, "Phoenix": 4.0, "Slightly Stoopid": 2.5, "The Strokes": 3.0}
        }


#计算曼哈顿距离
def manhattan(rating1,rating2):
    """计算曼哈顿距离。rating1和rating2参数中存储的数据格式均为
    {'The Strokes':3.0,'Slightly Stoopid':2.5}"""
    distance = 0
    for key in rating1:
        if key in rating2:
            distance += abs(rating1[key] - rating2[key])
    return distance

#计算闵可夫斯基距离
def minkowski(rating1,rating2,r):
    distance = 0
    for key in rating1:
        if key in rating2:
            distance += pow(abs(rating1[key] - rating2[key]),r)
    return pow(distance,1.0/r)

def computeNearestNeighbor(username,users):
    distances = []
    for user in users:
        if user != username:
            #distance = manhattan(users[user],users[username])
            distance = minkowski(users[user],users[username],2)
            distances.append((distance,user))
    distances.sort()
    return distances

#计算皮尔逊相关系数(-1~1)
def pearson(rating1,rating2):
    sum_xy = 0
    sum_x = 0
    sum_y = 0
    sum_x2 = 0
    sum_y2 = 0
    n = 0
    for key in rating1:
        if key in rating2:
            n += 1
            x = rating1[key]
            y = rating2[key]
            sum_xy += x*y
            sum_x += x
            sum_y += y
            sum_x2 += pow(x,2)
            sum_y2 += pow(y,2)
    denominator = sqrt(sum_x2 - pow(sum_x,2) / n) * sqrt(sum_y2 - pow(sum_y,2) / n)
    if denominator == 0:
        return 0
    else:
        return (sum_xy - (sum_x * sum_y) / n) / denominator


#计算余弦相似度
def cos(rating1,rating2):
    sum_x2 = 0
    sum_y2 = 0
    sum_xy = 0
    for key in rating1:
        if key in rating2:
            x = rating1[key]
            y = rating2[key]
            sum_x2 += pow(x,2)
            sum_y2 += pow(y,2)
            sum_xy += x * y
    denominator = sqrt(sum_x2) * sqrt(sum_y2)
    if denominator == 0:
        return 0
    else:
        return sum_xy / denominator


def recommend(username,users):
    nearest = computeNearestNeighbor(username,users)[0][1]

    recommendations = []
    neighborRatings = users[nearest]
    userRatings = users[username]
    for artist in neighborRatings:
        if not artist in userRatings:
            recommendations.append((artist,neighborRatings[artist]))
    return sorted(recommendations, key = lambda artistTuple:artistTuple[1], reverse = True)

print(recommend('Hailey',users))
#print (recommend('Chan',users))

注意:

1、如果数据存在“分数膨胀”问题,就使用皮尔逊相关系数。

2、如果数据比较密集,变量之间基本都存在公有值,且这些距离数据是非常重要的,那么就使用欧几里得距离或者曼哈顿距离。

3、如果数据是稀疏的,则使用余弦相似度。



你可能感兴趣的:(邻近算法推荐系统)