2019独角兽企业重金招聘Python工程师标准>>>
一、协作型过滤(Collaborative Filtering)
二、寻找相近用户
数据集
critics = {
'Lisa Rose':
{'Lady in the Water': 2.5, 'Snakes on a Plane': 3.5, 'Just My Luck': 3.0, 'Superman Returns': 3.5, 'You, Me and Dupree': 2.5, 'The Night Listener': 3.0},
'Gene Seymour':
{'Lady in the Water': 3.0, 'Snakes on a Plane': 3.5, 'Just My Luck': 1.5, 'Superman Returns': 5.0, 'The Night Listener': 3.0, 'You, Me and Dupree': 3.5},
'Michael Phillips':
{'Lady in the Water': 2.5, 'Snakes on a Plane': 3.0, 'Superman Returns': 3.5, 'The Night Listener': 4.0},
'Claudia Puig':
{'Snakes on a Plane': 3.5, 'Just My Luck': 3.0, 'The Night Listener': 4.5, 'Superman Returns': 4.0, 'You, Me and Dupree': 2.5},
'Mick LaSalle':
{'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0, 'Just My Luck': 2.0, 'Superman Returns': 3.0, 'The Night Listener': 3.0, 'You, Me and Dupree': 2.0},
'Jack Matthews':
{'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0, 'The Night Listener': 3.0, 'Superman Returns': 5.0, 'You, Me and Dupree': 3.5},
'Toby':
{'Snakes on a Plane':4.5, 'You, Me and Dupree':1.0, 'Superman Returns':4.0}
}
Lady in the Water | Snakes on a plane | Just My Luck | Superman Returns | You, Me and Dupree | The Night Listener | |
Rose | 2.5 | 3.5 | 3.0 | 3.5 | 2.5 | 3.0 |
Seymour | 3.0 | 3.5 | 5.0 | 5.0 | 2.5 | 3.0 |
Phillips | 2.5 | 3.0 | 3.5 | 3.5 | 4.0 | |
Puig | 3.5 | 3.0 | 4.0 | 2.5 | 4.5 | |
LaSalle | 3.0 | 4.0 | 2.0 | 3.0 | 2.0 | 3.0 |
Mattnews | 3.0 | 4.0 | 5.0 | 3.5 | 3.0 | |
Toby | ? | 4.5 | ? | 4.0 | 1.0 | ? |
欧几里得距离
>> from math import sqrt
>>sqrt(pow(x1-x2,2) + pow(y1-y2,2))
>> 1 / (1 + sqrt(pow(x1-x2,2) + pow(y1-y2,2))) ==> 归一化 0~1
def sim_distance(prefs, person1, person2):
si = {}
for item in prefs[person1]: ===> 寻找p1和p2通过评论过的movie
if item in prefs[person2]:
si[item] = 1
if len(si) == 0:
return 0
sum_of_squares = 0.0
for item in prefs[person1]: ==> 欧几里得距离公式计算相似度
if item in prefs[person2]:
sum_of_squares += pow(prefs[person1][item] - prefs[person2][item], 2)
return 1 / (1 + sum_of_squares) ==> 归一化
皮尔逊相关度
http://lobert.iteye.com/blog/2024999
def sim_pearson(prefs, p1, p2):
si = {}
for item in prefs[p1]:
if item in prefs[p2]:
si[item] = 1
n = len(si)
if n == 0:
return 1
sum1 = 0.0
sum2 = 0.0
sum1Sq = 0.0
sum2Sq = 0.0
pSum = 0.0
for it in si:
sum1 += prefs[p1][it]
sum2 += prefs[p2][it]
sum1Sq += pow(prefs[p1][it], 2)
sum2Sq += pow(prefs[p2][it], 2)
pSum += prefs[p1][it] * prefs[p2][it]
num = pSum - (sum1 * sum2 / n)
den = sqrt((sum1Sq - pow(sum1, 2) / n) * (sum2Sq - pow(sum2, 2) / n))
if den == 0:
return 0
return num / den
推荐物品
为Toby推荐:
计算所有用户与Toby的相似度(sim_distance,sim_pearson)
def getRecommendations(prefs, person, similarity=sim_pearson):
totals = {}
simSums = {}
for other in prefs:
if other == person:
continue
#
sim = similarity(prefs, person, other) ==> 计算参数person与其他所有用户的相似度
if sim <= 0:
continue
for item in prefs[other]:
if item not in prefs[person] or prefs[person][item] == 0: ==> 推荐没有看过的movie
totals.setdefault(item, 0)
totals[item] += prefs[other][item] * sim
simSums.setdefault(item, 0)
simSums[item] += sim
# rankings = []
# for item,total in totals.items():
# rankings[total / simSums[item]] = item
rankings = [(total / simSums[item], item) for item, total in totals.items()]
rankings.sort() ==> 按照相似度降序排序
rankings.reverse()
return rankings
相似度 | Night | sim * Night | Lady | sim * Lady | Luck | sim * Luck | |
Rose | 0.99 | 3.0 | 0.99 * 3.0 | 2.5 | 0.99 * 2.5 | 3.0 | 0.99 * 3.0 |
Seymour | 0.38 | 3.0 | 0.38 * 3.0 | 3.0 | 0.38 * 3.0 | 1.5 | 0.38 * 1.5 |
Puig | 0.89 | 4.5 | 0.89 * 4.5 | 3.0 | 0.89 * 3.0 | ||
LaSalle | 0.92 | 3.0 | 0.92 * 3.0 | 3.0 | 0.92 * 3.0 | 2.0 | 0.92 * 2.0 |
Matthews | 0.66 | 3.0 | 0.66 * 3.0 | 3.0 | 0.66 * 3.0 | ||
总计 | 12.89 | 8.38 | 8.07 | ||||
相似度总计 | 0.99+0.38+0.89+0.92+0.66=3.84 | 0.99+0.38+0.92+0.66=2.83 | 0.99+0.38+0.89+0.92=3.18 | ||||
总计/相似度总计 | 3.35 | 2.83 | 2.53 |
三、基于物品的过滤
基于用户的协作型过滤,要求我们使用来自每一位的全部评分构建数据集。这种方法对于数量以千计的用户或是物品规模或是没有问题,但是对于上百万客户的商品的大型网站而言,将一个用户与其他所有用户进行比较,然后再对每位用户评过分的商品进行比较,其速度可能是无法忍受的。同样。一个商品销售量为数百万的网站,也许用户偏好方面彼此间很少见会有重叠,这可能会令用户相似性判断变得十分困难。
在拥有大量数据集的情况下,基于物品的协作型过滤能够更好的得出结论,而且允许我们将大量计算任务预先执行,从而