基于用户的最近邻算法(User-Based Neighbor Algorithms),是一种非概率性的协同过滤算法,也是推荐系统中最最古老,最著名的算法,我们称那些兴趣相似的用户为邻居,如果用户n相似于用户u,我们就说n是u的一个邻居。起初算法,对于未知目标的预测是根据该用户的相似用户的评分作出预测的
本文中以电影推荐为例:
假设有7个人像向你分别推荐了电影,你是其中的小明,你自己也看过一些电影,你自己看过的不想看,只想看没有看过的,且推荐度高,评分高的电影,示例数据如下:
{
"小明":{
"中国合伙人":5,
"太平轮":3,
"荒野猎人":4.5,
"老炮儿":5,
"我的少女时代":3,
"肖洛特烦恼":4.5,
"火星救援":5
},
"小红":{
"小时代4":4,
"荒野猎人":3,
"我的少女时代":5,
"肖洛特烦恼":5,
"火星救援":3,
"后会无期":3
},
"小阳":{
"小时代4":2,
"中国合伙人":5,
"我的少女时代":3,
"老炮儿":5,
"肖洛特烦恼":4.5,
"速度与激情7":5
},
"小四":{
"小时代4":5,
"中国合伙人":3,
"我的少女时代":4,
"匆匆那年":4,
"速度与激情7":3.5,
"火星救援":3.5,
"后会无期":4.5
},
"六爷":{
"小时代4":2,
"中国合伙人":4,
"荒野猎人":4.5,
"老炮儿":5,
"我的少女时代":2
},
"小李":{
"荒野猎人":5,
"盗梦空间":5,
"我的少女时代":3,
"速度与激情7":5,
"蚁人":4.5,
"老炮儿":4,
"后会无期":3.5
},
"隔壁老王":{
"荒野猎人":5,
"中国合伙人":4,
"我的少女时代":1,
"Phoenix":5,
"甄嬛传":4,
"The Strokes":5
},
"邻村小芳":{
"小时代4":4,
"我的少女时代":4.5,
"匆匆那年":4.5,
"甄嬛传":2.5,
"The Strokes":3
}
}
这里我们需要用到一个公式:皮尔逊公式
假设有两个变量X、Y,那么两变量间的皮尔逊相关系数可通过以下公式计算:
皮尔逊相关系数计算公式
皮尔逊相关系数计算公式
皮尔逊相关系数计算公式
皮尔逊相关系数计算公式
以上列出的四个公式等价,其中E是数学期望,cov表示协方差,N表示变量取值的个数
皮尔逊算法过于复杂,如果要有点理解的话,可以使用把维数降到二维,这样就跟余弦定理有点相识了,相识度就是两条直线的夹角,角度为0的时候,皮尔逊的值为1,就是最相识的,如果角度为180度,代表两个牛马不相干,皮尔逊的值为-1。
皮尔逊相关系数是比欧几里德距离更加复杂的可以判断人们兴趣的相似度的一种方法。该相关系数是判断两组数据与某一直线拟合程序的一种试题。它在数据不是很规范的时候,会倾向于给出更好的结果。
如图,Mick Lasalle为<
此处我以python代码为例向大家展示,文末附上java代码的实现
# 构造一份打分数据集,可以去movielens下载真实的数据做实验
users = {"小明": {"中国合伙人": 5.0, "太平轮": 3.0, "荒野猎人": 4.5, "老炮儿": 5.0, "我的少女时代": 3.0, "肖洛特烦恼": 4.5, "火星救援": 5.0},
"小红": {"小时代4": 4.0, "荒野猎人": 3.0, "我的少女时代": 5.0, "肖洛特烦恼": 5.0, "火星救援": 3.0, "后会无期": 3.0},
"小阳": {"小时代4": 2.0, "中国合伙人": 5.0, "我的少女时代": 3.0, "老炮儿": 5.0, "肖洛特烦恼": 4.5, "速度与激情7": 5.0},
"小四": {"小时代4": 5.0, "中国合伙人": 3.0, "我的少女时代": 4.0, "匆匆那年": 4.0, "速度与激情7": 3.5, "火星救援": 3.5, "后会无期": 4.5},
"六爷": {"小时代4": 2.0, "中国合伙人": 4.0, "荒野猎人": 4.5, "老炮儿": 5.0, "我的少女时代": 2.0},
"小李": {"荒野猎人": 5.0, "盗梦空间": 5.0, "我的少女时代": 3.0, "速度与激情7": 5.0, "蚁人": 4.5, "老炮儿": 4.0, "后会无期": 3.5},
"隔壁老王": {"荒野猎人": 5.0, "中国合伙人": 4.0, "我的少女时代": 1.0, "Phoenix": 5.0, "甄嬛传": 4.0, "The Strokes": 5.0},
"邻村小芳": {"小时代4": 4.0, "我的少女时代": 4.5, "匆匆那年": 4.5, "甄嬛传": 2.5, "The Strokes": 3.0}
}
# 定义几种距离计算函数
# 更高效的方式为把得分向量化之后使用scipy中定义的distance方法
from math import sqrt
def pearson_dis(rating1, rating2):
"""计算2个打分序列间的pearson距离. 输入的rating1和rating2都是打分dict
格式为{'小时代4': 1.0, '疯狂动物城': 5.0}"""
sum_xy = 0
sum_x = 0
sum_y = 0
sum_x2 = 0
sum_y2 = 0
n = 0
for key in rating1:
if key in rating2:
n += 1
x = rating1[key]
y = rating2[key]
sum_xy += x * y
sum_x += x
sum_y += y
sum_x2 += pow(x, 2)
sum_y2 += pow(y, 2)
# now compute denominator
denominator = sqrt(sum_x2 - pow(sum_x, 2) / n) * sqrt(sum_y2 - pow(sum_y, 2) / n)
if denominator == 0:
return 0
else:
return (sum_xy - (sum_x * sum_y) / n) / denominator
# 查找最近邻
def computeNearestNeighbor(username, users):
"""在给定username的情况下,计算其他用户和它的距离并排序"""
print("查找最近邻")
distances = []
for user in users:
if user != username:
# distance = manhattan_dis(users[user], users[username])
distance = pearson_dis(users[user], users[username])
distances.append((distance, user))
# 根据距离排序,距离越近,排得越靠前
distances.sort()
print("distances => ", distances)
return distances
# 推荐
def recommend(username, users):
print("输入=》", username)
"""对指定的user推荐电影"""
# 找到最近邻
nearest = computeNearestNeighbor(username, users)[0][1]
recommendations = []
# 找到最近邻看过,但是我们没看过的电影,计算推荐
neighborRatings = users[nearest]
print("nearest -> ", nearest)
print("neighborRatings -> ", neighborRatings)
userRatings = users[username]
print("userRatings -> ", userRatings)
for artist in neighborRatings:
if not artist in userRatings:
recommendations.append((artist, neighborRatings[artist]))
print("recommendations -> ", recommendations)
results = sorted(recommendations, key=lambda artistTuple: artistTuple[1], reverse=True)
for result in results:
print(result[0], result[1])
if __name__ == "__main__":
recommend('小明', users)
import java.util.ArrayList;
import java.util.List;
/**
* Created by ccwant on 2018-12-14.
*/
public class UserSet {
public List users = new ArrayList<>();
public UserSet() {
}
public User put(String username) {
return new User(username);
}
public User getUser(int position) {
return users.get(position);
}
public User getUser(String username) {
for (User user : users) {
if (user.username.equals(username)) {
return user;
}
}
return null;
}
public final class User {
public String username;
public List list = new ArrayList<>();
private User(String username) {
this.username = username;
}
public User set(String username, int score) {
this.list.add(new Set(username, score));
return this;
}
public void create() {
users.add(this);
}
public Set find(String username) {
for (Set set : list) {
if (set.username.equals(username)) {
return set;
}
}
return null;
}
@Override
public String toString() {
return "User{" +
"username='" + username + '\'' +
'}';
}
}
public final class Set implements Comparable {
public String username;
public int score;
public Set(String username, int score) {
this.username = username;
this.score = score;
}
@Override
public String toString() {
return "Set{" +
"username='" + username + '\'' +
", score=" + score +
'}';
}
@Override
public int compareTo(Set o) {
return score > o.score ? -1 : 1;
}
}
}
import java.util.*;
/**
* Created by ccwant on 2018-12-14.
*/
public class Recommend {
/**
* 在给定username的情况下,计算其他用户和它的距离并排序
* @param username
* @param set
* @return
*/
private Map computeNearestNeighbor(String username, UserSet set) {
Map distances = new TreeMap<>();
UserSet.User u1 = set.getUser(username);
for (int i = 0; i < set.users.size(); i++) {
UserSet.User u2 = set.getUser(i);
if (!u2.username.equals(username)) {
double distance = pearson_dis(u2.list, u1.list);
distances.put(distance, u2.username);
}
}
System.out.println("distance => " + distances);
return distances;
}
/**
* 计算2个打分序列间的pearson距离
*
* @param rating1
* @param rating2
* @return
*/
private double pearson_dis(List rating1, List rating2) {
int sum_xy = 0;
int sum_x = 0;
int sum_y = 0;
double sum_x2 = 0;
double sum_y2 = 0;
int n = 0;
for (int i = 0; i < rating1.size(); i++) {
UserSet.Set key1 = rating1.get(i);
for (int j = 0; j < rating2.size(); j++) {
UserSet.Set key2 = rating2.get(j);
if (key1.username.equals(key2.username)) {
n += 1;
int x = key1.score;
int y = key2.score;
sum_xy += x * y;
sum_x += x;
sum_y += y;
sum_x2 += Math.pow(x, 2);
sum_y2 += Math.pow(y, 2);
}
}
}
double denominator = Math.sqrt(sum_x2 - Math.pow(sum_x, 2) / n) * Math.sqrt(sum_y2 - Math.pow(sum_y, 2) / n);
if (denominator == 0) {
return 0;
} else {
return (sum_xy - (sum_x * sum_y) / n) / denominator;
}
}
public List recommend(String username, UserSet set) {
//找到最近邻
Map distances = computeNearestNeighbor(username, set);
String nearest = distances.values().iterator().next();
System.out.println("nearest -> " + nearest);
List recommendations = new ArrayList<>();
//找到最近邻看过,但是我们没看过的电影,计算推荐
UserSet.User neighborRatings = set.getUser(nearest);
System.out.println("neighborRatings -> " + neighborRatings.list);
UserSet.User userRatings = set.getUser(username);
System.out.println("userRatings -> " + userRatings.list);
for (UserSet.Set artist : neighborRatings.list) {
if (userRatings.find(artist.username) == null) {
recommendations.add(artist);
}
}
Collections.sort(recommendations);
System.out.println("recommendations -> " + recommendations.toString());
return recommendations;
}
}
import sys.Recommend;
import sys.UserSet;
import java.util.*;
/**
* Created by ccwant on 2018-12-14.
*/
public class Demo {
public static void main(String[] args) {
//输入用户总量
UserSet userSet = new UserSet();
userSet.put("小明")
.set("中国合伙人", 50)
.set("太平轮", 30)
.set("荒野猎人", 45)
.set("老炮儿", 50)
.set("我的少女时代", 30)
.set("肖洛特烦恼", 45)
.set("火星救援", 50)
.create();
userSet.put("小红")
.set("小时代4", 40)
.set("荒野猎人", 30)
.set("我的少女时代", 50)
.set("肖洛特烦恼", 50)
.set("火星救援", 30)
.set("后会无期", 30)
.create();
userSet.put("小阳")
.set("小时代4", 20)
.set("中国合伙人", 50)
.set("我的少女时代", 30)
.set("老炮儿", 50)
.set("肖洛特烦恼", 45)
.set("速度与激情7", 50)
.create();
userSet.put("小四")
.set("小时代4", 50)
.set("中国合伙人", 30)
.set("我的少女时代", 40)
.set("匆匆那年", 40)
.set("速度与激情7", 35)
.set("火星救援", 35)
.set("后会无期", 45)
.create();
userSet.put("六爷")
.set("小时代4", 20)
.set("中国合伙人", 40)
.set("荒野猎人", 45)
.set("老炮儿", 50)
.set("我的少女时代", 20)
.create();
userSet.put("小李")
.set("荒野猎人", 50)
.set("盗梦空间", 50)
.set("我的少女时代", 30)
.set("速度与激情7", 50)
.set("蚁人", 45)
.set("老炮儿", 40)
.set("后会无期", 35)
.create();
userSet.put("隔壁老王")
.set("荒野猎人", 50)
.set("中国合伙人", 40)
.set("我的少女时代", 10)
.set("Phoenix", 50)
.set("甄嬛传", 40)
.set("The Strokes", 50)
.create();
userSet.put("邻村小芳")
.set("小时代4", 40)
.set("我的少女时代", 45)
.set("匆匆那年", 45)
.set("甄嬛传", 25)
.set("The Strokes", 30)
.create();
Recommend recommend = new Recommend();
List recommendations = recommend.recommend("小明", userSet);
System.out.println("-----------------------");
for (UserSet.Set set : recommendations) {
System.out.println(set.username+" "+set.score);
}
}
}