算法核心的公式如下:
1.余弦相似度(求邻居):
2.预测公式(预测a可能会喜欢哪种物品):
这里是参考帖子的公式,说实话看这个公式看的有点懵
另外一个比较好解释的帖子请看这里的公式和分析过程:https://www.cnblogs.com/dsgcBlogs/p/8619566.html
看完以上内容开始进入正题【基于用户相似协同过滤推荐】
我们的项目设置多个地方比如:获取物品详情、关注物品等接口调用时进行记录行为,以单位1为一次,可以设置每个行为不同的权重,例如:关注+5、收藏+3等等。hobby_num代表喜爱度
从数据库查询出构成二维数组并循环生成以下数组(为方便展示,已经转为json格式,php程序内无须转为json,1级键为用户id索引)
{
"1": [
{
"user_id": "1",
"second_id": "1",
"ranking_name": "2018年最受欢迎涂抹式面膜",
"hobby_num": "0"
},
{
"user_id": "1",
"second_id": "2",
"ranking_name": "2018年最热销猫粮",
"hobby_num": "0"
},
{
"user_id": "1",
"second_id": "3",
"ranking_name": "2018年最热销的天然猫粮",
"hobby_num": "0"
},
{
"user_id": "1",
"second_id": "4",
"ranking_name": "2018年性价比高又耐养的宠物",
"hobby_num": "0"
},
{
"user_id": "1",
"second_id": "5",
"ranking_name": "2018年最长寿的宠物",
"hobby_num": "0"
},
{
"user_id": "1",
"second_id": "6",
"ranking_name": "2018年最火爆的理科专业",
"hobby_num": "0"
},
{
"user_id": "1",
"second_id": "7",
"ranking_name": "2018年美国医科学校",
"hobby_num": "0"
},
{
"user_id": "1",
"second_id": "8",
"ranking_name": "2018年最受女生喜欢的热门专业",
"hobby_num": "0"
},
{
"user_id": "1",
"second_id": "9",
"ranking_name": "2018年最热门的小语种",
"hobby_num": "0"
},
{
"user_id": "1",
"second_id": "10",
"ranking_name": "2018年最高人气的英语教育机构",
"hobby_num": "0"
}
]}
//计算分母
$fm = 0;
foreach ($data[$user_id] as $item) {
if ($item['hobby_num'] != 0) {
$fm += $item['hobby_num'] * $item['hobby_num'];
}
}
//根
$fm = sqrt($fm);
foreach ($data as $key => $item) {
if ($key != $user_id) {
//计算分子和分母
$fz = 0;
$fm2 = 0;
foreach ($item as $i => $j) {
//计算分子
if ($data[$user_id][$i]['hobby_num'] != 0 && $j['hobby_num'] != 0) {
$fz += $data[$user_id][$i]['hobby_num'] * $j['hobby_num'];
}
//计算分母2
if ($j['hobby_num'] != 0) {
$fm2 += $j['hobby_num'] * $j['hobby_num'];
}
}
$fm2 = sqrt($fm2);
if ($fz != 0 && $fm != 0 && $fm2 != 0) {
//计算用户的cos值
$cos[$key] = $fz / $fm / $fm2;
}
} else {
//设置用户未看过的内容数组
foreach ($item as $k => $j) {
if ($j['hobby_num'] == 0) {
$not_hobby[$k] = $j;
}
}
}
}
//从大到小排序cos值
arsort($cos);
//设置好用户对应的cos值
foreach ($cos as $key => $item) {
$lingju[] = ['user_id' => $key, 'cos' => $item];
}
//取前面3个邻居
$lingju = array_slice($lingju, 0, 3);
//设置好对预测推荐的数组
foreach ($lingju as &$i) {
foreach ($not_hobby as $k => $j) {
$i['hobby'][] = $data[$i['user_id']][$k];
}
}
//计算predict值
foreach ($not_hobby as $k => &$j) {
$predict_fz = 0;
$predict_fm = 0;
foreach ($lingju as $item) {
if ($item['hobby'][$k]['hobby_num'] != 0) {
$predict_fz += $item['cos'] * $item['hobby'][$k]['hobby_num'];
}
$predict_fm += $item['cos'];
}
$j['predict'] = $predict_fz / sqrt($predict_fm);
}
//设置排序的第二数组
foreach ($not_hobby as $key => $row) {
$volume[$key] = $row['predict'];
}
//根据predict值排序二维数组
array_multisort($volume, SORT_DESC, $not_hobby);
键为用户id【理论上越接近1则两个用户越相似,此处看你们的机制,因为我们项目的有分领域,并设置测试每个领域有系统设定好的机器人,比如以下1为关注游戏领域的机器人,测试时点击游戏内容则与用户id=1的用户较为相似,cos值为0.42007393952061,可作为邻居】
[
{
"user_id": 1,
"cos": 0.42007393952061
},
{
"user_id": 12,
"cos": 0.22096682734453
},
{
"user_id": 5,
"cos": 0.099333258551861
},
{
"user_id": 8,
"cos": 0.050731001802594
},
{
"user_id": 17,
"cos": 0.022953657440244
}
]
[
{
"user_id": "389",
"second_id": "226",
"ranking_name": "100元以内最具性价比的出租屋神器",
"hobby_num": "0",
"predict": 9.2758507820205
},
{
"user_id": "389",
"second_id": "46",
"ranking_name": "创造101最高人气的成员",
"hobby_num": "0",
"predict": 6.3466347455929
},
{
"user_id": "389",
"second_id": "135",
"ranking_name": "曾经红极一时的最经典耽美影视作品",
"hobby_num": "0",
"predict": 2.8248421075833
},
{
"user_id": "389",
"second_id": "71",
"ranking_name": "不论何时回顾,都会感动落泪的动漫情节",
"hobby_num": "0",
"predict": 2.3112344516591
},
{
"user_id": "389",
"second_id": "18",
"ranking_name": "2018年最让人心动的动漫女主",
"hobby_num": "0",
"predict": 2.054430623697
},
{
"user_id": "389",
"second_id": "20",
"ranking_name": "日本恐怖漫画",
"hobby_num": "0",
"predict": 1.7976267957348
},
{
"user_id": "389",
"second_id": "27",
"ranking_name": "亚洲最火爆的动漫",
"hobby_num": "0",
"predict": 1.7976267957348
},
{
"user_id": "389",
"second_id": "73",
"ranking_name": "国产动画片排行榜",
"hobby_num": "0",
"predict": 1.7976267957348
},
{
"user_id": "389",
"second_id": "50",
"ranking_name": "2018最佳BL漫画",
"hobby_num": "0",
"predict": 1.5408229677727
},
{
"user_id": "389",
"second_id": "19",
"ranking_name": "2018年日本耽美漫画",
"hobby_num": "0",
"predict": 0.77041148388636
},
{
"user_id": "389",
"second_id": "33",
"ranking_name": "80后最爱看的动漫",
"hobby_num": "0",
"predict": 0.77041148388636
},
{
"user_id": "389",
"second_id": "74",
"ranking_name": "日本最催泪虐心动漫",
"hobby_num": "0",
"predict": 0.77041148388636
},
{
"user_id": "389",
"second_id": "221",
"ranking_name": "深圳福田区公寓-梅林&景田&车公庙&新洲",
"hobby_num": "0",
"predict": 0.57721698199284
},
{
"user_id": "389",
"second_id": "153",
"ranking_name": "饭可以不吃但不能不玩的最经典单机游戏",
"hobby_num": "0",
"predict": 0.46177358559427
},
{
"user_id": "389",
"second_id": "44",
"ranking_name": "中国区-2018年绝地求生最受欢迎的职业战队",
"hobby_num": "0",
"predict": 0.3463301891957
}
]
此处做法是按照从大到小的预测值排列并推荐。当然了predict值比较小的内容是否推荐,看项目而定,一般predict值小代表和用户不是非常的贴合,也可能不一定是用户喜欢的内容,但一般项目内可能感兴趣的榜单,就推荐几个,一般来说就够了
当然我们目前项目算法还在优化,存在以下问题
参考:https://blog.csdn.net/dingsongtao/article/details/72160096