php协同过滤推荐算法【附部分源码】

算法核心的公式如下:

1.余弦相似度(求邻居):

暂无

2.预测公式(预测a可能会喜欢哪种物品):

暂无

这里是参考帖子的公式,说实话看这个公式看的有点懵

另外一个比较好解释的帖子请看这里的公式和分析过程:https://www.cnblogs.com/dsgcBlogs/p/8619566.html

看完以上内容开始进入正题【基于用户相似协同过滤推荐】

  • 获取所有用户对物品的关注数矩阵

我们的项目设置多个地方比如:获取物品详情、关注物品等接口调用时进行记录行为,以单位1为一次,可以设置每个行为不同的权重,例如:关注+5、收藏+3等等。hobby_num代表喜爱度

php协同过滤推荐算法【附部分源码】_第1张图片

从数据库查询出构成二维数组并循环生成以下数组(为方便展示,已经转为json格式,php程序内无须转为json,1级键为用户id索引

{
    "1": [
        {
            "user_id": "1",
            "second_id": "1",
            "ranking_name": "2018年最受欢迎涂抹式面膜",
            "hobby_num": "0"
        },
        {
            "user_id": "1",
            "second_id": "2",
            "ranking_name": "2018年最热销猫粮",
            "hobby_num": "0"
        },
        {
            "user_id": "1",
            "second_id": "3",
            "ranking_name": "2018年最热销的天然猫粮",
            "hobby_num": "0"
        },
        {
            "user_id": "1",
            "second_id": "4",
            "ranking_name": "2018年性价比高又耐养的宠物",
            "hobby_num": "0"
        },
        {
            "user_id": "1",
            "second_id": "5",
            "ranking_name": "2018年最长寿的宠物",
            "hobby_num": "0"
        },
        {
            "user_id": "1",
            "second_id": "6",
            "ranking_name": "2018年最火爆的理科专业",
            "hobby_num": "0"
        },
        {
            "user_id": "1",
            "second_id": "7",
            "ranking_name": "2018年美国医科学校",
            "hobby_num": "0"
        },
        {
            "user_id": "1",
            "second_id": "8",
            "ranking_name": "2018年最受女生喜欢的热门专业",
            "hobby_num": "0"
        },
        {
            "user_id": "1",
            "second_id": "9",
            "ranking_name": "2018年最热门的小语种",
            "hobby_num": "0"
        },
        {
            "user_id": "1",
            "second_id": "10",
            "ranking_name": "2018年最高人气的英语教育机构",
            "hobby_num": "0"
        }
    ]}
  • 计算余弦相似度cos并从大到小排序【不计算为0的值,0代表没有关注度,$not_hobby代表被推荐用户未观看过的内容列表,也就是hobby_num=0的内容】
        //计算分母
        $fm = 0;

        foreach ($data[$user_id] as $item) {
            if ($item['hobby_num'] != 0) {
                $fm += $item['hobby_num'] * $item['hobby_num'];
            }
        }

        //根
        $fm = sqrt($fm);

        foreach ($data as $key => $item) {
            if ($key != $user_id) {
                //计算分子和分母
                $fz = 0;
                $fm2 = 0;
                foreach ($item as $i => $j) {

                    //计算分子
                    if ($data[$user_id][$i]['hobby_num'] != 0 && $j['hobby_num'] != 0) {
                        $fz += $data[$user_id][$i]['hobby_num'] * $j['hobby_num'];
                    }

                    //计算分母2
                    if ($j['hobby_num'] != 0) {
                        $fm2 += $j['hobby_num'] * $j['hobby_num'];
                    }
                }

                $fm2 = sqrt($fm2);

                if ($fz != 0 && $fm != 0 && $fm2 != 0) {
                    //计算用户的cos值
                    $cos[$key] = $fz / $fm / $fm2;
                }

            } else {

                //设置用户未看过的内容数组
                foreach ($item as $k => $j) {
                    if ($j['hobby_num'] == 0) {
                        $not_hobby[$k] = $j;
                    }
                }

            }
        }

        //从大到小排序cos值
        arsort($cos);

       //设置好用户对应的cos值
        foreach ($cos as $key => $item) {
            $lingju[] = ['user_id' => $key, 'cos' => $item];
        }

        //取前面3个邻居
        $lingju = array_slice($lingju, 0, 3);


        //设置好对预测推荐的数组
        foreach ($lingju as &$i) {
            foreach ($not_hobby as $k => $j) {
                $i['hobby'][] = $data[$i['user_id']][$k];
            }
        }

        //计算predict值
        foreach ($not_hobby as $k => &$j) {
            $predict_fz = 0;
            $predict_fm = 0;

            foreach ($lingju as $item) {
                if ($item['hobby'][$k]['hobby_num'] != 0) {
                    $predict_fz += $item['cos'] * $item['hobby'][$k]['hobby_num'];
                }

                $predict_fm += $item['cos'];
            }


            $j['predict'] = $predict_fz / sqrt($predict_fm);
        }

        //设置排序的第二数组
        foreach ($not_hobby as $key => $row) {
            $volume[$key] = $row['predict'];
        }

        //根据predict值排序二维数组
        array_multisort($volume, SORT_DESC, $not_hobby);

键为用户id【理论上越接近1则两个用户越相似,此处看你们的机制,因为我们项目的有分领域,并设置测试每个领域有系统设定好的机器人,比如以下1为关注游戏领域的机器人,测试时点击游戏内容则与用户id=1的用户较为相似,cos值为0.42007393952061,可作为邻居】

[
    {
        "user_id": 1,
        "cos": 0.42007393952061
    },
    {
        "user_id": 12,
        "cos": 0.22096682734453
    },
    {
        "user_id": 5,
        "cos": 0.099333258551861
    },
    {
        "user_id": 8,
        "cos": 0.050731001802594
    },
    {
        "user_id": 17,
        "cos": 0.022953657440244
    }
]
[
    {
        "user_id": "389",
        "second_id": "226",
        "ranking_name": "100元以内最具性价比的出租屋神器",
        "hobby_num": "0",
        "predict": 9.2758507820205
    },
    {
        "user_id": "389",
        "second_id": "46",
        "ranking_name": "创造101最高人气的成员",
        "hobby_num": "0",
        "predict": 6.3466347455929
    },
    {
        "user_id": "389",
        "second_id": "135",
        "ranking_name": "曾经红极一时的最经典耽美影视作品",
        "hobby_num": "0",
        "predict": 2.8248421075833
    },
    {
        "user_id": "389",
        "second_id": "71",
        "ranking_name": "不论何时回顾,都会感动落泪的动漫情节",
        "hobby_num": "0",
        "predict": 2.3112344516591
    },
    {
        "user_id": "389",
        "second_id": "18",
        "ranking_name": "2018年最让人心动的动漫女主",
        "hobby_num": "0",
        "predict": 2.054430623697
    },
    {
        "user_id": "389",
        "second_id": "20",
        "ranking_name": "日本恐怖漫画",
        "hobby_num": "0",
        "predict": 1.7976267957348
    },
    {
        "user_id": "389",
        "second_id": "27",
        "ranking_name": "亚洲最火爆的动漫",
        "hobby_num": "0",
        "predict": 1.7976267957348
    },
    {
        "user_id": "389",
        "second_id": "73",
        "ranking_name": "国产动画片排行榜",
        "hobby_num": "0",
        "predict": 1.7976267957348
    },
    {
        "user_id": "389",
        "second_id": "50",
        "ranking_name": "2018最佳BL漫画",
        "hobby_num": "0",
        "predict": 1.5408229677727
    },
    {
        "user_id": "389",
        "second_id": "19",
        "ranking_name": "2018年日本耽美漫画",
        "hobby_num": "0",
        "predict": 0.77041148388636
    },
    {
        "user_id": "389",
        "second_id": "33",
        "ranking_name": "80后最爱看的动漫",
        "hobby_num": "0",
        "predict": 0.77041148388636
    },
    {
        "user_id": "389",
        "second_id": "74",
        "ranking_name": "日本最催泪虐心动漫",
        "hobby_num": "0",
        "predict": 0.77041148388636
    },
    {
        "user_id": "389",
        "second_id": "221",
        "ranking_name": "深圳福田区公寓-梅林&景田&车公庙&新洲",
        "hobby_num": "0",
        "predict": 0.57721698199284
    },
    {
        "user_id": "389",
        "second_id": "153",
        "ranking_name": "饭可以不吃但不能不玩的最经典单机游戏",
        "hobby_num": "0",
        "predict": 0.46177358559427
    },
    {
        "user_id": "389",
        "second_id": "44",
        "ranking_name": "中国区-2018年绝地求生最受欢迎的职业战队",
        "hobby_num": "0",
        "predict": 0.3463301891957
    }
]

此处做法是按照从大到小的预测值排列并推荐。当然了predict值比较小的内容是否推荐,看项目而定,一般predict值小代表和用户不是非常的贴合,也可能不一定是用户喜欢的内容,但一般项目内可能感兴趣的榜单,就推荐几个,一般来说就够了

php协同过滤推荐算法【附部分源码】_第2张图片

当然我们目前项目算法还在优化,存在以下问题

  • 用户冷启动问题,无从推荐【解决办法:热度排名(如果是新闻系统还要+时间衰减+人工干预等)、比如阿里的推荐系统就从各种平台(淘宝、饿了么、支付宝)记录内容,可以解决冷启动】

参考:https://blog.csdn.net/dingsongtao/article/details/72160096

你可能感兴趣的:(php,算法)