基于python的用户画像系统设计与实现

实验要求

  • 编程语言:python
  • 爬取内容:爬取用户基础属性和UP主属性
  • 数据存储:mysql
  • 生成用户画像和UP主的画像
  • flask前端展示人员画像

爬虫

  • 爬取哔哩哔哩用户收藏夹内容和关注列表,代码如下:

def shoucangjia(cookie, mysql_handler, uid):
    import requests
    media_ids = get_media_id(cookie, uid)
    for media_id in media_ids:
        # url = "https://api.bilibili.com/x/v3/fav/resource/list?media_id=756273631&pn=1&ps=20&keyword=&order=mtime&type=0&tid=0&platform=web&jsonp=jsonp"
        url = f"https://api.bilibili.com/x/v3/fav/resource/list?media_id={media_id}&pn=1&ps=20&keyword=&order=mtime&type=0&tid=0&platform=web&jsonp=jsonp"

        payload = {}
        headers = {
            'authority': 'api.bilibili.com',
            'accept': 'application/json, text/plain, */*',
            'accept-language': 'zh-CN,zh;q=0.9',
            'cookie': cookie,
            'origin': 'https://space.bilibili.com',
            'referer': f'https://space.bilibili.com/{uid}/favlist',
            'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="100", "Google Chrome";v="100"',
            'sec-ch-ua-mobile': '?0',
            'sec-ch-ua-platform': '"Windows"',
            'sec-fetch-dest': 'empty',
            'sec-fetch-mode': 'cors',
            'sec-fetch-site': 'same-site',
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36'
        }

        response = requests.request("GET", url, headers=headers, data=payload)
        time.sleep(2)
        print(response.text)
        for it in response.json()['data']['medias']:
            title = it['title']
            intro = it['intro']
            mid = it['upper']['mid']
            print('收藏夹中内容是')
            print('title: {}'.format(title))
            print('intro: {}'.format(intro))
            print('mid: {}'.format(mid))
            insert_sql = "INSERT INTO bilibili_shoucang (title,intro,uid,mid) VALUES( '%s','%s','%s','%s');" % (title,intro,uid, mid)
            print("insert_sql: {}".format(insert_sql))
            mysql_handler.insert_data(insert_sql)
            with open('./data/收藏夹.txt', 'a', encoding='utf8') as f:
                f.write(intro + '\n')
def guanzhu(cookie, mysql_handler, uid):
    import requests

    url = f"https://api.bilibili.com/x/relation/followings?vmid={uid}&pn=1&ps=20&order=desc&order_type=attention&jsonp=jsonp&callback=__jp13"

    payload = {}
    headers = {
        'authority': 'api.bilibili.com',
        'accept': '*/*',
        'accept-language': 'zh-CN,zh;q=0.9',
        'cookie': cookie,
        'referer': f'https://space.bilibili.com/{uid}/fans/follow',
        'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="100", "Google Chrome";v="100"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"Windows"',
        'sec-fetch-dest': 'script',
        'sec-fetch-mode': 'no-cors',
        'sec-fetch-site': 'same-site',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36'
    }

    response = requests.request("GET", url, headers=headers, data=payload)
    time.sleep(2)
    print(response.text)
    ob = re.search('__jp13\((.*?)}}\)', response.text)
    if ob:
        str_data = ob.group(1) + '}}'
        # print(str_data)
        # print(type(str_data))
        json_da = json.loads(str_data)
        for it in json_da['data']['list']:
            # print('it: {}'.format(it))
            uname = it['uname']
            sign = it['sign']
            mid = it['mid']
            print('关注人员有')
            print('uname:{}'.format(uname))
            print('sign:{}'.format(sign))
            print('mid:{}'.format(mid))
            insert_sql = 'INSERT INTO bilibili_guanzhu(uname,sign,uid,mid) VALUES( "%s","%s","%s","%s");' % (uname,sign,uid,mid)
            print("insert_sql: {}".format(insert_sql))
            mysql_handler.insert_data(insert_sql)
            with open('./data/关注.txt', 'a', encoding='utf8') as f:
                f.write(sign + '\n')


  • 我们通过分析视频题目得到用户的偏好
  • 我们再通过关注列表的用户首页视频数据得到用户偏好
  • 通过这些偏好来确定用户爱好属性最后通过这些属性来生成用户画像,还需要爬取up主的主页视频内容来分析他的用户画像,代码如下:
def get_guanzhushufansshu(cookie, mid):
    import requests

    url = f"https://api.bilibili.com/x/relation/stat?vmid={mid}&jsonp=jsonp"

    payload = {}
    headers = {
        'authority': 'api.bilibili.com',
        'accept': 'application/json, text/plain, */*',
        'accept-language': 'zh-CN,zh;q=0.9',
        'cookie': cookie,
        # 'cookie': '_uuid=284A9C35-410F10-EDF6-3C17-A54AB9E37FC100821infoc; b_nut=1643015606; buvid3=DA5BB132-6D37-E925-D4B3-2C8A4E90E3E206238infoc; buvid4=113A9242-DF44-71D7-411B-D2635313C53406238-022012417-49/i0Y3vQxJnoCHY0LiQLA%3D%3D; rpdid=|(u|kk)Jl)m|0J\'uYRRR)))mJ; i-wanna-go-back=-1; nostalgia_conf=-1; buvid_fp_plain=undefined; SESSDATA=23090d69%2C1666616262%2Cebd78%2A41; bili_jct=164fb21485e3237e504b7289f12e3522; DedeUserID=478755531; DedeUserID__ckMd5=7b252a2e31a4618c; sid=6lg80kg6; CURRENT_QUALITY=64; buvid_fp=5817007307836e6434cf19e36092363c; CURRENT_BLACKGAP=0; blackside_state=0; b_ut=5; is-2022-channel=1; CURRENT_FNVAL=4048; bp_video_offset_478755531=656642964180697100; fingerprint3=1aef2ae9098ea1c4d68da3b9ae656903; fingerprint=a6a9cadc0142d2671c783db7928ea24c; b_lsid=878109938_180A6C3B44F; innersign=1; PVID=2',
        'origin': 'https://space.bilibili.com',
        'referer': f'https://space.bilibili.com/{mid}',
        'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="100", "Google Chrome";v="100"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"Windows"',
        'sec-fetch-dest': 'empty',
        'sec-fetch-mode': 'cors',
        'sec-fetch-site': 'same-site',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36'
    }

    response = requests.request("GET", url, headers=headers, data=payload)

    # print(response.text)
    guanzhushu = response.json()['data']['following']
    print('关注数:{}'.format(guanzhushu))
    fansshu = response.json()['data']['follower']
    print('粉丝数:{}'.format(fansshu))
    return guanzhushu,fansshu
def gethuozanshubofnagshuyuedushu(cookie, mid):
    import requests

    url = f"https://api.bilibili.com/x/space/upstat?mid={mid}&jsonp=jsonp"

    payload = {}
    headers = {
        'authority': 'api.bilibili.com',
        'accept': 'application/json, text/plain, */*',
        'accept-language': 'zh-CN,zh;q=0.9',
        'cookie': cookie,
        # 'cookie': '_uuid=284A9C35-410F10-EDF6-3C17-A54AB9E37FC100821infoc; b_nut=1643015606; buvid3=DA5BB132-6D37-E925-D4B3-2C8A4E90E3E206238infoc; buvid4=113A9242-DF44-71D7-411B-D2635313C53406238-022012417-49/i0Y3vQxJnoCHY0LiQLA%3D%3D; rpdid=|(u|kk)Jl)m|0J\'uYRRR)))mJ; i-wanna-go-back=-1; nostalgia_conf=-1; buvid_fp_plain=undefined; SESSDATA=23090d69%2C1666616262%2Cebd78%2A41; bili_jct=164fb21485e3237e504b7289f12e3522; DedeUserID=478755531; DedeUserID__ckMd5=7b252a2e31a4618c; sid=6lg80kg6; CURRENT_QUALITY=64; buvid_fp=5817007307836e6434cf19e36092363c; CURRENT_BLACKGAP=0; blackside_state=0; b_ut=5; is-2022-channel=1; CURRENT_FNVAL=4048; bp_video_offset_478755531=656642964180697100; fingerprint3=1aef2ae9098ea1c4d68da3b9ae656903; fingerprint=a6a9cadc0142d2671c783db7928ea24c; b_lsid=878109938_180A6C3B44F; innersign=1; PVID=2',
        'origin': 'https://space.bilibili.com',
        'referer': f'https://space.bilibili.com/{mid}',
        'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="100", "Google Chrome";v="100"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"Windows"',
        'sec-fetch-dest': 'empty',
        'sec-fetch-mode': 'cors',
        'sec-fetch-site': 'same-site',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36'
    }

    response = requests.request("GET", url, headers=headers, data=payload)

    # print(response.text)
    huozanshu = response.json()['data']['likes']
    print('获赞数:{}'.format(huozanshu))
    bofangshu = response.json()['data']['archive']['view']
    print('播放数:{}'.format(bofangshu))
    yuedushu = response.json()['data']['article']['view']
    print('获赞数:{}'.format(yuedushu))
    return huozanshu,bofangshu,yuedushu
def get_up_info(cookie, mid):
    import requests

    url = f"https://api.bilibili.com/x/space/arc/search?mid={mid}&ps=30&tid=0&pn=1&keyword=&order=pubdate&jsonp=jsonp"

    payload = {}
    headers = {
        'authority': 'api.bilibili.com',
        'accept': 'application/json, text/plain, */*',
        'accept-language': 'zh-CN,zh;q=0.9',
        'cookie': cookie,
        # 'cookie': '_uuid=284A9C35-410F10-EDF6-3C17-A54AB9E37FC100821infoc; buvid3=DA5BB132-6D37-E925-D4B3-2C8A4E90E3E206238infoc; b_nut=1643015606; buvid4=113A9242-DF44-71D7-411B-D2635313C53406238-022012417-49/i0Y3vQxJnoCHY0LiQLA%3D%3D; rpdid=|(u|kk)Jl)m|0J\'uYRRR)))mJ; i-wanna-go-back=-1; nostalgia_conf=-1; buvid_fp_plain=undefined; SESSDATA=23090d69%2C1666616262%2Cebd78%2A41; bili_jct=164fb21485e3237e504b7289f12e3522; DedeUserID=478755531; DedeUserID__ckMd5=7b252a2e31a4618c; sid=6lg80kg6; CURRENT_QUALITY=64; buvid_fp=5817007307836e6434cf19e36092363c; blackside_state=0; CURRENT_BLACKGAP=0; b_ut=5; is-2022-channel=1; CURRENT_FNVAL=4048; bp_video_offset_478755531=656642964180697100; fingerprint3=1aef2ae9098ea1c4d68da3b9ae656903; fingerprint=a6a9cadc0142d2671c783db7928ea24c; innersign=1; PVID=3; b_lsid=2E10610AD3_1809868C916',
        'origin': 'https://space.bilibili.com',
        'referer': f'https://space.bilibili.com/{mid}/video',
        'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="100", "Google Chrome";v="100"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"Windows"',
        'sec-fetch-dest': 'empty',
        'sec-fetch-mode': 'cors',
        'sec-fetch-site': 'same-site',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36'
    }

    response = requests.request("GET", url, headers=headers, data=payload)

    print(response.text)
    text = ''
    for m in response.json()['data']['list']['vlist']:
        description = m['description']
        title = m['title']
        print('description: {}'.format(description))
        print('title: {}'.format(title))
        text += '{},{}'.format(title, description)
    return text


  • 最后需要和up的用户画像做一个相似度判断,画像如下:
    基于python的用户画像系统设计与实现_第1张图片
    基于python的用户画像系统设计与实现_第2张图片

  • 最后就是前后端和数据库的功能展示了,如下:
    基于python的用户画像系统设计与实现_第3张图片
    基于python的用户画像系统设计与实现_第4张图片
    基于python的用户画像系统设计与实现_第5张图片

最后

本文的目的只有一个就是学习更多知识,如果有人利用本文技术去进行非法商业获取利益带来的法律责任都是操作者自己承担,和本文以及作者没关系,对本文有任何疑问可以+v讨论:zgffzgffzgff。

你可能感兴趣的:(python,毕业设计,python,毕业设计)