python爬虫爬取bilibili弹幕数据

import json
import re
import pymongo
import requests
from bs4 import BeautifulSoup
from muyushuixin.config import *

headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
                        ' AppleWebKit/537.36 (KHTML, like Gecko)'
                        ' Chrome/60.0.3112.101 Safari/537.36',
           'Referer':'http://space.bilibili.com/43085921'}


class Bilibili(object):
    main_url_pattern = 'https://www.bilibili.com/video/av{av}/'
    dm_url_pattern = 'http://comment.bilibili.com/{cid}.xml'
    hash_url_pattern = 'http://biliquery.typcn.com/api/user/hash/'
    space_url_pattern = 'https://space.bilibili.com/{av}'
    Infou = 'https://space.bilibili.com/ajax/member/GetInfo'

    def __init__(self, av):
        # 视频av
        self.av = str(av)
        # 获取弹幕数据的cid
        self.cid = None
        # 弹幕数据
        self.items = None
        # name解析
        self.names = None
        # mongo
        self.db = pymongo.MongoClient(MONGO_URI)[MONGO_DB]
    # 获取aid
    def get_aid(self):
        url = self.main_url_pattern.format(av=self.av)
        res = self.get_response(url)
        self.cid = str(re.findall('"cid=(\d+)&aid=\d+&', res.text)[0])
        print(self.cid)

    # 获取弹幕json数据
    def get_dm(self):
        dm_url = self.dm_url_pattern.format(cid=self.cid)
        print(dm_url)
        soup = self.get_soup(dm_url)
        ds = soup.select('d')
        items = []
        for d in ds:
            #print(d.get('p'), d.text)
            item = {
                'p' : d.get('p'),
                'content' : d.text
            }
            items.append(item)
        # 保存弹幕数据
        self.items = items
        #self.write_dm_to_json()

    # 弹幕json数据写入文本
    def write_dm_to_json(self):
        with open(str(self.av)+'.json', 'w', encoding='utf8') as f:
            json.dump(self.items, f)

    # hash解析获得全部id
    def hash_get_all_id(self):
        for item in self.items:
            self.hash_get_id(item)

    # hash解析单个id
    def hash_get_id(self, item):
        hash_id = re.findall(r',0,(.*?),', item['p'])[0]
        #print(hash_id)
        hash_url = self.hash_url_pattern + str(hash_id)
        res = self.get_response(hash_url).json()
        if res['error']==0:
            item['id'] = res['data'][0]['id']
        else:
            item['id'] = None
        print(item)

    # 由id获取用户名
    def id_get_name(self):
        for item in self.items:
            if item['id']:
                data = {'mid':item['id']}
                res = requests.post(url=self.Infou,headers=headers, data=data)
                print(res.json())
                dic = res.json()
                item['name'] = dic['data']['name']
                item['description'] = dic['data']['sign']
                item['face'] = dic['data']['face']
                print(item)
            else:
                item['name'],item['description'],item['face']=None,None,None

    # 保存到mongo
    def save_to_mongo(self):
        for item in self.items:
            self.db[MONGO_TABLE].insert(item)

    # 利用requests.get()方法获得sponse对象('utf8')
    @staticmethod
    def get_response(url, headers=headers):
        try:
            response = requests.get(url, headers=headers)
            response.encoding = 'utf8'
            if response:
                return response
        except Exception as e:
            print(e)

    # 利用BeautifulSoup库返回soup对象
    def get_soup(self, url, headers=headers):
        response = self.get_response(url, headers)
        if response:
            soup = BeautifulSoup(response.text, 'lxml')
            return soup


if __name__ == '__main__':
    one = Bilibili(14615200)
    one.get_aid()
    one.get_dm()
    one.hash_get_all_id()
    one.id_get_name()
    one.save_to_mongo()

你可能感兴趣的:(python爬虫小项目)