哔哩哔哩视频信息爬虫(实时爬取)

结合 \rightarrow\rightarrow 哔哩哔哩小助手程序

爬取思路

自定义模块构建及框架设计

 

哔哩哔哩视频信息爬虫(实时爬取)_第1张图片

文件目录

哔哩哔哩视频信息爬虫(实时爬取)_第2张图片

__init__.py:

#__init__


"""

浏览json数据
videoinfo = [
                    data['aid'],        # av号
                    data['view'],       # 播放量
                    data['like'],       # 点赞数
                    data['favorite'],   # 收藏数
                    data['share'],      # 转发数
                    data['reply'],      # 评论
                    data['danmaku'],    # 弹幕
                    data['coin'],       # 硬币数
                    data['title'],      # 标题
                    data['tname'],      # 分类
                    
                ]

"""




headers = {
    
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 \
    (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"
    }

WebDownloader模块:(请求并加载网页模块)

#WebDownloader






import requests
from BilibiliSpider import headers





class WebDownloader:
    global headers
    def __init__(self,headers=headers,timeout=6):
        self.headers = headers
        self.timeout = timeout




    #获取待爬取json网页
    def getJsonWeb(self,url):
        try:
            r = requests.get(url,headers=self.headers,timeout=self.timeout)
            r.raise_for_status()
            r.encoding=r.apparent_encoding
            return r.json()
        except:
            return "error"

    

JsonParse模块:(网页内容解析)

#JsonParse
#



import json
from BilibiliSpider import WebDownloader

import threading


class JsonParse:
    def __init__(self,total=1,lock = threading.Lock()):
        self.lock = lock
        self.total = total
    
        

    #将用来存入json文件中,获取av号:[视频标题,视频分类]
    def parseStat(self,dict_json,jsonPage):
        
        
        try:
            View = jsonPage['data']['View']
            aid = View['aid']
            sort = View['tname']
            title = View['title']
            if View['aid']!=None:                        #筛选出av号,并判断是否存在
                dict_json[aid] = [title,sort]
                with self.lock:
                    return ""
                        
                    
            
        
        except:
            pass


        #return dict_json             
            
    
    def parseJsonImage(self,jsonPage):
        try:
            View = jsonPage['data']['View']
            picHref = View['pic']
            return picHref
        except:
            pass
            


    #用于实时爬取视频信息
    def parseJsonList(self,dict_json,jsonPage):
        try:
            View = jsonPage['data']['View']
            aid = View['aid']
            sort = View['tname']
            detail = View['desc']
            title = View['title']
            
            Stat = View['stat']
            play = Stat['view']
            like = Stat['like']
            collect = Stat['favorite']
            share = Stat['share']
            reply = Stat['reply']
            danmaku = Stat['danmaku']
            coin = Stat['coin']

            dict_json['视频名称:'] = title
            dict_json['AV号:'] = aid
            dict_json['分类:'] = sort
            dict_json['视频简介:'] = detail
            dict_json['播放量:'] = play
            dict_json['点赞:'] = like
            dict_json['收藏:'] = collect
            dict_json['转发:'] = share
            dict_json['评论:'] = reply
            dict_json['弹幕:'] = danmaku
            dict_json['硬币:'] = coin
            
            #for i in dict_json:
                #print(i,end='')
                #print(dict_json[i])
        except:
            pass

UrlFactory模块:(api-url工厂,获取对应标题的API链接)

#UrlFactory


"""
detail?  :https://api.bilibili.com/x/web-interface/view/detail?&aid=77515252
stat?    :https://api.bilibili.com/x/web-interface/archive/stat?aid=11111111

"""
#api_urlStat = 'https://api.bilibili.com/x/web-interface/archive/stat?aid='


import json


class UrlFactory:
    
    def __init__(self,api_urlDetail='https://api.bilibili.com/x/web-interface/view/detail?&aid='):
        self.api_urlDetail = api_urlDetail
        
    


    #从json文件中获取apiUrl
    def getUrlJson(self,title):
        with open('href.json',mode='r')as fjson:
            data = json.loads(fjson.read())
            
            #av号:[视频标题,视频分类]   
            for i in data:
            
                t = data[i][0]
                if t == title:
                    apiUrl = self.api_urlDetail + i
                    break
         
        
        return apiUrl
        

主函数A

#下载av号
    total = 0
    dict_json={}
    v = videoInfoSpider()
    print('开始爬取apiUrl...')
    for i in range(1,2019):
        
        start = 10000
        urls = [
            "https://api.bilibili.com/x/web-interface/view/detail?&aid={}".format(j)
            for j in range(start,start+10000)
            ]
        with futures.ThreadPoolExecutor(64)as executor:
            executor.map(v.apiUrlCrawl,urls)
        print(total)
        total += 1
    with open('href.json','a')as fjson:
        data = json.dumps(dict_json,indent=4)
        fjson.write(data)
        
    print("爬取结束!")

首先运行主函数A,得到一个json文件,作为后续实时爬取API

哔哩哔哩视频信息爬虫(实时爬取)_第3张图片

紧接着UrlFactory模块的作用就来了,调用UrlFactory中的 getUrlJson()即可获得对应搜索标题的视频信息URL,根据URL请求网页,最后调用JsonParse模块即可得到相应的信息啦

 

你可能感兴趣的:(Python,菜狗的日记)