爬虫第二篇——爬取bilibiliUP主相册

自学爬虫第二篇,爬取bilibiliUP主相册

# -*- coding: UTF-8 -*-
from urllib import request
import json
import os

if __name__ == '__main__':
    # 为up主单独创建文件夹
    def createDir(path):
        if not os.path.exists(path):
            os.makedirs(path)
        return
    # 存储图片
    def savePic(picSrc,savePath,picName):
        savePath = savePath + picName
        if not os.path.exists(savePath):
            request.urlretrieve(picSrc, savePath, print('存储成功 %s' % savePath))
        else:
            print('当前文件已存在:%s' % picName)
        return
    # 爬取up主相册信息
    def copyUpPhotos(upId,photoType):
        url = 'https://api.vc.bilibili.com/link_draw/v1/doc/upload_count?uid=%s' % (mid)
        photoCount,num = 0,0
        res = request.urlopen(url)
        resJson = json.load(res)
        if resJson['code'] == 0 and resJson['data']['all_count'] > 0:
            if resJson['data']['%s_count' % photoType] > 0:
                photoCount = resJson['data']['%s_count' % photoType]
                createDir(savePath + photoType)
        else:
            print('没有可爬取的相册信息')
            return
        url = 'https://api.vc.bilibili.com/link_draw/v1/doc/doc_list?uid=%s&page_num=0&page_size=%s&biz=%s' % (upId,photoCount,photoType)
        res = request.urlopen(url)
        resJson = json.load(res)
        for items in resJson['data']['items']:
            picName = ''
            picName += '%s/%s_' % (photoType,str(items['doc_id']))
            index = 0
            for picItems in items['pictures']:
                picNameTemp = '%s%s.jpg' % (picName,index)
                savePic(picItems['img_src'], savePath, picNameTemp)
                index += 1
                num += 1
        print('爬取完毕,共计%s个照片' % num)
        return
    # 爬取up主视频信息
    def copyUpVideos(upId,tid,videoType) :
        videoUrl = 'https://space.bilibili.com/ajax/member/getSubmitVideos?mid=%s&pagesize=%s&tid=%s&page=1&keyword=&order=pubdate' % (upId,1,0)
        videoCount = 0
        res = request.urlopen(videoUrl)
        resJson = json.load(res)
        if resJson['status'] and resJson['data']['count'] > 0:
            createDir(savePath)
            if resJson['data']['tlist'][str(tid)] != '':
                createDir(savePath + videoType)
                videoCount = resJson['data']['tlist'][str(tid)]['count']
        else:
            print('没有可爬取的视频信息')
            return
        videoUrl = 'https://space.bilibili.com/ajax/member/getSubmitVideos?mid=%s&pagesize=%s&tid=%s&page=1&keyword=&order=pubdate' % (upId, videoCount, tid)
        res = request.urlopen(videoUrl)
        resJson = json.load(res)
        for i in range(0,videoCount):
            playTemp = resJson['data']['vlist'][i]['play']
            aidTemp = resJson['data']['vlist'][i]['aid']
            favoritesTemp = resJson['data']['vlist'][i]['favorites']
            picTemp = 'http:%s' % resJson['data']['vlist'][i]['pic']
            picName = '%s/p%sa%sf%s.jpg' % (videoType,playTemp,aidTemp,favoritesTemp)
            savePic(picTemp,savePath,picName)
        print('爬取完毕,共计%s个视频' % videoCount)
        return

    # 基础配置
    ##################################################up主隐藏ID,#######################################################
    mid = 123456    # 此处id只做举例,具体需要哪个up主请自行填写
    savePath = 'F:/bilibili/%s/' % mid      # F:/bilibili/ 可自由更改
    ##################################bilibli搜索到up主后,将url中的mid复制过来即可######################################
    pagesize,tid = 1,0 # 勿调整,会影响整理爬虫效果

    # up主个人信息(暂不考虑)
    # https://api.bilibili.com/x/space/acc/info?mid=123456&jsonp=jsonp

    # up主投稿视频
    # 1.舞蹈——播放量 + avID + 喜欢
    copyUpVideos(mid,129,'dance')
    # 2.生活——播放量 + avID + 喜欢
    copyUpVideos(mid,160,'life')

    # up主投稿相册
    # 1.摄影——相册Id + 序列
    copyUpPhotos(mid,'photo')
    # 2.日常——相册Id + 序列
    copyUpPhotos(mid,'daily')

注:代码还有很多需要优化的地方,此处仅作为参考说明。希望对有这方面意向的coder有帮助....

你可能感兴趣的:(爬虫,爬虫,bilibili,相册,up主)