requests实现爬取bilibili小视频

import os 
import re
import time
import requests
import random
from urllib3.request import urlencode

定义一个函数,获取网址

def get_page(next_offset):
    params = {
'page_size': 10,
'next_offset': next_offset,
'tag': '今日热门',
'platform': 'pc'
    }
    baseurl = 'https://api.vc.bilibili.com/board/v1/ranking/top?'
    url = baseurl + urlencode(params)
    return url

#自定义一个函数,处理不能作为文件名的字符

# 自定义一个函数,处理不能作为文件名的字符
def validatetitle(title):
    rstr = '[/\:*?<>|@\n\r]'
    new_title = re.sub(rstr,'_',title)
    return new_title
# 报错信息如下:原因是文件名太长,可以通过字符串的索引截取一部分作为名字
# Invalid argument: 'Mint .44手枪双爆头,力挽狂澜追平比分\n#彩虹六号:围攻# #2020邀请赛#.mp4'
urlist = [i*10+1 for i in range(1,10)]
urlist.insert(0,0)
for i in urlist:
    print('=====================================')
    print('正在爬取offset=%d的异步数据'%i)
    print('======================================')
    url = get_page(i)
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.42 Safari/537.36'}
    response = requests.get(url, headers = headers)
    json = response.json()
    print(response.status_code)
    data = json.get('data')
    items = data.get('items')

    # 创建一个文件夹,用于存放所有的视频
    viedo_path = r'D:\Desktop\爬虫_anaconda\bilibili小视频\viedo'
    if not os.path.exists(viedo_path):
        os.mkdir(viedo_path)
        os.chdir(viedo_path)
        print('当前文件夹是:',os.getcwd()) # 获取当前文件夹名字

    for index,item in enumerate(items, start = 1):  # 遍历每一个视频
        print('爬取第%d个视频'%index)
        # 文件名字
        viedo_name = item['item']['description']
        viedo_name = validatetitle(viedo_name)[:15] + r'......'
        print(viedo_name)
        # 视频下载地址
        viedo_url = item['item']['video_playurl']
#         print(viedo_url)

        # 下载视频并保存
        resp = requests.get(viedo_url,headers = headers)
        file_path = '{file_name}.{file_suffix}'.format(file_name = viedo_name, file_suffix = 'mp4') # 这里直接写文件名字,是因为现在我们就在创建的文件夹中
        with open(file_path,'wb') as f:
            f.write(resp.content)
        time.sleep(random.randint(6,8))
print('爬取完成')    

 

你可能感兴趣的:(爬虫,python,python)