import requests
import re#正则表达式模块
from urllib.request import urlretrieve #专门用来下载的方法
import os
#获取网站源代码
def down_loads():
url = 'https://www.pearvideo.com/category_5'
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18363'}
res = requests.get(url, headers=header).text
#正则表达式、bs4
#通过正则表达式匹配
# .*? 匹配所有
regex = r''
video_id=re.findall(regex,res)#视频id列表
url2='https://www.pearvideo.com/'
urllist=[ ]
for i in video_id:
newurl=url2+i
urllist.append(newurl)
for playurl in urllist:
html2=requests.get(playurl,headers=header).text
regex2=r'sdUrl="",ldUrl="",srcUrl="(.*?)",vdoUrl=srcUrl'
video_id2=re.findall(regex2,html2)
regex3=r'(.*?)
'
name=re.findall(regex3,html2)
print('正在下载视频:%s'%name[0])
path='lishiping'
if path not in os.listdir():
os.mkdir(path)
urlretrieve(video_id2[0], path+"/%s.mp4"%name[0])
down_loads()
现在还只能爬一页视频。。。