import requests
from bs4 import BeautifulSoup
import re
from multiprocessing import Pool
def get_video_data(dic):
headers = {
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36"
}
#使用线程池对视频数据进行请求(较为耗时的堵塞操作)
url = dic['url']
print(dic['name'], '正在下载')
data = requests.get(url=url, headers=headers, timeout=0.5).content
#持久化存储操作
with open(dic['name'], 'wb') as fp:
fp.write(data)
print(dic['name'], '下载成功')
if __name__ == '__main__':
headers = {
'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36"
}
url = 'https://www.pearvideo.com/category_5'
page_text = requests.get(url=url, headers=headers).text
soup = BeautifulSoup(page_text, 'lxml')
li_urls = soup.select('.vervideo-bd')
urls = []#存储所有视频的链接和名字
i = 1
for li in li_urls:
try:
i = i + 1
detail_url = 'https://www.pearvideo.com/' + li.a['href']
name = soup.select('.vervideo-title')[i].text+'.mp4'
detail_page_text = requests.get(url=detail_url, headers=headers, timeout=0.5).text
ex = 'srcUrl="(.*?)",vdoUrl'
video_url = re.findall(ex, detail_page_text)[0]
dic = {
'name': name,
'url': video_url
}
urls.append(dic)
except:
continue
pool = Pool(4)
pool.map(get_video_data, urls)
pool.close()
pool.join()
总结:
1.windows环境下需要将主函数放在以下代码下方
if __name__ == '__main__':
mac环境下不需要此操作
2.下载时下载二进制数据,使用’wb’而不是’w’。
3.如果下载视频过多(爬取大量数据),网站要求验证证书,大量爬取需要使用其他方法应对ssl反爬策略
感谢csdn学院的爬虫教程。