python爬虫-梨视频短视频爬取(线程池)
import requests
from lxml import etree
import random
from multiprocessing.dummy import Pool
def get_video(dic):
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36 Edg/88.0.705.56'
}
video_data = requests.get(url = dic['url'] , headers = headers).content
print(dic['name']+'开始下载')
path = "./lishipin/"+str(int(random.random()*100)) + '.mp4'
with open(path,'wb') as fp:
fp.write(video_data)
print(dic['name']+'下载成功')
def main():
web_url = 'https://www.pearvideo.com/category_5'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36 Edg/88.0.705.56'
}
web_page_text = requests.get(url = web_url,headers = headers).text
tree = etree.HTML(web_page_text)
li_list = tree.xpath('//*[@id="listvideoListUl"]/li')
rea_urls=[]
for li in li_list:
video_name = li.xpath('./div/a/div[2]/text()')[0]+'.mp4'
video_url = 'https://www.pearvideo.com/'+li.xpath("./div/a/@href")[0]
countId = video_url.split("/")[-1].split("_")[1]
mrd = random.random()
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36 Edg/88.0.705.56',
'Referer': 'https://www.pearvideo.com/video_' + countId
}
ajax_url = 'https://www.pearvideo.com/videoStatus.jsp'
params = {
'contId': str(countId),
'mrd': str(mrd)
}
ajax_json = requests.get(url = ajax_url,headers = headers,params = params).json()
fake_url = ajax_json['videoInfo']['videos']['srcUrl']
fake_url_list = fake_url.split('/')
end = fake_url_list.pop()
end_list = end.split("-")
end_url = ""
for i in range(len(end_list)-1):
end_url = end_url + "-"+ end_list[i+1]
rea_url=""
for element in fake_url_list:
rea_url=rea_url+element+"/"
rea_url=rea_url+"cont-"+str(countId) + end_url
dic = {
'url':rea_url,
'name':video_name
}
rea_urls.append(dic)
pool = Pool(4)
pool.map(get_video,rea_urls)
pool.close()
pool.join()
if __name__ == '__main__':
main()