尝试用python抓取视频并且处理title中的非法字符,保存为mp4
已经处理了分页的问题,默认下载为360p格式。
#!/usr/bin/python
from bs4 import BeautifulSoup as bs
from requests.exceptions import ConnectTimeout,ConnectionError
import requests,time,sys,re,queue
import youtube_dl
#基本URL
base = "https://www.youku.com/results?search_query="
qstring = "cctv+空中剧院"
pagestring = "&page="
proxystr = '127.0.0.1:49705'
#proxystr = ''
#设置代理
sess = requests.session()
sess.proxies = {'https': proxystr}
video_urls = queue.Queue() # url队列
counter=0 #页码
while True:
counter += 1
try:
response = sess.get(base + qstring+ pagestring + str(counter))
except (ConnectTimeout, ConnectionError):
print("不能访问youku 检查是否已设置代理")
sys.exit()
page = response.text
soup = bs(page, 'html.parser')# 开始解析html
No_more_results = soup.findAll('div',attrs={'class':'display-message'})
if No_more_results and No_more_results[0].text=="No more results":
break#翻页过头了
vids = soup.findAll('a', attrs={'class': 'yt-uix-tile-link'})
if(vids):
for v in vids:
if len(v['href']) > 20:
continue#超过20的可能是广告?
v_link = 'https://www.youku.com' + v['href']
video_urls.put([v_link,v['title']])
else:#没有找到视频,结束了?
break
print("page:{} size:{}".format(counter,video_urls.qsize()))
time.sleep(1)#休息一下
counter=0
while not video_urls.empty():
v_url,title = video_urls.get()
print(v_url,title)
# pattern = re.compile(r"\||CCTV戏曲| |来自")
# file_name = re.sub(pattern, "", title).replace("/", "-")
try:
ydl_opts = {# 定义下载参数
'format' : '[height=360]',#360p已经足够了
# 格式化下载后的文件名,加入处理后的title
'outtmpl': '%(title)s.%(ext)s',
'proxy' : proxystr
}
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
# 下载给定的URL列表
result = ydl.download([v_url])
print('下载完成')
except (TimeoutError,ConnectTimeout, ConnectionError):
print("不能访问youku 检查是否已设置代理")
sys.exit()
counter += 1
if(counter>3):
break#测试3条