前面的抖音无水印视频爬虫(一)——踩坑,已经讲了相关的原理以及实现的过程
1、OS:Deepin 15.10.1桌面版本 64位
2、IDE:PyCharm Professional 2019
3、Python Modules(需要额外安装): requests, selenium,
aiohttp(由于大量的IO操作所以用这个还是能提高性能滴), browsermobproxy
1、打开douyiner的分享链接(selenium+browsermobproxy),获取第一个max_cursor=0的XHR
def url_transfer(url):
#browsermobproxy的服务器
server = Server("/home/alexhowe/workspace/browsermob-proxy-2.1.4/bin/browsermob-proxy")
server.start()
proxy = server.create_proxy()
chrome_options = Options()
chrome_options.add_argument('--proxy-server={0}'.format(proxy.proxy))
# chrome_options.add_argument('--headless') #selenium不弹出窗口
# chrome_options.add_argument('--disable-gpu')
driver = webdriver.Chrome(chrome_options=chrome_options)
proxy.new_har("douyin", options={'captureHeaders': True, 'captureContent': True})
driver.get(url)
#提取这个douyiner的名字作为文件夹名字
nick = driver.find_element_by_class_name('nickname').text
print(nick)
global SUBDIR
SUBDIR = nick
if not osp.exists(osp.join(DOWNLODAD_URL, nick)):
mkdir(osp.join(DOWNLODAD_URL, nick))
#对代理监听的结果进行遍历,提取出我们想要的XHR
result = proxy.har
for entry in result['log']['entries']:
_url = entry['request']['url']
if "web/api" in _url:
# print(_url)
cookies = driver.get_cookies()
return get_json(cookies, _url), _url, cookies
2、循环获取所有的XHR
def extract_from_result(url):
res, url, cookies = url_transfer(url)
rjson = json.loads(res)
loop = asyncio.get_event_loop()
loop.run_until_complete(extract_videos(res))
while rjson['has_more']:
max_cursor = rjson['max_cursor']
# print(max_cursor)
surl = str(url).split("&")
for i in surl:
if "max_cursor" in i:
rp = "max_cursor={}".format(max_cursor)
surl[surl.index(i)] = rp
next_url = "&".join(surl)
print(next_url)
text = get_json(cookies, next_url)
loop = asyncio.get_event_loop()
loop.run_until_complete(extract_videos(text))
rjson = json.loads(text)
3、对XHR的解析
async def extract_videos(text: str):
aweme_list = json.loads(text)["aweme_list"]
for aweme in aweme_list:
video = Video()
video.name = aweme['desc'].split("#")[0].strip()
video.url = aweme['video']['download_addr']['url_list']
video.id = aweme['aweme_id']
if aweme['desc'].startswith('#'):
video.name = video.id
# videos.append(video)
async with aiohttp.ClientSession(headers=headers) as session:
await download_video(session, video)
4、下载视频(判断是否已经下载且是有数据的)
async def download_video(session, video: Video):
# res = requests.get(video.url[0])
try:
video_path = osp.join(DOWNLODAD_URL, SUBDIR, video.name + ".mp4")
if not osp.exists(video_path):
await download(session, video, video_path)
elif osp.getsize(video_path) == 0:
await download(session, video, video_path)
else:
print("Video: {} is already exists".format(video.name))
except Exception as e:
print("Video: {} downloads failed".format(video.name), e)
5、下载视频(网络请求)
async def download(session, video, video_path):
async with session.get(video.url[0]) as res:
print("Downloading {} >>>>>>>>>>>>>>>>>".format(video.name), res.status)
content = await res.content.read()
if res.status == 200:
with open(video_path, "wb")as fw:
fw.write(content)
完整的代码已经上传到Github啦:DouyinCrawler-TextRec
预告:视频抽帧与文字识别
有疑问或者指教欢迎评论留言,O(∩_∩)O谢谢