1、方案
Charles抓包、appium自动化、mitmproxy处理响应数据、MongoDB数据库
2、准备条件
2.1、手机需要root,并且将Charles、mitmproxy的证书导入手机的系统证书中,将参考:https://www.jianshu.com/p/70208fbc8e43
不这样做,打开抖音APP会没网
2.2、手机配置代理,自行百度
3、附上代码
3.1、spider.py 爬虫代码
from pymongo import MongoClient
from mitmproxy import ctx
import json
USER_INFO_URL = 'https://api3-core-c-lf.amemv.com/aweme/v1/user/profile/other'
USER_VIDEO_URL = 'https://api3-core-c-lf.amemv.com/aweme/v1/aweme/post'
mongo_client = MongoClient()
user_info_collection = mongo_client['douyin']['user_info']
user_video_collection = mongo_client['douyin']['user_video']
def parse_user_info(content):
item = dict()
data = json.loads(content)
print(data)
item['unique_id'] = data['user']['unique_id']
item['short_id'] = data['user']['short_id']
item['signature'] = data['user']['signature']
item['nickname'] = data['user']['nickname']
item['total_favorited'] = data['user']['total_favorited']
item['following_count'] = data['user']['following_count']
item['follower_count'] = data['user']['follower_count']
item['aweme_count'] = data['user']['aweme_count']
item['dongtai_count'] = data['user']['dongtai_count']
item['favoriting_count'] = data['user']['favoriting_count']
item['avatar_list'] = data['user']['avatar_300x300']['url_list']
# user_info_collection.insert_one(item)
user_info_collection.update({'unique_id': item['unique_id'], 'short_id': item['short_id']}, {'$set': item}, True)
ctx.log.info(str(item))
def parse_user_video(content):
data = json.loads(content)
aweme_list = data['aweme_list']
for aweme in aweme_list:
item = dict()
item['desc'] = aweme['desc']
item['aweme_id'] = aweme['aweme_id']
item['create_time'] = aweme['create_time']
item['unique_id'] = aweme['author']['unique_id']
item['short_id'] = aweme['author']['short_id']
item['nickname'] = aweme['author']['nickname']
item['video_url'] = aweme['video']['play_addr']['url_list'][0]
user_video_collection.update(
{'unique_id': item['unique_id'], 'short_id': item['short_id'], 'aweme_id': item['aweme_id']},
{'$set': item}, True)
ctx.log.info(str(item))
def response(flow):
url = flow.request.url
content = flow.response.text
if url.startswith(USER_INFO_URL):
parse_user_info(content)
elif url.startswith(USER_VIDEO_URL):
parse_user_video(content)
3.2、controller.py 自动化
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from appium import webdriver
import random
import time
PLATFORM = 'Android'
DEVICE_NAME = 'Redmi_Note_4X'
APP_PACKAGE = 'com.ss.android.ugc.aweme'
APP_ACTIVITY = '.splash.SplashActivity'
DRIVER_SERVER = 'http://localhost:4723/wd/hub'
TIMEOUT = 30
class DouYinController:
def __init__(self):
"""
初始化
"""
# 驱动配置
self.desired_caps = {
'platformName': PLATFORM,
'deviceName': DEVICE_NAME,
'appPackage': APP_PACKAGE,
'appActivity': APP_ACTIVITY
}
self.driver = webdriver.Remote(DRIVER_SERVER, self.desired_caps)
self.wait = WebDriverWait(self.driver, TIMEOUT)
self.flick_start_x = self.driver.get_window_size().get('width') / 2
self.flick_start_y = self.driver.get_window_size().get('height') * 0.8
self.flick_end_y = self.driver.get_window_size().get('height') * 0.2
def prepare(self):
"""
准备阶段
:return:
"""
print('开始进入准备阶段。。。')
time.sleep(10)
# 确认协议
print('开始点击好的。。。')
self.driver.tap([(530, 1272)], 500)
# 获取权限,允许
self.wait.until(EC.presence_of_element_located((By.ID, 'android:id/button1'))).click()
# 获取权限,允许
self.wait.until(EC.presence_of_element_located((By.ID, 'android:id/button1'))).click()
time.sleep(3)
# 滑动
self.driver.swipe(self.flick_start_x, self.flick_start_y, self.flick_start_x, self.flick_end_y)
print('准备阶段已完成。。。')
def parse_user_info(self):
"""
解析指定抖音ID用户名
:return:
"""
print('*' * 100)
print('进入用户信息和视频爬取阶段。。。')
time.sleep(3)
# 点击搜索
self.driver.tap([(984, 133)], 500)
short_id_list = ['922002900', 'XFeiXia']
for short_id in short_id_list:
# 搜索框
search = self.wait.until(EC.presence_of_element_located((By.ID, 'com.ss.android.ugc.aweme:id/ai3')))
search.clear()
search.set_text(short_id)
if '搜索结果为空' in self.driver.page_source:
continue
# 第一行结果
self.wait.until(EC.presence_of_element_located((By.ID, 'com.ss.android.ugc.aweme:id/gd6'))).click()
# 详情
self.wait.until(EC.presence_of_element_located((By.ID, 'com.ss.android.ugc.aweme:id/fwi'))).click()
while True:
time.sleep(random.random())
self.driver.swipe(self.flick_start_x, self.flick_start_y, self.flick_start_x, self.flick_end_y)
html = self.driver.page_source
if '暂时没有更多了' in html:
break
# 后退
self.driver.back()
def parse_recommend(self):
"""
爬取推荐抖音用户信息
:return:
"""
while True:
print('*' * 100)
print('新的页面。。。。。')
try:
self.wait.until(EC.presence_of_element_located((By.ID, 'com.ss.android.ugc.aweme:id/en0'))).click()
time.sleep(random.randint(7, 15))
# 点击搜索
self.driver.tap([(79, 133)], 500)
time.sleep(random.randint(1, 3))
except Exception as e:
print('定位失败', e)
while True:
try:
self.driver.swipe(self.flick_start_x, self.flick_start_y, self.flick_start_x, self.flick_end_y)
except Exception as e:
print('滑动失败', e)
else:
break
print('处理结束。。。')
def main(self):
self.prepare()
# self.parse_user_info()
self.parse_recommend()
if __name__ == '__main__':
DouYinController().main()
3.3、downloader.py 专门下载视频
from pymongo import MongoClient
import requests
import os
VIDEO_DIR = './videos'
class DownloadFile:
def __init__(self):
self.mongo_client = MongoClient()
self.user_video_collection = self.mongo_client['douyin']['user_video']
def __del__(self):
self.mongo_client.close()
def download_file(self, file_path, download_url):
print('正在下载:{}'.format(download_url))
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36',
}
r = requests.get(url=download_url, headers=headers, stream=True)
with open(file_path, "wb") as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
print('下载完成:{}'.format(download_url))
def run(self):
user_videos = self.user_video_collection.find()
for user_video in user_videos:
unique_id = user_video['unique_id']
short_id = user_video['short_id']
aweme_id = user_video['aweme_id']
video_url = user_video['video_url']
save_dir = '{}/{}_{}'.format(VIDEO_DIR, unique_id, short_id)
if not os.path.exists(save_dir):
os.mkdir(save_dir)
file_path = "{}/{}.mp4".format(save_dir, aweme_id)
self.download_file(file_path, video_url)
if __name__ == '__main__':
DownloadFile().run()
视频链接有失效时间、具体时长不知,数据保存数据中,启动下载视频程序,
代码仅供参考。