爬取抖音APP数据最新版

1、方案

Charles抓包、appium自动化、mitmproxy处理响应数据、MongoDB数据库

2、准备条件

2.1、手机需要root,并且将Charles、mitmproxy的证书导入手机的系统证书中,将参考:https://www.jianshu.com/p/70208fbc8e43 

不这样做,打开抖音APP会没网

2.2、手机配置代理,自行百度

3、附上代码

3.1、spider.py 爬虫代码

from pymongo import MongoClient
from mitmproxy import ctx
import json

USER_INFO_URL = 'https://api3-core-c-lf.amemv.com/aweme/v1/user/profile/other'
USER_VIDEO_URL = 'https://api3-core-c-lf.amemv.com/aweme/v1/aweme/post'

mongo_client = MongoClient()
user_info_collection = mongo_client['douyin']['user_info']
user_video_collection = mongo_client['douyin']['user_video']


def parse_user_info(content):
    item = dict()
    data = json.loads(content)
    print(data)
    item['unique_id'] = data['user']['unique_id']
    item['short_id'] = data['user']['short_id']
    item['signature'] = data['user']['signature']
    item['nickname'] = data['user']['nickname']
    item['total_favorited'] = data['user']['total_favorited']
    item['following_count'] = data['user']['following_count']
    item['follower_count'] = data['user']['follower_count']
    item['aweme_count'] = data['user']['aweme_count']
    item['dongtai_count'] = data['user']['dongtai_count']
    item['favoriting_count'] = data['user']['favoriting_count']
    item['avatar_list'] = data['user']['avatar_300x300']['url_list']
    # user_info_collection.insert_one(item)
    user_info_collection.update({'unique_id': item['unique_id'], 'short_id': item['short_id']}, {'$set': item}, True)
    ctx.log.info(str(item))


def parse_user_video(content):
    data = json.loads(content)
    aweme_list = data['aweme_list']
    for aweme in aweme_list:
        item = dict()
        item['desc'] = aweme['desc']
        item['aweme_id'] = aweme['aweme_id']
        item['create_time'] = aweme['create_time']
        item['unique_id'] = aweme['author']['unique_id']
        item['short_id'] = aweme['author']['short_id']
        item['nickname'] = aweme['author']['nickname']
        item['video_url'] = aweme['video']['play_addr']['url_list'][0]
        user_video_collection.update(
            {'unique_id': item['unique_id'], 'short_id': item['short_id'], 'aweme_id': item['aweme_id']},
            {'$set': item}, True)
        ctx.log.info(str(item))


def response(flow):
    url = flow.request.url
    content = flow.response.text
    if url.startswith(USER_INFO_URL):
        parse_user_info(content)
    elif url.startswith(USER_VIDEO_URL):
        parse_user_video(content)

3.2、controller.py 自动化

from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from appium import webdriver
import random
import time

PLATFORM = 'Android'
DEVICE_NAME = 'Redmi_Note_4X'
APP_PACKAGE = 'com.ss.android.ugc.aweme'
APP_ACTIVITY = '.splash.SplashActivity'
DRIVER_SERVER = 'http://localhost:4723/wd/hub'
TIMEOUT = 30


class DouYinController:

    def __init__(self):
        """
        初始化
        """
        # 驱动配置
        self.desired_caps = {
            'platformName': PLATFORM,
            'deviceName': DEVICE_NAME,
            'appPackage': APP_PACKAGE,
            'appActivity': APP_ACTIVITY
        }
        self.driver = webdriver.Remote(DRIVER_SERVER, self.desired_caps)
        self.wait = WebDriverWait(self.driver, TIMEOUT)
        self.flick_start_x = self.driver.get_window_size().get('width') / 2
        self.flick_start_y = self.driver.get_window_size().get('height') * 0.8
        self.flick_end_y = self.driver.get_window_size().get('height') * 0.2

    def prepare(self):
        """
        准备阶段
        :return:
        """
        print('开始进入准备阶段。。。')
        time.sleep(10)
        # 确认协议
        print('开始点击好的。。。')
        self.driver.tap([(530, 1272)], 500)
        # 获取权限,允许
        self.wait.until(EC.presence_of_element_located((By.ID, 'android:id/button1'))).click()
        # 获取权限,允许
        self.wait.until(EC.presence_of_element_located((By.ID, 'android:id/button1'))).click()
        time.sleep(3)
        # 滑动
        self.driver.swipe(self.flick_start_x, self.flick_start_y, self.flick_start_x, self.flick_end_y)
        print('准备阶段已完成。。。')

    def parse_user_info(self):
        """
        解析指定抖音ID用户名
        :return:
        """
        print('*' * 100)
        print('进入用户信息和视频爬取阶段。。。')
        time.sleep(3)
        # 点击搜索
        self.driver.tap([(984, 133)], 500)
        short_id_list = ['922002900', 'XFeiXia']
        for short_id in short_id_list:
            # 搜索框
            search = self.wait.until(EC.presence_of_element_located((By.ID, 'com.ss.android.ugc.aweme:id/ai3')))
            search.clear()
            search.set_text(short_id)

            if '搜索结果为空' in self.driver.page_source:
                continue

            # 第一行结果
            self.wait.until(EC.presence_of_element_located((By.ID, 'com.ss.android.ugc.aweme:id/gd6'))).click()
            # 详情
            self.wait.until(EC.presence_of_element_located((By.ID, 'com.ss.android.ugc.aweme:id/fwi'))).click()

            while True:
                time.sleep(random.random())
                self.driver.swipe(self.flick_start_x, self.flick_start_y, self.flick_start_x, self.flick_end_y)
                html = self.driver.page_source
                if '暂时没有更多了' in html:
                    break
            # 后退
            self.driver.back()

    def parse_recommend(self):
        """
        爬取推荐抖音用户信息
        :return:
        """
        while True:
            print('*' * 100)
            print('新的页面。。。。。')
            try:
                self.wait.until(EC.presence_of_element_located((By.ID, 'com.ss.android.ugc.aweme:id/en0'))).click()
                time.sleep(random.randint(7, 15))
                # 点击搜索
                self.driver.tap([(79, 133)], 500)
                time.sleep(random.randint(1, 3))
            except Exception as e:
                print('定位失败', e)
            while True:
                try:
                    self.driver.swipe(self.flick_start_x, self.flick_start_y, self.flick_start_x, self.flick_end_y)
                except Exception as e:
                    print('滑动失败', e)
                else:
                    break
            print('处理结束。。。')

    def main(self):
        self.prepare()
        # self.parse_user_info()
        self.parse_recommend()


if __name__ == '__main__':
    DouYinController().main()

3.3、downloader.py 专门下载视频

from pymongo import MongoClient
import requests
import os

VIDEO_DIR = './videos'


class DownloadFile:

    def __init__(self):
        self.mongo_client = MongoClient()
        self.user_video_collection = self.mongo_client['douyin']['user_video']

    def __del__(self):
        self.mongo_client.close()

    def download_file(self, file_path, download_url):
        print('正在下载:{}'.format(download_url))
        headers = {
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36',
        }
        r = requests.get(url=download_url, headers=headers, stream=True)
        with open(file_path, "wb") as f:
            for chunk in r.iter_content(chunk_size=1024):
                if chunk:
                    f.write(chunk)
        print('下载完成:{}'.format(download_url))

    def run(self):
        user_videos = self.user_video_collection.find()
        for user_video in user_videos:
            unique_id = user_video['unique_id']
            short_id = user_video['short_id']
            aweme_id = user_video['aweme_id']
            video_url = user_video['video_url']

            save_dir = '{}/{}_{}'.format(VIDEO_DIR, unique_id, short_id)
            if not os.path.exists(save_dir):
                os.mkdir(save_dir)
            file_path = "{}/{}.mp4".format(save_dir, aweme_id)
            self.download_file(file_path, video_url)


if __name__ == '__main__':
    DownloadFile().run()

视频链接有失效时间、具体时长不知,数据保存数据中,启动下载视频程序,

代码仅供参考。

你可能感兴趣的:(python)