python关于抖音app达人信息及视频的抓取

一.使用到的环境以及工具

python(环境,本贴用的是3.6)
appium(下载Appium-Desktop)下载地址 :https://s3.amazonaws.com/termius.desktop.autoupdate/win/Termius.exe
夜神模拟器
mitmproxy

二.环境配置需要注意的地方

appium-server已经没有更新了 取而代之的是appium-desktop 在githup上可以下载并且安装,
安装环境很多:包括jdk,node.js andriod SDK
安装安卓的时候 需要下载一个版本 并且将tool platforms platforms-tools 以及安装的总路径 添加到环境变量
appium启动之后:
python关于抖音app达人信息及视频的抓取_第1张图片

三.直接上代码

1.app.py

while True:


    desired_caps = {}
    desired_caps['platformName'] = 'Android'
    desired_caps['deviceName'] = '127.0.0.1:62001'
    desired_caps['platformVersion'] = '4.4.2'

    desired_caps['appPackage'] = 'com.ss.android.ugc.aweme'
    desired_caps['appActivity'] = '.main.MainActivity'



    # desired_caps = {
    #     "platformName":"Android",
    #     "deviceName":"vivo X20",
    #     "appPackage":"com.ss.android.ugc.aweme",
    #     "appActivity":".main.MainActivity"
    # }

    driver = webdriver.Remote('http://localhost:4723/wd/hub', desired_caps)


    time.sleep(1)

    # driver.find_element_by_xpath("	/hierarchy/android.widget.FrameLayout/android.widget.LinearLayout/android.widget.FrameLayout/android.widget.RelativeLayout/android.widget.HorizontalScrollView/android.widget.LinearLayout/android.widget.TabHost/android.widget.RelativeLayout/android.widget.FrameLayout/android.widget.FrameLayout/android.support.v4.view.ViewPager/android.widget.FrameLayout/android.view.View[1]/android.widget.FrameLayout/android.widget.ScrollView/android.widget.FrameLayout/android.widget.LinearLayout").click()
    driver.tap(([314,385],[365,492]),200)

    time.sleep(1)

    x = random.randint(300, 600)

    driver.swipe(x, random.randint(910, 1010), random.randint(x - 1, x + 1), random.randint(170, 290),
                 random.randint(180, 200))

    print("走到这里")

    time.sleep(3)

    # driver.find_element_by_xpath("	/hierarchy/android.widget.FrameLayout/android.widget.LinearLayout/android.widget.FrameLayout/android.widget.RelativeLayout/android.widget.HorizontalScrollView/android.widget.LinearLayout/android.widget.TabHost/android.widget.RelativeLayout/android.widget.FrameLayout/android.widget.FrameLayout/android.support.v4.view.ViewPager/android.widget.FrameLayout/android.view.View[1]/android.widget.FrameLayout/android.support.v4.view.ViewPager/android.widget.RelativeLayout/android.widget.LinearLayout[2]/android.widget.RelativeLayout/android.widget.ImageView").click()

    # driver.tap(([314,385],[365,492]),200)


    # driver.find_element_by_id("com.ss.android.ugc.aweme:id/nz").click()


    # driver.tap(([659,669],[672,680]),106)

    # print(random.uniform(10,20))

    #向下滑动刷新
    driver.swipe(562,451,562,895,200)

    time.sleep(1.5)

    item = {}

    try:

        while True:


            aa = 0
            while aa <10:

                try:
                    # driver.find_element_by_id("com.ss.android.ugc.aweme:id/nz").click()
                    driver.find_element_by_id("com.ss.android.ugc.aweme:id/a2s").click()
                    time.sleep(random.uniform(0.8,1.1))
                    break
                except:

                    if aa>2:
                        y = random.randint(300, 800)
                        driver.swipe(random.randint(300,680), y, random.randint(70, 100), y, random.randint(180, 200))
                        try:
                            driver.find_element_by_id("com.ss.android.ugc.aweme:id/a2s").click()
                        except:
                            break
                    time.sleep(1)
                    print("等待视频页面加载")
                    aa += 1


            bb = 0


            print("进入循环前")

            while bb < 5:

                try:


                    gz = driver.find_element_by_xpath("	/hierarchy/android.widget.FrameLayout/android.widget.LinearLayout/android.widget.FrameLayout/android.widget.FrameLayout/android.widget.HorizontalScrollView/android.widget.LinearLayout/android.widget.LinearLayout/android.widget.FrameLayout/android.widget.LinearLayout[1]/android.widget.LinearLayout/android.widget.FrameLayout[2]/android.widget.LinearLayout/android.widget.RelativeLayout[1]/android.widget.TextView").get_attribute("text")

                    if gz == "作品 0":
                        time.sleep(1)
                        print("还没返回内容")
                        bb += 1
                    else:
                        print("退出循环")
                        break
                except:
                    print("baocuole")
                    time.sleep(0.5)
                    bb += 1
                    traceback.print_exc()

                    try:
                        driver.find_element_by_id("com.ss.android.ugc.aweme:id/a2s").click()
                        time.sleep(random.uniform(0.8, 1.1))
                    except:
                        pass


            print("已经在循环外面")

            #向左滑动

            y = random.randint(300,800)

            driver.swipe(random.randint(70,100),y,random.randint(600,680),y,random.randint(180,200))


            time.sleep(random.uniform(0.5,0.9))

            # driver.tap(([40,61],[49,68]),200)

            #向下滑动
            x = random.randint(300,600)

            driver.swipe(x,random.randint(910,1010),random.randint(x-1,x+1),random.randint(170,290),random.randint(180,200))
            time.sleep(0.3)

    except:
        traceback.print_exc()

2.mitm.py

class AddHeader:

    def response(self, flow):
        # if ctx.options.addheader is not None:
        #     flow.response.headers["addheader"] = str(ctx.options.addheader)

        if "https://www.sojson.com/auth_v_1_0/js/js_obfuscator.shtml" in flow.request.url:
            print("555555555555555555555555555555")
            print(flow.response.content.decode())


    def request(self, flow: mitmproxy.http.HTTPFlow):

        url = flow.request.url


        if "baidu" in url:

            res = requests.get("http://fanyi.youdao.com/",headers=v_headers)



            flow.response = http.HTTPResponse.make(200,res.content,{"Content-Type": "text/html"})



        if "/?user_id=" in url and "aweme-eagle" in url:


            ctx.log.info("请求详情页地址:" + flow.request.url)


            a = 0

            while True:
                # time.sleep(10000)
                try:
                    response = requests.get(url, headers=headers, proxies=proxies)
                    res = response.text

                    if res is not None and "https://www.abuyun.com/" not in res:
                        break
                except:
                    print("请求主页超时")

            res = json.loads(response.text)

            res = res["user"]



            item = {}

            # 抖音昵称
            item["nickName"] = res["nickname"]

            # 抖音号
            item["shortId"] = res["short_id"]

            if item["shortId"] == "" or item["shortId"] == "0":
                item["shortId"] = res["unique_id"]

            # 性别
            try:
                item["gender"] = res["gender"]
            except:
                item["gender"] = None

            # 作品总数
            item["awemeCount"] = res["aweme_count"]

            # 生日
            item["birthday"] = res["birthday"]

            # 城市
            item["city"] = res["city"]

            # 动态数量
            item["dongtaiCount"] = res["dongtai_count"]

            # 喜欢数量
            item["favoritingCount"] = res["favoriting_count"]

            # 粉丝数量
            item["followerCount"] = res["mplatform_followers_count"]

            # 关注数量
            item["followingCount"] = res["following_count"]

            # 获赞数量
            item["totalFavorited"] = res["total_favorited"]

            # 简介
            item["singnature"] = res["signature"].replace("\n","")

            # 绑定电话
            item["bindPhone"] = res["bind_phone"]

            print(item)
            BaoCun(item)

            # flow.response = http.HTTPResponse.make(404)

            flow.response = http.HTTPResponse.make(200,response.content,{"Content-Type": "text/html"})




addons = [
    AddHeader()

启动mitmproxy的时候指定文件 用来拦截请求拿到数据

nitmweb -s mitm.py

四.运行

1.启动夜神模拟器 appium以及mitmproxy
运行app.py

注意:需要注意的是:在请求链接的时候 输入命令nox_adb.exe connect 127.0.0.1:62001 才会连接上 不然会提示找不到安卓设备

数据:
python关于抖音app达人信息及视频的抓取_第2张图片
python关于抖音app达人信息及视频的抓取_第3张图片

你可能感兴趣的:(python爬虫)