appium + 真机 获取微信公众号 信息

2018.11.30

昨天简单获取了一下微信朋友圈,算是对appium 又回顾了一遍,今天爬一波微信公众号,改天研究一下微信app的加密

直接上代码:

import time
from lxml import etree
from appium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


PLATFROM = "Android"
DEVIE_NAME = "AVY9KA9641202915"
APP_PACKAGE = "com.tencent.mm"
APP_ACTIVITY = ".ui.LauncherUI"
DEIVER_SERVER = "http://localhost:4723/wd/hub"
TIMROUT = 10  # 单位秒

FLICK_START_X = 300
FLICK_START_Y = 300
FLICK_DISTANCE = 700


class WX(object):
    def __init__(self):
        """
        初始化操作
        """
        # 驱动配置操作
        self.desired_caps = {
            "platformName": PLATFROM,
            "deviceName": DEVIE_NAME,
            "appPackage": APP_PACKAGE,
            "appActivity": APP_ACTIVITY,
            "noReset": True,
            "chromeOptions": {
                "androidProcess": "com.tencent.mm:toolsmp"  
            },
            "chromedriverExecutable": "C:\\Program Files (x86)\\Appium\\resources\\app\\node_modules\\appium\\node_modules\\appium-chromedriver\\chromedriver\\win\\chromedriver.exe",
            "recreateChromeDriverSessions": True  # 如果需要切换到H5页面 这一句就很重要

        }

        self.driver = webdriver.Remote(DEIVER_SERVER, self.desired_caps)
        self.wait = WebDriverWait(self.driver, TIMROUT)

    def login(self):
        """
        登陆操作
        :return:
        """

        # 登陆操作
        login = self.wait.until(EC.presence_of_element_located((By.ID, "com.tencent.mm:id/d75")))
        login.click()
        time.sleep(3)

        # 手机号输入
        phone = self.wait.until(EC.presence_of_element_located((By.ID, "com.tencent.mm:id/hz")))

        phone.set_text(USERNAME)
        time.sleep(1)

        # 点击下一步
        next = self.wait.until(EC.presence_of_element_located((By.ID, "com.tencent.mm:id/alr")))
        next.click()
        time.sleep(3)
        # 输入密码
        password = self.wait.until(EC.presence_of_element_located((By.ID, "com.tencent.mm:id/hz")))
        password.set_text(PASSWORD)
        time.sleep(1)

        # 点击登陆
        submit = self.wait.until(EC.presence_of_element_located((By.ID, "com.tencent.mm:id/alr")))
        submit.click()

        # 解决提示 点击否
        submit = self.wait.until(EC.presence_of_element_located((By.ID, "com.tencent.mm:id/an2")))
        submit.click()
        time.sleep(10)

    def enter(self, name):

        # 切换到联系人

        tab = self.wait.until(
            EC.presence_of_all_elements_located((By.XPATH, '//*[@resource-id="com.tencent.mm:id/po"]')))[1]
        tab.click()

        # 点击公众号
        tab = self.wait.until(
            EC.presence_of_all_elements_located((By.XPATH, """//*[@resource-id="com.tencent.mm:id/a2o"]""")))[0]
        tab.click()

        for i in range(50):

            # 因为只能点击当前页面的标签,所以需要循环向下滑动,找到需要点击的标签,在进行点击

            # 滑动查找需要爬取的公众号,并点击进入这个公众号
            tab_list = self.wait.until(
                EC.presence_of_all_elements_located((By.XPATH, """//*[@resource-id="com.tencent.mm:id/a6e"]""")))

            for tab in tab_list:

                name1 = tab.get_attribute("text")
                if name == name1:
                    tab.click()
                    # 点击右上角进历史页面

                    tab = self.wait.until(EC.presence_of_element_located(
                        (By.XPATH, """//*[@resource-id="com.tencent.mm:id/j1"]""")))
                    tab.click()

                    self.driver.swipe(FLICK_START_X, FLICK_START_Y + FLICK_DISTANCE, FLICK_START_X, FLICK_START_Y, 1000)
                    time.sleep(1)
                    self.driver.swipe(FLICK_START_X, FLICK_START_Y + FLICK_DISTANCE, FLICK_START_X, FLICK_START_Y, 1000)
                    tab = self.wait.until(EC.presence_of_element_located(
                        (By.XPATH, """//*[@resource-id="com.tencent.mm:id/avt"]""")))
                    tab.click()
                    return
            self.driver.swipe(FLICK_START_X, FLICK_START_Y + FLICK_DISTANCE, FLICK_START_X, FLICK_START_Y, 1000)
            time.sleep(1)
            # 点击右上角进入公众号的介绍页面

    def crawl(self, name):

        time.sleep(3)
        # contexts = self.driver.contexts
        # print(contexts)


        # 这里判断到达底部的条件是显示 已无更多
        for aa in range(100):
            # 滑动
            for ii in range(10):
                self.driver.swipe(FLICK_START_X, FLICK_START_Y + FLICK_DISTANCE, FLICK_START_X, FLICK_START_Y, 200)
                time.sleep(1)
            # 滑动结束以后,需要一段时间让手机反应
            self.driver.switch_to.context("WEBVIEW_com.tencent.mm:toolsmp")  # 切换到H5页面
            try:
                # 结束条件
                yiwgd = self.wait.until(
                    EC.presence_of_element_located((By.XPATH, """//*[@class=\"tips js_no_more_msg\"]""")))
                yiwgd = yiwgd.text
                if yiwgd == "已无更多":
                    # 这里是将所有 请求过来的json数据 进行添加到html页面了 ,然后我保存下来了
                    html = self.driver.page_source
                    html = html.replace("&", "&")
                    with open(name + ".html", "w", encoding="utf-8") as f:
                        f.write(html)
                    break
            except:
                pass

            # 可能会产生 kill -9 chromedriver 的命令
            time.sleep(1)
            self.driver.switch_to.context("NATIVE_APP")  # 切换回去 native 页面

            # 这一种解析方式太慢了 ,直接将获得的原文保存下来,然后在进行解析
            # f_file = open("微信公众号文件2.csv", "w", encoding="utf-8")
            #
            # # 这里不知道怎么不能先取大的div,再取div 下面的时间 和内容等等,下面这样写也是对的
            #
            # div_list = self.wait.until(
            #     EC.presence_of_all_elements_located(
            #         (By.XPATH, """//*[@class=\"weui_msg_card_list\"]//div[@class=\"weui_media_bd js_media\"]/h4""")))
            # time_list = self.wait.until(EC.presence_of_all_elements_located(
            #     (By.XPATH, """//*[@class=\"weui_msg_card_list\"]//div[@class=\"weui_media_bd js_media\"]/p[2]""")))
            #
            # for num, div in enumerate(div_list):
            #     info = div.text  # 文章内容
            #     info = str(info).replace(",", ".")  # 替换可能出现的逗号,避免csv文件出错
            #     hrefs = div.get_attribute("hrefs")  # 获取文章临时链接
            #     out_time = time_list[num].text
            #     f_file.write(str(info) + "," + out_time + "," + str(hrefs) + "\n")
            #     print(out_time)
            #     print(info)
            #     print(hrefs)

    def analysis(self, name):
        with open(name + ".html", "r", encoding="utf-8") as f:
            html = f.read()
        html = etree.HTML(html)
        div_list = html.xpath("""//*[@class="weui_msg_card_list"]//div[@class="weui_media_bd js_media"]""")
        f_file = open(name + ".csv", "w", encoding="utf-8")
        f_file.write("标题,文章发布时间,文章开头54个字符,文章url链接")
        for div in div_list:
            try:
                # 标题
                title = div.xpath("./h4/text()")[0]
                title = title.strip()
                print(title)

                # 文章发布时间
                out_time = div.xpath("""./p[@class="weui_media_extra_info"]/text()""")[0]
                out_time = out_time.replace("年", "-").replace("月", "-").replace("日", "")
                print(out_time)

                # 文章开头 54 chars
                info = div.xpath("""./p[@class="weui_media_desc"]/text()""")[0]
                print(info)

                # url链接
                href = div.xpath("""./h4/@hrefs""")[0]
                print(href)
                f_file.write(title + "," + out_time + "," + info + "," + href + "\n")
            except:
                print("文章违规被删除")



if __name__ == '__main__':
    """
    1.需要事先将所有需要爬取的公众号进行关注
    2.在进行爬取的时候 ,不要开chrome://inspect/#devices 这个页面,好像是占用chromedriver,造成程序异常
    3.这里对于爬取的过程,其实可以使用 requests 库来进行请求完成 ,只是现在对于每一个加密参数的生成不熟,
    后期进行修改,可以不使用手机完成内容的获取
    
    """
    wx = WX()

    name_list = ["头号人工智能", "人工智能"]
    for name in name_list:
        wx.enter(name)
        wx.crawl(name)
        wx.analysis(name)

 

你可能感兴趣的:(爬虫,appium)