2018.11.30
昨天简单获取了一下微信朋友圈,算是对appium 又回顾了一遍,今天爬一波微信公众号,改天研究一下微信app的加密
直接上代码:
import time
from lxml import etree
from appium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
PLATFROM = "Android"
DEVIE_NAME = "AVY9KA9641202915"
APP_PACKAGE = "com.tencent.mm"
APP_ACTIVITY = ".ui.LauncherUI"
DEIVER_SERVER = "http://localhost:4723/wd/hub"
TIMROUT = 10 # 单位秒
FLICK_START_X = 300
FLICK_START_Y = 300
FLICK_DISTANCE = 700
class WX(object):
def __init__(self):
"""
初始化操作
"""
# 驱动配置操作
self.desired_caps = {
"platformName": PLATFROM,
"deviceName": DEVIE_NAME,
"appPackage": APP_PACKAGE,
"appActivity": APP_ACTIVITY,
"noReset": True,
"chromeOptions": {
"androidProcess": "com.tencent.mm:toolsmp"
},
"chromedriverExecutable": "C:\\Program Files (x86)\\Appium\\resources\\app\\node_modules\\appium\\node_modules\\appium-chromedriver\\chromedriver\\win\\chromedriver.exe",
"recreateChromeDriverSessions": True # 如果需要切换到H5页面 这一句就很重要
}
self.driver = webdriver.Remote(DEIVER_SERVER, self.desired_caps)
self.wait = WebDriverWait(self.driver, TIMROUT)
def login(self):
"""
登陆操作
:return:
"""
# 登陆操作
login = self.wait.until(EC.presence_of_element_located((By.ID, "com.tencent.mm:id/d75")))
login.click()
time.sleep(3)
# 手机号输入
phone = self.wait.until(EC.presence_of_element_located((By.ID, "com.tencent.mm:id/hz")))
phone.set_text(USERNAME)
time.sleep(1)
# 点击下一步
next = self.wait.until(EC.presence_of_element_located((By.ID, "com.tencent.mm:id/alr")))
next.click()
time.sleep(3)
# 输入密码
password = self.wait.until(EC.presence_of_element_located((By.ID, "com.tencent.mm:id/hz")))
password.set_text(PASSWORD)
time.sleep(1)
# 点击登陆
submit = self.wait.until(EC.presence_of_element_located((By.ID, "com.tencent.mm:id/alr")))
submit.click()
# 解决提示 点击否
submit = self.wait.until(EC.presence_of_element_located((By.ID, "com.tencent.mm:id/an2")))
submit.click()
time.sleep(10)
def enter(self, name):
# 切换到联系人
tab = self.wait.until(
EC.presence_of_all_elements_located((By.XPATH, '//*[@resource-id="com.tencent.mm:id/po"]')))[1]
tab.click()
# 点击公众号
tab = self.wait.until(
EC.presence_of_all_elements_located((By.XPATH, """//*[@resource-id="com.tencent.mm:id/a2o"]""")))[0]
tab.click()
for i in range(50):
# 因为只能点击当前页面的标签,所以需要循环向下滑动,找到需要点击的标签,在进行点击
# 滑动查找需要爬取的公众号,并点击进入这个公众号
tab_list = self.wait.until(
EC.presence_of_all_elements_located((By.XPATH, """//*[@resource-id="com.tencent.mm:id/a6e"]""")))
for tab in tab_list:
name1 = tab.get_attribute("text")
if name == name1:
tab.click()
# 点击右上角进历史页面
tab = self.wait.until(EC.presence_of_element_located(
(By.XPATH, """//*[@resource-id="com.tencent.mm:id/j1"]""")))
tab.click()
self.driver.swipe(FLICK_START_X, FLICK_START_Y + FLICK_DISTANCE, FLICK_START_X, FLICK_START_Y, 1000)
time.sleep(1)
self.driver.swipe(FLICK_START_X, FLICK_START_Y + FLICK_DISTANCE, FLICK_START_X, FLICK_START_Y, 1000)
tab = self.wait.until(EC.presence_of_element_located(
(By.XPATH, """//*[@resource-id="com.tencent.mm:id/avt"]""")))
tab.click()
return
self.driver.swipe(FLICK_START_X, FLICK_START_Y + FLICK_DISTANCE, FLICK_START_X, FLICK_START_Y, 1000)
time.sleep(1)
# 点击右上角进入公众号的介绍页面
def crawl(self, name):
time.sleep(3)
# contexts = self.driver.contexts
# print(contexts)
# 这里判断到达底部的条件是显示 已无更多
for aa in range(100):
# 滑动
for ii in range(10):
self.driver.swipe(FLICK_START_X, FLICK_START_Y + FLICK_DISTANCE, FLICK_START_X, FLICK_START_Y, 200)
time.sleep(1)
# 滑动结束以后,需要一段时间让手机反应
self.driver.switch_to.context("WEBVIEW_com.tencent.mm:toolsmp") # 切换到H5页面
try:
# 结束条件
yiwgd = self.wait.until(
EC.presence_of_element_located((By.XPATH, """//*[@class=\"tips js_no_more_msg\"]""")))
yiwgd = yiwgd.text
if yiwgd == "已无更多":
# 这里是将所有 请求过来的json数据 进行添加到html页面了 ,然后我保存下来了
html = self.driver.page_source
html = html.replace("&", "&")
with open(name + ".html", "w", encoding="utf-8") as f:
f.write(html)
break
except:
pass
# 可能会产生 kill -9 chromedriver 的命令
time.sleep(1)
self.driver.switch_to.context("NATIVE_APP") # 切换回去 native 页面
# 这一种解析方式太慢了 ,直接将获得的原文保存下来,然后在进行解析
# f_file = open("微信公众号文件2.csv", "w", encoding="utf-8")
#
# # 这里不知道怎么不能先取大的div,再取div 下面的时间 和内容等等,下面这样写也是对的
#
# div_list = self.wait.until(
# EC.presence_of_all_elements_located(
# (By.XPATH, """//*[@class=\"weui_msg_card_list\"]//div[@class=\"weui_media_bd js_media\"]/h4""")))
# time_list = self.wait.until(EC.presence_of_all_elements_located(
# (By.XPATH, """//*[@class=\"weui_msg_card_list\"]//div[@class=\"weui_media_bd js_media\"]/p[2]""")))
#
# for num, div in enumerate(div_list):
# info = div.text # 文章内容
# info = str(info).replace(",", ".") # 替换可能出现的逗号,避免csv文件出错
# hrefs = div.get_attribute("hrefs") # 获取文章临时链接
# out_time = time_list[num].text
# f_file.write(str(info) + "," + out_time + "," + str(hrefs) + "\n")
# print(out_time)
# print(info)
# print(hrefs)
def analysis(self, name):
with open(name + ".html", "r", encoding="utf-8") as f:
html = f.read()
html = etree.HTML(html)
div_list = html.xpath("""//*[@class="weui_msg_card_list"]//div[@class="weui_media_bd js_media"]""")
f_file = open(name + ".csv", "w", encoding="utf-8")
f_file.write("标题,文章发布时间,文章开头54个字符,文章url链接")
for div in div_list:
try:
# 标题
title = div.xpath("./h4/text()")[0]
title = title.strip()
print(title)
# 文章发布时间
out_time = div.xpath("""./p[@class="weui_media_extra_info"]/text()""")[0]
out_time = out_time.replace("年", "-").replace("月", "-").replace("日", "")
print(out_time)
# 文章开头 54 chars
info = div.xpath("""./p[@class="weui_media_desc"]/text()""")[0]
print(info)
# url链接
href = div.xpath("""./h4/@hrefs""")[0]
print(href)
f_file.write(title + "," + out_time + "," + info + "," + href + "\n")
except:
print("文章违规被删除")
if __name__ == '__main__':
"""
1.需要事先将所有需要爬取的公众号进行关注
2.在进行爬取的时候 ,不要开chrome://inspect/#devices 这个页面,好像是占用chromedriver,造成程序异常
3.这里对于爬取的过程,其实可以使用 requests 库来进行请求完成 ,只是现在对于每一个加密参数的生成不熟,
后期进行修改,可以不使用手机完成内容的获取
"""
wx = WX()
name_list = ["头号人工智能", "人工智能"]
for name in name_list:
wx.enter(name)
wx.crawl(name)
wx.analysis(name)