Python+Selenium自动抓取Ajax渲染网页

Python+Selenium自动抓取Ajax渲染网页_第1张图片
image.png

@前言:
工作闲暇之余自学Python,想找个项目练练手,于是瞄准了我们客户阿里给我们下发任务的网站,往常同事都是手动登录网站,手动复制粘贴Case内容到Excel。Kanshan震惊,都9102年了,怎么还要做这么低效(无脑)的工作,于是自学python尝试自动化获取case内容并且保存到本地,想一想,能有多难???

然鹅:人生第一次认真爬的网页有万万个没想到...

@问题和方法

  • 万万没想到①:不是所有的网站都随便逛的,遇到这种拦路虎怎么办,盘他? AVMS网址

    Python+Selenium自动抓取Ajax渲染网页_第2张图片
    image.png

  • 方法①:先登录网站,拿到cookies,放到headers里面请求网页,发现网页是Ajax渲染的,而且提交方式为post,此路不通。

  • 方法②:selenium模拟登录后获取cookies,保存到本地,每次使用时再调用。先上模拟登录的代码:
    @模拟登录

from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

def login():
    driver.get(url)    #加载页面
    #定位输入用户名的表单
    username = WAIT.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#exampleInputUser"))) 
    #定位输入密码的表单
    password = WAIT.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#exampleInputPassword")))
    #定位登录的按钮    
    submit = WAIT.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="login-button"]')))
    username.send_keys("XXXX")  #引号内为用户和密码
    password.send_keys("XXXX")
    submit.click()  #模拟鼠标点击
    driver.refresh()  #刷新页面

if __name__ =='__main__':
    task_id = input("请输入需要抓取的task_id:")
    url = 'http://www.aliavms.cn:7001/tsmanager/index.html#/detail?task_id=' + task_id
    pages_string = input("请输入需要抓取得页数:")
    pages = int(pages_string)
    #options = webdriver.ChromeOptions()  #使用chromeless需要的参数
    #options.add_argument('headless')
    #options.add_argument('disable-gpu')
    #driver = webdriver.Chrome(options=options)
    driver = webdriver.Firefox()
    WAIT = WebDriverWait(driver, 10)
    task_name, case_name = login() #为了生成excel名称和sheet表格名称

(下面的获取cookies、保存、读取后来都没有用到)

import os
import json

def get_cookies():
   cookies = driver.get_cookies() #webdriver直接获取cookies

def save_cookies(cookies):
   with open("cookies.txt", "w") as fp:
       json.dump(cookies, fp)

def read_cookie():
   if os.path.exists('cookies.text'):
       cookies_dict = dict()
       with open("cookies.txt", "r") as fp:
           cookies = json.load(fp)
           for cookie in cookies:
               cookies_dict[cookie['name']] = cookie['value']
       return cookies_dict
   else:
       get_cookies()
       return read_cookie()
  • 如果是静态网页,那就很简单了

import requests

headers = {
    # 假装自己是浏览器
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/73.0.3683.75 Chrome/73.0.3683.75 Safari/537.36',
    # 把你刚刚拿到的Cookie塞进来
    'Cookie': 'eda38d470a662ef3606390ac3b84b86f9; Hm_lvt_f1d3b035c559e31c390733e79e080736=1553503899; biihu__user_login=omvZVatKKSlcXbJGmXXew9BmqediJ4lzNoYGzLQjTR%2Fjw1wOz3o4lIacanmcNncX1PsRne5tXpE9r1sqrkdhAYQrugGVfaBICYp8BAQ7yBKnMpAwicq7pZgQ2pg38ZzFyEZVUvOvFHYj3cChZFEWqQ%3D%3D; Hm_lpvt_f1d3b035c559e31c390733e79e080736=1553505597',
}

session = requests.Session()
url = "https://......."
response = session.get(url)
print(response.text)
  • 万万没想到②:driver.page_source抓取的html只有部分代码,因为是基于Ajax渲染的(虽然kanshan很菜,但是kanshan不会这么容易屈服的...)最终使用selenium+xpath定位获取到需要抓取的每页项数。
data = driver.find_element_by_xpath('/html/body/div[2]/div/div/div[8]/div[2]/div/table/tbody').find_elements_by_tag_name('tr')
length = len(data) - 1
  • 接着抓取每一项的内容,每一项又是单独的一个页面,所以规则是:点击抓取项,跳转到新的页面,因为第一次玩爬虫,所以这里也踩到坑了,因为driver的定位还在主页面,虽然另外加载了一个标签页,获取的仍然是主页面的信息,所以要做如下操作:
import time

def new_page(button1):  #button1是抓取项的xpath路径
    page_detail = WAIT.until(EC.element_to_be_clickable((By.XPATH, button1)))
    page_detail.click()
    time.sleep(2) #给足页面加载时间
    #driver.window_handles是获取所有句柄
    new_page = driver.window_handles[-1]  #获取新标签页(子页面)的句柄
    page = driver.window_handles[0] #获取主页面的句柄
    driver.switch_to.window(new_page)  #跳转到子页面
    save_to_excel()   
    time.sleep(1)
    driver.close()   #抓取完成关闭子页面
    driver.switch_to.window(page)  #跳转到主页面

  • 万万没想到③:紧接着遇到新的问题:子页面里面有框架iframe的嵌套...
    最终解决办法是先定位到iframe,然后再跳出,进入下一个iframe,再跳出,所以Kanshan还写了个循环。
    for i in range(1, 4):
        #iframe的xpath
        button2 = "/html/body/div/div[2]/div/div[4]/div[%d]/div[2]/div/div/div/iframe" % i
        iframe = WAIT.until(EC.presence_of_element_located((By.XPATH, button2)))
        driver.switch_to.frame(iframe)  #跳转到指定的itrame框架
        data = WAIT.until(EC.presence_of_all_elements_located((By.TAG_NAME, 'p')))
        if len(data) == 0:   #这里写了个判断是因为部分iframe没数据,为了不产生报错
            tc_data = ""   #没有数据的地方使其为空写到excel
        else:
            text = ""
            for item in data:
                text = text + item.text + '\n'    #不同小标签的内容会换行
            tc_data.append(text)   #字典的append()方法追加内容
            driver.switch_to.default_content()   #切到出事的frame,为了跳出iframe,然后进入下一个iframe

  • 万万没想到④:此外还遇到本身内容为空的情况,程序会报错,解决方法如下:
    try:
        tc_class = WAIT.until(EC.presence_of_element_located((By.XPATH, '/html/body/div/div[2]/div/div[2]/table/tr[1]/td[4]/span'))).text
    except Exception as e:    #出错也能继续执行
        tc_class = ""

  • 万万没想到⑤:接着是保存到excel的部分:这里我写了个循环,是为了解决在excel中追加新的sheet而不是覆盖。
import xlwt
import xlrd
from xlutils.copy import copy as xl_copy

    if os.path.exists(u'%s.xls' % task_name):
        #读取存在的excel文档
        read_book = xlrd.open_workbook((u'%s.xls' % task_name), formatting_info=True)
        write_book = xl_copy(read_book)  #复制
        #新增sheet
        sheet = write_book.add_sheet(case_name, cell_overwrite_ok=True)
    else:
        #新建excel
        write_book = xlwt.Workbook(encoding='utft-8', style_compression=0)
        #新建sheet
        sheet = write_book.add_sheet(case_name, cell_overwrite_ok=True)
    # 表头内容
    sheet.write(0, 0, '用例名称')
    sheet.write(0, 1, '用例描述')
    sheet.write(0, 2, '用例步骤')
    sheet.write(0, 3, 'Pass/Fail标准')
    sheet.write(0, 4, '用例类别')
    sheet.write(0, 5, '备注说明')
    sheet.write(0, 6, '结果')

    n = 1
    tc_num = 1
  • 接上部分
def save_to_excel():
    global n   #这里很重要,设置全局变量
    tc_name = WAIT.until(EC.presence_of_element_located((By.XPATH, '/html/body/div/div[2]/div/div[2]/table/tr[1]/td[2]/span'))).text
    print("爬取第%d项 tc_name: %s" % (n, tc_name))
    try:
        tc_class = WAIT.until(EC.presence_of_element_located((By.XPATH, '/html/body/div/div[2]/div/div[2]/table/tr[1]/td[4]/span'))).text
    except Exception as e:
        tc_class = ""
    try:
        tc_comment = WAIT.until(EC.presence_of_element_located((By.XPATH, '/html/body/div/div[2]/div/div[2]/div[2]/div[2]'))).text
    except Exception as e:
        tc_comment = ""

    tc_data = []
    for i in range(1, 4):
        button2 = "/html/body/div/div[2]/div/div[4]/div[%d]/div[2]/div/div/div/iframe" % i
        iframe = WAIT.until(EC.presence_of_element_located((By.XPATH, button2)))
        driver.switch_to.frame(iframe)
        data = WAIT.until(EC.presence_of_all_elements_located((By.TAG_NAME, 'p')))
        if len(data) == 0:
            tc_data = ""
        else:
            text = ""
            for item in data:
                text = text + item.text + '\n'
            tc_data.append(text)
            driver.switch_to.default_content()

    tc_description = tc_data[0]
    tc_step = tc_data[1]
    tc_criteria = tc_data[2]

    sheet.write(n, 0, tc_name)
    sheet.write(n, 1, tc_description)
    sheet.write(n, 2, tc_step)
    sheet.write(n, 3, tc_criteria)
    sheet.write(n, 4, tc_class)
    sheet.write(n, 5, tc_comment)

    n += 1

  • 全部代码:
# -*- coding:utf-8 -*-
# Copyright (c)2019, KanShan,All rightsreserved
# Author:KanShan
#Description:输入阿里avms的task_id和页面数,自动抓取Case_info并保存...
import time
import xlwt
import xlrd
import os
from xlutils.copy import copy as xl_copy
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

def login():
    driver.get(url)
    username = WAIT.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#exampleInputUser")))
    password = WAIT.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#exampleInputPassword")))
    submit = WAIT.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="login-button"]')))
    username.send_keys("XXXX")
    password.send_keys("XXXX")
    submit.click()
    driver.refresh()
    #task_name.xlsx
    task_name = WAIT.until(EC.presence_of_element_located((By.XPATH, '/html/body/div[2]/div/div/div[4]/div[2]/div/div[2]/div/table/tbody/tr/td[2]/span'))).text
    #task里面的case
    case_name = WAIT.until(EC.presence_of_element_located((By.XPATH, '//*[@id="app"]/div/div/div[2]/table/tr[1]/td[2]/span'))).text
    return task_name, case_name


def save_to_excel():
    global n
    tc_name = WAIT.until(EC.presence_of_element_located((By.XPATH, '/html/body/div/div[2]/div/div[2]/table/tr[1]/td[2]/span'))).text
    print("爬取第%d项 tc_name: %s" % (n, tc_name))
    try:
        tc_class = WAIT.until(EC.presence_of_element_located((By.XPATH, '/html/body/div/div[2]/div/div[2]/table/tr[1]/td[4]/span'))).text
    except Exception as e:
        tc_class = ""
    try:
        tc_comment = WAIT.until(EC.presence_of_element_located((By.XPATH, '/html/body/div/div[2]/div/div[2]/div[2]/div[2]'))).text
    except Exception as e:
        tc_comment = ""

    tc_data = []
    for i in range(1, 4):
        button2 = "/html/body/div/div[2]/div/div[4]/div[%d]/div[2]/div/div/div/iframe" % i
        iframe = WAIT.until(EC.presence_of_element_located((By.XPATH, button2)))
        driver.switch_to.frame(iframe)
        data = WAIT.until(EC.presence_of_all_elements_located((By.TAG_NAME, 'p')))
        if len(data) == 0:
            tc_data = ""
        else:
            text = ""
            for item in data:
                text = text + item.text + '\n'
            tc_data.append(text)
            driver.switch_to.default_content()

    tc_description = tc_data[0]
    tc_step = tc_data[1]
    tc_criteria = tc_data[2]

    sheet.write(n, 0, tc_name)
    sheet.write(n, 1, tc_description)
    sheet.write(n, 2, tc_step)
    sheet.write(n, 3, tc_criteria)
    sheet.write(n, 4, tc_class)
    sheet.write(n, 5, tc_comment)

    n += 1

def new_page(button1):
    page_detail = WAIT.until(EC.element_to_be_clickable((By.XPATH, button1)))
    page_detail.click()
    time.sleep(2)
    new_page = driver.window_handles[-1]
    page = driver.window_handles[0]
    driver.switch_to.window(new_page)
    save_to_excel()
    time.sleep(1)
    driver.close()
    driver.switch_to.window(page)

def page_detail():
    data = driver.find_element_by_xpath('/html/body/div[2]/div/div/div[8]/div[2]/div/table/tbody').find_elements_by_tag_name('tr')
    length = len(data) - 1
    indexs = length
    for index in range(2, indexs + 2):
        if length <= 0:
            break
        else:
            button1 = ('//*[@id="app"]/div/div/div[8]/div[2]/div/table/tbody/tr[%d]/td[3]/div/div/a' % index)
            try:
                new_page(button1)
                length -= 1
            except Exception as e:
                pass
                button2 = (
                        '/html/body/div[2]/div/div/div[8]/div[2]/div/table/tbody/tr[%d]/td[2]/table/tr/td[2]/div/div/span' % index)
                result = WAIT.until(EC.presence_of_element_located((By.XPATH, button2))).text
                print('抓取测试结果:%s' % result)
                global n
                n -= 1
                sheet.write(n, 6, result)
                n += 1
                length -= 3

def main():
    print("爬取Task_name: %s" % task_name)
    print("爬取Case_name: %s" % case_name)
    if pages == 1:
        print("爬取第1页")
        page_detail()
        print("爬取完成:共1页,保存中")
        driver.close()
    elif pages >= 2:
        try:
            page_detail()
            print("爬取完成:第1页")
            for page in range(2, pages + 1):
                print("爬取第%d页" % page)
                if page > 6:
                    next_page = WAIT.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="app"]/div/div/div[8]/div[2]/div/div[2]/div/div/ul/li[7]')))
                else:
                    next_page = WAIT.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="app"]/div/div/div[8]/div[2]/div/div[2]/div/div/ul/li[%d]' % (page + 1))))
                next_page.click()
                time.sleep(3)
                page_detail()
                print("爬取完成:第%d页" % page)
        finally:
            driver.close()
        print("爬取完成:共%d页,保存中" % pages)
    else:
        print("页数输入错误,请输入大于等于1的整数")
        exit()

if __name__ =='__main__':
    task_id = input("请输入需要抓取的task_id:")
    url = 'http://www.aliavms.cn:7001/tsmanager/index.html#/detail?task_id=' + task_id
    pages_string = input("请输入需要抓取得页数:")
    pages = int(pages_string)
    #chrome_options = webdriver.ChromeOptions()
    #chrome_options.add_argument('headless')
    #chrome_options.add_argument('disable-gpu')
    #driver = webdriver.Chrome(options=chrome_options)
    driver = webdriver.Firefox()
    WAIT = WebDriverWait(driver, 10)
    task_name, case_name = login()

    if os.path.exists(u'%s.xls' % task_name):
        read_book = xlrd.open_workbook((u'%s.xls' % task_name), formatting_info=True)
        write_book = xl_copy(read_book)
        sheet = write_book.add_sheet(case_name, cell_overwrite_ok=True)
    else:
        write_book = xlwt.Workbook(encoding='utft-8', style_compression=0)
        sheet = write_book.add_sheet(case_name, cell_overwrite_ok=True)

    sheet.write(0, 0, '测试用例名称')
    sheet.write(0, 1, '测试用例描述')
    sheet.write(0, 2, '测试用例步骤')
    sheet.write(0, 3, '测试Pass/Fail标准')
    sheet.write(0, 4, '测试用例类别')
    sheet.write(0, 5, '备注说明')
    sheet.write(0, 6, '测试结果')

    n = 1
    tc_num = 1
    main()
    #保存为excel文件
    write_book.save(u'%s.xls' % task_name)

你可能感兴趣的:(Python+Selenium自动抓取Ajax渲染网页)