Selenium自动化测试工具——以爬取京东商品信息为例

需要安装的包

import re
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from pyquery import PyQuery as pq
import csv
import time

完整代码

browser = webdriver.Chrome()
wait = WebDriverWait(browser, 10)
def search():
    print('正在搜索')
    try:
        browser.get("https://www.jd.com/")
        input = wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR, '#key'))
        )
        input.send_keys(keywords)
        submit = wait.until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, '#search > div > div.form > button')))
        time.sleep(3)
        submit.click()
        total = wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR, '#J_bottomPage > span.p-skip > em:nth-child(1)')))
        return total.text
    except TimeoutException:
        return search()


def next_page(page_number):
    print(f'正在翻第{page_number}页')
    try:
        print('定位到跳转页数')
        input = wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR, '#J_bottomPage > span.p-skip > input'))
        )
        print('定位到跳转按钮,确保可点击')
        submit = wait.until(EC.element_to_be_clickable(
            (By.CSS_SELECTOR, '#J_bottomPage > span.p-skip > a')))
        input.clear()
        input.send_keys(page_number)
        submit.click()
        # time.sleep(2)
        wait.until(EC.text_to_be_present_in_element(
            (By.CSS_SELECTOR, '#J_bottomPage > span.p-num > a.curr'), str(page_number)))
        get_products()
    except TimeoutException:
        next_page(page_number)

def get_products():
    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#J_goodsList > ul')))
    html = browser.page_source
    doc = pq(html,parser="html")
    items = doc('#J_goodsList .gl-item').items()
    for item in items:
        # print(item)
        image="http:"+str(item('.gl-i-wrap .p-img a img').attr('src'))
        price=item.find('.p-price').text()
        title=item.find('.p-name').text()
        title = title.strip('\n')
        shop=item.find('div span a').text()
        comment=item.find('.p-commit a').text()
        product = [image,price,title,shop,comment]
        # product = ["http:"+str(item('.gl-i-wrap .p-img a img').attr('src')), item.find('.p-price').text(), item.find('.p-name').text(), item.find('div span a').text(), item.find('.p-commit a').text()]
        print(product)
        writer.writerow(product)


def main():
    keywords = input('请输入关键字:')
    total = search()
    total = int(re.compile('(\d+)').search(total).group(1))
    global f
    f=open('result.csv',mode='w',encoding='gbk',newline='')
    global writer
    writer = csv.writer(f)
    head = ['image', 'price', 'title', 'shop', 'comment']
    writer.writerow(head)
    # get_products()
    for i in range(2, total + 1):
        next_page(i)
    f.close()

if __name__ == '__main__':
    main()

你可能感兴趣的:(Python,selenium,Python,京东,自动化测试)