selenium抓取苏宁图书

selenium绕过js,实现滚动条自动向下滑动,抓取苏宁图书

目标url:烹饪/美食【报价 品牌 口碑评价 测评 正品行货 限时低价 分期】 -苏宁易购

from selenium import webdriver
# 延迟模块
import time
# 解析html
from lxml import etree
# 页面加载等等的异常处理
from selenium.common.exceptions import TimeoutException
import re
import json

if __name__ == '__main__':
    # 创建浏览器对象
    chrome_obj = webdriver.Chrome()
    # 页面等等
    chrome_obj.set_page_load_timeout(5)
    # 输入网址 >> 烹饪/美食 https://list.suning.com/0-502336-0.html?safp=d488778a.46602.advancedFilter.38&safc=cate.0.0&safpn=10006.502282#search-path
    try:
        chrome_obj.get('https://list.suning.com/0-502336-0.html?safp=d488778a.46602.advancedFilter.38&safc=cate.0.0&safpn=10006.502282#search-path')
    except TimeoutException:
        print('超时了......')

    chrome_obj.maximize_window()
    time.sleep(2)

    # 进行进度条的滚动
    for i in range(12):
        time.sleep(2)
        chrome_obj.execute_script(f'document.documentElement.scrollTop={i+1}*1000')

    # 发送了很多次请求,页面已经有了120本书
    # 获取当前页面的html代码
    str_data = chrome_obj.page_source

    # 解析书名
    html_obj = etree.HTML(str_data)
    title_list = html_obj.xpath('//div[@class="res-info"]//a/@aria-label')
    print(title_list)
    print(len(title_list))

    # 解析价格
    price_list = []
    for i in title_list:
        price_list.append(re.findall(r'(.*?)元',i)[0])

    print(price_list)
    print(len(price_list))

    # 保存
    with open('苏宁图书01.json','w',encoding="utf-8") as f:
        for i in range(len(title_list)):
            dict_ = {}
            dict_[title_list[i]] = price_list[i]
            json_data = json.dumps(dict_,ensure_ascii=False) + ',\n'
            f.write(json_data)
            print(dict_)

你可能感兴趣的:(爬虫,selenium,chrome,python)