python爬虫爬取数据并以json格式输出成文件

import time
import json
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait


class Test:

    url = 'http://www.test.com/hello'
    options = webdriver.ChromeOptions()
    # 不加载图片,加快访问速度
    options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
    # 此步骤很重要,设置为开发者模式,防止被各大网站识别出来使用了Selenium
    options.add_experimental_option('excludeSwitches', ['enable-automation'])
    # 添加本地代理
    # options.add_argument("--proxy--server=127.0.0.1:8080")
    # 添加UA
    ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'
    # ua = 'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1 (compatible; Baiduspider-render/2.0; +http://www.baidu.com/search/spider.html)'
    # ua = 'Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)'
    options.add_argument('user-agent=' + ua)

    driver = webdriver.Chrome(options=options)
    driver.maximize_window()
    wait = WebDriverWait(driver, 10)
    driver.get(url)
    time.sleep(3)
    # driver.refresh()

    result = []
    items = driver.find_elements_by_css_selector('div.ex-item')
    for item in items:
        name = item.find_element_by_css_selector('div.ex-item-bottom div.title').text
        print(name)
        address = item.find_element_by_css_selector('div.ex-item-bottom p.f-toe').text
        print(address)
        exhi_time = item.find_element_by_css_selector('div.ex-item-bottom p.item-line').text
        print(exhi_time)
        image = item.find_element_by_css_selector('div.ex-item-top img').get_attribute('src')
        print(image)
        one = {}
        one['name'] = name
        one['address'] = address
        one['exhi_time'] = exhi_time
        one['image'] = image
        result.append(one)


    with open('d:\\data.json', 'w', encoding='utf-8') as file:
        file.write(json.dumps(result, indent=2, ensure_ascii=False))

    time.sleep(10)
    # driver.close()  # 关闭浏览器

输出json格式的文件内容示例如下:

[
  {
    "name": "2020年上海国际展",
    "address": "上海世博展览馆",
    "exhi_time": "2020/03/24~03/26",
    "image": "https://show.test.com/show/imgs/202003/61a840a1373f45122d4e.jpg"
  },
  {
    "name": "中国国际产业展览会",
    "address": "上海市徐汇区漕宝路88号",
    "exhi_time": "2020/04/10~04/12",
    "image": "https://show.test.com/show/imgs/202003/289f27cb5513fad11.jpg"
  }
]

本文内容到此结束。

 

你可能感兴趣的:(python)