python爬虫 selenium、PyQuery 爬取京东商城

首先来说一说selenium,这是一个浏览器自动化测试框架。嗯。。。。就介绍这么多,要详细了解的话自己去看文档

诺~网址:https://www.seleniumhq.org/docs/

第一步需求分析:

1)自动搜索商品

2)获取搜索结果的数据

来个分割线

python爬虫 selenium、PyQuery 爬取京东商城_第1张图片

第二步上代码啊:

功能编写模块

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException,StaleElementReferenceException
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from pyquery import PyQuery as pq
from config import *
import pymongo
import re

client = pymongo.MongoClient(MONGO_URL)
db = client[MONGO_DB]

# 定义一个浏览器
chrome_options=Options()
#设置chrome 浏览器无界面模式
chrome_options.add_argument('--headless')
browser = webdriver.Chrome(chrome_options=chrome_options)
# chrome 有界面模式
# browser = webdriver.Chrome()


# selenium 3.0后的版本不支持 PhantomJS
# browser = webdriver.PhantomJS(service_args=SERVIVE_ARGS)

# 设置窗口大小
# browser.set_window_size(1400,900)

# 设置等待时间
wait = WebDriverWait(browser,3)
def search():
    try:
        browser.get('https://www.jd.com/')
        #获取搜索输入框 
        # presence_of_element_located : 判断某个元素是否加载成功,如果定位到就返回inputs
        inputs = wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR,'#key'))
        )
        #获取搜索按钮
        # element_to_be_clickable  判断某个元素中是否可见并且是enable的,代表可点击
        submit = wait.until(
            EC.element_to_be_clickable((By.CSS_SELECTOR,'#search > div > div.form > button'))
        )
        #在输入框输入文本
        inputs.send_keys('篮球')
        #模拟按钮点击
        submit.click()
        #获取搜索结果总页数
        total = wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR,'#J_bottomPage > span.p-skip > em:nth-child(1)'))
        )
        produce()
        return total.text
    except TimeoutException:
        return search()
    except StaleElementReferenceException as n:
        return search()

def next_page(page_num):
    try:
        #获取翻页输入框 
        inputs = wait.until(
                EC.presence_of_element_located((By.CSS_SELECTOR,'#J_bottomPage > span.p-skip > input'))
            )
        #获取翻页确认按钮
        submit = wait.until(
            EC.element_to_be_clickable((By.CSS_SELECTOR,'#J_bottomPage > span.p-skip > a'))
        )
        # 先清除输入框
        inputs.clear()
        # 再输入内容
        inputs.send_keys(page_num)
        
        submit.click()
        # text_to_be_present_in_element  判断指定元素的属性值中是否包含了预期的字符串,返回布尔值
        wait.until(
            EC.text_to_be_present_in_element(
                (By.CSS_SELECTOR,'#J_bottomPage > span.p-num > a.curr'),str(page_num)
            )
        )
        # html = browser.page_source
        # print(html)
        produce()
    except TimeoutException:
        return next_page(page_num)
    except StaleElementReferenceException as n:
        return next_page(page_num)

def produce():
    try:
        wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR,'#J_goodsList .gl-warp .gl-item'))
        )
        # 获取页面信息
        html = browser.page_source
        # 用pyquery 库解析页面
        doc = pq(html)
        # items 获取所有选择的内容  
        items = doc('#J_goodsList .gl-warp .gl-item').items()
        for item in items:
            product = {
                'image':item.find('.p-img img').attr('src'),
                'price':item.find('.p-price').text(),   
                'title':item.find('.p-name').text(),
                'commit':item.find('.p-commit').text(),         
                'shop':item.find('.p-shop').text()
            }
            # print(product)
            save_to_mongo(product)
    except TimeoutException:
        return produce()

def save_to_mongo(result):
    try:
        if db[MONGO_TABLE].insert(result):
            print('存储到mongodb成功',result)
    except Exception:
        print('存储数据失败',result)


def main():
    totals = search()
    totals = int(re.compile(r'(\d+)').search(totals).group(1))
    for c in range(2,totals+1):
        next_page(c)
    browser.close()

if __name__ == '__main__':
    main()

创建一个config文件存储数据到mongoDB

# 创建连接
MONGO_URL = 'localhost'
# 创建数据库
MONGO_DB = 'jingdong'
# 创建表
MONGO_TABLE = 'product'

 

你可能感兴趣的:(爬虫)