爬虫09 —— Selenium 与 PyQuery 综合练习

import os,re,json,pymongo

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.chrome.options import Options
from pyquery import PyQuery as PQ
from config import *


cookies_path = "taobao_cookies.json"
client = pymongo.MongoClient(MONGO_URL)
db = client[MONGO_DB]
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
browser = webdriver.Chrome(chrome_options=chrome_options)
wait = WebDriverWait(browser, 60)

def search():
    browser.get("https://www.taobao.com/")
    set_cookies()
    browser.refresh()
    try:
        input = wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR, '#q'))
        )
        submit = wait.until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_TSearchForm > div.search-button > button'))
        )
        input.send_keys(KEYWORD)
        submit.click()
        if 'https://s.taobao.com/search?q=' not in browser.current_url:
            print("此时需要手动登录。。。请在60秒内完成登录。。。")
        wait.until(EC.url_contains('https://s.taobao.com/search?q='))
        print("登录成功!")
        save_cookies()
        total = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.total')))
        get_products()
        return total.text
    except TimeoutException:
        print("登录超时!正在重新登录!")
        return search()


def next_page(page_number):
    # 跳转到指定页
    print("******************** 加载第 %d 页 ********************"%page_number)
    try:
        input = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.form > input")))
        submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit")))
        input.clear()
        input.send_keys(page_number)
        submit.click()
        wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > ul > li.item.active > span"), str(page_number)))
        get_products()
    except TimeoutException:
        print("本页超时:", page_number, "正在刷新。。。")
        return next_page(page_number)

def set_cookies():
    # 设置cookies
    if os.path.exists(cookies_path):
        with open(cookies_path, "r")as f:
            cookies = json.load(f)
            for cookie in cookies:
                browser.add_cookie(cookie)
        print("设置cookies成功!")


def save_cookies():
    # 保存cookie
    cookies = browser.get_cookies()
    with open(cookies_path, "w") as f:
        json.dump(cookies, f)

def save_to_mongo(result):
    try:
        if db[MONGO_TABLE].insert_one(result):
            print(result['title'], "is saved")
    except Exception as e:
        print("存储错误:", result)
        print("错误原因:", type(e), e)


def get_products():
    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-itemlist')))
    doc = PQ(browser.page_source)
    items = doc('#mainsrp-itemlist .items .item').items()
    for item in items:
        product = {
     
            'image':item.find('.pic img').attr("src"),
            'price':item.find('.price').text(),
            'deal':item.find('.deal-cnt').text()[:-3],
            'title':item.find('.title').text(),
            'shop':item.find('.shop').text(),
            'location':item.find('.location').text()
        }
        save_to_mongo(product)


def main():
    try:
        total = search()
        total = re.search(re.compile("\d+"), total)
        if total:
            num = int(total.group())
            for i in range(2, num+1):
                next_page(i)
    except Exception as e:
        print(e)
    finally:
        browser.close()

if __name__ == '__main__':
    main()

config.py

MONGO_URL = 'localhost'
MONGO_DB = 'taobao'
MONGO_TABLE = 'product'

KEYWORD = '糕点'

你可能感兴趣的:(所谓爬虫)