selenium爬取淘宝商品

对于《python3网络爬虫实战分析》中selenium爬取淘宝商品信息代码进行修改。

原代码

from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from urllib.parse import quote
from pyquery import PyQuery as pq
from pymongo import MongoClient

browser = webdriver.Chrome()
wait = WebDriverWait(browser, 10)
KEYWORD = 'iPad'

def index_page(page):
    try:
        url = 'https://s.taobao.com/search?q=' + quote(KEYWORD)
        browser.get(url)
        if page > 1:
            input = wait.until(
                EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager div.form > input')))
            submit = wait.until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, '#mainsrp-pager div.form > span.btn.J_Submit')))
            input.clear()
            input.send_keys(page)
            submit.click()
        wait.until(
            EC.text_to_be_present_in_element((By.CSS_SELECTOR, '#mainsrp-pager li.item.active > span'), str(page)))
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.m-itemlist .items .item')))
        get_products()
    except TimeoutException:
        index_page(page)



def get_products():
    """
    提取商品数据
    """
    html = browser.page_source
    doc = pq(html)
    items = doc('#mainsrp-itemlist .items .item').items()
    for item in items:
        product = {
            'image': item.find('.pic .img').attr('data-src'),
            'price': item.find('.price').text(),
            'deal': item.find('.deal-cnt').text(),
            'title': item.find('.title').text(),
            'shop': item.find('.shop').text(),
            'location': item.find('.location').text()
        }
        print(product)
        save_to_mongo(product)


MONGO_URL = 'localhost'
MONGO_DB = 'taobao'
MONGO_COLLECTION = 'products'
client = MongoClient(MONGO_URL)
db = client[MONGO_DB]
def save_to_mongo(result):
    """
    保存至MongoDB
    :param result: 结果
    """
    try:
        if db[MONGO_COLLECTION].insert(result):
            print('存储到MongoDB成功')
    except Exception:
        print('存储到MongoDB失败')


MAX_PAGE = 100
if __name__ == '__main__':
    for i in range(1, MAX_PAGE + 1):
        index_page(i)

运行后,出现了两个问题
1.无法跳过淘宝登陆页面

selenium爬取淘宝商品_第1张图片
2.数据无法存入MongDB
selenium爬取淘宝商品_第2张图片

对于第一个问题,在出于登陆界面时,直接手机扫码登陆后,就可以爬取了(这是我目前能够想出的办法,当然账号登陆也行)。

对于第二个问题,对原代码的save_to_mongo()函数进行一定的修改,如下

from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from urllib.parse import quote
from pyquery import PyQuery as pq
from pymongo import MongoClient
import time

client = MongoClient()
db = client['MONGO_DB']

#我个人调取webdriver的操作,复制时候代码要根据自身进行一定的调整
chrome_driver = 'D:\python3.8.1\chromedriver.exe'
browser = webdriver.Chrome(executable_path=chrome_driver)
time.sleep(1)

wait = WebDriverWait(browser,10)
KEYWORD = 'iPad'

def get_products():
    html=browser.page_source
    doc = pq(html)
    items = doc('#mainsrp-itemlist .items .item').items()
    for item in items:
        product={
            'image':item.find('.pic .img').attr('data-src'),
            'price':item.find('price').text(),
            'deal':item.find('.deal-cnt').text(),
            'title':item.find('.title').text(),
            'shop':item.find('.shop').text(),
            'location':item.find('.location').text()
            }
        print(product)
        save_to_mongo(product)
    

def index_page(page):
    print('正在爬取第',page,'页')
    try:
        url = 'https://s.taobao.com/search?q=' + quote(KEYWORD)
        browser.get(url)
        if page > 1 :
            input = wait.until(
                EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-pager div.form > input')))
            submit = wait.until(
                EC.element_to_be_clickable((By.CSS_SELECTOR,'#mainsrp-pager div.form > span.btn.J_Submit')))
            input.clear()
            input.send_keys(page)
            submit.click()
        wait.until(
            EC.text_to_be_present_in_element((By.CSS_SELECTOR,'#mainsrp-pager li.item.active > span'),str(page)))
        wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR,'.m-itemlist .items .item')))
        get_products()
    except TimeoutException:
        index_page(page)
        

def save_to_mongo(result):
    try:
        if db['MONGO_DB'].insert(result):
            print('存储到MongDB成功')
    except Exception:
        print('存储到MongDB失败')

MAX_PAGE = 100
def main():
    for i in range(1,MAX_PAGE + 1):
        index_page(i)

if __name__=='__main__':
    main()

再次运行
selenium爬取淘宝商品_第3张图片
可以存储到MongDB了

你可能感兴趣的:(selenium爬取淘宝商品)