Python爬虫:利用selenium爬取淘宝商品信息


# 项目简介:利用selenium爬取淘宝商品信息
"""
思路:
1、先打开浏览器,输入关键字,点击搜索,获取商品页总页数
2、通过遍历所有页面,获取商品页
3、获取页面的时候同时进行解析页面内容
4、将获取到的数据,存入mongodb中

技巧:
1、先通过chrome测试需要的内容,再修改为phatomjs
2、每次需要模拟操作之前,可以设置等待条件,等待加载完毕再操作
3、通过浏览器自带的路径选择器,可以较快的对网页元素进行选择
"""

代码实现

import re
import pymongo
from pyquery import PyQuery as pq
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException


# 配置mongo数据库
client = pymongo.MongoClient("localhost")
db = client["taobao"]

# 设置浏览器参数
service_args = ["--load-images=false"]

browser = webdriver.PhantomJS(service_args=service_args)
browser.set_window_size(1400, 900) # 不设置可能访问不到正确的页面
wait = WebDriverWait(browser, 10)

# 输入网址,搜索关键字
def search_page():
    print("正在搜索...")
    try:
        browser.get("https://www.taobao.com/")
        # 搜索
        search = wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "#q"))
        )
        submit = wait.until(
            EC.element_to_be_clickable((By.CSS_SELECTOR,
            '#J_TSearchForm > div.search-button > button'))
        )
        search.send_keys("美食")
        submit.click()

        # 获取总页数
        total = browser.find_element_by_css_selector(
            "#mainsrp-pager > div > div > div > div.total")

        total = int(re.compile("(\d+)").search(total.text).group(1))
        return total
    except TimeoutException:
        return search_page()


# 翻页访问
def next_page(page_num):
    print("正在翻页...", page_num)
    try:
        number = wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR,
            "#mainsrp-pager > div > div > div > div.form > input")))

        confirm = wait.until(
            EC.element_to_be_clickable((By.CSS_SELECTOR,
            "#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit")))
        number.clear()
        number.send_keys(page_num)
        confirm.click()

        wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,
        "#mainsrp-pager > div > div > div > ul > li.item.active"), str(page_num)))

        # 解析页面
        parse_page()

    except TimeoutException:
        next_page(page_num)


# 解析页面,获取商品信息
def parse_page():
    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,
        "#mainsrp-itemlist .items .item")))

    # 用pyquery解析
    doc = pq(browser.page_source)
    items = doc("#mainsrp-itemlist .items .item").items()
    for item in items:
        product = {}
        product["image"] = item.find(".pic .img").attr("src")
        product["title"] = item.find(".title").text()
        product["price"] = item.find(".price").text()
        product["shop"] = item.find(".shop").text()
        product["deal-cnt"] = item.find(".deal-cnt").text()[:-3]
        product["location"] = item.find(".location").text()
        print(product)

        # 保存数据
        save_to_mongo(product)

# 将字典格式的数据保存到mongodb中
def save_to_mongo(data):
    try:
        db["taobao"].insert(data)
        print("保存成功", data)
    except Exception:
        print("保存失败")

# 程序主函数
def main():
    total = search_page()
    for i in range(1, total+1):
        next_page(i)
    browser.close()

if __name__ == "__main__":
    main()

你可能感兴趣的:(python)