爬取淘宝商品信息

爬取淘宝商品

  • 获取商品列表

获取商品列表

from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from urllib.parse import quote

browser=webdriver.Chrome()
wait=WebDriverWait(browser,10)
KEYWORD='ipad'
def index_page(page):
     """
     抓取索引页
     :param page: 页码
     """
     print('正在爬取第',page,'页')
     try:
         url='https://s.taobao.com/search?q='+quote(KEYWORD)
         browser.get(url)
         if page>1:
             input=wait.until(
                 EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-pager div.form>input')))
             submit=wait.until(
                 EC.element_to_be_clickable((By.CSS_SELECTOR,'#mainsrp-pager div.form>span.btn.J_Submit')))
             input.clear()
             input.send_keys(page)
             submit.click()
         wait.until(
            EC.text_to_be_present_in_element((By.CSS_SELECTOR,'#mainsrp-pager li.item.active>span'),str(page)))
         wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.m-itemlist .item .item')))
         get_products()
     except TimeoutException:
         index_page(page)
#提取商品数据
def get_products():
    html=browser.page_source
    doc=pq(html)
    items=doc('#mainsrp-itemlist .items .item').items()
    for item in items:
        product={
     
            'image':item.find('.pic .img').attr('data-src'),
            'price':item.find('.price').text(),
            'deal':item.find('.deal-cnt').text(),
            'title':item.find('.title').text(),
            'shop':item.find('.shop').text(),
            'location':item.find('.location').text()
        }
        print(product)

#保存到MongoDB
MONGO_URL='localhost'
MONGO_DB='taobao'
MONGO_COLLECTION='product'
client=pymongo.MongoClient(MONGO_URL)
db=client[MONGO_DB]
def save_to_mongo(result):
    """
    保存至MongoDB
    :param result:结果
    :return:
    """
    try:
        if db[MONGO_COLLECTION].insert(result):
            print('存储到MongoDB成功')
    except Exception:
        print('存储到MongoDB失败')

#遍历每页
MAX_PAGE=30
def main():
    """
    遍历每一页
    """
    for i in range(1,MAX_PAGE+1):
        index_page(1)

if __name__ == '__main__':
    main()

你可能感兴趣的:(python爬虫,python,selenium)