selenium爬取 阿里巴巴 商品目录页面

仅仅是获取商品目录,不需要登陆
逻辑很简单,打开首页,点击更多,逐层把目录的内容爬下来,同时编好关联的ID

注意:

  1. 使用mongodb存储(因为不需要先建库,省事儿)
  2. tqdm添加进度条,好看点
  3. 不加载图片
  4. 无头浏览器
  5. 忽略一些警告
  6. 模拟手机
from selenium import webdriver
from scrapy import Selector
import time
import pymongo
from tqdm import tqdm


# 链接到mongodb
client = pymongo.MongoClient('localhost', 27017)
# 数据库的名字:ali1688
db = client.ali1688

chrome_options = webdriver.ChromeOptions()
# 配置不加载图片
prefs = {"profile.managed_default_content_settings.images": 2}
chrome_options.add_experimental_option("prefs", prefs)
# 配置无头浏览器
chrome_options.add_argument('--headless')
# 忽略一些警告
chrome_options.add_argument('--log-level=3')
# 模拟手机浏览器
mobile_emulation = {'deviceName': 'iPhone 6'}
chrome_options.add_experimental_option("mobileEmulation", mobile_emulation)
browser = webdriver.Chrome(options=chrome_options)


# 打开登陆页面
browser.get('http://m.1688.com')
browser.implicitly_wait(5)

# 跳转到更多页面
selector = Selector(text=browser.page_source)
more_url = selector.css(
    'body > m-home-nav2 > div > div > a:nth-child(5)::attr(href)').get()
browser.get(more_url)
browser.implicitly_wait(3)

# # 暂时:左侧所有名称
first_container = Selector(
    text=browser.page_source).css('.tab-container').css('.tab-item-container')

first_id = 10
n = 1  # 用来获取左侧列表中的元素的序号
for item in tqdm(first_container):
    first_name = item.css('::text').get()
    # 点击左侧的大类
    browser.find_element_by_css_selector(
        f'body > div.left-content > tab-list > div > div > div:nth-child({n})').click()
    n += 1
    print('=================================')
    print(first_name)
    db.alibaba.insert_one({"name": first_name, "id": first_id})
    first_id += 1
    second_id = 10
    time.sleep(5)
    # 获取某个大类之下的 所有小类别
    for container in Selector(text=browser.page_source).css('.right-content').css('.container'):
        title2 = container.css('.card-title::text').get()
        id2 = (first_id-1)*100+second_id
        print('**'+title2, id2)
        db.alibaba.insert_one({"name": title2, "id": id2})
        second_id += 1
        third_id = 10
        for i in container.css('.item'):
            title3 = i.css('span::text').get()
            id3 = (first_id-1)*10000+(second_id-1)*100+third_id
            print('  --' + title3, id3)
            db.alibaba.insert_one({"name": title3, "id": id3})
            third_id += 1

client.close()
browser.close()

你可能感兴趣的:(python)