后来我在尝试了使用selenium模拟动态页面请求时,终于找到了一个比较完美的方案。但是缺点是数据抓取速度没那么快了。但这其实影响不大,反正我们睡觉的时候也可以让机器工作嘛。
我们的案例是爬取关键词iPhone x case的商品前50条数据(eBay显示50,其实实际是60条),这里没有做翻页的深度。
这里注意无销量的产品是抓取不到销售数据的,scrapy会报错然后跳到下一条。所以我们把无销量的产品sold=“0 sold”传给pipelines就可以了
下面我把代码贴出来:
# -*- coding: utf-8 -*-
import scrapy,time
from scrapy.selector import Selector
from scrapy.http import Request
from ..items import ebItem
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
count = 0
count2 = 0
class Ebt2Spider(scrapy.Spider):
name = 'ebt2'
allowed_domains = ['ebay.com']
start_urls = ['https://www.ebay.com/sch/i.html?_from=R40&_nkw=iphone+x+case&_sacat=0&_ipg=50']
visited_urls = set()
def parse(self, response):
hxs1 = Selector(response=response).xpath('//li[@class="s-item "]')
global count
for obj in hxs1:
href = obj.xpath('.//a[@class="s-item__link"]/@href').extract_first().strip()
if href in self.visited_urls:
pass
else:
self.visited_urls.add(href)
for url in self.visited_urls:
print(count,"\n",url,"\n\n")
yield Request(
url=href,
callback=self.getsold
)
def getsold(self,response):
global count2
count2+=1
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
driver = webdriver.Chrome(chrome_options=options)
driver.get(response.url)
WebDriverWait(driver, 30).until(lambda driver: driver.find_element_by_id("shippingSummary"))
try:
sold = Selector(response=response).xpath(
'//div[@class="u-flL qtyCntVal vi-bboxrev-posabs vi-bboxrev-dsplinline "]//a[@class="vi-txt-underline"]/text()').extract_first().strip()
driver.quit()
item_obj = ebItem(count=count2, href=response.url, sold=sold)
yield item_obj
except:
driver.quit()
item_obj = ebItem(count=count2, href=response.url, sold="0 sold")
yield item_obj