主要内容,官网首页selenium模拟发送搜索关键字,搜索页获取商品链接并进行翻页爬取,其中商品详情页有不同类型的页面,进行不同的抓取方法,处理抓取数据并进行保存。
scrapy.spiders代码如下:
import scrapy
from scrapy import Request
from TaoBao.items import TaobaoItem
import re
class ExampleSpider(scrapy.Spider):
name = 'taobao'
def start_requests(self):
url_str = 'http://www.taobao.com'
yield Request(url=url_str,callback=self.parse,dont_filter=True,meta={'page':0})
def parse(self, response):
price = response.xpath('//div[@class="price g_price g_price-highlight"]/strong/text()').extract()
for i in price:
print(i)
good_urls = response.xpath('//div[@class="row row-2 title"]/a/@href').extract()
for good_url in good_urls:
if not re.search('http:', good_url):
good_url = 'https:' + good_url
print(good_url)
yield Request(url=good_url, callback=self.parse_dail, meta={'page': '1'})
if re.search(r'search',response.url):
yield Request(url = response.url,callback=self.parse,meta={'page':'2'})
def parse_dail(self,response):
if re.search(r'//detail',response.url):
print('hello')
good_url = response.url
goods_name = response.xpath('//div[@class="tb-detail-hd"]/h1/text()').extract()
shop_name = response.xpath('//div[@class="name"]/a[@class="shopLink"]/text()').extract()
price = response.xpath('//span[@class="tm-price"]/text()').extract()
count_num=response.xpath('//span[@class="tm-count"]/text()').extract()
sales_volumn = count_num[0]
comments = count_num[1]
print(''.join(goods_name))
print(price)
print(good_url)
print(sales_volumn)
print(comments)
print(''.join(shop_name))
names = ''.join(goods_name)
item = TaobaoItem()
item["good_url"] = good_url
item["goods_name"]=''.join(names.split())
item["shop_name"]=''.join(shop_name)
item["price"]=price[-1]
item["sales_volumn"]=sales_volumn
item["comments"]=comments
yield item
if re.search(r'//item',response.url):
good_url = response.url
goods_name = response.xpath('//h3[@class="tb-main-title"]/text()').extract()
shop_name = response.xpath('//div[@class="shop-name-wrap"]/a[@class="shop-name-link"]/text()').extract()
price = response.xpath('//em[@class="tb-rmb-num"]/text()').extract()
sales_volumn = response.xpath('//strong[@id="J_SellCounter"]/text()').extract()
comments = response.xpath('//strong[@id="J_RateCounter"]/text()').extract()
print(''.join(goods_name))
print(''.join(price))
print(good_url)
print(sales_volumn)
print(comments)
print(''.join(shop_name))
names = ''.join(goods_name)
item = TaobaoItem()
item["good_url"] = good_url
item["goods_name"] = ''.join(names.split())
item["shop_name"] = ''.join(shop_name)
item["price"] = price[-1]
item["sales_volumn"] = ''.join(sales_volumn)
item["comments"] = ''.join(comments)
yield item
import scrapy
class TaobaoItem(scrapy.Item):
good_url = scrapy.Field()
goods_name = scrapy.Field()
shop_name = scrapy.Field()
price = scrapy.Field()
sales_volumn = scrapy.Field()
comments = scrapy.Field()
import sqlite3
class TaobaoPipeline(object):
def __init__(self):
self.conn = sqlite3.connect('taobao.db')
self.cursor = self.conn.cursor()
self.cursor.execute("create table IF NOT EXISTS taobagoods(good_url varchar(200),goods_name varchar(200),shop_name varchar(500),price varchar(100),sales_volumn varchar(100),comments varchar(100))")
def process_item(self, item, spider):
self.cursor.execute("insert into taobagoods values('%s','%s','%s','%s','%s','%s')"%(item["good_url"],item["goods_name"],item["shop_name"],item["price"],item["sales_volumn"],item["comments"]))
self.conn.commit()
return item
BOT_NAME = 'TaoBao'
ROBOTSTXT_OBEY = False
USER_AGENT = 'TaoBao (+http://www.yourdomain.com)'
SPIDER_MODULES = ['TaoBao.spiders']
NEWSPIDER_MODULE = 'TaoBao.spiders'
SPIDER_MIDDLEWARES = {
'TaoBao.middlewares.TaobaoSpiderMiddleware': 543,
}
DOWNLOADER_MIDDLEWARES = {
'TaoBao.middlewares.SeleniumMiddlewares': 543,
}
ITEM_PIPELINES = {
'TaoBao.pipelines.TaobaoPipeline': 300,
}
from selenium import webdriver
from selenium.webdriver.firefox.options import Options as FOptions
import time
from scrapy.http import HtmlResponse
class SeleniumMiddlewares(object):
def __init__(self):
self.options = FOptions()
#self.options.add_argument("-headless")
self.browser = webdriver.Firefox(executable_path="/home/hello/Downloads/geckodriver",firefox_options=self.options)
def process_request(self,request,spider):
if int(request.meta['page']) == 0:
self.browser.get(request.url)
input_name =self.browser.find_element_by_xpath('//*[@id="q"]')
input_name.click()
input_name.send_keys('python')
btn_seacher = self.browser.find_element_by_xpath('//button[@class="btn-search tb-bg"]')
btn_seacher.click()
time.sleep(3)
if int(request.meta['page']) == 1:
self.browser.get(request.url)
time.sleep(3)
if int(request.meta['page']) == 2:
self.browser.get(request.url)
self.browser.execute_script("window.scrollTo(0,document.body.scrollHeight)")
time.sleep(1)
next_page = self.browser.find_element_by_xpath('//span[contains(text(),"下一页")]')
next_page.click()
time.sleep(2)
return HtmlResponse(url=self.browser.current_url, body=self.browser.page_source, encoding="utf-8",
request=request)