python学习之 scrapy+selenium爬取淘宝商品信息

学习目的 使用scrapy框架获取动态网站信息,以淘宝为例, 获取商品的[描述,价格,商店,图片链接]将获取的信息保存到execl表,或者json文件,数据库中。

打开淘宝首页搜索一加手机,获取第一页url为https://s.taobao.com/search?q=%E4%B8%80%E5%8A%A0%E6%89%8B%E6%9C%BA&imgfile=&js=1&stats_click=search_radio_all%3A1&initiative_id=staobaoz_20180830&ie=utf8,

点击下一页获取第二页url为https://s.taobao.com/search?q=%E4%B8%80%E5%8A%A0%E6%89%8B%E6%9C%BA&imgfile=&js=1&stats_click=search_radio_all%3A1&initiative_id=staobaoz_20180830&ie=utf8&bcoffset=4&p4ppushleft=1%2C48&s=44&ntoffset=4

通过对比分析下一页的url主要通过s变化,每次增加44,所以url可以简化为https://s.taobao.com/search?q=%E4%B8%80%E5%8A%A0%E6%89%8B%E6%9C%BA&s=0

下面开始写代码了

新建一个项目文件

scrapy startproject taobao

cd taobao 
scrapy genspider phone s.taobao.com

item.py

import scrapy

class TaobaoItem(scrapy.Item):

    img_src = scrapy.Field()
    info = scrapy.Field()
    price = scrapy.Field()
    shop = scrapy.Field()

spiders/phone.py

import scrapy
from selenium import webdriver
from ..items import TaobaoItem
class PhoneSpider(scrapy.Spider):
    name = 'phone'
    allowed_domains = ['s.taobao.com']

    total_page = 20
    start_urls = ['https://s.taobao.com/search?q=%E4%B8%80%E5%8A%A0%E6%89%8B%E6%9C%BA&s={}'.format(i*44) for i in range(total_page)]

    def __init__(self):

        # 配置谷歌浏览器无图和无界面模式
        self.options = webdriver.ChromeOptions()
        self.prefs = {
            'profile.default_content_setting_values':{'images':2}
        }
        self.options.add_argument('--headless')
        self.options.add_argument('--disable-gpu')
        self.options.add_experimental_option('prefs',self.prefs)

        self.driver = webdriver.Chrome(chrome_options=self.options)

    def parse(self, response):

        data_list = response.xpath('//div[@class="item J_MouserOnverReq  "]')

        for data in data_list:

            info = ''.join(data.xpath('.//div[@class="row row-2 title"]/a/text()').extract()).strip().replace('/','')
            price = data.xpath('.//div[@class="price g_price g_price-highlight"]/strong/text()').extract_first()
            shop = data.xpath('.//a[@class="shopname J_MouseEneterLeave J_ShopInfo"]/span[2]/text()').extract_first()
            img_src = "https:" + data.xpath('.//a[@class="pic-link J_ClickStat J_ItemPicA"]/img/@data-src').extract_first()

            item = TaobaoItem()

            item['info'] = info
            item['price'] = price
            item['shop'] = shop
            item['img_src'] = [img_src]

            yield item

        # 获取总页数
        # self.total_page = response.xpath('//div[@class="total"]/text()').re_first('\d+')

    @staticmethod
    def close(spider, reason):

        spider.driver.close()
        return

pipelines.py 

import scrapy
import sqlite3
import xlwt
from urllib.request import urlretrieve
from scrapy.exporters import JsonItemExporter


class UrllibPipeline(object):
    def process_item(self, item, spider):
        urlretrieve(item["img_src"][0],"imgs/"+item["info"]+".jpg")
        return item

class JsonFilePipeline(object):

    def __init__(self):
        self.file = open('taobao.json','wb')
        self.exporter = JsonItemExporter(self.file,ensure_ascii=False,encoding='utf-8')

    def open_spider(self,spider):
        self.exporter.start_exporting()

    def process_item(self,item,spider):
        self.exporter.export_item(item)
        return item

    def close_spider(self,spider):
        self.exporter.finish_exporting()
        self.file.close()


class TaobaoPipeline(object):
    def __init__(self):
        self.workbook = xlwt.Workbook(encoding='utf-8')
        self.sheet = self.workbook.add_sheet('一加手机')
        self.info_list = ['info','price','shop','img_src']
        self.row = 1
    def open_spider(self,spider):

        for index,info in enumerate(self.info_list):
            self.sheet.write(0,index,info)

    def close_spider(self,spider):

        self.workbook.save("Taobao.xlsx")

    def process_item(self, item, spider):

        data_list = [item["info"],item["price"],item["shop"],item["img_src"]]

        for index,data in enumerate(data_list):
            self.sheet.write(self.row,index,data)
        self.row += 1
        return item


class SqlitePipeline(object):

    def __init__(self):
        self.conn = sqlite3.connect('taobaoDB')
        self.cursor = self.conn.cursor()

    def open_spider(self,spider):

        self.cursor.execute('create table if not exists phone (img text,info text,price text,shop text)')
        self.conn.commit()

    def process_item(self,item,spider):

        self.cursor.execute(f'insert into phone VALUES ("{item["img_src"]}","{item["info"]}","{item["price"]}","{item["shop"]}")')
        self.conn.commit()

        return item
    def close_spider(self,spider):

        self.conn.close()

middlewares.py  此文件为中间件文件 ,在此添加一个下载中间件

from scrapy import signals
from scrapy.http.response.html import HtmlResponse

class SeleniumMiddleware(object):

    def process_request(self,request,spider):

        if spider.name == "phone":
            spider.driver.get(request.url)
            spider.driver.implicitly_wait(10)

            response = HtmlResponse(url=spider.driver.current_url,
                                    request=request,
                                    body=spider.driver.page_source,
                                    encoding='utf-8')
            return response

修改settings.py 

ROBOTSTXT_OBEY = False

DOWNLOADER_MIDDLEWARES = {
   # 'taobao.middlewares.TaobaoDownloaderMiddleware': 543,
   'taobao.middlewares.SeleniumMiddleware': 2,
}

ITEM_PIPELINES = {
   'taobao.pipelines.TaobaoPipeline': 1,
   'taobao.pipelines.JsonFilePipeline': 2,
   'taobao.pipelines.SqlitePipeline': 4,
   # 如果想要下载图片只需将下面一行解注释即可
   # 'taobao.pipelines.UrllibPipeline': 56,
}

IMAGES_STORE = 'imgs'

在spiders同级目录下新建main.py文件

from scrapy import cmdline

cmdline.execute('scrapy crawl phone'.split())

最后直接运行main.py文件即可。

源码放到GitHub上--> 点击跳转

 

你可能感兴趣的:(scrapy,selenium)