学习目的 使用scrapy框架获取动态网站信息,以淘宝为例, 获取商品的[描述,价格,商店,图片链接]将获取的信息保存到execl表,或者json文件,数据库中。
打开淘宝首页搜索一加手机,获取第一页url为https://s.taobao.com/search?q=%E4%B8%80%E5%8A%A0%E6%89%8B%E6%9C%BA&imgfile=&js=1&stats_click=search_radio_all%3A1&initiative_id=staobaoz_20180830&ie=utf8,
点击下一页获取第二页url为https://s.taobao.com/search?q=%E4%B8%80%E5%8A%A0%E6%89%8B%E6%9C%BA&imgfile=&js=1&stats_click=search_radio_all%3A1&initiative_id=staobaoz_20180830&ie=utf8&bcoffset=4&p4ppushleft=1%2C48&s=44&ntoffset=4
通过对比分析下一页的url主要通过s变化,每次增加44,所以url可以简化为https://s.taobao.com/search?q=%E4%B8%80%E5%8A%A0%E6%89%8B%E6%9C%BA&s=0
下面开始写代码了
新建一个项目文件
scrapy startproject taobao
cd taobao
scrapy genspider phone s.taobao.com
item.py
import scrapy
class TaobaoItem(scrapy.Item):
img_src = scrapy.Field()
info = scrapy.Field()
price = scrapy.Field()
shop = scrapy.Field()
spiders/phone.py
import scrapy
from selenium import webdriver
from ..items import TaobaoItem
class PhoneSpider(scrapy.Spider):
name = 'phone'
allowed_domains = ['s.taobao.com']
total_page = 20
start_urls = ['https://s.taobao.com/search?q=%E4%B8%80%E5%8A%A0%E6%89%8B%E6%9C%BA&s={}'.format(i*44) for i in range(total_page)]
def __init__(self):
# 配置谷歌浏览器无图和无界面模式
self.options = webdriver.ChromeOptions()
self.prefs = {
'profile.default_content_setting_values':{'images':2}
}
self.options.add_argument('--headless')
self.options.add_argument('--disable-gpu')
self.options.add_experimental_option('prefs',self.prefs)
self.driver = webdriver.Chrome(chrome_options=self.options)
def parse(self, response):
data_list = response.xpath('//div[@class="item J_MouserOnverReq "]')
for data in data_list:
info = ''.join(data.xpath('.//div[@class="row row-2 title"]/a/text()').extract()).strip().replace('/','')
price = data.xpath('.//div[@class="price g_price g_price-highlight"]/strong/text()').extract_first()
shop = data.xpath('.//a[@class="shopname J_MouseEneterLeave J_ShopInfo"]/span[2]/text()').extract_first()
img_src = "https:" + data.xpath('.//a[@class="pic-link J_ClickStat J_ItemPicA"]/img/@data-src').extract_first()
item = TaobaoItem()
item['info'] = info
item['price'] = price
item['shop'] = shop
item['img_src'] = [img_src]
yield item
# 获取总页数
# self.total_page = response.xpath('//div[@class="total"]/text()').re_first('\d+')
@staticmethod
def close(spider, reason):
spider.driver.close()
return
pipelines.py
import scrapy
import sqlite3
import xlwt
from urllib.request import urlretrieve
from scrapy.exporters import JsonItemExporter
class UrllibPipeline(object):
def process_item(self, item, spider):
urlretrieve(item["img_src"][0],"imgs/"+item["info"]+".jpg")
return item
class JsonFilePipeline(object):
def __init__(self):
self.file = open('taobao.json','wb')
self.exporter = JsonItemExporter(self.file,ensure_ascii=False,encoding='utf-8')
def open_spider(self,spider):
self.exporter.start_exporting()
def process_item(self,item,spider):
self.exporter.export_item(item)
return item
def close_spider(self,spider):
self.exporter.finish_exporting()
self.file.close()
class TaobaoPipeline(object):
def __init__(self):
self.workbook = xlwt.Workbook(encoding='utf-8')
self.sheet = self.workbook.add_sheet('一加手机')
self.info_list = ['info','price','shop','img_src']
self.row = 1
def open_spider(self,spider):
for index,info in enumerate(self.info_list):
self.sheet.write(0,index,info)
def close_spider(self,spider):
self.workbook.save("Taobao.xlsx")
def process_item(self, item, spider):
data_list = [item["info"],item["price"],item["shop"],item["img_src"]]
for index,data in enumerate(data_list):
self.sheet.write(self.row,index,data)
self.row += 1
return item
class SqlitePipeline(object):
def __init__(self):
self.conn = sqlite3.connect('taobaoDB')
self.cursor = self.conn.cursor()
def open_spider(self,spider):
self.cursor.execute('create table if not exists phone (img text,info text,price text,shop text)')
self.conn.commit()
def process_item(self,item,spider):
self.cursor.execute(f'insert into phone VALUES ("{item["img_src"]}","{item["info"]}","{item["price"]}","{item["shop"]}")')
self.conn.commit()
return item
def close_spider(self,spider):
self.conn.close()
middlewares.py 此文件为中间件文件 ,在此添加一个下载中间件
from scrapy import signals
from scrapy.http.response.html import HtmlResponse
class SeleniumMiddleware(object):
def process_request(self,request,spider):
if spider.name == "phone":
spider.driver.get(request.url)
spider.driver.implicitly_wait(10)
response = HtmlResponse(url=spider.driver.current_url,
request=request,
body=spider.driver.page_source,
encoding='utf-8')
return response
修改settings.py
ROBOTSTXT_OBEY = False
DOWNLOADER_MIDDLEWARES = {
# 'taobao.middlewares.TaobaoDownloaderMiddleware': 543,
'taobao.middlewares.SeleniumMiddleware': 2,
}
ITEM_PIPELINES = {
'taobao.pipelines.TaobaoPipeline': 1,
'taobao.pipelines.JsonFilePipeline': 2,
'taobao.pipelines.SqlitePipeline': 4,
# 如果想要下载图片只需将下面一行解注释即可
# 'taobao.pipelines.UrllibPipeline': 56,
}
IMAGES_STORE = 'imgs'
在spiders同级目录下新建main.py文件
from scrapy import cmdline
cmdline.execute('scrapy crawl phone'.split())
最后直接运行main.py文件即可。
源码放到GitHub上--> 点击跳转