#1、架构介绍
#Engine。引擎,处理整个系统的数据流、触发事务,是整个框架的核心
#item。项目,它定义了爬取结果的数据结构,爬取的数据会被赋值成Item对象
#Scheduler。调度器,接受引擎发过来的请求并将其加入队列中,在引擎再次请求时将请求提供给引擎
#Downloader。下载器,下载网页内容,并将网页内容返回给蜘蛛
#Spiders。蜘蛛,其内容定义了爬取的逻辑和网页的解析规则,它主要负责解析响应内容并生产提取结果和新的请求
#Item Pipeline 项目管道,负责处理由蜘蛛从网页中抽取的项目,它的主要任务是清洗、验证和存储数据。
#Downloader Middlewaress. 下载器中间件,位于引擎和蜘蛛之间的钩子框架,主要处理引擎与下载器之间的请求及响应
#Spider Middlewares 蜘蛛中间件,位于引擎和蜘蛛之间的钩子框架,主要处理向蜘蛛输入的响应和输出结果及新的请求
# Scrapy 入门
#在cmd中进入一个文件夹,然后输入 scrapy startproject tutorial,这个命令会创建一个tutorial的文件夹,文件夹结构如下:
'''
scrapy.cfg #Scrapy 部署时的配置文件,定义了项目的配置文件,部署相关信息等内容
tutorial #项目的模块,需要从这里引入
_init_.py
items.py #Item的定义,定义爬取的数据结构
middlewares.py #Middlewares的定义,定义爬取时的中间件
pipelines.py #piplines的定义,定义数据管道
settings.py #配置文件
spiders #放置Spiders的文件夹
_init_.py
'''
#创建Spider
#spider是自己定义的类,Scrapy用它来从网页里爬取内容,并解析抓取的结果。不过这个类必须继承Scrapy提供的Spider类scrapy.Spider,还要定义Spider的名称和起始请求,以及怎样处理爬取后的结果的方法
#也可以使用命令行来创建一个Spider。比如,要生成Quotes这个Spider,可以执行如下命令:
#cd tutorial
#scrapy genspider qutotes qutotes.toscrape.com
#进入刚创建的tutorial文件夹,执行genspider命令,第一个参数是Spider的名称,第二个参数是网站域名,执行完毕后spiders文件夹中多了一个quotes.py文件,内容如下:
'''
# -*- coding: utf-8 -*-
import scrapy
class QuotesSpider(scrapy.Spider):
name = 'quotes'
allowed_domains = ['quotes.toscrape.com']
start_urls = ['http://quotes.toscrape.com/']
def parse(self, response):
pass
'''
#这里有三个属性—name、allowed_domins、start_urls 还有一个方法parse
#name:每个项目唯一的名字,用来区分不同的Spider
#allowed_domains:它是允许爬取的域名,如果初始或后续的的请求连接不是这个域名下的,则请求连接会被过滤掉 #start_urls,它包含了Spider在启动时爬取的爬取的url列表,初始请求是用它来定义的
#parse,它是spider的一个方法,负责解析返回的响应、提取数据或者进一步生成要处理的请求
#创建item
#Item是保存数据的容器,它使用的方法与字典类似,,不过Item添加了额外的保护机制,可以避免拼写或者定义字段错误
#创建Item类需要继承自scrapy.Item类,并且定义类型为scrapy.Field的字段。观察目标网站,发现能提取的内容有text,author,tags
# 定义Item,此时将item修改如下:
'''
import scrapy
class TutorialItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
text = scrapy.Field()
author = scrapy.Field()
tags = scrapy.Field()
pass
'''
#解析Response
#前面我们看到,parse()方法的参数response是stat_urls里面的连接爬取后的结果。所以在pars()方法中,我们可以直接对response变量包含的内容进行解析,比如说浏览器请求结果的源代码,或者进一步分析源代码内容,或者找出结果中的连接而得到下一个请求 #首先看网页结构,每一页都有多个class为quote的区块,每个区块内都包含text、author、tags。那么我们先找出所有的quote,然后提取每一个quote中的内容,这里讲parse()方法内容修改如下
'''
def parse(self,response):
quotes = response.css('.quote')
for quote in quotes:
text = quote.css('.text::text').extract_first()
author = quote.css('.author::text').extract_first()
tags = quote.css('.tags .tag::text').extract()
'''
#使用Item
# 上文定义了Item,Item可以理解为一个字典,不过在声明时需要实例化。然后依次用刚才解析的结果赋值Item的每一个字段,最后将Item返回即可
#QuotesSpider改写如下:
'''
# -*- coding: utf-8 -*-
import scrapy
from tutorial.items import TutorialItem
class QuotesSpider(scrapy.Spider):
name = 'quotes'
allowed_domains = ['quotes.toscrape.com']
start_urls = ['http://quotes.toscrape.com/']
def parse(self, response):
quotes = response.css('.quote')
for quote in quotes:
item = TutorialItem()
item['text'] = quote.css('.text::text').extract_first()
item['author'] = quote.css('.author::text').extract_first()
item['tags'] = quote.css('.tags .tag::text').extract()
yield item
'''
#后续Request
#上面的操作实现了从初始页面抓取内容。那么下一页的内容该如何抓取呢?这就需要从当前页面中找到信息来生成下一个请求,然后在下一个请求中再找到信息来生成下一个请求。这样循环往复迭代,从而实现整站的爬取。
#构造请求时需要用到scrapy.Request,这里我们传递两个参数—url和callback
#url:它请求的下一个连接的地址
#callback:它是回调函数。当指定了该回调函数的请求完成后,获取到响应,引擎会将该响应作为参数传递给这个回调函数。回调函数进行解析或生成下一个请求,在parse()方法后追加如下代码
'''
next = response.css('.pager .next a::attr("href")').extract_first()
url = response.urljoin(next)
yield scrapy.Request(url=url, callback=self.parse)
'''
#运行
#进入目录,运行
# scrapy crawl quotes
#保存到文件
# scrapy crawl quotes -o quotes.json 或quotes.csv 或quotes.xml 或quotes.pickle 或quotes.marshal
#或者进行远程输出ftp scrapy crawl quotes -o ftp://[email protected]/path/to/quotes.csv
#使用Item Pipeline
#如果想进行更复杂的操作,如将结果保存到MongDB数据库,或者筛选某些有用的Item,我们可以通过Item Pipeline来实现
#Item Pipeline作为项目管道,当Item生成后,它会自动被送到Item Pipleline进行处理,我们常用Item Pipeline来做如下操作
#清理HTML数据
#验查爬取数据,检查爬取字段
#查重并去掉重复内容
#将爬取结果保存到数据库
#要实现Item Pipeline 很简单,只需要定义一个类并实现process_item()方法即可。启动Item Pipeline后,Item Pipeline会自动调动process_item()方法,并返回数据,数据类型必须是字典或者Item对象,或者抛出DropItem异常
#process_item()方法有两个参数,item是每次Spider生成的Item,第二个参数是Spider的实例化对象 #接下来我们实现一个process_item()方法,将item传入字典中长度大于30的数据截成30并在后面加上......
#接下来我们将处理后的item存入MongoDB中,定义另外一个Pipeline。同样在pipelines文件内,我们实现另一个类MongoPipeline,内容如下:
#还需要将settings.py文件中这部分进行改写
'''
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = { 'tutorial.pipelines.TutorialPipeline': 300, 'tutorial.pipelines.MongoPipeline':400 } MONGO_URI = 'localhost'
MONGO_DB = 'quotes' ''' ''' import pymongo class TutorialPipeline(object): def __init__(self): self.limit = 30 def process_item(self, item, spider): #接下来我们实现一个process_item()方法,将item传入字典中长度大于30的数据截成30并在后面加上...... if item['text']: if len(item['text']) > self.limit: item['text'] = item['text'][:self.limit].rstrip() + '......' return item else: return DropItem('Missing Text') #没有文本抛出异常 class MongoPipeline(object): def __init__(self,mongo_uri,mongo_db): self.mongo_uri = mongo_uri self.mongo_db = mongo_db @classmethod def from_crawler(cls,crawler):
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db = crawler.settings.get('MONGO_DB')
)
def open_spider(self,spider):
self.client = pymongo.MongoClient(self.mongo_uri) #连接MongoDB
self.db = self.client[self.mongo_db] #创建数据库
def process_item(self,item,spider):
name = item.__class__.__name__
self.db[name].insert(dict(item))
return item
def close_spider(self,spider):
self.client.close()
'''
下面是各个文件中代码情况:
items.py:
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class TutorialItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() text = scrapy.Field() author = scrapy.Field() tags = scrapy.Field() pass
middlewares.py文件没有更改内容
pipelines.py文件内容:
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymongo
class TutorialPipeline(object):
def __init__(self):
self.limit = 30
def process_item(self, item, spider): #接下来我们实现一个process_item()方法,将item传入字典中长度大于30的数据截成30并在后面加上......
if item['text']:
if len(item['text']) > self.limit:
item['text'] = item['text'][:self.limit].rstrip() + '......'
return item
else:
return DropItem('Missing Text') #没有文本抛出异常
class MongoPipeline(object):
def __init__(self,mongo_uri,mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
@classmethod
def from_crawler(cls,crawler):
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db = crawler.settings.get('MONGO_DB')
)
def open_spider(self,spider):
self.client = pymongo.MongoClient(self.mongo_uri) #连接MongoDB
self.db = self.client[self.mongo_db] #创建数据库
def process_item(self,item,spider):
name = item.__class__.__name__
self.db[name].insert(dict(item))
return item
def close_spider(self,spider):
self.client.close()
settings文件内容:
# -*- coding: utf-8 -*-
# Scrapy settings for tutorial project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'tutorial'
SPIDER_MODULES = ['tutorial.spiders']
NEWSPIDER_MODULE = 'tutorial.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'tutorial (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'tutorial.middlewares.TutorialSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'tutorial.middlewares.TutorialDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'tutorial.pipelines.TutorialPipeline': 300,
'tutorial.pipelines.MongoPipeline':400
}
MONGO_URI = 'localhost'
MONGO_DB = 'quotes'
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
quotes.py文件内容:
# -*- coding: utf-8 -*-
import scrapy
from tutorial.items import TutorialItem
class QuotesSpider(scrapy.Spider):
name = 'quotes'
allowed_domains = ['quotes.toscrape.com']
start_urls = ['http://quotes.toscrape.com/']
def parse(self, response):
quotes = response.css('.quote')
for quote in quotes:
item = TutorialItem()
item['text'] = quote.css('.text::text').extract_first()
item['author'] = quote.css('.author::text').extract_first()
item['tags'] = quote.css('.tags .tag::text').extract()
yield item
next = response.css('.pager .next a::attr("href")').extract_first()
url = response.urljoin(next)
yield scrapy.Request(url=url, callback=self.parse)
最后执行 scrapy crawl quotes命令:
我用的是adminMongo可视化工具,使用前还得先连接到创建的数据库才能看到数据