目录
方法一:利用框架内置函数
方法二:拼接URL
方法三:拼接简化版
Scrapy提供了很多种翻页的方式,记住其中常用的三种即可
下期会将一些CSS基本语法
这是spider.pyl里设置即可
好处:可以从自己设置爬取的页面范围,即从第几页到第几页,在明确页码的情况下,可以用这种方法。
import scrapy
from my_scrapy.items import MyScrapyItem
class SpiderSpider(scrapy.Spider):
# 爬虫名称
name = 'spider'
# 域名限制,允许爬取的范围
# allowed_domains = ['https://quotes.toscrape.com/']
base_url = 'https://quotes.toscrape.com/page/{}/'
# 初始请求的页面
start_urls = ['https://quotes.toscrape.com/']
# 方法一:构造翻页函数
def start_requests(self):
for page in range(1,10):
url = self.base_url.format(page)
yield scrapy.Request(url,callback=self.parse)
def parse(self, response):
# text = response.text
quotes = response.xpath('//div[@class="quote"]')
for quote in quotes :
# 旧方法 get()为新方法
# text = quote.xpath('./span[@class = "text"]/text()').extract_first()
# 实例化对象
item = MyScrapyItem()
# 利用xpth进行爬取
text = quote.xpath('./span[@class = "text"]/text()').get()
author = quote.xpath('.//small[@class="author"]/text()').get()
Tags = quote.xpath('.//a[@class="tag"]/text()').getall()
item['text'] = text
item['author'] = author
item['Tag'] = Tags
# 迭代出去
yield item
好处:可以自动爬取到最后一页,比较方便,不用取查看网页有几页。
import scrapy
from my_scrapy.items import MyScrapyItem
class SpiderSpider(scrapy.Spider):
# 爬虫名称
name = 'spider'
# 域名限制,允许爬取的范围
# allowed_domains = ['https://quotes.toscrape.com/']
# base_url = 'https://quotes.toscrape.com/page/{}/'
# 初始请求的页面
start_urls = ['https://quotes.toscrape.com/']
def parse(self, response):
# text = response.text
quotes = response.xpath('//div[@class="quote"]')
for quote in quotes:
# 旧方法 get()为新方法
# text = quote.xpath('./span[@class = "text"]/text()').extract_first()
# 实例化对象
item = MyScrapyItem()
# 利用xpth进行爬取
text = quote.xpath('./span[@class = "text"]/text()').get()
author = quote.xpath('.//small[@class="author"]/text()').get()
Tags = quote.xpath('.//a[@class="tag"]/text()').getall()
item['text'] = text
item['author'] = author
item['Tag'] = Tags
# 迭代出去
yield item
# 拼接URL
next = response.css('.pager .next a::attr("href")').get()
url = response.urljoin(next)
yield scrapy.Request(url=url, callback=self.parse)
好处:代码简洁
import scrapy
from my_scrapy.items import MyScrapyItem
class SpiderSpider(scrapy.Spider):
# 爬虫名称
name = 'spider'
# 域名限制,允许爬取的范围
# allowed_domains = ['https://quotes.toscrape.com/']
# base_url = 'https://quotes.toscrape.com/page/{}/'
# 初始请求的页面
start_urls = ['https://quotes.toscrape.com/']
def parse(self, response):
# text = response.text
quotes = response.xpath('//div[@class="quote"]')
for quote in quotes:
# 旧方法 get()为新方法
# text = quote.xpath('./span[@class = "text"]/text()').extract_first()
# 实例化对象
item = MyScrapyItem()
# 利用xpth进行爬取
text = quote.xpath('./span[@class = "text"]/text()').get()
author = quote.xpath('.//small[@class="author"]/text()').get()
Tags = quote.xpath('.//a[@class="tag"]/text()').getall()
item['text'] = text
item['author'] = author
item['Tag'] = Tags
# 迭代出去
yield item
# 简洁的拼接
yield from response.follow_all(response.css('.pager .next a::attr("href")'), callback=self.parse)