运行环境:
* Python 2.7.12
* Scrapy 1.2.2
* Mac OS X 10.10.3 Yosemite
继续爬取Scrapy 1.2.2文档提供的练习网址:
"http://quotes.toscrapy.com"
可以暂时不用考虑爬虫被封的情况,用于初级爬虫练习。
目标
多级页面爬取时,在什么位置yield items是个问题,结论是可以放入子页面的爬取时yield items。但是要记住scrapy的自动去重。
最终代码
因为本次实验内容较多。因此先给出最终的代码。
items.py声明items
增加声明子页面的抓取内容。
import scrapy
class QuotesItem(scrapy.Item):
quote = scrapy.Field()
author = scrapy.Field()
tags = scrapy.Field()
author_born_date = scrapy.Field()
author_born_location = scrapy.Field()
author_description = scrapy.Field()
author_full_url = scrapy.Field()
爬虫文件
需要进行以下关键内容改写:
- 引入items.py中的类
- 更改爬虫名
- 把作者的介绍页面的链接(author_full_url)放入items中。
- 需要把item的元数据传入子页面的request中:
meta={'item':item}
- 需要把Scrapy的自动去重机制关掉:
callback=self.parse_author,dont_filter=True
最终代码如下:
import scrapy
from quotes_2.items import QuotesItem
class QuotesSpider(scrapy.Spider):
name = 'quotes_2_6'
start_urls = [
'http://quotes.toscrape.com',
]
allowed_domains = [
'toscrape.com',
]
def parse(self,response):
for quote in response.css('div.quote'):
item = QuotesItem()
item['quote'] = quote.css('span.text::text').extract_first()
item['author'] = quote.css('small.author::text').extract_first()
item['tags'] = quote.css('div.tags a.tag::text').extract()
author_page = response.css('small.author+a::attr(href)').extract_first()
item['author_full_url'] = response.urljoin(author_page)
yield scrapy.Request(url=item['authro_full_url'], meta={'item':item},callback=self.parse_author,dont_filter=True)
next_page = response.css('li.next a::attr("href")').extract_first()
if next_page is not None:
next_full_url = response.urljoin(next_page)
yield scrapy.Request(next_full_url, callback=self.parse)
def parse_author(self,response):
item = response.meta['item']
item['author_born_date'] = response.css('.author-born-date::text').extract_first()
item['author_born_location'] = response.css('.author-born-location::text').extract_first()
item['author_description'] = response.css('.author-born-location::text').extract_first()
yield item
实验内容记录
步骤1:声明items
首先,我们针对进行改写。
爬虫文件在改写前的原始文件如下:
import scrapy
class QuotesSpider(scrapy.Spider):
name = 'quotes_2_3'
start_urls = [
'http://quotes.toscrape.com',
]
allowed_domains = [
'toscrape.com',
]
def parse(self,response):
for quote in response.css('div.quote'):
yield{
'quote': quote.css('span.text::text').extract_first(),
'author': quote.css('small.author::text').extract_first(),
'tags': quote.css('div.tags a.tag::text').extract(),
}
author_page = response.css('small.author+a::attr(href)').extract_first()
authro_full_url = response.urljoin(author_page)
yield scrapy.Request(authro_full_url, callback=self.parse_author)
def parse_author(self,response):
yield{
'author': response.css('.author-title::text').extract_first(),
'author_born_date': response.css('.author-born-date::text').extract_first(),
'author_born_location': response.css('.author-born-location::text').extract_first(),
'authro_description': response.css('.author-born-location::text').extract_first(),
}
把子页面下要爬取的内容也声明items。
import scrapy
class QuotesItem(scrapy.Item):
quote = scrapy.Field()
author = scrapy.Field()
tags = scrapy.Field()
# For author introductions
author_born_date = scrapy.Field()
author_born_location = scrapy.Field()
author_description = scrapy.Field()
步骤2:爬虫
在子页面再yield item
import scrapy
from quotes_2.items import QuotesItem
class QuotesSpider(scrapy.Spider):
name = 'quotes_2_5'
start_urls = [
'http://quotes.toscrape.com',
]
allowed_domains = [
'toscrape.com',
]
def parse(self,response):
for quote in response.css('div.quote'):
item = QuotesItem()
item['quote'] = quote.css('span.text::text').extract_first()
item['author'] = quote.css('small.author::text').extract_first()
item['tags'] = quote.css('div.tags a.tag::text').extract()
author_page = response.css('small.author+a::attr(href)').extract_first()
author_full_url = response.urljoin(author_page)
yield scrapy.Request(authro_full_url, meta={'item':item},callback=self.parse_author)
def parse_author(self,response):
item = response.meta['item']
item['author_born_date'] = response.css('.author-born-date::text').extract_first()
item['author_born_location'] = response.css('.author-born-location::text').extract_first()
item['author_description'] = response.css('.author-born-location::text').extract_first()
yield item
结果
[
{"author_description": "in Ulm, Germany", "author": "Albert Einstein", "quote": "\u201cThe world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.\u201d", "tags": ["change", "deep-thoughts", "thinking", "world"], "author_born_date": "March 14, 1879", "author_born_location": "in Ulm, Germany"}
]
parse()函数以及子页面同时yield item
import scrapy
from quotes_2.items import QuotesItem
class QuotesSpider(scrapy.Spider):
name = 'quotes_2_5'
start_urls = [
'http://quotes.toscrape.com',
]
allowed_domains = [
'toscrape.com',
]
def parse(self,response):
for quote in response.css('div.quote'):
item = QuotesItem()
item['quote'] = quote.css('span.text::text').extract_first()
item['author'] = quote.css('small.author::text').extract_first()
item['tags'] = quote.css('div.tags a.tag::text').extract()
yield item
author_page = response.css('small.author+a::attr(href)').extract_first()
author_full_url = response.urljoin(author_page)
yield scrapy.Request(authro_full_url, meta={'item':item},callback=self.parse_author)
def parse_author(self,response):
item = response.meta['item']
item['author_born_date'] = response.css('.author-born-date::text').extract_first()
item['author_born_location'] = response.css('.author-born-location::text').extract_first()
item['author_description'] = response.css('.author-born-location::text').extract_first()
yield item
json文件结果
[
{"quote": "\u201cThe world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.\u201d", "author": "Albert Einstein", "tags": ["change", "deep-thoughts", "thinking", "world"]},
{"quote": "\u201cIt is our choices, Harry, that show what we truly are, far more than our abilities.\u201d", "author": "J.K. Rowling", "tags": ["abilities", "choices"]},
{"quote": "\u201cThere are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.\u201d", "author": "Albert Einstein", "tags": ["inspirational", "life", "live", "miracle", "miracles"]},
{"quote": "\u201cThe person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.\u201d", "author": "Jane Austen", "tags": ["aliteracy", "books", "classic", "humor"]},
{"quote": "\u201cImperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.\u201d", "author": "Marilyn Monroe", "tags": ["be-yourself", "inspirational"]},
{"quote": "\u201cTry not to become a man of success. Rather become a man of value.\u201d", "author": "Albert Einstein", "tags": ["adulthood", "success", "value"]},
{"quote": "\u201cIt is better to be hated for what you are than to be loved for what you are not.\u201d", "author": "Andr\u00e9 Gide", "tags": ["life", "love"]},
{"quote": "\u201cI have not failed. I've just found 10,000 ways that won't work.\u201d", "author": "Thomas A. Edison", "tags": ["edison", "failure", "inspirational", "paraphrased"]},
{"quote": "\u201cA woman is like a tea bag; you never know how strong it is until it's in hot water.\u201d", "author": "Eleanor Roosevelt", "tags": ["misattributed-eleanor-roosevelt"]},
{"quote": "\u201cA day without sunshine is like, you know, night.\u201d", "author": "Steve Martin", "tags": ["humor", "obvious", "simile"]},
{"author_description": "in Ulm, Germany", "author": "Albert Einstein", "quote": "\u201cThe world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.\u201d", "tags": ["change", "deep-thoughts", "thinking", "world"], "author_born_date": "March 14, 1879", "author_born_location": "in Ulm, Germany"}
]
把作者介绍页面放入items中
如下:
def parse(self,response):
for quote in response.css('div.quote'):
item = QuotesItem()
item['quote'] = quote.css('span.text::text').extract_first()
item['author'] = quote.css('small.author::text').extract_first()
item['tags'] = quote.css('div.tags a.tag::text').extract()
yield item
author_page = response.css('small.author+a::attr(href)').extract_first()
item['authro_full_url'] = response.urljoin(author_page)
yield scrapy.Request(url=item['authro_full_url'], meta={'item':item},callback=self.parse_author)
得到json文件,仍然只有一个作者。
[
{"quote": "\u201cThe world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.\u201d", "author": "Albert Einstein", "tags": ["change", "deep-thoughts", "thinking", "world"]},
{"quote": "\u201cIt is our choices, Harry, that show what we truly are, far more than our abilities.\u201d", "author": "J.K. Rowling", "tags": ["abilities", "choices"]},
{"quote": "\u201cThere are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.\u201d", "author": "Albert Einstein", "tags": ["inspirational", "life", "live", "miracle", "miracles"]},
{"quote": "\u201cThe person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.\u201d", "author": "Jane Austen", "tags": ["aliteracy", "books", "classic", "humor"]},
{"quote": "\u201cImperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.\u201d", "author": "Marilyn Monroe", "tags": ["be-yourself", "inspirational"]},
{"quote": "\u201cTry not to become a man of success. Rather become a man of value.\u201d", "author": "Albert Einstein", "tags": ["adulthood", "success", "value"]},
{"quote": "\u201cIt is better to be hated for what you are than to be loved for what you are not.\u201d", "author": "Andr\u00e9 Gide", "tags": ["life", "love"]},
{"quote": "\u201cI have not failed. I've just found 10,000 ways that won't work.\u201d", "author": "Thomas A. Edison", "tags": ["edison", "failure", "inspirational", "paraphrased"]},
{"quote": "\u201cA woman is like a tea bag; you never know how strong it is until it's in hot water.\u201d", "author": "Eleanor Roosevelt", "tags": ["misattributed-eleanor-roosevelt"]},
{"quote": "\u201cA day without sunshine is like, you know, night.\u201d", "author": "Steve Martin", "tags": ["humor", "obvious", "simile"]},
{"authro_full_url": "http://quotes.toscrape.com/author/Albert-Einstein", "author": "Albert Einstein", "quote": "\u201cThe world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.\u201d", "tags": ["change", "deep-thoughts", "thinking", "world"], "author_born_date": "March 14, 1879", "author_born_location": "in Ulm, Germany", "author_description": "in Ulm, Germany"}
]
把作者页面先放进items再yield
def parse(self,response):
for quote in response.css('div.quote'):
item = QuotesItem()
item['quote'] = quote.css('span.text::text').extract_first()
item['author'] = quote.css('small.author::text').extract_first()
item['tags'] = quote.css('div.tags a.tag::text').extract()
author_page = response.css('small.author+a::attr(href)').extract_first()
item['authro_full_url'] = response.urljoin(author_page)
yield item
yield scrapy.Request(url=item['authro_full_url'], meta={'item':item},callback=self.parse_author)
def parse_author(self,response):
item = response.meta['item']
item['author_born_date'] = response.css('.author-born-date::text').extract_first()
item['author_born_location'] = response.css('.author-born-location::text').extract_first()
item['author_description'] = response.css('.author-born-location::text').extract_first()
yield item
结果
[
{"authro_full_url": "http://quotes.toscrape.com/author/Albert-Einstein", "quote": "\u201cThe world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.\u201d", "author": "Albert Einstein", "tags": ["change", "deep-thoughts", "thinking", "world"]},
{"authro_full_url": "http://quotes.toscrape.com/author/Albert-Einstein", "quote": "\u201cIt is our choices, Harry, that show what we truly are, far more than our abilities.\u201d", "author": "J.K. Rowling", "tags": ["abilities", "choices"]},
{"authro_full_url": "http://quotes.toscrape.com/author/Albert-Einstein", "quote": "\u201cThere are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.\u201d", "author": "Albert Einstein", "tags": ["inspirational", "life", "live", "miracle", "miracles"]},
{"authro_full_url": "http://quotes.toscrape.com/author/Albert-Einstein", "quote": "\u201cThe person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.\u201d", "author": "Jane Austen", "tags": ["aliteracy", "books", "classic", "humor"]},
{"authro_full_url": "http://quotes.toscrape.com/author/Albert-Einstein", "quote": "\u201cImperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.\u201d", "author": "Marilyn Monroe", "tags": ["be-yourself", "inspirational"]},
{"authro_full_url": "http://quotes.toscrape.com/author/Albert-Einstein", "quote": "\u201cTry not to become a man of success. Rather become a man of value.\u201d", "author": "Albert Einstein", "tags": ["adulthood", "success", "value"]},
{"authro_full_url": "http://quotes.toscrape.com/author/Albert-Einstein", "quote": "\u201cIt is better to be hated for what you are than to be loved for what you are not.\u201d", "author": "Andr\u00e9 Gide", "tags": ["life", "love"]},
{"authro_full_url": "http://quotes.toscrape.com/author/Albert-Einstein", "quote": "\u201cI have not failed. I've just found 10,000 ways that won't work.\u201d", "author": "Thomas A. Edison", "tags": ["edison", "failure", "inspirational", "paraphrased"]},
{"authro_full_url": "http://quotes.toscrape.com/author/Albert-Einstein", "quote": "\u201cA woman is like a tea bag; you never know how strong it is until it's in hot water.\u201d", "author": "Eleanor Roosevelt", "tags": ["misattributed-eleanor-roosevelt"]},
{"authro_full_url": "http://quotes.toscrape.com/author/Albert-Einstein", "quote": "\u201cA day without sunshine is like, you know, night.\u201d", "author": "Steve Martin", "tags": ["humor", "obvious", "simile"]},
{"authro_full_url": "http://quotes.toscrape.com/author/Albert-Einstein", "author": "Albert Einstein", "quote": "\u201cThe world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.\u201d", "tags": ["change", "deep-thoughts", "thinking", "world"], "author_born_date": "March 14, 1879", "author_born_location": "in Ulm, Germany", "author_description": "in Ulm, Germany"}
]
交给子目录yield
def parse(self,response):
for quote in response.css('div.quote'):
item = QuotesItem()
item['quote'] = quote.css('span.text::text').extract_first()
item['author'] = quote.css('small.author::text').extract_first()
item['tags'] = quote.css('div.tags a.tag::text').extract()
author_page = response.css('small.author+a::attr(href)').extract_first()
item['authro_full_url'] = response.urljoin(author_page)
yield scrapy.Request(url=item['authro_full_url'], meta={'item':item},callback=self.parse_author)
def parse_author(self,response):
item = response.meta['item']
item['author_born_date'] = response.css('.author-born-date::text').extract_first()
item['author_born_location'] = response.css('.author-born-location::text').extract_first()
item['author_description'] = response.css('.author-born-location::text').extract_first()
yield item
最后只有一个
[
{"authro_full_url": "http://quotes.toscrape.com/author/Albert-Einstein", "author": "Albert Einstein", "quote": "\u201cThe world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.\u201d", "tags": ["change", "deep-thoughts", "thinking", "world"], "author_born_date": "March 14, 1879", "author_born_location": "in Ulm, Germany", "author_description": "in Ulm, Germany"}
]
关闭Scrapy的去重机制
加上参数,不要过滤
def parse(self,response):
for quote in response.css('div.quote'):
item = QuotesItem()
item['quote'] = quote.css('span.text::text').extract_first()
item['author'] = quote.css('small.author::text').extract_first()
item['tags'] = quote.css('div.tags a.tag::text').extract()
author_page = response.css('small.author+a::attr(href)').extract_first()
item['authro_full_url'] = response.urljoin(author_page)
yield scrapy.Request(url=item['authro_full_url'], meta={'item':item},callback=self.parse_author,dont_filter=True)
def parse_author(self,response):
item = response.meta['item']
item['author_born_date'] = response.css('.author-born-date::text').extract_first()
item['author_born_location'] = response.css('.author-born-location::text').extract_first()
item['author_description'] = response.css('.author-born-location::text').extract_first()
yield item
最后结果json文件
[
{"authro_full_url": "http://quotes.toscrape.com/author/Albert-Einstein", "author": "Albert Einstein", "quote": "\u201cTry not to become a man of success. Rather become a man of value.\u201d", "tags": ["adulthood", "success", "value"], "author_born_date": "March 14, 1879", "author_born_location": "in Ulm, Germany", "author_description": "in Ulm, Germany"},
{"authro_full_url": "http://quotes.toscrape.com/author/Albert-Einstein", "author": "Marilyn Monroe", "quote": "\u201cImperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.\u201d", "tags": ["be-yourself", "inspirational"], "author_born_date": "March 14, 1879", "author_born_location": "in Ulm, Germany", "author_description": "in Ulm, Germany"},
{"authro_full_url": "http://quotes.toscrape.com/author/Albert-Einstein", "author": "Steve Martin", "quote": "\u201cA day without sunshine is like, you know, night.\u201d", "tags": ["humor", "obvious", "simile"], "author_born_date": "March 14, 1879", "author_born_location": "in Ulm, Germany", "author_description": "in Ulm, Germany"},
{"authro_full_url": "http://quotes.toscrape.com/author/Albert-Einstein", "author": "Andr\u00e9 Gide", "quote": "\u201cIt is better to be hated for what you are than to be loved for what you are not.\u201d", "tags": ["life", "love"], "author_born_date": "March 14, 1879", "author_born_location": "in Ulm, Germany", "author_description": "in Ulm, Germany"},
{"authro_full_url": "http://quotes.toscrape.com/author/Albert-Einstein", "author": "Albert Einstein", "quote": "\u201cThere are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.\u201d", "tags": ["inspirational", "life", "live", "miracle", "miracles"], "author_born_date": "March 14, 1879", "author_born_location": "in Ulm, Germany", "author_description": "in Ulm, Germany"},
{"authro_full_url": "http://quotes.toscrape.com/author/Albert-Einstein", "author": "J.K. Rowling", "quote": "\u201cIt is our choices, Harry, that show what we truly are, far more than our abilities.\u201d", "tags": ["abilities", "choices"], "author_born_date": "March 14, 1879", "author_born_location": "in Ulm, Germany", "author_description": "in Ulm, Germany"},
{"authro_full_url": "http://quotes.toscrape.com/author/Albert-Einstein", "author": "Jane Austen", "quote": "\u201cThe person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.\u201d", "tags": ["aliteracy", "books", "classic", "humor"], "author_born_date": "March 14, 1879", "author_born_location": "in Ulm, Germany", "author_description": "in Ulm, Germany"},
{"authro_full_url": "http://quotes.toscrape.com/author/Albert-Einstein", "author": "Albert Einstein", "quote": "\u201cThe world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.\u201d", "tags": ["change", "deep-thoughts", "thinking", "world"], "author_born_date": "March 14, 1879", "author_born_location": "in Ulm, Germany", "author_description": "in Ulm, Germany"},
{"authro_full_url": "http://quotes.toscrape.com/author/Albert-Einstein", "author": "Thomas A. Edison", "quote": "\u201cI have not failed. I've just found 10,000 ways that won't work.\u201d", "tags": ["edison", "failure", "inspirational", "paraphrased"], "author_born_date": "March 14, 1879", "author_born_location": "in Ulm, Germany", "author_description": "in Ulm, Germany"},
{"authro_full_url": "http://quotes.toscrape.com/author/Albert-Einstein", "author": "Eleanor Roosevelt", "quote": "\u201cA woman is like a tea bag; you never know how strong it is until it's in hot water.\u201d", "tags": ["misattributed-eleanor-roosevelt"], "author_born_date": "March 14, 1879", "author_born_location": "in Ulm, Germany", "author_description": "in Ulm, Germany"}
]