爬虫训练网站
在学习了Scrapy入门之后,我们可以开始进一步通过实例练习,于是我找到了上面的训练项目,虽然没有教程,但是代码难度设置比较适合初学者训练。
我们从上面下载代码,开始学习。
爬取两个网站,记录爬取过的url。
import scrapy
class QuotesSpider(scrapy.Spider):
name = "quotes1"
def start_requests(self):
urls = [
'http://quotes.toscrape.com/page/1/',
'http://quotes.toscrape.com/page/2/',
]
requests = []
for url in urls:
requests.append(scrapy.Request(url=url, callback=self.parse))
return requests
def parse(self, response):
self.log('I just visited {}'.format(response.url))
import scrapy
class QuotesSpider(scrapy.Spider):
name = "quotes2"
start_urls = [
'http://quotes.toscrape.com/page/1/',
'http://quotes.toscrape.com/page/2/',
]
def parse(self, response):
self.log('I just visited {}'.format(response.url))
使用css处理选择到网页信息,然后返回
由于之前学习过了Scrapy框架的简单教程没什么难度就不详述了。
import scrapy
class QuotesSpider(scrapy.Spider):
name = "quotes3"
start_urls = [
'http://quotes.toscrape.com/page/1/',
'http://quotes.toscrape.com/page/2/',
]
def parse(self, response):
quotes = []
for quote in response.css('div.quote'):
quotes.append({
'text': quote.css('span.text::text').extract_first(),
'author': quote.css('span small::text').extract_first(),
'tags': quote.css('div.tags a.tag::text').extract(),
})
# spider_4_quotes.py shows this same spider, but it generates
# the items individually instead of returning all of them in list
return quotes
其中返回的数据我们怎么获得呢?
返回的数据我们可以通过json来获得。
我们的控制台也能看到相应的信息:
什么时候yield
主要是理解yield的应用:
import scrapy
class QuotesSpider(scrapy.Spider):
name = "quotes4"
start_urls = [
'http://quotes.toscrape.com/page/1/',
'http://quotes.toscrape.com/page/2/',
]
def parse(self, response):
for quote in response.css('div.quote'):
yield {
'text': quote.css('span.text::text').extract_first(),
'author': quote.css('span small::text').extract_first(),
'tags': quote.css('div.tags a.tag::text').extract(),
}
import scrapy
from datetime import datetime
class RedditSpider(scrapy.Spider):
name = 'reddit'
start_urls = [
'http://reddit.com/r/programming',
'http://reddit.com/r/python',
]
def parse(self, response):
# ':not(.stickied)' avoids scraping announcements that arte sticked to the top
for thing in response.css('.thing:not(.stickied)'):
yield {
'title': thing.css('.title::text').extract_first(),
'link': thing.css('.title > a::attr(href)').extract_first(),
'user_name': thing.css('a.author::text').extract_first(),
'user_url': thing.css('a.author::attr(href)').extract_first(),
# when score is 0, reddit show a bullet point instead of a 0
'score': int(thing.css('.score.unvoted::text').re_first('(\d+)') or 0),
'time': datetime.strptime(
thing.css('time::attr(datetime)').extract_first(),
'%Y-%m-%dT%H:%M:%S+00:00'
),
}