scrapy爬虫--升级练习

scrapy startproject toscrape_book

scrapy genspider books book.toscrape.com

Created spider 'books' using template 'basic' in module:

toscrape_book.spiders.books

这个命令可以生成一个一个spider和他遥爬的网页

自动生成:

class BooksSpider(scrapy.Spider):

name ='books'

allowed_domains = ['book.toscrape.com']

start_urls = ['http://book.toscrape.com/']

#书籍列表页面解析函数

def parse(self,response):

      pass



编写封装Item

classBooksItem(scrapy.Item):

#定义封装的要爬的信息的Item类

name = scrapy.Field()#书名

price = scrapy.Field()# jia ge

review_rating = scrapy.Field()#评价等级

review_num = scrapy.Field()#评价数量

upc = scrapy.Field()#产品编码

stock = scrapy.Field()#库存量



编写spider

#书籍列表页面解析函数

def parse(self,response):

le = LinkExtractor(restrict_css='article.product_pod h3')

forlinkinle.extract_links(response):

yieldscrapy.Request(link.url,callback=self.parse_book)

le = LinkExtractor(restrict_css='ul.pager li.next')

links = le.extract_links(response)

iflinks:

next_url = links[0].url

yield scrapy.Request(next_url,callback=self.parse)

#书籍页面的解析函数

def parse_book(self,response):

book = BooksItem()

sel =response.css('div.product_main')

book['name'] = sel.xpath('./h1/text()').extract_first()

book['price'] = sel.css('p.price_color::text').extract_first()

book['review_rating'] = sel.css('p.stat-rating::attr(class)')\

.re_first('star-rating([A-Za-z]+)')

sel = response.css('table.table.table-striped')

book['upc'] = sel.xpath('(.//tr)[1]/td/text()').extract_first()

book['stock'] = sel.xpath('(.//tr)[last()-1]/td/text()')\

.re_first('\((\d+)available\)')

book['review_rating'] = sel.xpath('(.//tr)[last()]/td/text()').extract_first()

yieldbook

设置输出顺序


FEED_EXPORT_FIELDS=['upc','name','price','stock','review_rating','review_num']

汉字和阿拉伯数字的映射关系

classBookPipline(object):

review_rating_map={

'One':1,

'Two':2,

'Three':3,

'Four':4,

'Five':5,

}

defprocess_item(self,item,spider):

rating = item.get('review_rating')

ifrating:

item['review_rating'] =self.review_rating_map[rating]

returnitem


scrapy crawl books -o books.csv

你可能感兴趣的:(scrapy爬虫--升级练习)