CrawlSpider
- 基于scrapy进行全站数据抓取的一种技术手段
- CrawlSpider就是spider的一个子类
- 连接提取器:LinkExtracotr
- 规则解析器:Rule
- 使用流程:
- 新建一个工程
- cd 工程中
- 新建一个爬虫文件:scrapy genspider -t crawl spiderName www.xxx.com
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from sunCrawlPro.items import SuncrawlproItem, DetailItem
class SunSpider(CrawlSpider):
name = 'sun'
start_urls = ['http://wz.sun0769.com/political/index/politicsNewest?id=1&page=']
link = LinkExtractor(allow=r'id=1&page=\d+')
link_detail = LinkExtractor(allow=r'/political/politics/index\?id=\d+')
rules = (
Rule(link, callback='parse_item', follow=False),
Rule(link_detail, callback='parse_detail'),
)
def parse_item(self, response):
li_list = response.xpath('/html/body/div[2]/div[3]/ul[2]/li')
for li in li_list:
id = li.xpath('./span[1]/text()').extract_first()
title = li.xpath('./span[3]/a/text()').extract_first()
item = SuncrawlproItem()
item['id'] = id
item['title'] = title
yield item
def parse_detail(self, response):
content = response.xpath('/html/body/div[3]/div[2]/div[2]/div[2]/pre/text()').extract_first()
id = response.xpath('/html/body/div[3]/div[2]/div[2]/div[1]/span[4]/text()').extract_first()
id = id.split(':')[-1]
item = DetailItem()
item['content'] = content
item['id'] = id
yield item
class SuncrawlproPipeline:
def process_item(self, item, spider):
if item.__class__.__name__ == 'DetailItem':
content = item['content']
id = item['id']
print(item)
else:
title = item['title']
id = item['id']
return item
class SuncrawlproItem(scrapy.Item):
title = scrapy.Field()
id = scrapy.Field()
class DetailItem(scrapy.Item):
content = scrapy.Field()
id = scrapy.Field()