CrawlSpider全站数据爬取

CrawlSpider

  • 基于scrapy进行全站数据抓取的一种技术手段
  • CrawlSpider就是spider的一个子类
    • 连接提取器:LinkExtracotr
    • 规则解析器:Rule
  • 使用流程:
    • 新建一个工程
    • cd 工程中
    • 新建一个爬虫文件:scrapy genspider -t crawl spiderName www.xxx.com

from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from sunCrawlPro.items import SuncrawlproItem, DetailItem


class SunSpider(CrawlSpider):
    name = 'sun'
    # allowed_domains = ['www.xxx.com']
    start_urls = ['http://wz.sun0769.com/political/index/politicsNewest?id=1&page=']
    # 实例化连接提取器对象
    # 作用:根据指定的规则 allow=正则表达式 进行指定连接提取
    link = LinkExtractor(allow=r'id=1&page=\d+')
    # 获取详情页链接
    link_detail = LinkExtractor(allow=r'/political/politics/index\?id=\d+')
    rules = (
        # 将link作用到Rule构造方法参数1中
        # 作用:将连接提取器提取到的连接进行请求发送并且根据指定规则对请求到的数据进行数据解析
        Rule(link, callback='parse_item', follow=False),
        # follow=True:将链接提取器  继续作用到  链接提取器提取到的  链接  所对应的  页面中
        Rule(link_detail, callback='parse_detail'),
    )

    def parse_item(self, response):
        li_list = response.xpath('/html/body/div[2]/div[3]/ul[2]/li')
        for li in li_list:
            id = li.xpath('./span[1]/text()').extract_first()
            title = li.xpath('./span[3]/a/text()').extract_first()
            item = SuncrawlproItem()
            item['id'] = id
            item['title'] = title
            yield item

    def parse_detail(self, response):
        content = response.xpath('/html/body/div[3]/div[2]/div[2]/div[2]/pre/text()').extract_first()
        id = response.xpath('/html/body/div[3]/div[2]/div[2]/div[1]/span[4]/text()').extract_first()
        id = id.split(':')[-1]
        item = DetailItem()
        item['content'] = content
        item['id'] = id

        yield item
class SuncrawlproPipeline:
    def process_item(self, item, spider):
        if item.__class__.__name__ == 'DetailItem':
            content = item['content']
            id = item['id']
            print(item)
        else:
            title = item['title']
            id = item['id']
            # print(item)
        return item
class SuncrawlproItem(scrapy.Item):
    # define the fields for your item here like:
    title = scrapy.Field()
    id = scrapy.Field()

class DetailItem(scrapy.Item):
    content = scrapy.Field()
    id = scrapy.Field()

你可能感兴趣的:(python,#,python爬虫)