scrapy CrawlSpider 爬全站数据

# -*- coding: utf-8 -*-
import scrapy
from scrapy.spiders import CrawlSpider, Rule
# from scrapy.linkextractors.sgml import SgmlLinkExtractor
from scrapy.linkextractors import LinkExtractor

from CrawlSpiderTest.items import CrawlspidertestItem

class CsdnarticleSpider(CrawlSpider):
    name = 'csdnArticle'
    allowed_domains = ['blog.csdn.net']
    start_urls = ['https://blog.csdn.net/u012150179/article/details/11749017']

    pagelink = LinkExtractor(allow=('/u012150179/article/details'))

    rules = [
        Rule(pagelink, callback='parse_item', follow=True)
    ]

    def parse_item(self, response):
        item = CrawlspidertestItem()
        item['title'] = response.css('.title-article::text').extract_first()
        yield item

    # def parse(self, response):
    #     pass
 
 

http://www.waitingfy.com/archives/3937

 

你可能感兴趣的:(scrapy)