抓取

# cnblogs.py
from urllib2 import parse
from scrapy import Request

class CnblogsSpider(scrapy.spider):
    name = 'cnblogs'
    allowed_domains = ['news.cnblogs.com']
    start_urls = ['http://news.cnblogs.com/']
    
    # 获取新闻列表页中的新闻url 并交给 scrapy 进行下载后调用相应的解析方法
    # 获取下一页的url并交给scrapy进行下载, 下载完成后交给parse继续跟进
    def parse(self, response):
        post_nodes = response.css('.news_block') # 获取列表页的信息
        for post in post_nodes:
            image_url = post.css('.entry_summary a img::attr(src)').extract_first() # 略缩图地址
            post_url = post.css('.news_entry a::attr(href)').extract_first() # 文章url地址
            request_url = parse.urljoin(response.url, post_url
            # 用 Request 返回生成器, 用meta来传递参数
            # 将request_url返回后的数据交给parse_detail异步处理
            yield Request(url=request_url, meta={"front_image_url": image_url}, callback=self.parse_detail)

    def parse_detail(self, response):
        pass

你可能感兴趣的:(抓取)