爬取干货集中营数据(3)

爬取干货集中营数据(3)

http://gank.io/

改进后的妹纸爬虫 ಠ౪ಠ

1 项目结构


爬取干货集中营数据(3)_第1张图片
project gank

2 修改 Item

  • title 标题
  • images 妹子图片
  • leftLink 左部导航
  • rightLink 右部导航
 class GankPageItem(scrapy.Item):
        title = scrapy.Field()
        images = scrapy.Field()
        leftLink = scrapy.Field()
        rightLink = scrapy.Field()

3 新建 gankpage_spider.py (spider 目录下)

# -*- coding: utf-8 -*-

import scrapy

from gank.items import GankPageItem


class GnakPageSpider(scrapy.Spider):
    name = "gank"
    allowed_domains = ["gank.io"]
    start_urls = [
        "http://gank.io",
    ]
    url_prefix = "http://gank.io"

    def parse(self, response):

        pageItem = self.extract_item(response)
        yield pageItem

        # 左部导航不为空 则向左部爬取
        leftLinks = pageItem['leftLink']
        if len(leftLinks) == 1:
            leftLink = self.url_prefix + leftLinks[0]
            yield scrapy.Request(leftLink, callback=self.to_left_page)
        # 右部导航不为空 则向右部爬取
        rightLinks = pageItem['rightLink']
        if len(rightLinks) == 1:
            rightLink = self.url_prefix + rightLinks[0]
            yield scrapy.Request(rightLink, callback=self.to_right_page)

    def extract_item(self, response):
        '''
            页面解析
        '''
        selector = scrapy.Selector(response)
        tagTitle = selector.xpath('head/title')
        tagNavDivs = selector.xpath('(//div[contains(@class, "typo")]/div[contains(@class, "container content")]/div[contains(@class, "row")])[1]/div')
        tagPs = selector.xpath('//div[contains(@class, "typo")]/div[contains(@class, "container content")]/div[contains(@class, "outlink")]/p')

        pageItem = GankPageItem()
        # 页面标题
        pageTitle = tagTitle.xpath('text()').extract()[0]
        pageItem['title'] = pageTitle #.encode('utf-8')
        # 妹纸图片
        pageItem['images'] = tagPs.xpath('./img/@src').extract()
        # 导航链接
        if len(tagNavDivs) == 2:
            pageItem['leftLink'] = tagNavDivs[0].xpath('./p/a/@href').extract()
            pageItem['rightLink'] = tagNavDivs[1].xpath('./p/a/@href').extract()
        else:
            pageItem['rightLink'] = tagNavDivs[0].xpath('./p/a/@href').extract()

        return pageItem

    def to_left_page(self, response):
        '''
            左部爬取
        '''
        pageItem = self.extract_item(response)
        yield pageItem

        leftLinks = pageItem['leftLink']
        if len(leftLinks) == 1:
            leftLink = self.url_prefix + leftLinks[0]
            yield scrapy.Request(leftLink, callback=self.to_left_page)

    def to_right_page(self, response):
        '''
            右部爬取
        '''
        pageItem = self.extract_item(response)
        yield pageItem

        rightLinks = pageItem['rightLink']
        if len(rightLinks) == 1:
            rightLink = self.url_prefix + rightLinks[0]
            yield scrapy.Request(rightLink, callback=self.to_right_page)

4 执行爬虫

$> crapy crawl gank

你可能感兴趣的:(爬取干货集中营数据(3))