4.爬取翻页

# -*- coding: utf-8 -*-
import scrapy
from Boss.items import BossItem

class ZhipinSpider(scrapy.Spider):
    name = 'zhipin'
    allowed_domains = ['zhipin.com']
    start_urls = ['https://www.zhipin.com/c101280600/?query=python&page=1&ka=page-1']
        #['https://www.zhipin.com/c101280600/?query=python&page=%d&ka=page-%d'%(i,i) for i in range(1,21)]
    # ['https://www.zhipin.com/c101280600/?query=python&page=1&ka=page-1']
        # ['https://www.zhipin.com/c101280600/?query=python&page=1&ka=page-1','https://www.zhipin.com/c101280600/?query=python&page=2&ka=page-2','https://www.zhipin.com/c101280600/?query=python&page=3&ka=page-3']
    # 定义一个变量,用于记录当前是第几页
    page = 1

    def parse(self, response):
        job_list = response.xpath("//div[@class='job-list']//li")
        # print(len(job_list))
        for job in job_list:
            item = BossItem()
            item["job"] = job.xpath(".//div[@class='job-title']/text()").extract_first()
            item["salary"] = job.xpath(".//span[@class='red']/text()").extract_first()
            item["company"] = job.xpath(".//div[@class='company-text']//a/text()").extract_first()
            item["position"] = job.xpath(".//div[@class='info-primary']/p//text()[1]").extract_first()
            item["require"] = job.xpath(".//div[@class='info-primary']/p//text()[2]").extract_first() + job.xpath(".//div[@class='info-primary']/p//text()[3]").extract_first()
            item["info"] =  " ".join(job.xpath(".//div[@class='company-text']/p//text()").extract())
            item["hr"] = " ".join(job.xpath(".//div[@class='info-publis']//h3[@class='name']/text()").extract())
            yield item

        # 进行一个翻页操作

        self.page += 1
        if self.page < 20:
            # 重新调度下载器
            url = 'https://www.zhipin.com/c101280600/?query=python&page=%d&ka=page-%d' % (self.page,self.page)
            yield scrapy.Request(url=url,callback=self.parse)
            # 这种方案同步的下载,在一个下载器下载器下载并解析完毕以后继续用手动的调取下载器递归下载




你可能感兴趣的:(4.爬取翻页)