爬取51job的信息

# -*- coding: utf-8 -*-
import scrapy
from ..items import JobspiderItem


class JobSpider(scrapy.Spider):
    name = 'job'
    allowed_domains = ['51job.com']
    start_urls = ['http://search.51job.com/list/171900%252C020000%252C010000%252C030200%252C040000%252C00,000000,0000,00,0,08,Python,2,1.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=',
                  'http://search.51job.com/list/171900%252C020000%252C010000%252C030200%252C040000%252C00,000000,0000,00,0,08,PHP,2,1.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=',
                  'http://search.51job.com/list/171900%252C020000%252C010000%252C030200%252C040000%252C00,000000,0000,00,0,08,html5,2,1.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=']

    def parse(self, response):
        #解析第一页
        #第一种方法:加上以下六行代码
        yield scrapy.Request(
            url=response.url,
            callback=self.parse_job_info,
            meta={},
            dont_filter=True
        )
        #解析下一页
        yield scrapy.Request(
            url=response.url,
            callback=self.parse_next_page,
            meta={},
            dont_filter=True
        )

    def parse_next_page(self, response):
        """
        解析下一页
        :param response:
        :return:
        """
        next_page = response.xpath("//li[@class='bk'][2]/a/@href").extract_first('')
        #判断下一页是否有
        if next_page:
            yield scrapy.Request(
                # #response.url先解析第一页的网址
                #第二种方法:把此处的url改为第一页的网址
                # url=response.url,
                url=next_page,
                callback=self.parse_job_info,
                meta={},
                dont_filter=True
            )
            #循环:自己调用自己
            yield scrapy.Request(
                url=next_page,
                callback=self.parse_next_page,
                meta={},
                dont_filter=True
            )
            """
            递归:如果一个函数内部自己调用自己,这种形式就叫做递归
            """

    def parse_job_info(self, response):
        """
        解析工作信息
        :param response:
        :return:
        """
        job_div_list = response.xpath("//div[@id='resultList']/div[@class='el']")
        for job_div in job_div_list:
            job_name = job_div.xpath("p/span/a/@title").extract_first(
                "没有工作名称").replace(",", "/")
            job_company_name = job_div.xpath("span[@class='t2']/a/@title").extract_first(
                "没有公司名称").strip()
            job_place = job_div.xpath("span[@class='t3']/text()").extract_first("没有工作地点").strip()
            job_salary = job_div.xpath("span["
                              "@class='t4']/text()").extract_first("工资面议").strip()
            job_time = job_div.xpath("span[@class='t4']/text()").extract_first("没有工作时间").strip()
            job_type = "51job" if "51job.com" in response.url else "其他"
            print(job_type ,job_name,job_company_name,job_place,job_salary,job_time)

            """
            数据清洗:负责清除数据两端的空行,空格,特殊符号等,常用操作一般是strip()
            还包括清除哪些无效数据,例如数据格式不完整的数据,以及重复数据"""

            item = JobspiderItem()
            item['job_name'] = job_name
            item['fan_kui_lv'] = "没有反馈率"
            item['job_company_name'] = job_company_name
            item['job_salary'] = job_salary
            item['job_place'] = job_place
            item['job_type'] = job_type
            item['job_time'] = "没有时间"
            yield item

你可能感兴趣的:(爬取51job的信息)