通过scrapy爬取前程无忧招聘数据

创建项目:

scrapy startproject ScrapyDemo
cd ScrapyDemo
scrapy genspider bigqcwy msearch.51job.com

items.py文件添加爬取信息:

class ScrapydemoItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    # 职位名称
    name = scrapy.Field()
    # 薪资水平
    salary = scrapy.Field()
    # 招聘单位
    company = scrapy.Field()
    # 工作地点
    jobPlace = scrapy.Field()
    # 工作经验
    jobExperience = scrapy.Field()
    # 学历要求
    education = scrapy.Field()
    # 工作内容(岗位职责)
    # jobContent = scrapy.Field()
    # 任职要求(技能要求)
    jobRequirement = scrapy.Field()

编辑spider文件bigqcwy.py:
对薪资简单做了清洗

# -*- coding: utf-8 -*-
import scrapy
import time
from ScrapyDemo.items import ScrapydemoItem
import re


class BigqcwySpider(scrapy.Spider):
    name = 'bigqcwy'
    allowed_domains = ['msearch.51job.com']
    custom_settings = {
        "DEFAULT_REQUEST_HEADERS": {
            'Cookie':'设置你的cookie',
          
        },
        "AUTOTHROTTLE_ENABLED": True,
        # "DOWNLOAD_DELAY": 1,
        # "ScrapyDemo.pipelines.ScrapydemoPipeline": 300,
    }
    start_urls = ['https://msearch.51job.com/']

    def start_requests(self):
        # 搜索关键词列表
        list = ['0100%2C7700%2C7200%2C7300%2C7800', '7400%2C2700%2C7900%2C7500%2C6600', '8000%2C6100%2C2600%2C2800%2C3300']
        for i in list:
            # 每个关键词有2000页
            for j in range(1, 2001):
                time.sleep(2)
                start_url = 'https://msearch.51job.com/job_list.php?funtype=' + str(i) +'&jobarea=000000&filttertype=loginmore&pageno=' + str(j)
                if start_url:
                    yield scrapy.Request(url=start_url, callback=self.parse)

    def parse(self, response):
        # 保存详情页链接
        list_url = response.xpath('//*[@id="pageContent"]/div[3]/a')
        for list in list_url:
            time.sleep(1)
            url = list.xpath('@href').extract()[0]
            url = "https:" + url
            # print("爬取详情url:", url)
            if url:
                yield scrapy.Request(url=url, callback=self.parse_item)

    def parse_item(self, response):
        # time.sleep(2)
        item = ScrapydemoItem()
        # selector = Selector(response)
        # 职位名称
        item['name'] = response.xpath('//*[@id="pageContent"]/div[1]/div[1]/p/text()').extract_first()
        # 薪资水平
        try:
            sa = response.xpath('//*[@id="pageContent"]/div[1]/p/text()').extract_first()
            num = list(re.findall(r'([0-9]+(\.?[0-9]?)?)-([0-9]+(\.?[0-9]?)?)', sa)[0])
            if '万' in sa and '月' in sa:
                sa1 = float(num[0]) * 10
                sa2 = float(num[2]) * 10
                sa3 = str(sa1).replace('.0', '')
                sa4 = str(sa2).replace('.0', '')
                item['salary'] = sa3 + '-' + sa4 + '千/月'
            elif '万' in sa and '年' in sa:
                # 1、换算为万/月
                sa1 = float(num[0]) / 12
                sa2 = float(num[2]) / 12
                n1 = list(re.findall(r'([0-9]+(\.?[0-9]?)?)', str(sa1))[0])
                n2 = list(re.findall(r'([0-9]+(\.?[0-9]?)?)', str(sa2))[0])
                sa1 = str(n1[0]).replace('.0', '')
                sa2 = str(n2[0]).replace('.0', '')
                # 2、换算为千/月
                sa3 = float(sa1) * 10
                sa4 = float(sa2) * 10
                sa5 = str(sa3).replace('.0', '')
                sa6 = str(sa4).replace('.0', '')
                item['salary'] = sa5 + '-' + sa6 + '千/月'
            else:
                item['salary'] = sa
        except:
            item['salary'] = '面议'
        # 招聘单位
        item['company'] = response.xpath('//*[@id="pageContent"]/div[2]/a[1]/p/text()').extract_first()
        # city地址
        try:
            dizhi = response.xpath('//*[@id="pageContent"]/div[2]/a[2]/span/text()').extract_first().replace('上班地址 : ', ':')
        except:
            dizhi = ''
        # 城市
        city = response.xpath('//*[@id="pageContent"]/div[1]/div[1]/em/text()').extract_first()
        # 工作地点
        try:
            item['jobPlace'] = city + dizhi
        except:
            item['jobPlace'] = city
        # 工作经验
        try:
            item['jobExperience'] = response.xpath('//*[@id="pageContent"]/div[1]/div[2]/span[2]/text()').extract_first()
        except:
            item['jobExperience'] = '数据缺失'
        # 学历要求
        try:
            item['education'] = response.xpath('//*[@id="pageContent"]/div[1]/div[2]/span[3]/text()').extract_first()
        except:
            item['education'] = '数据缺失'
        # 工作内容(岗位职责)
        # try:
        #     # item['jobContent'] = response.xpath('//*[@id="pageContent"]/div[3]/div[3]/article/br//text()').extract_first()
        #     item['jobContent'] = response.xpath('string(//*[@id="pageContent"]/div[3]/div[3]/article)').extract_first().split(':')[1].split(':')[0]
        # except:
        #     item['jobContent'] = '无数据'
        # 任职要求(技能要求)
        try:
            # item['jobRequirement'] = response.xpath('string(//*[@id="pageContent"]/div[3]/div[3]/article)').extract_first().split(':')[1].split(':')[1] //*[@id="pageContent"]/div[3]/div[2]/article
            jobR = response.xpath('string(//*[@id="pageContent"]/div[3]/div[3]/article)').extract_first()
            if jobR != '':
                item['jobRequirement'] = jobR
            else:
                item['jobRequirement'] = response.xpath('string(//*[@id="pageContent"]/div[3]/div[2]/article)').extract_first()
        except:
            item['jobRequirement'] = '数据缺失'
        # print("职位名称:", item['name'])
        # print("薪资水平:", item['salary'])
        # print("招聘单位:", item['company'])
        # print("工作地点:", item['jobPlace'])
        # print("工作经验:", item['jobExperience'])
        # print("学历要求:", item['education'])
        # print("任职要求(技能要求):", item['jobRequirement'])
        return item

编辑pipelines.py:
采用Mongodb数据库存储数据

from pymongo import MongoClient

class ScrapydemoPipeline(object):

    def open_spider(self, spider):
        self.db = MongoClient('localhost', 27017).bigqcwy_db
        self.collection = self.db.bigqcwy_collection

    def process_item(self, item, spider):
        self.collection.insert_one(dict(item))

    def close_spider(self, spider):
        self.collection.close()

编辑settings.py:

USER_AGENT = '设置user-agent'
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 1
COOKIES_ENABLED = False
ITEM_PIPELINES = {
   'ScrapyDemo.pipelines.ScrapydemoPipeline': 300,
}

爬取结果:
通过scrapy爬取前程无忧招聘数据_第1张图片

你可能感兴趣的:(#,Python爬虫)