scrapy----网易招聘数据提取2(最新)

wangyi.py:

import json
from wangyi2.items import Wangyi2Item

import scrapy


class WangyiSpider(scrapy.Spider):
    name = "wangyi"
    allowed_domains = ["163.com"]
    start_urls = ["https://hr.163.com/api/hr163/position/queryPage"]
    def __init__(self):
        self.page_=1


    def start_requests(self):
        url = self.start_urls[0]
        # 构建表单
        post_data = {
            "currentPage": 1,
            "pageSize": 10

        }
        print(post_data)
        #         针对url发送post请求
        yield scrapy.Request(
            url=url,
            callback=self.parse,
            body=json.dumps(post_data),
            method='POST',
            headers={'Content-Type': 'application/json;charset=UTF-8'}
        )


    def parse(self, response):
        dic=response.json()

        data_list=dic['data']['list']
        # print(data_list)
        for data in data_list:
            item=Wangyi2Item()
            item['name']=data['name']
            item['num']=data['recruitNum']
            item['req']=data['requirement'].replace('\n','')
            # item['time']=data['updateTime']
            # tem_data=time.strftime('%Y-%m-%d %H:%M:%S', data['reqEducationName'])
            # tem['reqEducationName']=time.strftime('%Y-%m-%d %H:%M:%S', data['reqEducationName'])
            # print(tem)
            # time.sleep(2)
            yield item
        self.page_=self.page_+1
        if not dic['data']['lastPage']:
            url = self.start_urls[0]
            # 构建表单
            post_data = {
                "currentPage": self.page_,
                "pageSize": 10

            }
            print(post_data)
            #         针对url发送post请求
            yield scrapy.Request(
                url=url,
                callback=self.parse,
                body=json.dumps(post_data),
                method='POST',
                headers={'Content-Type': 'application/json;charset=UTF-8'}
            )



items.py: 

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class Wangyi2Item(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    name=scrapy.Field()
    num=scrapy.Field()
    req=scrapy.Field()
    time=scrapy.Field()

pipelines.py:

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import json

# useful for handling different item types with a single interface
from itemadapter import ItemAdapter


class Wangyi2Pipeline:
    def __init__(self):
        self.file=open('wangyi.json','w')
    def process_item(self, item, spider):
        # 将 item对象强转成字典
        item = dict(item)
        # 将字典数据序列化
        json_data = json.dumps(item, ensure_ascii=False) + ',\n'

        #  写入
        self.file.write(json_data)

        # 默认使用完管道之后需要将数据返回给引擎
        return item
    def __del__(self):
        self.file.close()

 --------------------------------------------------------------------------------------------------------------------------------

效果图:

 scrapy----网易招聘数据提取2(最新)_第1张图片

你可能感兴趣的:(scrapy,python,开发语言)