import json
from wangyi2.items import Wangyi2Item
import scrapy
class WangyiSpider(scrapy.Spider):
name = "wangyi"
allowed_domains = ["163.com"]
start_urls = ["https://hr.163.com/api/hr163/position/queryPage"]
def __init__(self):
self.page_=1
def start_requests(self):
url = self.start_urls[0]
# 构建表单
post_data = {
"currentPage": 1,
"pageSize": 10
}
print(post_data)
# 针对url发送post请求
yield scrapy.Request(
url=url,
callback=self.parse,
body=json.dumps(post_data),
method='POST',
headers={'Content-Type': 'application/json;charset=UTF-8'}
)
def parse(self, response):
dic=response.json()
data_list=dic['data']['list']
# print(data_list)
for data in data_list:
item=Wangyi2Item()
item['name']=data['name']
item['num']=data['recruitNum']
item['req']=data['requirement'].replace('\n','')
# item['time']=data['updateTime']
# tem_data=time.strftime('%Y-%m-%d %H:%M:%S', data['reqEducationName'])
# tem['reqEducationName']=time.strftime('%Y-%m-%d %H:%M:%S', data['reqEducationName'])
# print(tem)
# time.sleep(2)
yield item
self.page_=self.page_+1
if not dic['data']['lastPage']:
url = self.start_urls[0]
# 构建表单
post_data = {
"currentPage": self.page_,
"pageSize": 10
}
print(post_data)
# 针对url发送post请求
yield scrapy.Request(
url=url,
callback=self.parse,
body=json.dumps(post_data),
method='POST',
headers={'Content-Type': 'application/json;charset=UTF-8'}
)
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class Wangyi2Item(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
name=scrapy.Field()
num=scrapy.Field()
req=scrapy.Field()
time=scrapy.Field()
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import json
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
class Wangyi2Pipeline:
def __init__(self):
self.file=open('wangyi.json','w')
def process_item(self, item, spider):
# 将 item对象强转成字典
item = dict(item)
# 将字典数据序列化
json_data = json.dumps(item, ensure_ascii=False) + ',\n'
# 写入
self.file.write(json_data)
# 默认使用完管道之后需要将数据返回给引擎
return item
def __del__(self):
self.file.close()
--------------------------------------------------------------------------------------------------------------------------------