# -*- coding: utf-8 -*-
import scrapy
from qcwy.items import QcwyItem
from urllib import parse
import re
class A51jobSpider(scrapy.Spider):
name = '51job'
allowed_domains = ['51job.com']
keyword = "python开发工程师" # 此地方可以灵活更换
kw = parse.quote(parse.quote(keyword))
base_url = "https://search.51job.com/list/020000,000000,0000,00,9,99,{0},2,{1}.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare="
offset = 1
start_urls = [base_url.format(kw, offset)]
def parse(self, response):
if response.body is not None:
string = response.xpath('//span[@class="td"]').extract()[0]
pattern = ".*?(\d+).*?"
count = re.match(pattern, string).groups()[0]
for data in response.xpath('//div[@class="el"]'):
item = QcwyItem()
item['position'] = data.xpath('./p/span/a/text()').extract()
if len(item['position']) != 0:
item['position'] = str(item['position'][0]).replace('\r\n', "").strip()
item['position_href'] = data.xpath('./p/span/a/@href').extract()[0]
item['company'] = data.xpath('./span/a/text()').extract()[0]
item['company_href'] = data.xpath('./span/a/@href').extract()[0]
item['workplace'] = data.xpath('./span[@class="t3"]/text()').extract()[0]
item['pay'] = data.xpath('./span[@class="t4"]/text()').extract()
if len(item['pay']) != 0:
item['pay'] = item['pay'][0]
else:
item['pay'] = ""
item['release_time'] = data.xpath('./span[@class="t5"]/text()').extract()[0]
yield scrapy.Request(url=item['position_href'], callback=self.parse_info,
meta={"iteminfo": item},
headers={
'referer': item['position_href']
})
# yield item
self.offset += 1
if self.offset > int(count):
return
yield scrapy.Request(self.base_url.format(self.kw, self.offset), callback=self.parse, headers={
'referer': self.base_url.format(self.kw, self.offset)
})
def parse_info(self, response):
"""
处理详细信息
:param response:
:return:
"""
item = response.meta["iteminfo"]
position_ask = response.xpath('//div[@class="cn"]/p[@class="msg ltype"]/text()').extract()
item['position_ask'] = " ".join(str(x).replace('\\s', "").strip() for x in position_ask if x is not None)
position_welfare = response.xpath('//div[@class="cn"]//div[@class="t1"]/span/text()').extract()
item['position_welfare'] = " ".join(
str(x).replace('\\s', "").strip() for x in position_welfare if x is not None)
position_info = \
response.xpath('//div[@class="tCompany_main"]//div[@class="bmsg job_msg inbox"]//text()').extract()
item['position_info'] = " ".join(str(x).replace('\\s', "").strip() for x in position_info if x is not None)
position_el = response.xpath(
'//div[@class="tCompany_main"]/div[@class="tBorderTop_box"]/div[@class="bmsg inbox"]//text()').extract()
item['position_el'] = " ".join(str(x).replace('\\s', "").strip() for x in position_el if x is not None)
yield item
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class QcwyItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# 职位
position = scrapy.Field()
# 职位链接
position_href = scrapy.Field()
# 职位要求
position_ask = scrapy.Field()
# 福利
position_welfare = scrapy.Field()
# 职位信息
position_info = scrapy.Field()
# 职位其他信息
position_el = scrapy.Field()
# 公司
company = scrapy.Field()
# 公司链接
company_href = scrapy.Field()
# 工作地点
workplace = scrapy.Field()
# 薪资
pay = scrapy.Field()
# 发布时间
release_time = scrapy.Field()
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
from pymongo import MongoClient
class QcwyPipeline(object):
def process_item(self, item, spider):
client = MongoClient(host="127.0.0.1", port=27017)
db = client['qcwy']
col = db['python'] # collections可以灵活改变
col.insert_one(dict(item))
return item
# -*- coding: utf-8 -*-
# Scrapy settings for qcwy project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'qcwy'
SPIDER_MODULES = ['qcwy.spiders']
NEWSPIDER_MODULE = 'qcwy.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
# CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
# CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
# COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
# TELNETCONSOLE_ENABLED = False
# Override the default request headers:
# DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
# }
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
# SPIDER_MIDDLEWARES = {
# 'qcwy.middlewares.QcwySpiderMiddleware': 543,
# }
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# DOWNLOADER_MIDDLEWARES = {
# 'qcwy.middlewares.QcwyDownloaderMiddleware': 543,
# }
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
# EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
# }
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'qcwy.pipelines.QcwyPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
# AUTOTHROTTLE_ENABLED = True
# The initial download delay
# AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
# AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
# AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
# HTTPCACHE_ENABLED = True
# HTTPCACHE_EXPIRATION_SECS = 0
# HTTPCACHE_DIR = 'httpcache'
# HTTPCACHE_IGNORE_HTTP_CODES = []
# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'