创建项目:
scrapy startproject ScrapyDemo
cd ScrapyDemo
scrapy genspider bigqcwy msearch.51job.com
items.py文件添加爬取信息:
class ScrapydemoItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# 职位名称
name = scrapy.Field()
# 薪资水平
salary = scrapy.Field()
# 招聘单位
company = scrapy.Field()
# 工作地点
jobPlace = scrapy.Field()
# 工作经验
jobExperience = scrapy.Field()
# 学历要求
education = scrapy.Field()
# 工作内容(岗位职责)
# jobContent = scrapy.Field()
# 任职要求(技能要求)
jobRequirement = scrapy.Field()
编辑spider文件bigqcwy.py:
对薪资简单做了清洗
# -*- coding: utf-8 -*-
import scrapy
import time
from ScrapyDemo.items import ScrapydemoItem
import re
class BigqcwySpider(scrapy.Spider):
name = 'bigqcwy'
allowed_domains = ['msearch.51job.com']
custom_settings = {
"DEFAULT_REQUEST_HEADERS": {
'Cookie':'设置你的cookie',
},
"AUTOTHROTTLE_ENABLED": True,
# "DOWNLOAD_DELAY": 1,
# "ScrapyDemo.pipelines.ScrapydemoPipeline": 300,
}
start_urls = ['https://msearch.51job.com/']
def start_requests(self):
# 搜索关键词列表
list = ['0100%2C7700%2C7200%2C7300%2C7800', '7400%2C2700%2C7900%2C7500%2C6600', '8000%2C6100%2C2600%2C2800%2C3300']
for i in list:
# 每个关键词有2000页
for j in range(1, 2001):
time.sleep(2)
start_url = 'https://msearch.51job.com/job_list.php?funtype=' + str(i) +'&jobarea=000000&filttertype=loginmore&pageno=' + str(j)
if start_url:
yield scrapy.Request(url=start_url, callback=self.parse)
def parse(self, response):
# 保存详情页链接
list_url = response.xpath('//*[@id="pageContent"]/div[3]/a')
for list in list_url:
time.sleep(1)
url = list.xpath('@href').extract()[0]
url = "https:" + url
# print("爬取详情url:", url)
if url:
yield scrapy.Request(url=url, callback=self.parse_item)
def parse_item(self, response):
# time.sleep(2)
item = ScrapydemoItem()
# selector = Selector(response)
# 职位名称
item['name'] = response.xpath('//*[@id="pageContent"]/div[1]/div[1]/p/text()').extract_first()
# 薪资水平
try:
sa = response.xpath('//*[@id="pageContent"]/div[1]/p/text()').extract_first()
num = list(re.findall(r'([0-9]+(\.?[0-9]?)?)-([0-9]+(\.?[0-9]?)?)', sa)[0])
if '万' in sa and '月' in sa:
sa1 = float(num[0]) * 10
sa2 = float(num[2]) * 10
sa3 = str(sa1).replace('.0', '')
sa4 = str(sa2).replace('.0', '')
item['salary'] = sa3 + '-' + sa4 + '千/月'
elif '万' in sa and '年' in sa:
# 1、换算为万/月
sa1 = float(num[0]) / 12
sa2 = float(num[2]) / 12
n1 = list(re.findall(r'([0-9]+(\.?[0-9]?)?)', str(sa1))[0])
n2 = list(re.findall(r'([0-9]+(\.?[0-9]?)?)', str(sa2))[0])
sa1 = str(n1[0]).replace('.0', '')
sa2 = str(n2[0]).replace('.0', '')
# 2、换算为千/月
sa3 = float(sa1) * 10
sa4 = float(sa2) * 10
sa5 = str(sa3).replace('.0', '')
sa6 = str(sa4).replace('.0', '')
item['salary'] = sa5 + '-' + sa6 + '千/月'
else:
item['salary'] = sa
except:
item['salary'] = '面议'
# 招聘单位
item['company'] = response.xpath('//*[@id="pageContent"]/div[2]/a[1]/p/text()').extract_first()
# city地址
try:
dizhi = response.xpath('//*[@id="pageContent"]/div[2]/a[2]/span/text()').extract_first().replace('上班地址 : ', ':')
except:
dizhi = ''
# 城市
city = response.xpath('//*[@id="pageContent"]/div[1]/div[1]/em/text()').extract_first()
# 工作地点
try:
item['jobPlace'] = city + dizhi
except:
item['jobPlace'] = city
# 工作经验
try:
item['jobExperience'] = response.xpath('//*[@id="pageContent"]/div[1]/div[2]/span[2]/text()').extract_first()
except:
item['jobExperience'] = '数据缺失'
# 学历要求
try:
item['education'] = response.xpath('//*[@id="pageContent"]/div[1]/div[2]/span[3]/text()').extract_first()
except:
item['education'] = '数据缺失'
# 工作内容(岗位职责)
# try:
# # item['jobContent'] = response.xpath('//*[@id="pageContent"]/div[3]/div[3]/article/br//text()').extract_first()
# item['jobContent'] = response.xpath('string(//*[@id="pageContent"]/div[3]/div[3]/article)').extract_first().split(':')[1].split(':')[0]
# except:
# item['jobContent'] = '无数据'
# 任职要求(技能要求)
try:
# item['jobRequirement'] = response.xpath('string(//*[@id="pageContent"]/div[3]/div[3]/article)').extract_first().split(':')[1].split(':')[1] //*[@id="pageContent"]/div[3]/div[2]/article
jobR = response.xpath('string(//*[@id="pageContent"]/div[3]/div[3]/article)').extract_first()
if jobR != '':
item['jobRequirement'] = jobR
else:
item['jobRequirement'] = response.xpath('string(//*[@id="pageContent"]/div[3]/div[2]/article)').extract_first()
except:
item['jobRequirement'] = '数据缺失'
# print("职位名称:", item['name'])
# print("薪资水平:", item['salary'])
# print("招聘单位:", item['company'])
# print("工作地点:", item['jobPlace'])
# print("工作经验:", item['jobExperience'])
# print("学历要求:", item['education'])
# print("任职要求(技能要求):", item['jobRequirement'])
return item
编辑pipelines.py:
采用Mongodb数据库存储数据
from pymongo import MongoClient
class ScrapydemoPipeline(object):
def open_spider(self, spider):
self.db = MongoClient('localhost', 27017).bigqcwy_db
self.collection = self.db.bigqcwy_collection
def process_item(self, item, spider):
self.collection.insert_one(dict(item))
def close_spider(self, spider):
self.collection.close()
编辑settings.py:
USER_AGENT = '设置user-agent'
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 1
COOKIES_ENABLED = False
ITEM_PIPELINES = {
'ScrapyDemo.pipelines.ScrapydemoPipeline': 300,
}