工具:scrapy,MongoDB,Excel,tableau
1.分析网页链接,里面包含有【keyword=数据分析师&keywordtype=2&curr_page=1】这些关键信息,可以看出换页操作只需要更改curr_page的值。
2.查看网页源代码,发现职位信息都藏在源代码里,这样直接从源代码里提取职位信息就好了
3.先编写items,这里提取职位名称,薪资,公司名称,公司类型,公司规模,公司标签,所在城市,学历要求,工作经验,福利,招聘要求标签
class Job61Item(scrapy.Item):
jobname=scrapy.Field()
salary=scrapy.Field()
company=scrapy.Field()
companytype=scrapy.Field()
companyscale=scrapy.Field()
companytag=scrapy.Field()
city=scrapy.Field()
record=scrapy.Field()
workyear=scrapy.Field()
welfare=scrapy.Field()
requirements=scrapy.Field()
pass
4.再编写pipelines,把获取到的数据保存到MongoDB
import pymongo
class MongoPipeline(object):
collection_name = 'scrapy_items'
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
@classmethod
def from_crawler(cls, crawler):
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DATABASE')
)
def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def close_spider(self, spider):
self.client.close()
def process_item(self, item, spider):
self.db[self.collection_name].insert_one(dict(item))
return item
5.配置settings
MONGO_URI='localhost'
MONGO_DATABASE='job51'
ITEM_PIPELINES = {
'job61.pipelines.MongoPipeline':300,
}
6.开始编写爬虫,首先使用循环获取每个职位的详情页面的url,从里面再提取具体的职位信息
def start_requests(self):
for i in range(1,185):
print('正在爬取第'+str(i)+'页')
url='https://search.51job.com/jobsearch/search_result.php?keyword=%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90%E5%B8%88&keywordtype=2&curr_page='+str(i)
r=requests.get(url)
r.encoding='gbk'
urllist=re.compile('href="(.*?)" οnmοusedοwn=""').findall(r.text)
for j in range(0,len(urllist)):
joburl=urllist[j]
yield scrapy.Request(joburl,callback=self.parse)
7.使用css选择器从网页源代码提取职位信息
parse(self, response):
item = Job61Item()
doc=pq(response.text)
items=doc('body > div.tCompanyPage > div.tCompany_center.clearfix').items()
for ite in items:
item['jobname']=ite.find('div.tHeader.tHjob > div > div.cn > h1').text()
item['salary']=ite.find('div.tHeader.tHjob > div > div.cn > strong').text()
item['company']=ite.find('div.tCompany_sidebar > div:nth-child(1) > div.com_msg > a > p').text()
item['companytype']=ite.find('div.tCompany_sidebar > div:nth-child(1) > div.com_tag > p:nth-child(1)').text()
item['companyscale']=ite.find('div.tCompany_sidebar > div:nth-child(1) > div.com_tag > p:nth-child(2)').text()
item['companytag']=ite.find('div.tCompany_sidebar > div:nth-child(1) > div.com_tag > p:nth-child(3)').text()
item['city']=(ite.find('div.tHeader.tHjob > div > div.cn > p.msg.ltype').text()).split('|')[0]
item['record']=(ite.find('div.tHeader.tHjob > div > div.cn > p.msg.ltype').text()).split('|')[2]
item['workyear']=(ite.find('div.tHeader.tHjob > div > div.cn > p.msg.ltype').text()).split('|')[1]
item['welfare']=ite.find('div.tHeader.tHjob > div > div.cn > div > div').text()
item['requirements']=ite.find('div.tCompany_main > div:nth-child(1) > div > div.mt10 > p:nth-child(2)').text()
yield item
pass
8.贴上完整代码
# -*- coding: utf-8 -*-
import scrapy
from job61.items import Job61Item
from pyquery import PyQuery as pq
import pymongo
from pymongo import MongoClient
import re
import requests
class JobSpider(scrapy.Spider):
name = 'job'
allowed_domains = ['51job.com']
start_urls = ['https://51job.com/']
def start_requests(self):
for i in range(1,185):
print('正在爬取第'+str(i)+'页')
url='https://search.51job.com/jobsearch/search_result.php?keyword=%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90%E5%B8%88&keywordtype=2&curr_page='+str(i)
r=requests.get(url)
r.encoding='gbk'
urllist=re.compile('href="(.*?)" οnmοusedοwn=""').findall(r.text)
for j in range(0,len(urllist)):
joburl=urllist[j]
yield scrapy.Request(joburl,callback=self.parse)
def parse(self, response):
item = Job61Item()
doc=pq(response.text)
items=doc('body > div.tCompanyPage > div.tCompany_center.clearfix').items()
for ite in items:
item['jobname']=ite.find('div.tHeader.tHjob > div > div.cn > h1').text()
item['salary']=ite.find('div.tHeader.tHjob > div > div.cn > strong').text()
item['company']=ite.find('div.tCompany_sidebar > div:nth-child(1) > div.com_msg > a > p').text()
item['companytype']=ite.find('div.tCompany_sidebar > div:nth-child(1) > div.com_tag > p:nth-child(1)').text()
item['companyscale']=ite.find('div.tCompany_sidebar > div:nth-child(1) > div.com_tag > p:nth-child(2)').text()
item['companytag']=ite.find('div.tCompany_sidebar > div:nth-child(1) > div.com_tag > p:nth-child(3)').text()
item['city']=(ite.find('div.tHeader.tHjob > div > div.cn > p.msg.ltype').text()).split('|')[0]
item['record']=(ite.find('div.tHeader.tHjob > div > div.cn > p.msg.ltype').text()).split('|')[2]
item['workyear']=(ite.find('div.tHeader.tHjob > div > div.cn > p.msg.ltype').text()).split('|')[1]
item['welfare']=ite.find('div.tHeader.tHjob > div > div.cn > div > div').text()
item['requirements']=ite.find('div.tCompany_main > div:nth-child(1) > div > div.mt10 > p:nth-child(2)').text()
yield item
pass
1.将MongoDB数据导出成csv文件,一共是9826条数据,在Excel中对其进行数据整理和清洗,结果如下所示
然后再将这个文件导入到tableau进行简单可视化分析
2.不同类型公司对数据分析职位的需求量
3.数据分析职位在各城市的薪资水平
4.工资经验与薪资的对比情况
5.对学历要求占比
6.招聘要求关键词词云