爬取前程无忧存入数据库进行数据分析和可视化绘制词云图

爬取前程无忧网站

爬虫基本思路
获取数据在网页的位置----编写防爬-----启动数据库保存数据

本次教程使用的是mongodb数据库(MySQL)原理差不多,自己百度吧
ps:本次测试日期是2020/7/4 后面因为网站更新防爬措施无法实现功能本教程不背锅!

1、具体要求:职位名称、薪资水平、招聘单位、工作地点、工作经验、学历要求、工作内容(岗位职责)、任职要求(技能要求)。
spider文件代码

# -*- coding: utf-8 -*-
import scrapy
import re
from ..items import QianchengwuyouItem


class WuyouSpider(scrapy.Spider):
    name = 'wuyou'
    allowed_domains = ['jobs.51job.com']
    start_urls = ['http://jobs.51job.com/']

    def parse_details(selfm, response):
        # 获取详情页面数据
        print("=" * 100)
        print(response.url)
        item = QianchengwuyouItem()
        # 职位名称
        item["Job_title"] = response.xpath("//div[@class='cn']/h1/text()").extract_first()
        # 薪资水平
        item["Pay_level"] = response.xpath("//div[@class='cn']/strong/text()").extract_first()
        # 招聘单位
        item["Recruitment_unit"] = response.xpath("//div[@class='cn']//a[1]/text()").extract_first()
        # 工作地点 + 工作经验 + 学历要求....都在//div[@class='cn']/p[2]中
        item["Workplace"] = response.xpath("//div[@class='cn']/p[2]/text()[1]").get().replace('\xa0','')
        # 工作经验 + 学历要求
        all = response.xpath("//div[@class='cn']/p[2]/text()[2]").get().replace('\xa0','')
        # 判断工作经验是否存在
        if len(all) >= 4:
            item["hands_background"] = all
            item["Education_requirements"] = response.xpath("//div[@class='cn']/p[2]/text()[3]").get().replace('\xa0','')
            if len(item["Education_requirements"]) != 2:
                item["Education_requirements"] = None
        elif len(all) < 4:
            item["hands_background"] = None
            item["Education_requirements"] = all
            if len(item["Education_requirements"]) != 2:
                item["Education_requirements"] = None
            # .get().replace('\xa0','')
        # item["Workplace"] = item["Workplace"].get(1)
        # # 学历要求
        # item["Education_requirements"] = response.xpath("//div[@class='cn']/p[2]/text()[3]").get().replace('\xa0','')
        # 职位信息包含(工作内容+任职要求+工作经验+学历要求)
        item["Career_information"] = response.xpath("//div[@class='bmsg job_msg inbox']/p/text()").extract()
        item["Career_information"] = [i.strip() for i in item["Career_information"]]
        item["Career_information"] = [i for i in item["Career_information"] if len(i) > 0]
        item["Career_information"] = " ".join(item["Career_information"]).replace("\xa0","").replace(",",",")
        if (item["Pay_level"]) is None:
            item["Pay_level"] = "无"
        # 关键字:keyword
        item["keyword"] = response.xpath("//div[@class='mt10']//p//a/text()").extract()
        yield item

    def industry_perse(self, response):
        # # 获取该行业下所有职业链接
        # all_list = response.xpath("//div[@class='detlist gbox']")
        # # 获取全部招聘职位下的所有职业(occupation)链接
        # for a in all_list:
        #     occupation_url = a.xpath(".//span/a/@href").extract_first()
        #     yield scrapy.Request(
        #         occupation_url,
        #         callback=self.parse_details
        #     )
        # 获取当前页面所有职业所在的div
        all_list = response.xpath("//div[@class='detlist gbox']//div")
        # 计算当前页面获取多少url
        url_num = 0
        # 遍历获取大数据行业下的所有职业(occupation)链接
        for a in all_list:
            occupation_url = a.xpath("./p/span/a/@href").extract_first()
            yield scrapy.Request(
                occupation_url,
                callback=self.parse_details
            )
            url_num += 1
        # 翻页
        print("当前已爬取{}个职业".format(url_num))
        next_url = response.xpath("//div[@class='p_in']/ul//li/a[text()='下一页']/@href").extract_first()
        if next_url is not None:
            yield scrapy.Request(
                next_url,
                callback=self.industry_perse
            )

    def parse(self, response):
        # 获取全部招聘职位的链接
        dashujukaifa_list = response.xpath("//div[@class='maincenter']/div[2]/div[2]//a")
        # 获取全部招聘职位下的所有行业(industry)链接
        for b in dashujukaifa_list:
            industry_url = b.xpath(".//@href").extract_first()
            if industry_url == 'https://jobs.51job.com/dashujukaifa/':
                industries_url = industry_url
                yield scrapy.Request(
                    industries_url,
                    callback=self.industry_perse
                )

items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class QianchengwuyouItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    # 职位名称
    Job_title = scrapy.Field()

    # 薪资水平
    Pay_level = scrapy.Field()

    # 招聘单位
    Recruitment_unit = scrapy.Field()

    # 工作地点
    Workplace = scrapy.Field()

    # 工作经验
    hands_background = scrapy.Field()

    # 学历要求
    Education_requirements = scrapy.Field()

    # 职位信息(工作内容+任职要求+工作经验)
    Career_information = scrapy.Field()

    # 关键字:keyword
    keyword = scrapy.Field()

pipelines.py:启用数据库

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html

from pymongo import MongoClient



class QianchengwuyouPipeline:
    def open_spider(self,spider):
        self.db = MongoClient("localhost", 27017).qiancheng  # 创建数据库yc
        self.collection = self.db.yc_collection  # 创建一个集合

    def process_item(self, item, spider):
        #添加数据到jingjiren表中
        self.collection.insert_one(dict(item))
        return item
    def close_spider(self,spider):
        self.collection.close()

反爬在settings.py设置,看个人喜好

BOT_NAME = 'qianchengwuyou'

SPIDER_MODULES = ['qianchengwuyou.spiders']
NEWSPIDER_MODULE = 'qianchengwuyou.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 ' \
             'Safari/537.36 Edg/83.0.478.54 '

# Obey robots.txt rules
ROBOTSTXT_OBEY = False
FEED_EXPORT_ENCODING = 'utf-8'
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 1
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
COOKIES_ENABLED = False
# 启用pipeline
ITEM_PIPELINES = {
     
    'qianchengwuyou.pipelines.QianchengwuyouPipeline':300,
}

mongodb效果如图
爬取前程无忧存入数据库进行数据分析和可视化绘制词云图_第1张图片
数据分析与可视化:我个人是把所有的功能都集合在一个py文件里,能注释的都注释了,大家应该能看懂

import pymongo  # python连接mongodb数据库模块
import re
from wordcloud import WordCloud  # 词云图绘制模块
from collections import Counter  # 获取数据库链接游标
from pyecharts.charts import Bar, Pie, WordCloud  # bar:条形图绘制模块,pie:饼图绘制模块,wordcloud:词云图绘制模块
from pyecharts.render import make_snapshot   # 绘图模块
from snapshot_phantomjs import snapshot
from pyecharts import options as opts, options
# pyecharts模块的详细使用教程和实例网址:http://pyecharts.org/#/

myclient = pymongo.MongoClient("127.0.0.1", port=27017)  # 数据库IP地址
mydb = myclient["qiancheng"]  # 数据库名称
mytable = mydb["yc_collection"]  # 表名称
# 分析相关岗位的平均工资、最高工资、最低工资
# def shujufenxi(query):
# 最低工资
min_salary_list = []
# 最高工资
max_salary_list = []
# 平均工资
average_salary_list = []
# 岗位
addr_list = []


class PyMongoDemo(object):

    def shujufenxi_1(self):
        Job = input("请输入你想要分析的职业:")
        # 遍历循环
        for i in mytable.find(
                {
     "$and": [
                    {
     "Job_title": {
     "$regex": "{job}".format(job=Job)}},
                    {
     "Pay_level": {
     "$regex": "万/月"}}
                ]}
            ):
            # 拆分列表获取关键数据
            min_salary = i["Pay_level"].split("-")[0]
            print(min_salary)
            max_salary = re.findall(r'([\d+\.]+)', (i["Pay_level"].split("-")[1]))[0]
            average_salary = "{:.1f}".format((float(min_salary) + float(max_salary)) / 2)
            # 设置横坐标的对象
            company = i["Job_title"]
            print(company)
            # 将获取的数据分别写入
            min_salary_list.append(min_salary)
            max_salary_list.append(max_salary)
            average_salary_list.append(average_salary)
            addr_list.append(company)
        bar = Bar(
            init_opts=opts.InitOpts(width="10000px", height="800px"),
        )
        bar.set_global_opts(
            title_opts=opts.TitleOpts(title="{}薪资".format(Job), subtitle="单位  万/月"),
            xaxis_opts=opts.AxisOpts(axislabel_opts={
     "rotate": 45}),
        )
        bar.add_xaxis(addr_list)
        bar.add_yaxis("最高薪资", max_salary_list)
        bar.add_yaxis("最低薪资", min_salary_list)
        bar.add_yaxis("平均薪资", average_salary_list)
        bar.render("{}_1.html".format(Job))

    def shujufenxi_2(self):
        Job = input("请输入你想要分析的职业:")
        for i in mytable.find(
                {
     "$and": [{
     "Job_title": {
     "$regex": "{}".format(Job)}}, {
     "Pay_level": {
     "$regex": "千/月"}}]}):
            min_salary = i["Pay_level"].split("-")[0]
            print(min_salary)
            max_salary = re.findall(r'([\d+\.]+)', (i["Pay_level"].split("-")[1]))[0]
            average_salary = "{:.1f}".format((float(min_salary) + float(max_salary)) / 2)
            company = i["Job_title"]
            print(company)
            min_salary_list.append(min_salary)
            max_salary_list.append(max_salary)
            average_salary_list.append(average_salary)
            addr_list.append(company)
        bar = Bar(
            init_opts=opts.InitOpts(width="10000px", height="800px"),
        )
        bar.set_global_opts(
            title_opts=opts.TitleOpts(title="{}薪资", subtitle="单位  千/月").format(Job),
            xaxis_opts=opts.AxisOpts(axislabel_opts={
     "rotate": 45}),
        )
        bar.add_xaxis(addr_list)
        bar.add_yaxis("最高薪资", max_salary_list)
        bar.add_yaxis("最低薪资", min_salary_list)
        bar.add_yaxis("平均薪资", average_salary_list)
        bar.render("{}_1.html").format(Job)

    def diqugangweishu(self):
        init_opts: opts.InitOpts = opts.InitOpts()
        chengdu_num = 0
        beijing_num = 0
        shanghai_num = 0
        guangzhou_num = 0
        shenzhen_num = 0
        for i in mytable.find({
     "Job_title": {
     "$regex": "大数据"}}, {
     'Job_title', 'Workplace'}):
            Workplace = i["Workplace"].split("-")[0]
            if "成都" in Workplace:
                chengdu_num += 1
            elif "北京" in i["Workplace"]:
                beijing_num += 1
            elif "上海" in i["Workplace"]:
                shanghai_num += 1
            elif "广州" in i["Workplace"]:
                guangzhou_num += 1
            elif "深圳" in i["Workplace"]:
                shenzhen_num += 1
        print(chengdu_num, beijing_num, shanghai_num, guangzhou_num, shenzhen_num)
        # all_num = chengdu_num + beijing_num + shanghai_num + guangzhou_num + shanghai_num
        data = [("成都", chengdu_num), ("北京", beijing_num), ("上海", shanghai_num), ("广州", guangzhou_num),
                ("深圳", shenzhen_num)]
        num = [chengdu_num, beijing_num, shanghai_num, guangzhou_num, shenzhen_num]
        print(data)
        # 创建条形图
        # data_pair = [list(z) for z in zip(addr, num)]
        # bar = Bar(init_opts=opts.InitOpts(width="1800px", height="800px"))
        # bar.set_global_opts(
        #     title_opts=opts.TitleOpts(title="数据分析师地区岗位个数", subtitle="单位  个"),
        #     xaxis_opts=opts.AxisOpts(axislabel_opts={"rotate": 45}),
        # )
        # bar.add_xaxis(addr)
        # bar.add_yaxis("数据分析师地区岗位个数", num)
        # bar.render("数据分析师地区岗位个数.html")

        # 创建图表对象
        pie = Pie()
        # 关联数据
        pie.add(
            # 设置系列名称
            series_name="大数据岗位地区分析",
            # 设置需要展示的数据
            data_pair=data,
            # 设置圆环空心部分和数据显示部分的比例
            radius=["30%", "70%"],
            # 设置饼是不规则的
            rosetype="radius"
        )
        # 设置数据显示的格式
        pie.set_series_opts(label_opts=options.LabelOpts(formatter="{b}: {d}%"))
        # 设置图表的标题
        pie.set_global_opts(title_opts=options.TitleOpts(title="手机销量"))
        # 数据渲染
        pie.render('数据分析地区岗位.html')

    # “数据采集”岗位要求的技能词云图
    def ciyuntu(self):
        keyword_num = 0
        a = []
        for i in mytable.find({
     }, {
     "keyword"}):
            keyword = list(i["keyword"])
            # print(keyword_list)
            a.append(keyword)
            # print(a)
            keyword_num += 1
        print(a)
        keyword_list = sum(a, [])
        word_count = {
     }
        for word in keyword_list:
            if word in word_count:
                word_count[word] += 1
            else:
                word_count[word] = 1
        print(word_count)
        lst = Counter(word_count)
        result = lst.most_common()
        # print(lst.most_common())
        print(result)
        keyword_list = ' '.join(keyword_list)

        # print(keyword_list)
        # print(keyword_num)
        def wordcloud_chart() -> WordCloud:
            c = (
                WordCloud()
                    .add("",
                         result,
                         shape="cardioid",
                         word_size_range=[20, 55], )
                    .set_global_opts(title_opts=opts.TitleOpts(title="WordCloud-shape-diamond"))
                    .render("大数据关键字词云图.html")
                # .render("大数据关键字词云图.png")
            )
            return c

        # make_snapshot(snapshot, wordcloud_chart(), "大数据关键字词云图.png")

    # 分析大数据相关岗位1-3年工作经验的薪资水平
    def fenxi1_3xinzishuiping(self):
        choice1 = "万/月"
        choice2 = "千/月"
        choice = input("请输入你要分析的薪资单位(1:万/月,2:千/月):")
        if choice == '1':
            choice = choice1
        elif choice == '2':
            choice = choice2
        else:
            return choice
        print(choice)
        for i in mytable.find({
     
            "$or": [{
     "hands_background": {
     "$regex": "1"}}, {
     "hands_background": {
     "$regex": "2"}},
                    {
     "hands_background": {
     "$regex": "3"}}
                    ], "$and": [{
     "Job_title": {
     "$regex": "大数据"}}, {
     "Pay_level": {
     "$regex": "{}".format(choice)}}]},
                {
     "Job_title", "Pay_level", "hands_background"}):
            print(i)
            min_salary = i["Pay_level"].split("-")[0]
            # print(min_salary)
            max_salary = re.findall(r'([\d+\.]+)', (i["Pay_level"].split("-")[1]))[0]
            average_salary = "{:.1f}".format((float(min_salary) + float(max_salary)) / 2)
            company = i["Job_title"]
            # print(company)
            min_salary_list.append(min_salary)
            max_salary_list.append(max_salary)
            average_salary_list.append(average_salary)
            addr_list.append(company)
        bar = Bar(
            init_opts=opts.InitOpts(width="10000px", height="800px"),
        )
        bar.set_global_opts(
            title_opts=opts.TitleOpts(title="大数据相关岗位1-3年工作经验的薪资", subtitle="单位  {}".format(choice)),
            xaxis_opts=opts.AxisOpts(axislabel_opts={
     "rotate": 45}),
        )
        bar.add_xaxis(addr_list)
        bar.add_yaxis("最高薪资", max_salary_list)
        bar.add_yaxis("最低薪资", min_salary_list)
        bar.add_yaxis("平均薪资", average_salary_list)
        bar.render("分析大数据相关岗位1-3年工作经验的薪资水平.html")


if __name__ == "__main__":
    mongo = PyMongoDemo()
    a = 0
    b = 0
    a = str(input("请输入你要选择的功能(1:分析输入岗位的薪资水平,2:分析大数据岗位的地区分布,3:分析大数据相关岗位1-3年工作经验的薪资水平),4:绘制大数据关键字词云图:"))
    while True:
        if a == '1':
            b = str(input("请输入你要选择的工资单位(1:万/月,2:千/月):"))
            if b == '1':
                mongo.shujufenxi_1()
                continue
            elif b == '2':
                mongo.shujufenxi_2()
                continue
            else:
                print("请输入正确的数字")
                continue
        elif a == '2':
            mongo.diqugangweishu()
            continue
        elif a == '3':
            mongo.fenxi1_3xinzishuiping()
            continue
        elif a == '4':
            mongo.ciyuntu()
            continue
        else:
            print("输入错误,请重新输入!")
            break
    # mongo.shujufenxi_1()
    # mongo.shujufenxi_2()
    # mongo.diqugangweishu()
    # mongo.ciyuntu()
    # mongo.fenxi1_3xinzishuiping()

该文件所使用的python模块尽量全部都安装
附上效果图
爬取前程无忧存入数据库进行数据分析和可视化绘制词云图_第2张图片
爬取前程无忧存入数据库进行数据分析和可视化绘制词云图_第3张图片
爬取前程无忧存入数据库进行数据分析和可视化绘制词云图_第4张图片爬取前程无忧存入数据库进行数据分析和可视化绘制词云图_第5张图片
到此结束!再见各位

你可能感兴趣的:(实训,mongodb,数据抓取,爬虫,jsoup)