java爬虫案例

这是一个基于WebMagic + SpringBoot开发的一个简单的爬虫案例,主要爬取前程无忧的招聘数据保存到Mysql数据库!

数据图

WebMagic简介
WebMagic是一个简单灵活的Java爬虫框架。基于WebMagic,你可以快速开发出一个高效、易维护的爬虫。

mysql表结构

/*
Navicat MySQL Data Transfer

Source Server         : 本机数据库
Source Server Version : 80017
Source Host           : localhost:3306
Source Database       : crawler

Target Server Type    : MYSQL
Target Server Version : 80017
File Encoding         : 65001

Date: 2019-12-08 23:36:09
*/

SET FOREIGN_KEY_CHECKS=0;

-- ----------------------------
-- Table structure for jobs_item
-- ----------------------------
DROP TABLE IF EXISTS `jobs_item`;
CREATE TABLE `jobs_item` (
  `id` bigint(20) NOT NULL AUTO_INCREMENT COMMENT '主键ID',
  `company_name` varchar(100) DEFAULT NULL COMMENT '公司名称',
  `company_addr` varchar(255) DEFAULT NULL COMMENT '公司地址',
  `company_info` text COMMENT '公司简介',
  `job_name` varchar(100) DEFAULT NULL COMMENT '工作名称',
  `job_num` int(11) DEFAULT '0' COMMENT '招聘人数',
  `job_addr` varchar(255) DEFAULT NULL COMMENT '工作地址',
  `job_info` text COMMENT '工作简介',
  `diploma` varchar(20) DEFAULT NULL COMMENT '文凭',
  `salary_min` bigint(10) DEFAULT NULL COMMENT '最小月薪',
  `salary_max` bigint(10) DEFAULT NULL COMMENT '最多月薪',
  `url` varchar(100) DEFAULT NULL COMMENT '招聘信息详情页',
  `time` varchar(20) DEFAULT NULL COMMENT '职位最近发布时间',
  `created` datetime DEFAULT NULL ON UPDATE CURRENT_TIMESTAMP,
  `updated` datetime DEFAULT NULL ON UPDATE CURRENT_TIMESTAMP,
  PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=2844 DEFAULT CHARSET=utf8;

项目目录图

image.png

添加pom.xml依赖



    4.0.0

    
        org.springframework.boot
        spring-boot-starter-parent
        2.2.1.RELEASE
         
    

    club.studycode
    qcwy-crawler
    1.0.0-SNAPSHOT
    qcwy-crawler
    JOBS

    
        1.8
        2.1.5
        0.7.3
    

    

        
            org.springframework.boot
            spring-boot-starter-test
            test
            
                
                    org.junit.vintage
                    junit-vintage-engine
                
            
        

        
            org.springframework.boot
            spring-boot-starter-web
        

        
            org.projectlombok
            lombok
            true
        

        
            tk.mybatis
            mapper-spring-boot-starter
            ${mapper.version}
        

        
            mysql
            mysql-connector-java
            runtime
        

        
        
            us.codecraft
            webmagic-core
            
                
                    org.slf4j
                    slf4j-log4j12
                
            
            ${webmagic.version}
        

        
        
            us.codecraft
            webmagic-extension
            ${webmagic.version}
        

        
            com.google.guava
            guava
            28.1-jre
        
    

    
        
            
                org.springframework.boot
                spring-boot-maven-plugin
            
        
    



MyMapper类 定义一个tkMybatis的总接口

package club.studycode.mapper;


import tk.mybatis.mapper.common.Mapper;
import tk.mybatis.mapper.common.MySqlMapper;

public interface MyMapper extends Mapper, MySqlMapper {

}

JobsItemDao接口 继承MyMapper 相当于有了基本的curd方法

package club.studycode.qcwy.crawler.dao;

import club.studycode.mapper.MyMapper;
import club.studycode.qcwy.crawler.entity.JobsItem;
import org.springframework.stereotype.Repository;


/**
 *  @ClassName: JobsItemDao.java
 *  @Author: Slayer
 *  @Date: 2019/11/16 0:59
 *  @Description:
 */
@Repository
public interface JobsItemDao extends MyMapper {

}

JobsItem Entity类

package club.studycode.qcwy.crawler.entity;

import java.io.Serializable;
import java.util.Date;

import lombok.Data;

import javax.persistence.Column;
import javax.persistence.Id;
import javax.persistence.Table;


/**
 * @ClassName: QcwyItem.java
 * @Author: Slayer
 * @Date: 2019/11/16 0:51
 * @Description:
 */
@Data
@Table(name = "jobs_item")
public class JobsItem implements Serializable {

    private static final long serialVersionUID = -1274246480063610692L;

    // 主键ID
    @Id
    @Column(name = "id")
    private Long id;

    // 公司名称
    @Column(name = "company_name")
    private String companyName;

    // 公司地址
    @Column(name = "company_addr")
    private String companyAddr;

    // 公司简介
    @Column(name = "company_info")
    private String companyInfo;

    // 工作名称
    @Column(name = "job_name")
    private String jobName;

    // 招聘人数
    @Column(name = "job_num")
    private Integer jobNum;

    // 工作地址
    @Column(name = "job_addr")
    private String jobAddr;

    // 工作简介
    @Column(name = "job_info")
    private String jobInfo;

    // 文凭
    @Column(name = "diploma")
    private String diploma;

    // 最小月薪
    @Column(name = "salary_min")
    private Integer salaryMin;

    // 最多月薪
    @Column(name = "salary_max")
    private Integer salaryMax;

    // 招聘信息详情页
    @Column(name = "url")
    private String url;

    // 职位最近发布时间
    @Column(name = "time")
    private String time;

    @Column(name = "created")
    private Date created;

    @Column(name = "updated")
    private Date updated;

}

JobsItemService 业务层接口

package club.studycode.qcwy.crawler.service;

import club.studycode.qcwy.crawler.entity.JobsItem;

/**
 *  @ClassName: JobsItemService.java
 *  @Author: Slayer
 *  @Date: 2019/12/8 23:42
 *  @Description: 
 */
public interface JobsItemService {

    void save(JobsItem jobsItem);


    JobsItem getByCompanyName(String companyName);

}

JobsItemServiceImpl 业务实现类

package club.studycode.qcwy.crawler.service.impl;

import club.studycode.qcwy.crawler.dao.JobsItemDao;
import club.studycode.qcwy.crawler.entity.JobsItem;
import club.studycode.qcwy.crawler.service.JobsItemService;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;
import tk.mybatis.mapper.entity.Example;

import javax.annotation.Resource;


@Transactional(readOnly = true)
@Service
public class JobsItemServiceImpl implements JobsItemService {

    @Resource
    private JobsItemDao jobsItemDao;

    /**
     * 保存数据
     *
     * @param jobsItem
     */
    @Override
    @Transactional(readOnly = false)
    public void save(JobsItem jobsItem) {
        // 新增数据
        if (jobsItem.getId() == null) {
            jobsItemDao.insert(jobsItem);
        }
        //更新数据
        else {
            jobsItemDao.updateByPrimaryKey(jobsItem);
        }

    }


    /**
     * 根据名称获取数据
     *
     * @param companyName
     * @return
     */
    @Override
    public JobsItem getByCompanyName(String companyName) {
        Example example = new Example(JobsItem.class);
        example.createCriteria().andEqualTo("companyName", companyName);
        return jobsItemDao.selectOneByExample(example);
    }

}

JobProcessor WebMagic 核心类 主要定义要爬取数据的规则

package club.studycode.qcwy.crawler.task;

import club.studycode.qcwy.crawler.entity.JobsItem;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover;
import us.codecraft.webmagic.scheduler.QueueScheduler;
import us.codecraft.webmagic.selector.Selectable;

import java.util.Date;
import java.util.List;


/**
 *  @ClassName: JobProcessor.java
 *  @Author: Slayer
 *  @Date: 2019/12/8 23:43
 *  @Description: 
 */
@Component
public class JobProcessor implements PageProcessor {

        private static final String URL_CRAWLER = "https://search.51job.com/list/000000,000000,0000,00,9,99,java,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=";
//    private static final String URL_CRAWLER = "https://search.51job.com/list/000000,000000,0000,00,9,99,%25E9%2594%2580%25E5%2594%25AE,2,1.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=";


    private long num = 1;

    @Override
    public void process(Page page) {
        List selectables = page.getHtml().css("div#resultList div.el").nodes();

        // 判断获取到的集合是否为空
        if (selectables.size() == 0) {
            // 为空则表示这是招聘详情页
            this.saveJobInfo(page);
        }
        // 不为空,表示这是详情页,解析出详情页的url地址,放到任务中
        else {
            selectables.forEach(selectable -> {
                // 获取详情页url地址
                String jobInfoUrl = selectable.css("p.t1 > span > a[href]").links().get();
                // 把获取到的url地址放在任务队中
                page.addTargetRequest(jobInfoUrl);
            });

            // 获取下一页的url地址
            String nextPage = page.getHtml().css("li.bk").nodes().get(1).links().get();
            // 把获取到的url地址放在任务队中
            page.addTargetRequest(nextPage);
        }

    }

    /**
     * 保存详情页内容
     *
     * @param page 页面数据
     */
    private void saveJobInfo(Page page) {
        JobsItem jobsItem = new JobsItem();
        // 设置公司名称
        String companyName = page.getHtml().css("p.cname > a", "text").get();
        jobsItem.setCompanyName(companyName);

        // 设置部分信息
        String info = page.getHtml().css("p.msg", "text").get();
        this.saveCompanyInfo(info, jobsItem);

        // 设置公司信息
        String companyInfo = page.getHtml().css("div.tmsg", "text").get();
        jobsItem.setCompanyInfo(companyInfo);

        // 设置工作名称
        String jobName = page.getHtml().css("div.cn > h1", "text").get();
        jobsItem.setJobName(jobName);

        // 设置工作地址
        List jobAddrSelectables = page.getHtml().css("p.fp").nodes();
        if (jobAddrSelectables.size() > 1) {
            String jobAddrHtml = jobAddrSelectables.get(1).css("p.fp", "text").get();
            jobsItem.setJobAddr(jobAddrHtml);
        }

        // 设置工作简介

        List jobInfoSelectables = page.getHtml().css("div.bmsg").nodes();

        if (jobInfoSelectables.size() >= 1) {
            String jobInfoHtml = jobInfoSelectables.get(0).get();
            String jobInfo = Jsoup.parse(jobInfoHtml).text();
            jobsItem.setJobInfo(jobInfo);
        }

        // 设置最高和最低月薪
        String salary = page.getHtml().css("div.cn > strong", "text").get();
        this.saveSalary(salary, jobsItem);

        // 设置招聘信息详情页
        jobsItem.setUrl(page.getUrl().get());

        jobsItem.setCreated(new Date());

        jobsItem.setUpdated(jobsItem.getCreated());

        // 保存数据
        page.putField("jobsItem", jobsItem);

        System.out.println("爬虫次数" + this.num++);

    }

    private void saveSalary(String salary, JobsItem jobsItem) {
        if (!StringUtils.isBlank(salary)) {
            String[] split = salary.split("-");

            char unit = split[1].charAt(split[1].length() - 3);

            double num = 0;

            switch (unit) {
                case '千':
                    num = 1000;
                    break;
                case '万':
                    num = 10000;
                    break;
                default:
                    break;
            }

            // 设置最低月薪
            int salaryMin = (int) (Double.parseDouble(split[0]) * num);
            jobsItem.setSalaryMin(salaryMin);

            // 设置最多月薪
            int salaryMax = (int) (Double.parseDouble(split[1].substring(0, split[1].length() - 3)) * num);
            jobsItem.setSalaryMax(salaryMax);
        }

    }


    private void saveCompanyInfo(String companyInfo, JobsItem jobsItem) {
        if (!StringUtils.isBlank(companyInfo)) {
            String[] companyInfos = companyInfo.split("    ");
            // 设置公司地址
            jobsItem.setCompanyAddr(companyInfos[0].trim());

            // 设置公司招聘人数
            if ("招若干人".equals(companyInfos[3].trim())) {
                jobsItem.setJobNum(9999);
            } else {
                String num = companyInfos[3].replaceAll("[^0-9]", "");
                if (!StringUtils.isBlank(num)) {
                    jobsItem.setJobNum(Integer.parseInt(num));
                }
            }

            // 设置文凭
            if (companyInfos[2].contains("招")) {
                jobsItem.setDiploma("无学历");
                String num = companyInfos[2].replaceAll("[^0-9]", "");
                if (!StringUtils.isBlank(num)) {
                    jobsItem.setJobNum(Integer.parseInt(num));
                }
            } else {
                jobsItem.setDiploma(companyInfos[2].trim());
            }


            // 设置职位最近发布时间
            for (String time : companyInfos) {
                if (!StringUtils.isBlank(time)) {
                    if (time.indexOf("发布") > 0) {
                        time = time.replace("发布", "");
                        jobsItem.setTime(time);
                    }
                }
            }

        }
    }

    private Site site = Site.me()
            // 设置字符编码
            .setCharset("gbk")
            // 设置超时时间
            .setTimeOut(10 * 1000)
            // 设置重试间隔
            .setRetrySleepTime(3 * 1000)
            // 设置重试次数
            .setSleepTime(3);


    @Override
    public Site getSite() {
        return site;
    }


    @Autowired
    private SaveDataPipeline saveDataPipeline;

    /**
     * initialDelay当任务启动后,等多久在执行
     * fixedDelay每隔多久执行一次
     */
    @Scheduled(initialDelay = 1000, fixedDelay = 10 * 1000)
    public void process() {
        Spider.create(new JobProcessor())
                .addUrl(URL_CRAWLER)
                // 使用BloomFilter来进行去重,占用内存较小,但是可能漏抓页面   //100000是估计的页面数量
                .setScheduler(new QueueScheduler().setDuplicateRemover(new BloomFilterDuplicateRemover(100000)))
                .thread(1)
                // 设置输出位置
                .addPipeline(this.saveDataPipeline)
                .run();
    }
}

SaveDataPipeline 主要把爬取的数据保存到数据库中

package club.studycode.qcwy.crawler.task;

import club.studycode.qcwy.crawler.entity.JobsItem;
import club.studycode.qcwy.crawler.service.JobsItemService;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;


/**
 *  @ClassName: SaveDataPipeline.java
 *  @Author: Slayer
 *  @Date: 2019/12/8 23:46
 *  @Description: 
 */
@Component
public class SaveDataPipeline implements Pipeline {

    @Autowired
    private JobsItemService jobsItemService;

    @Override
    public void process(ResultItems resultItems, Task task) {
        // 获取数据
        JobsItem jobsItem = resultItems.get("jobsItem");
        // 判断获取的数据是否符合
        if (jobsItem != null && jobsItem.getCompanyName() != null) {
            // 查询是否有重复的
            JobsItem resultJobsItem = jobsItemService.getByCompanyName(jobsItem.getCompanyName());
            // 有则更新
            if (resultJobsItem != null) {
                jobsItem.setId(resultJobsItem.getId());
                System.out.println("-----------------------------更新数据啦----------------------------------");
            }
            jobsItemService.save(jobsItem);
        }

    }
}

QcwyCrawlerApplication SpringBoot启动类

package club.studycode.qcwy.crawler;

import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.scheduling.annotation.EnableScheduling;
import org.springframework.transaction.annotation.EnableTransactionManagement;
import tk.mybatis.spring.annotation.MapperScan;

@SpringBootApplication
@MapperScan(basePackages = "club.studycode.qcwy.crawler.dao")
@EnableScheduling
@EnableTransactionManagement
public class QcwyCrawlerApplication {
    public static void main(String[] args) {
        SpringApplication.run(QcwyCrawlerApplication.class, args);
    }

}

application.yaml配置

spring:
  datasource:
    type: com.zaxxer.hikari.HikariDataSource
    driver-class-name: com.mysql.cj.jdbc.Driver
    url: jdbc:mysql://localhost:3306/crawler?useUnicode=true&characterEncoding=utf-8&useSSL=false&serverTimezone=Asia/Shanghai
    username: root
    password: "020822"
    hikari:
      minimum-idle: 5
      idle-timeout: 600000
      maximum-pool-size: 10
      auto-commit: true
      pool-name: MyHikariCP
      max-lifetime: 1800000
      connection-timeout: 30000
      connection-test-query: SELECT 1

mybatis:
  type-aliases-package: club.studycode.qcwy.crawler.entity

这个案例主要演示了WebMagic的基本使用!!!

你可能感兴趣的:(java爬虫案例)