这是一个基于WebMagic + SpringBoot开发的一个简单的爬虫案例,主要爬取前程无忧的招聘数据保存到Mysql数据库!
WebMagic简介
WebMagic是一个简单灵活的Java爬虫框架。基于WebMagic,你可以快速开发出一个高效、易维护的爬虫。
mysql表结构
/*
Navicat MySQL Data Transfer
Source Server : 本机数据库
Source Server Version : 80017
Source Host : localhost:3306
Source Database : crawler
Target Server Type : MYSQL
Target Server Version : 80017
File Encoding : 65001
Date: 2019-12-08 23:36:09
*/
SET FOREIGN_KEY_CHECKS=0;
-- ----------------------------
-- Table structure for jobs_item
-- ----------------------------
DROP TABLE IF EXISTS `jobs_item`;
CREATE TABLE `jobs_item` (
`id` bigint(20) NOT NULL AUTO_INCREMENT COMMENT '主键ID',
`company_name` varchar(100) DEFAULT NULL COMMENT '公司名称',
`company_addr` varchar(255) DEFAULT NULL COMMENT '公司地址',
`company_info` text COMMENT '公司简介',
`job_name` varchar(100) DEFAULT NULL COMMENT '工作名称',
`job_num` int(11) DEFAULT '0' COMMENT '招聘人数',
`job_addr` varchar(255) DEFAULT NULL COMMENT '工作地址',
`job_info` text COMMENT '工作简介',
`diploma` varchar(20) DEFAULT NULL COMMENT '文凭',
`salary_min` bigint(10) DEFAULT NULL COMMENT '最小月薪',
`salary_max` bigint(10) DEFAULT NULL COMMENT '最多月薪',
`url` varchar(100) DEFAULT NULL COMMENT '招聘信息详情页',
`time` varchar(20) DEFAULT NULL COMMENT '职位最近发布时间',
`created` datetime DEFAULT NULL ON UPDATE CURRENT_TIMESTAMP,
`updated` datetime DEFAULT NULL ON UPDATE CURRENT_TIMESTAMP,
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=2844 DEFAULT CHARSET=utf8;
项目目录图
添加pom.xml依赖
4.0.0
org.springframework.boot
spring-boot-starter-parent
2.2.1.RELEASE
club.studycode
qcwy-crawler
1.0.0-SNAPSHOT
qcwy-crawler
JOBS
1.8
2.1.5
0.7.3
org.springframework.boot
spring-boot-starter-test
test
org.junit.vintage
junit-vintage-engine
org.springframework.boot
spring-boot-starter-web
org.projectlombok
lombok
true
tk.mybatis
mapper-spring-boot-starter
${mapper.version}
mysql
mysql-connector-java
runtime
us.codecraft
webmagic-core
org.slf4j
slf4j-log4j12
${webmagic.version}
us.codecraft
webmagic-extension
${webmagic.version}
com.google.guava
guava
28.1-jre
org.springframework.boot
spring-boot-maven-plugin
MyMapper类 定义一个tkMybatis的总接口
package club.studycode.mapper;
import tk.mybatis.mapper.common.Mapper;
import tk.mybatis.mapper.common.MySqlMapper;
public interface MyMapper extends Mapper, MySqlMapper {
}
JobsItemDao接口 继承MyMapper 相当于有了基本的curd方法
package club.studycode.qcwy.crawler.dao;
import club.studycode.mapper.MyMapper;
import club.studycode.qcwy.crawler.entity.JobsItem;
import org.springframework.stereotype.Repository;
/**
* @ClassName: JobsItemDao.java
* @Author: Slayer
* @Date: 2019/11/16 0:59
* @Description:
*/
@Repository
public interface JobsItemDao extends MyMapper {
}
JobsItem Entity类
package club.studycode.qcwy.crawler.entity;
import java.io.Serializable;
import java.util.Date;
import lombok.Data;
import javax.persistence.Column;
import javax.persistence.Id;
import javax.persistence.Table;
/**
* @ClassName: QcwyItem.java
* @Author: Slayer
* @Date: 2019/11/16 0:51
* @Description:
*/
@Data
@Table(name = "jobs_item")
public class JobsItem implements Serializable {
private static final long serialVersionUID = -1274246480063610692L;
// 主键ID
@Id
@Column(name = "id")
private Long id;
// 公司名称
@Column(name = "company_name")
private String companyName;
// 公司地址
@Column(name = "company_addr")
private String companyAddr;
// 公司简介
@Column(name = "company_info")
private String companyInfo;
// 工作名称
@Column(name = "job_name")
private String jobName;
// 招聘人数
@Column(name = "job_num")
private Integer jobNum;
// 工作地址
@Column(name = "job_addr")
private String jobAddr;
// 工作简介
@Column(name = "job_info")
private String jobInfo;
// 文凭
@Column(name = "diploma")
private String diploma;
// 最小月薪
@Column(name = "salary_min")
private Integer salaryMin;
// 最多月薪
@Column(name = "salary_max")
private Integer salaryMax;
// 招聘信息详情页
@Column(name = "url")
private String url;
// 职位最近发布时间
@Column(name = "time")
private String time;
@Column(name = "created")
private Date created;
@Column(name = "updated")
private Date updated;
}
JobsItemService 业务层接口
package club.studycode.qcwy.crawler.service;
import club.studycode.qcwy.crawler.entity.JobsItem;
/**
* @ClassName: JobsItemService.java
* @Author: Slayer
* @Date: 2019/12/8 23:42
* @Description:
*/
public interface JobsItemService {
void save(JobsItem jobsItem);
JobsItem getByCompanyName(String companyName);
}
JobsItemServiceImpl 业务实现类
package club.studycode.qcwy.crawler.service.impl;
import club.studycode.qcwy.crawler.dao.JobsItemDao;
import club.studycode.qcwy.crawler.entity.JobsItem;
import club.studycode.qcwy.crawler.service.JobsItemService;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;
import tk.mybatis.mapper.entity.Example;
import javax.annotation.Resource;
@Transactional(readOnly = true)
@Service
public class JobsItemServiceImpl implements JobsItemService {
@Resource
private JobsItemDao jobsItemDao;
/**
* 保存数据
*
* @param jobsItem
*/
@Override
@Transactional(readOnly = false)
public void save(JobsItem jobsItem) {
// 新增数据
if (jobsItem.getId() == null) {
jobsItemDao.insert(jobsItem);
}
//更新数据
else {
jobsItemDao.updateByPrimaryKey(jobsItem);
}
}
/**
* 根据名称获取数据
*
* @param companyName
* @return
*/
@Override
public JobsItem getByCompanyName(String companyName) {
Example example = new Example(JobsItem.class);
example.createCriteria().andEqualTo("companyName", companyName);
return jobsItemDao.selectOneByExample(example);
}
}
JobProcessor WebMagic 核心类 主要定义要爬取数据的规则
package club.studycode.qcwy.crawler.task;
import club.studycode.qcwy.crawler.entity.JobsItem;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover;
import us.codecraft.webmagic.scheduler.QueueScheduler;
import us.codecraft.webmagic.selector.Selectable;
import java.util.Date;
import java.util.List;
/**
* @ClassName: JobProcessor.java
* @Author: Slayer
* @Date: 2019/12/8 23:43
* @Description:
*/
@Component
public class JobProcessor implements PageProcessor {
private static final String URL_CRAWLER = "https://search.51job.com/list/000000,000000,0000,00,9,99,java,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=";
// private static final String URL_CRAWLER = "https://search.51job.com/list/000000,000000,0000,00,9,99,%25E9%2594%2580%25E5%2594%25AE,2,1.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=";
private long num = 1;
@Override
public void process(Page page) {
List selectables = page.getHtml().css("div#resultList div.el").nodes();
// 判断获取到的集合是否为空
if (selectables.size() == 0) {
// 为空则表示这是招聘详情页
this.saveJobInfo(page);
}
// 不为空,表示这是详情页,解析出详情页的url地址,放到任务中
else {
selectables.forEach(selectable -> {
// 获取详情页url地址
String jobInfoUrl = selectable.css("p.t1 > span > a[href]").links().get();
// 把获取到的url地址放在任务队中
page.addTargetRequest(jobInfoUrl);
});
// 获取下一页的url地址
String nextPage = page.getHtml().css("li.bk").nodes().get(1).links().get();
// 把获取到的url地址放在任务队中
page.addTargetRequest(nextPage);
}
}
/**
* 保存详情页内容
*
* @param page 页面数据
*/
private void saveJobInfo(Page page) {
JobsItem jobsItem = new JobsItem();
// 设置公司名称
String companyName = page.getHtml().css("p.cname > a", "text").get();
jobsItem.setCompanyName(companyName);
// 设置部分信息
String info = page.getHtml().css("p.msg", "text").get();
this.saveCompanyInfo(info, jobsItem);
// 设置公司信息
String companyInfo = page.getHtml().css("div.tmsg", "text").get();
jobsItem.setCompanyInfo(companyInfo);
// 设置工作名称
String jobName = page.getHtml().css("div.cn > h1", "text").get();
jobsItem.setJobName(jobName);
// 设置工作地址
List jobAddrSelectables = page.getHtml().css("p.fp").nodes();
if (jobAddrSelectables.size() > 1) {
String jobAddrHtml = jobAddrSelectables.get(1).css("p.fp", "text").get();
jobsItem.setJobAddr(jobAddrHtml);
}
// 设置工作简介
List jobInfoSelectables = page.getHtml().css("div.bmsg").nodes();
if (jobInfoSelectables.size() >= 1) {
String jobInfoHtml = jobInfoSelectables.get(0).get();
String jobInfo = Jsoup.parse(jobInfoHtml).text();
jobsItem.setJobInfo(jobInfo);
}
// 设置最高和最低月薪
String salary = page.getHtml().css("div.cn > strong", "text").get();
this.saveSalary(salary, jobsItem);
// 设置招聘信息详情页
jobsItem.setUrl(page.getUrl().get());
jobsItem.setCreated(new Date());
jobsItem.setUpdated(jobsItem.getCreated());
// 保存数据
page.putField("jobsItem", jobsItem);
System.out.println("爬虫次数" + this.num++);
}
private void saveSalary(String salary, JobsItem jobsItem) {
if (!StringUtils.isBlank(salary)) {
String[] split = salary.split("-");
char unit = split[1].charAt(split[1].length() - 3);
double num = 0;
switch (unit) {
case '千':
num = 1000;
break;
case '万':
num = 10000;
break;
default:
break;
}
// 设置最低月薪
int salaryMin = (int) (Double.parseDouble(split[0]) * num);
jobsItem.setSalaryMin(salaryMin);
// 设置最多月薪
int salaryMax = (int) (Double.parseDouble(split[1].substring(0, split[1].length() - 3)) * num);
jobsItem.setSalaryMax(salaryMax);
}
}
private void saveCompanyInfo(String companyInfo, JobsItem jobsItem) {
if (!StringUtils.isBlank(companyInfo)) {
String[] companyInfos = companyInfo.split(" ");
// 设置公司地址
jobsItem.setCompanyAddr(companyInfos[0].trim());
// 设置公司招聘人数
if ("招若干人".equals(companyInfos[3].trim())) {
jobsItem.setJobNum(9999);
} else {
String num = companyInfos[3].replaceAll("[^0-9]", "");
if (!StringUtils.isBlank(num)) {
jobsItem.setJobNum(Integer.parseInt(num));
}
}
// 设置文凭
if (companyInfos[2].contains("招")) {
jobsItem.setDiploma("无学历");
String num = companyInfos[2].replaceAll("[^0-9]", "");
if (!StringUtils.isBlank(num)) {
jobsItem.setJobNum(Integer.parseInt(num));
}
} else {
jobsItem.setDiploma(companyInfos[2].trim());
}
// 设置职位最近发布时间
for (String time : companyInfos) {
if (!StringUtils.isBlank(time)) {
if (time.indexOf("发布") > 0) {
time = time.replace("发布", "");
jobsItem.setTime(time);
}
}
}
}
}
private Site site = Site.me()
// 设置字符编码
.setCharset("gbk")
// 设置超时时间
.setTimeOut(10 * 1000)
// 设置重试间隔
.setRetrySleepTime(3 * 1000)
// 设置重试次数
.setSleepTime(3);
@Override
public Site getSite() {
return site;
}
@Autowired
private SaveDataPipeline saveDataPipeline;
/**
* initialDelay当任务启动后,等多久在执行
* fixedDelay每隔多久执行一次
*/
@Scheduled(initialDelay = 1000, fixedDelay = 10 * 1000)
public void process() {
Spider.create(new JobProcessor())
.addUrl(URL_CRAWLER)
// 使用BloomFilter来进行去重,占用内存较小,但是可能漏抓页面 //100000是估计的页面数量
.setScheduler(new QueueScheduler().setDuplicateRemover(new BloomFilterDuplicateRemover(100000)))
.thread(1)
// 设置输出位置
.addPipeline(this.saveDataPipeline)
.run();
}
}
SaveDataPipeline 主要把爬取的数据保存到数据库中
package club.studycode.qcwy.crawler.task;
import club.studycode.qcwy.crawler.entity.JobsItem;
import club.studycode.qcwy.crawler.service.JobsItemService;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
/**
* @ClassName: SaveDataPipeline.java
* @Author: Slayer
* @Date: 2019/12/8 23:46
* @Description:
*/
@Component
public class SaveDataPipeline implements Pipeline {
@Autowired
private JobsItemService jobsItemService;
@Override
public void process(ResultItems resultItems, Task task) {
// 获取数据
JobsItem jobsItem = resultItems.get("jobsItem");
// 判断获取的数据是否符合
if (jobsItem != null && jobsItem.getCompanyName() != null) {
// 查询是否有重复的
JobsItem resultJobsItem = jobsItemService.getByCompanyName(jobsItem.getCompanyName());
// 有则更新
if (resultJobsItem != null) {
jobsItem.setId(resultJobsItem.getId());
System.out.println("-----------------------------更新数据啦----------------------------------");
}
jobsItemService.save(jobsItem);
}
}
}
QcwyCrawlerApplication SpringBoot启动类
package club.studycode.qcwy.crawler;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.scheduling.annotation.EnableScheduling;
import org.springframework.transaction.annotation.EnableTransactionManagement;
import tk.mybatis.spring.annotation.MapperScan;
@SpringBootApplication
@MapperScan(basePackages = "club.studycode.qcwy.crawler.dao")
@EnableScheduling
@EnableTransactionManagement
public class QcwyCrawlerApplication {
public static void main(String[] args) {
SpringApplication.run(QcwyCrawlerApplication.class, args);
}
}
application.yaml配置
spring:
datasource:
type: com.zaxxer.hikari.HikariDataSource
driver-class-name: com.mysql.cj.jdbc.Driver
url: jdbc:mysql://localhost:3306/crawler?useUnicode=true&characterEncoding=utf-8&useSSL=false&serverTimezone=Asia/Shanghai
username: root
password: "020822"
hikari:
minimum-idle: 5
idle-timeout: 600000
maximum-pool-size: 10
auto-commit: true
pool-name: MyHikariCP
max-lifetime: 1800000
connection-timeout: 30000
connection-test-query: SELECT 1
mybatis:
type-aliases-package: club.studycode.qcwy.crawler.entity