1、pom.xml
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0modelVersion>
<groupId>com.mollengroupId>
<artifactId>mollen_job_crawlerartifactId>
<version>1.0-SNAPSHOTversion>
<packaging>warpackaging>
<parent>
<groupId>org.springframework.bootgroupId>
<artifactId>spring-boot-starter-parentartifactId>
<version>2.0.5.RELEASEversion>
parent>
<dependencies>
<dependency>
<groupId>org.springframework.bootgroupId>
<artifactId>spring-boot-starter-webartifactId>
dependency>
<dependency>
<groupId>org.springframework.bootgroupId>
<artifactId>spring-boot-starter-data-jpaartifactId>
dependency>
<dependency>
<groupId>mysqlgroupId>
<artifactId>mysql-connector-javaartifactId>
dependency>
<dependency>
<groupId>us.codecraftgroupId>
<artifactId>webmagic-coreartifactId>
<version>0.7.3version>
<exclusions>
<exclusion>
<groupId>org.slf4jgroupId>
<artifactId>slf4j-log4j12artifactId>
exclusion>
exclusions>
dependency>
<dependency>
<groupId>us.codecraftgroupId>
<artifactId>webmagic-extensionartifactId>
<version>0.7.3version>
dependency>
<dependency>
<groupId>com.google.guavagroupId>
<artifactId>guavaartifactId>
<version>16.0version>
dependency>
<dependency>
<groupId>org.apache.commonsgroupId>
<artifactId>commons-lang3artifactId>
dependency>
dependencies>
project>
2、application.properties
#DB Configuration:
spring.datasource.driverClassName=com.mysql.jdbc.Driver
spring.datasource.url=jdbc:mysql://127.0.0.1:3306/crawler
spring.datasource.username=root
spring.datasource.password=root
#JPA Configuration:
spring.jpa.database=MySQL
spring.jpa.show-sql=true
3、JobInfo.java
package com.mollen.pojo;
import javax.persistence.Entity;
import javax.persistence.GeneratedValue;
import javax.persistence.GenerationType;
import javax.persistence.Id;
/**
* @ClassName: JobInfoService
* @Auther: Mollen
* @CreateTime: 2019-01-29 15:22:53
* @Description:
*/
@Entity
public class JobInfo {
@Id
@GeneratedValue(strategy = GenerationType.IDENTITY)
private Long id;
private String companyName;
private String companyAddr;
private String companyInfo;
private String jobName;
private String jobAddr;
private String jobInfo;
private Integer salaryMin;
private Integer salaryMax;
private String url;
private String time;
//getter/setter...
//toString
}
4、JobInfoDao.java
package com.mollen.dao;
import com.mollen.pojo.JobInfo;
import org.springframework.data.jpa.repository.JpaRepository;
/**
* @ClassName: JobInfo
* @Auther: Mollen
* @CreateTime: 2019-01-29 15:24:33
* @Description:
*/
public interface JobInfoDao extends JpaRepository<JobInfo,Long>{
}
5、JobInfoService.java
package com.mollen.service;
import com.mollen.pojo.JobInfo;
import java.util.List;
/**
* @ClassName: JobInfoService
* @Auther: Mollen
* @CreateTime: 2019-01-29 15:43:01
* @Description:
*/
public interface JobInfoService {
/**
* 保存
*/
public void save(JobInfo jobInfo);
/**
* 根据条件查询
*/
public List<JobInfo> findJobInfo(JobInfo jobInfo);
}
6、JobInfoServiceImpl.java
package com.mollen.service.impl;
import com.mollen.dao.JobInfoDao;
import com.mollen.pojo.JobInfo;
import com.mollen.service.JobInfoService;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.domain.Example;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;
import java.util.List;
/**
* @ClassName: JobInfoServiceImpl
* @Auther: Mollen
* @CreateTime: 2019-01-29 15:59:51
* @Description:
*/
@Service
public class JobInfoServiceImpl implements JobInfoService {
@Autowired
private JobInfoDao jobInfoDao;
@Override
@Transactional
public void save(JobInfo jobInfo) {
//根据url和发布时间查询数据
JobInfo param = new JobInfo();
param.setUrl(jobInfo.getUrl());
param.setTime(jobInfo.getTime());
//执行查询
List<JobInfo> list = this.findJobInfo(param);
//判断查询结果是否为空
if (list.size() == 0) {
//如果查询结果为空,表示招聘信息数据不存在,或者已经更新了,需要新增或者更新数据库
this.jobInfoDao.saveAndFlush(jobInfo);
}
}
@Override
public List<JobInfo> findJobInfo(JobInfo jobInfo) {
//设置查询条件
Example example = Example.of(jobInfo);
//执行查询
List list = this.jobInfoDao.findAll(example);
return list;
}
}
7、MathSalary.java 工具类可以直接使用
package com.mollen.task;
public class MathSalary {
/**
* 获取薪水范围
*
* @param salaryStr
* @return
*/
public static Integer[] getSalary(String salaryStr) {
//声明存放薪水范围的数组
Integer[] salary = new Integer[2];
String date = salaryStr.substring(salaryStr.length() - 1, salaryStr.length());
//如果是按天,则直接乘以240进行计算
if (!"月".equals(date) && !"年".equals(date)) {
salaryStr = salaryStr.substring(0, salaryStr.length() - 2);
salary[0] = salary[1] = str2Num(salaryStr, 240);
return salary;
}
String unit = salaryStr.substring(salaryStr.length() - 3, salaryStr.length() - 2);
String[] salarys = salaryStr.substring(0, salaryStr.length() - 3).split("-");
salary[0] = mathSalary(date, unit, salarys[0]);
salary[1] = mathSalary(date, unit, salarys[1]);
return salary;
}
//根据条件计算薪水
private static Integer mathSalary(String date, String unit, String salaryStr) {
Integer salary = 0;
//判断单位是否是万
if ("万".equals(unit)) {
//如果是万,薪水乘以1000
salary = str2Num(salaryStr, 1000);
} else {
//否则乘以100
salary = str2Num(salaryStr, 100);
}
//判断时间是否是月
if ("月".equals(date)) {
//如果是月,薪水乘以12
salary = str2Num(salary.toString(), 12);
}
return salary;
}
private static int str2Num(String salaryStr, int num) {
try {
// 把字符串转为小数,必须用Number接受,否则会有精度丢失的问题
Number result = Float.parseFloat(salaryStr) * num;
return result.intValue();
} catch (Exception e) {
}
return 0;
}
}
8、JobProcessor.java 网页标签变化需要根据具体情况修改
package com.mollen.task;
import com.mollen.pojo.JobInfo;
import org.jsoup.Jsoup;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover;
import us.codecraft.webmagic.scheduler.QueueScheduler;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Selectable;
import java.util.List;
@Component
public class JobProcessor implements PageProcessor {
private String url = "https://search.51job.com/list/000000,000000,0000,01%" +
"252C32,9,99,java,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&" +
"cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%" +
"2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&" +
"specialarea=00&from=&welfare=";
@Override
public void process(Page page) {
//解析页面,获取招聘信息详情的url地址
List<Selectable> list = page.getHtml().css("div#resultList div.el").nodes();
//判断获取到的集合是否为空
if (list.size() == 0) {
// 如果为空,表示这是招聘详情页,解析页面,获取招聘详情信息,保存数据
this.saveJobInfo(page);
} else {
//如果不为空,表示这是列表页,解析出详情页的url地址,放到任务队列中
for (Selectable selectable : list) {
//获取url地址
String jobInfoUrl = selectable.links().toString();
//把获取到的url地址放到任务队列中
page.addTargetRequest(jobInfoUrl);
}
//获取下一页的url
String bkUrl = page.getHtml().css("div.p_in li.bk").nodes().get(1).links().toString();
//把url放到任务队列中
page.addTargetRequest(bkUrl);
}
String html = page.getHtml().toString();
}
//解析页面,获取招聘详情信息,保存数据
private void saveJobInfo(Page page) {
//创建招聘详情对象
JobInfo jobInfo = new JobInfo();
//解析页面
Html html = page.getHtml();
//获取发布信息
String content = Jsoup.parse(html.css("div.cn p").regex(".*发布").toString()).text();
//获取数据,封装到对象中
jobInfo.setCompanyName(html.css("div.cn p.cname a","text").toString());
jobInfo.setCompanyAddr(Jsoup.parse(html.css("div.bmsg").nodes().get(1).toString()).text());
jobInfo.setCompanyInfo(Jsoup.parse(html.css("div.tmsg").toString()).text());
jobInfo.setJobName(html.css("div.cn h1","text").toString());
jobInfo.setJobAddr(content.substring(0,2));
jobInfo.setJobInfo(Jsoup.parse(html.css("div.job_msg").toString()).text());
jobInfo.setTime(content.substring(content.length()-7,content.length()-2));
jobInfo.setUrl(page.getUrl().toString());
//获取薪资
Integer[] salary = MathSalary.getSalary(html.css("div.cn strong", "text").toString());
jobInfo.setSalaryMin(salary[0]);
jobInfo.setSalaryMax(salary[1]);
//把结果保存起来
page.putField("jobInfo",jobInfo);
}
//设置
private Site site = Site.me()
.setCharset("gbk")//设置编码
.setTimeOut(10 * 1000)//设置超时时间
.setRetrySleepTime(3000)//设置重试的间隔时间
.setRetryTimes(3);//设置重试的次数
@Override
public Site getSite() {
return site;
}
@Autowired
private SpringDataPipeline springDataPipeline;
@Scheduled(initialDelay = 1000, fixedDelay = 100 * 1000)
public void process() {
Spider.create(new JobProcessor())
.addUrl(url) //地址
.setScheduler(new QueueScheduler().setDuplicateRemover(new BloomFilterDuplicateRemover(100000))) //存放方式和去重
.thread(10) //线程数
.addPipeline(this.springDataPipeline)//输出模式
.run(); //运行
}
}
9、SpringDataPipeline.java
package com.mollen.task;
import com.mollen.pojo.JobInfo;
import com.mollen.service.JobInfoService;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
@Component
public class SpringDataPipeline implements Pipeline {
@Autowired
private JobInfoService jobInfoService;
@Override
public void process(ResultItems resultItems, Task task) {
//获取封装好的招聘详情对象
JobInfo jobInfo = resultItems.get("jobInfo");
//判断数据是否不为空
if (jobInfo != null) {
//如果不为空把数据保存到数据库中
this.jobInfoService.save(jobInfo);
}
}
}
10、Application.java springboot入口
package com.mollen;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.scheduling.annotation.EnableScheduling;
/**
* springBoot启动入口
* :需要与三层架构包层同级
*/
@SpringBootApplication
@EnableScheduling //开启定时任务
public class Application {
public static void main(String[] args) {
SpringApplication.run(Application.class, args);
}
}
CREATE TABLE `job_info` (
`id` bigint(20) NOT NULL AUTO_INCREMENT COMMENT '主键id',
`company_name` varchar(100) DEFAULT NULL COMMENT '公司名称',
`company_addr` varchar(200) DEFAULT NULL COMMENT '公司联系方式',
`company_info` text COMMENT '公司信息',
`job_name` varchar(100) DEFAULT NULL COMMENT '职位名称',
`job_addr` varchar(50) DEFAULT NULL COMMENT '工作地点',
`job_info` text COMMENT '职位信息',
`salary_min` int(10) DEFAULT NULL COMMENT '薪资范围,最小',
`salary_max` int(10) DEFAULT NULL COMMENT '薪资范围,最大',
`url` varchar(150) DEFAULT NULL COMMENT '招聘信息详情页',
`time` varchar(10) DEFAULT NULL COMMENT '职位最近发布时间',
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8 COMMENT='招聘信息';