使用SpringBoot+Mybatis plus+Webmagic爬取51job的职位信息,并保存到mysql数据库.
引入maven依赖
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>2.2.5.RELEASE</version>
<relativePath/> <!-- lookup parent from repository -->
</parent>
<groupId>com.hg</groupId>
<artifactId>spider-demo</artifactId>
<version>0.0.1-SNAPSHOT</version>
<name>spider-demo</name>
<description>爬虫实战</description>
<properties>
<java.version>1.8</java.version>
</properties>
<dependencies>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<optional>true</optional>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>cn.hutool</groupId>
<artifactId>hutool-all</artifactId>
<version>5.1.0</version>
</dependency>
<!-- druid数据库连接池 -->
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>druid-spring-boot-starter</artifactId>
<version>1.1.10</version>
</dependency>
<!-- mysql connector -->
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<scope>runtime</scope>
</dependency>
<!-- Mybatis-plus -->
<dependency>
<groupId>com.baomidou</groupId>
<artifactId>mybatis-plus-boot-starter</artifactId>
<version>3.0.5</version>
</dependency>
<!--webmagic-->
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.7.3</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.7.3</version>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>16.0</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
</plugin>
</plugins>
</build>
</project>
创建数据库spider,新建表job_info
CREATE TABLE `job_info` (
`id` bigint(20) NOT NULL AUTO_INCREMENT COMMENT '主键id',
`company_name` varchar(100) DEFAULT NULL COMMENT '公司名称',
`company_addr` varchar(200) DEFAULT NULL COMMENT '公司联系方式',
`job_name` varchar(100) DEFAULT NULL COMMENT '职位名称',
`job_addr` varchar(50) DEFAULT NULL COMMENT '工作地点',
`salary` varchar(50) DEFAULT NULL COMMENT '薪资范围',
`url` varchar(150) DEFAULT NULL COMMENT '招聘信息详情页',
`time` varchar(10) DEFAULT NULL COMMENT '职位最近发布时间',
`job_detail` text COMMENT '职位详情',
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=2 DEFAULT CHARSET=utf8 COMMENT='招聘信息';
创建application.yml
spring:
application:
name: spider-servoce
jackson:
time-zone: GMT+8
date-format: yyyy-MM-dd HH:mm:ss
datasource:
driver-class-name: com.mysql.cj.jdbc.Driver
url: jdbc:mysql://localhost:3306/spider?useUnicode=true&characterEncoding=utf8&autoReconnect=true&useSSL=false
username: root
password: root
type: com.alibaba.druid.pool.DruidDataSource
druid:
initialSize: 10
minIdle: 10
maxActive: 50
maxWait: 60000
timeBetweenEvictionRunsMillis: 60000
minEvictableIdleTimeMillis: 300000
validationQuery: SELECT 1 FROM DUAL
testWhileIdle: true
testOnBorrow: false
testOnReturn: false
poolPreparedStatements: true
maxPoolPreparedStatementPerConnectionSize: 20
filters: stat,wall
connectionProperties: druid.stat.mergeSql=true;druid.stat.slowSqlMillis=5000
#mybatis
mybatis-plus:
mapper-locations: classpath:mapper/**/*.xml
typeAliasesPackage: com.hg.*.entity
global-config:
db-config:
id-type: auto
field-strategy: not_empty
table-underline: true
db-type: mysql
refresh: true
configuration:
map-underscore-to-camel-case: true
cache-enabled: false
logging:
level:
org.springframework.web: info
org.apache.http: info
us.codecraft.webmagic: info
package com.hg.spider.entity;
import com.baomidou.mybatisplus.annotation.TableId;
import com.baomidou.mybatisplus.annotation.TableName;
import lombok.Data;
import lombok.extern.slf4j.Slf4j;
@Data
@TableName("job_info")
@Slf4j
public class JobInfo {
@TableId
private Long id;
/**
* 公司名
*/
private String companyName;
/**
* 公司地址
*/
private String companyAddr;
/**
* 工作名称
*/
private String jobName;
/**
* 工作地址
*/
private String jobAddr;
/**
* 工作详情
*/
private String jobDetail;
/**
* 薪资
*/
private String salary;
/**
* 爬取的url
*/
private String url;
/**
* 职位发布时间
*/
private String time;
}
package com.hg.spider.dao;
import com.baomidou.mybatisplus.core.mapper.BaseMapper;
import com.hg.spider.entity.JobInfo;
/**
* @Author skh
* @Date 2020/3/21 16:27
* @Desc
*/
public interface JobInfoDao extends BaseMapper<JobInfo> {
}
package com.hg.spider.service;
import com.baomidou.mybatisplus.core.conditions.query.QueryWrapper;
import com.baomidou.mybatisplus.extension.service.impl.ServiceImpl;
import com.hg.spider.dao.JobInfoDao;
import com.hg.spider.entity.JobInfo;
import com.hg.spider.webmagic.JobProcessor;
import com.hg.spider.webmagic.MysqlPipeline;
import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover;
import us.codecraft.webmagic.scheduler.QueueScheduler;
import java.util.List;
/**
* @Author skh
* @Date 2020/3/21 12:10
* @Desc
*/
@Service
@Slf4j
public class JobInfoService extends ServiceImpl<JobInfoDao, JobInfo> {
//开始爬取的url
String url = "https://search.51job.com/list/080200,000000,0000,26,9,99,%25E6%2588%25BF%25E4%25BA%25A7%25E7%25BB%258F%25E7%25BA%25AA%25E4%25BA%25BA,2,1.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=";
@Autowired
private MysqlPipeline mysqlPipeline;
@Autowired
private JobProcessor jobProcessor;
public void getJobInfo() {
log.info("开始爬取数据");
//设置爬虫配置
Spider.create(jobProcessor)
.addUrl(url) //设置初始爬取的url
//使用布隆过滤器过滤重复url,需要引入guava包
.setScheduler(new QueueScheduler().setDuplicateRemover(new BloomFilterDuplicateRemover(100000)))
.thread(50) //设置线程数
.addPipeline(mysqlPipeline) //设置持久化
.run();
}
public List<JobInfo> selectJobInfoByUrl(String url) {
QueryWrapper<JobInfo> wrapper = new QueryWrapper<>();
wrapper.eq("url", url);
List<JobInfo> jobInfos = this.baseMapper.selectList(wrapper);
return jobInfos;
}
}
package com.hg.spider.controller;
import com.hg.spider.service.JobInfoService;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.RestController;
/**
* @Author skh
* @Date 2020/3/21 12:24
* @Desc
*/
@RestController
public class JobInfoController {
@Autowired
private JobInfoService jobInfoService;
@GetMapping("/getJobInfo")
public String getJobInfo() {
jobInfoService.getJobInfo();
return "success";
}
}
package com.hg.spider.webmagic;
import cn.hutool.core.collection.CollUtil;
import cn.hutool.core.util.StrUtil;
import com.hg.spider.entity.JobInfo;
import com.hg.spider.service.JobInfoService;
import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Selectable;
import java.util.List;
/**
* @Author skh
* @Date 2020/3/20 22:56
* @Desc 解析页面
*/
@Component
@Slf4j
public class JobProcessor implements PageProcessor {
@Autowired
private JobInfoService jobInfoService;
/**
* 解析页面
* @param page
*/
@Override
public void process(Page page) {
//解析列表页
List<Selectable> nodes = page.getHtml().css("div#resultList div.el").nodes();
if (CollUtil.isEmpty(nodes)) {
//为空表示这是招聘详情页,解析页面,获取招聘详情信息,保存数据
try {
this.saveJobInfo(page);
} catch (Exception e) {
log.error("解析异常,异常原因:{}", e.getMessage(),e);
}
} else {
//不为空表示这是列表页,解析出详情页url,放到任务队列中
for (Selectable node : nodes) {
//获取url地址
String jobInfoUrl = node.css("p.t1 span a").links().toString();
if (StrUtil.isNotBlank(jobInfoUrl)) {
//判断记录是否已存在
List<JobInfo> jobInfoList = jobInfoService.selectJobInfoByUrl(jobInfoUrl);
if (CollUtil.isEmpty(jobInfoList)) {
//把url放到任务队列中
page.addTargetRequest(jobInfoUrl);
} else {
log.info("记录已存在,记录url:{}",jobInfoUrl);
}
}
}
//获取下一页的url
List<String> all = page.getHtml().css("div.p_in li.bk a").links().all();
String bkUrl = all.get(all.size() - 1);
log.info("下一页Url:{}", bkUrl);
if (StrUtil.containsAny(bkUrl, "11.html")) {
System.out.println("已查到10页数据,无须无限爬取数据");
return;
}
page.addTargetRequest(bkUrl);
}
}
/**
* 解析job详情页
* @param page
*/
private void saveJobInfo(Page page) {
//解析页面
Html html = page.getHtml();
String companyName = html.css("div.cn p.cname a", "text").get();
List<String> text = html.css("div.bmsg.inbox p.fp", "text").all();
String companyAddr = text.get(text.size() - 1);
String jobName = html.css("div.cn h1", "text").get();
String jobStr = html.css("p.msg.ltype", "text").get();
String[] s = StrUtil.split(jobStr, " ");
String jobAddr = s[0];
String time = "";
for (String s1 : s) {
if (StrUtil.containsAny(s1, "发布")) {
time = StrUtil.removeAll(s1, "发布");
break;
}
}
String jonDetail = html.css("div.bmsg.job_msg.inbox", "allText").get();
String url = page.getUrl().get();
String salary = html.css("div.in div.cn strong", "text").get();
JobInfo jobInfo = new JobInfo();
jobInfo.setJobName(jobName);
jobInfo.setJobAddr(jobAddr);
jobInfo.setJobDetail(jonDetail);
jobInfo.setSalary(salary);
jobInfo.setUrl(url);
jobInfo.setTime(time);
jobInfo.setCompanyName(companyName);
jobInfo.setCompanyAddr(companyAddr);
//把结果保存到resultItems,为了持久化
page.putField("jobInfo", jobInfo);
}
//配置爬虫信息
private Site site = Site.me()
.setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36")
.setCharset("gbk")
.setTimeOut(10 * 1000)
.setRetryTimes(3)
.setRetrySleepTime(3000);
@Override
public Site getSite() {
return site;
}
}
package com.hg.spider.webmagic;
import com.hg.spider.entity.JobInfo;
import com.hg.spider.service.JobInfoService;
import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
/**
* @Author skh
* @Date 2020/3/21 16:18
* @Desc
*/
@Component
@Slf4j
public class MysqlPipeline implements Pipeline
{
@Autowired
private JobInfoService jobInfoService;
@Override
public void process(ResultItems resultItems, Task task) {
//获取封装好的数据
JobInfo jobInfo = resultItems.get("jobInfo");
if (jobInfo != null) {
jobInfoService.save(jobInfo);
}
}
}
运行项目,浏览器输入:
http://localhost:3306/getJobInfo
后台就会开始爬取数据.
以上只是一个简单的使用WebMagic的爬虫案例.可以作为入门学习使用.