闲来无事最近写了一个全新的爬虫框架WebMagic整合springboot的爬虫程序,不清楚WebMagic的童鞋可以先查看官网了解什么是Webmagic,顺便说说用springboot时遇到的一些坑。
首先附上Webmagic官网链接 WebMagic官网,上手很简单。
先贴上springboot的pom.xml配置
4.0.0
zhy_springboot
zhy_springboot
1.0.0
jar
org.springframework.boot
spring-boot-starter-parent
2.0.2.RELEASE
UTF-8
UTF-8
1.8
org.springframework.boot
spring-boot-starter-web
org.springframework.boot
spring-boot-starter-test
test
com.github.lftao
jkami
1.0.8
com.alibaba
fastjson
1.2.6
com.codeborne
phantomjsdriver
1.2.1
org.springframework.boot
spring-boot-devtools
true
org.apache.tomcat.embed
tomcat-embed-jasper
provided
org.elasticsearch.client
transport
org.elasticsearch.plugin
delete-by-query
2.3.2
org.elasticsearch
elasticsearch
org.springframework.boot
spring-boot-starter-tomcat
provided
javax.servlet
jstl
taglibs
standard
1.1.2
commons-codec
commons-codec
org.springframework.boot
spring-boot-configuration-processor
true
org.mybatis
mybatis
3.4.6
org.mybatis.spring.boot
mybatis-spring-boot-starter
1.3.2
tk.mybatis
mapper-spring-boot-starter
2.0.2
tk.mybatis
mapper
4.0.2
mysql
mysql-connector-java
org.apache.commons
commons-lang3
com.fasterxml.jackson.core
jackson-core
com.fasterxml.jackson.core
jackson-databind
com.fasterxml.jackson.datatype
jackson-datatype-joda
com.fasterxml.jackson.module
jackson-module-parameter-names
com.github.pagehelper
pagehelper-spring-boot-starter
1.2.5
org.springframework.boot
spring-boot-starter-thymeleaf
com.alibaba
druid-spring-boot-starter
1.1.9
org.springframework.boot
spring-boot-starter-log4j2
com.fasterxml.jackson.dataformat
jackson-dataformat-yaml
javax.persistence
persistence-api
1.0.2
javax.servlet
javax.servlet-api
provided
us.codecraft
webmagic-core
0.7.3
us.codecraft
webmagic-extension
0.7.3
org.springframework.boot
spring-boot-maven-plugin
org.springframework
springloaded
1.2.8.RELEASE
true
org.apache.maven.plugins
maven-surefire-plugin
true
其次就是springboot的yml配置文件
server:
port: 8080
spring:
# HTTP ENCODING
http:
encoding.charset: UTF-8
encoding.enable: true
encoding.force: true
# Thymeleaf
thymeleaf:
cache: false
encoding: UTF-8
content: text/html
prefix: /WEB-INF/jsp/
suffix: .jsp
# MVC
mvc:
static-path-pattern: /WEB-INF/resources/**
resources:
static-locations: /WEB-INF/resources/
# DATASOURCE
datasource:
type: com.alibaba.druid.pool.DruidDataSource
base:
type: com.alibaba.druid.pool.DruidDataSource
driver-class-name: com.mysql.jdbc.Driver
url: jdbc:mysql://localhost:3306/zhy?useUnicode=true&characterEncoding=utf-8
username: root
password: 123456
name: baseDb
initial-size: 1
min-idle: 1
max-active: 20
#获取连接等待超时时间
max-wait: 60000
#间隔多久进行一次检测,检测需要关闭的空闲连接
time-between-eviction-runs-millis: 60000
#一个连接在池中最小生存的时间
min-evictable-idle-time-millis: 300000
validation-query: SELECT 'x'
test-while-idle: true
test-on-borrow: false
test-on-return: false
#打开PSCache,并指定每个连接上PSCache的大小。oracle设为true,mysql设为false。分库分表较多推荐设置为false
pool-prepared-statements: false
max-pool-prepared-statement-per-connection-size: 20
second:
type: com.alibaba.druid.pool.DruidDataSource
driver-class-name: com.mysql.jdbc.Driver
url: jdbc:mysql://localhost:3306/mvs?useUnicode=true&characterEncoding=utf-8
username: root
password: 123456
name: secondDb
initial-size: 1
min-idle: 1
max-active: 20
#获取连接等待超时时间
max-wait: 60000
#间隔多久进行一次检测,检测需要关闭的空闲连接
time-between-eviction-runs-millis: 60000
#一个连接在池中最小生存的时间
min-evictable-idle-time-millis: 300000
validation-query: SELECT 'x'
test-while-idle: true
test-on-borrow: false
test-on-return: false
#打开PSCache,并指定每个连接上PSCache的大小。oracle设为true,mysql设为false。分库分表较多推荐设置为false
pool-prepared-statements: false
max-pool-prepared-statement-per-connection-size: 20
#pagehelper
pagehelper:
helperDialect: mysql
reasonable: true
supportMethodsArguments: true
params: count=countSql
returnPageInfo: check
#自定义person
person:
age: 18
name: Jack
sex: boy
hobbies: football,basketball,movies
family:
father: Tommy
mother: Rose
sister: Tina
#配置日志
logging:
config: classpath:log4j.xml
注:两个DB是为了测试多数据源的问题
启动类什么的就不说了,然后进入爬虫主体
首页获取爬取连接的列表页,也就是那本小说在顶点的列表页,我们用下面链接来测试
https://www.dingdiann.com/ddk75013/
WebMagic的抽取逻辑都在PageProcessor中,这是一个定义抽取逻辑的接口,你只要实现它附上你自己的抽取逻辑,就可以开发出一套完整的爬虫,不多说,上代码:
开始爬:
package com.zhy.springboot.crawler;
import java.security.SecureRandom;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.net.ssl.HostnameVerifier;
import javax.net.ssl.HttpsURLConnection;
import javax.net.ssl.SSLContext;
import javax.net.ssl.SSLSession;
import javax.net.ssl.X509TrustManager;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import org.springframework.ui.ModelMap;
import com.zhy.springboot.model.TBookInfo;
import com.zhy.springboot.service.iservice.ITBookInfoService;
import us.codecraft.webmagic.Spider;
@Component
public class BookInfoCrawler {
@Autowired
private ITBookInfoService bookInfoService;
/**
* @Title: BookInfoCrawler.java
* @Package com.zhy.springboot.crawler
* @Description:保存书籍基本信息
* @author John_Hawkings
* @date 2018年11月29日
* @version V1.0
*/
public void startCraw(ModelMap model) {
try {
TBookInfo tbi = new TBookInfo();
trustEveryone();
Document document = Jsoup.connect(model.get("baseUrl").toString())
.userAgent("Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36")
.get();
tbi.setBookAuthor(StringUtils.isBlank(document.getElementById("info").getElementsByTag("p").get(0).text().split(":")[1])?"":document.getElementById("info").getElementsByTag("p").get(0).text().split(":")[1]);//
tbi.setBookName(StringUtils.isBlank(document.getElementById("info").getElementsByTag("h1").text())?"":document.getElementById("info").getElementsByTag("h1").text());
String text = document.getElementById("list").getElementsByTag("dd").get(0).getElementsByTag("a").text();
tbi.setLastFieldName(StringUtils.isBlank(text)?"":text);
if(StringUtils.isNotBlank(text)) {
Pattern pattern = Pattern.compile("\\d+");
Matcher matcher = pattern.matcher(text);
if (matcher.find()) {
tbi.setBookTotalFields(Integer.valueOf(matcher.group()));
}
}
String dateStr = document.getElementById("info").getElementsByTag("p").get(2).text().split(":")[1];
if(StringUtils.isNotBlank(dateStr)) {
tbi.setLastUpdateTime(new SimpleDateFormat("yy-MM-dd hh:mm:ss").parse(dateStr));
}
tbi.setUpdateTime(new Date());
tbi.setCreateTime(new Date());
bookInfoService.insert(tbi);
if(model.get("baseUrl").toString().endsWith("/")) {
storeDetails(tbi,model.get("baseUrl").toString());
}else {
storeDetails(tbi,model.get("baseUrl").toString()+"/");
}
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* @Title: BookInfoCrawler.java
* @Package com.zhy.springboot.crawler
* @Description: 保存书籍详细信息
* @author John_Hawkings
* @date 2018年11月29日
* @version V1.0
*
* create(Site.me()
.setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.57 Safari/537.36"),jobInfoDaoPipeline, LieTouJobInfo.class)
.addUrl("https://www.liepin.com/sojob/?dqs=020&curPage=0")
.thread(5)
.run();
*/
private void storeDetails(TBookInfo tbi, String firstUrl) {
Spider mySpider = Spider.create(new DDNovelProcessor(tbi,firstUrl))
//从"列表页"开始抓
.addPipeline(new BookDetailsMapperPipeline())//数据持久化
.addUrl(firstUrl)
//开启5个线程抓取
.thread(10);
// .run();
//设置下载失败后的IP代理
HttpClientDownloader downloader = new HttpClientDownloader(){
@Override
protected void onError(Request request) {
setProxyProvider(SimpleProxyProvider.from(new Proxy("27.214.112.102",9000)));
}
};
mySpider.setDownloader(downloader)
//启动爬虫
.run();
}
/**
* 信任任何站点,实现https页面的正常访问
*
*/
public void trustEveryone() {
try {
HttpsURLConnection.setDefaultHostnameVerifier(new HostnameVerifier() {
public boolean verify(String hostname, SSLSession session) {
return true;
}
});
SSLContext context = SSLContext.getInstance("TLS");
context.init(null, new X509TrustManager[] { new X509TrustManager() {
public void checkClientTrusted(X509Certificate[] chain, String authType) throws CertificateException {
}
public void checkServerTrusted(X509Certificate[] chain, String authType) throws CertificateException {
}
public X509Certificate[] getAcceptedIssuers() {
return new X509Certificate[0];
}
} }, new SecureRandom());
HttpsURLConnection.setDefaultSSLSocketFactory(context.getSocketFactory());
} catch (Exception e) {
// e.printStackTrace();
}
}
public static void main(String[] args) throws Exception{
BookInfoCrawler bc = new BookInfoCrawler();
bc.trustEveryone();
//https://www.dingdiann.com/ddk75013/
Document document = Jsoup.connect("https://www.dingdiann.com/ddk75013/")
.userAgent("Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36")
.get();
/*String text = document.getElementById("list").getElementsByTag("dd").get(0).getElementsByTag("a").text();
Pattern pattern = Pattern.compile("\\d+");
Matcher matcher = pattern.matcher(text);
if (matcher.find()) {
System.out.println(matcher.group());
}
*/
Date parse = new SimpleDateFormat("yy-MM-dd hh:mm:ss").parse(document.getElementById("info").getElementsByTag("p").get(2).text().split(":")[1]);
System.out.println("DATE:"+parse);
}
}
可以看到最后面有一段JSOUP爬取HTTPS请求的处理代码,没有这个处理,JSOUP是抓取不到HTTPS请求的数据的;
WebMagic的启动:
Spider mySpider = Spider.create(new DDNovelProcessor(tbi,firstUrl))
//从"列表页"开始抓
.addPipeline(new BookDetailsMapperPipeline())//数据持久化
.addUrl(firstUrl)
//开启5个线程抓取
.thread(10);
// .run();
//设置下载失败后的IP代理
HttpClientDownloader downloader = new HttpClientDownloader(){
@Override
protected void onError(Request request) {
setProxyProvider(SimpleProxyProvider.from(new Proxy("27.214.112.102",9000)));
}
};
mySpider.setDownloader(downloader)
//启动爬虫
.run();
可以看到我在参数中设置了一个pipeline ,这是一个持久化数据的模块,每个线程在获取待抓取的URL {addTargetRequests}时,在执行完抽取逻辑后都会调用一次这个pipeline 持久化数据,我是基于的MySQL的的的的持久化的,如果频繁抓取可能会被检测到封IP,这里最好加上IP代理,至于可代理IP可以网上搜搜免费的;
抽数逻辑:
package com.zhy.springboot.crawler;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.openqa.selenium.NoSuchElementException;
import org.springframework.stereotype.Component;
import com.alibaba.druid.util.StringUtils;
import com.zhy.springboot.model.TBookDetails;
import com.zhy.springboot.model.TBookInfo;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
@Component
public class DDNovelProcessor implements PageProcessor {
private TBookInfo bookeInfo;
private String baseUrl;
// 部分一:抓取网站的相关配置,包括编码、抓取间隔、重试次数等
private Site site = Site.me().setUserAgent("Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36")
.setCharset("UTF-8")
.setSleepTime(1000)
.setRetrySleepTime(500)
.setRetryTimes(3);
public DDNovelProcessor(TBookInfo bookeInfo , String baseUrl) {
this.bookeInfo = bookeInfo;
this.baseUrl =baseUrl;
}
public DDNovelProcessor() {
}
@Override
public Site getSite() {
return site;
}
// process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑
@Override
// 部分二:定义如何抽取页面信息,并保存下来
public void process(Page page) {
try {
Document document = Jsoup.parse(page.getRawText());
//第一次进来获取全部链接
if(!page.getUrl().regex("https://www.dingdiann.com/ddk75013/\\d+.html").match()) {
Elements elementsByTag = document.getElementById("list").getElementsByTag("a");
Map map= new HashMap<>();
for (Element element : elementsByTag) {
map.put(baseUrl+element.attr("href").split("/")[2], 0);
}
page.addTargetRequests(new ArrayList<>(map.keySet()));
}else {
//后面连接进来直接入库章节信息
String title = document.getElementsByClass("bookname").get(0).getElementsByTag("h1").text();
TBookDetails tbd = new TBookDetails();
tbd.setBookId(bookeInfo.getId());
tbd.setBookTitle(title);
tbd.setBookContent(document.getElementById("content").text());
tbd.setCreateTime(new Date());
tbd.setUpdateTime(new Date());
String sortStr = title.split(" ")[0].substring(1, title.split(" ")[0].length()-1);
if(StringUtils.isNumber(sortStr)) {
tbd.setBookFieldSort(Integer.valueOf(sortStr));
}else {
tbd.setBookFieldSort(Integer.valueOf(chineseNumber2Int(sortStr)));
}
page.putField("allFileds", tbd);
}
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* @Title: DDNovelProcessor.java
* @Package com.zhy.springboot.crawler
* @Description: 判断元素是否存在
* @author John_Hawkings
* @date 2018年11月29日
* @version V1.0
*/
@SuppressWarnings("unused")
private boolean doesElementExist(Document document, String id) {
try
{
document.getElementById(id);
return true;
}
catch (NoSuchElementException e)
{
return false;
}
}
/**
* @Title: DDNovelProcessor.java
* @Package com.zhy.springboot.crawler
* @Description: 中文数字转阿拉伯数字
* @author John_Hawkings
* @date 2018年11月29日
* @version V1.0
*/
public static int chineseNumber2Int(String chineseNumber){
int result = 0;
int temp = 1;//存放一个单位的数字如:十万
int count = 0;//判断是否有chArr
char[] cnArr = new char[]{'一','二','三','四','五','六','七','八','九'};
char[] chArr = new char[]{'十','百','千','万','亿'};
for (int i = 0; i < chineseNumber.length(); i++) {
boolean b = true;//判断是否是chArr
char c = chineseNumber.charAt(i);
for (int j = 0; j < cnArr.length; j++) {//非单位,即数字
if (c == cnArr[j]) {
if(0 != count){//添加下一个单位之前,先把上一个单位值添加到结果中
result += temp;
temp = 1;
count = 0;
}
// 下标+1,就是对应的值
temp = j + 1;
b = false;
break;
}
}
if(b){//单位{'十','百','千','万','亿'}
for (int j = 0; j < chArr.length; j++) {
if (c == chArr[j]) {
switch (j) {
case 0:
temp *= 10;
break;
case 1:
temp *= 100;
break;
case 2:
temp *= 1000;
break;
case 3:
temp *= 10000;
break;
case 4:
temp *= 100000000;
break;
default:
break;
}
count++;
}
}
}
if (i == chineseNumber.length() - 1) {//遍历到最后一个字符
result += temp;
}
}
return result;
}
public static void main(String[] args) {
System.out.println(DDNovelProcessor.chineseNumber2Int("四百五十五"));
}
}
对于webmagic的解析页面的东西不是很熟,就没用,直接page.getRawText()获取原始页面,然后JSOUP处理;可以看到我在获取到页面数据后做了一次判断,因为我们在列表页获取到所有链接后会把链接放入待爬取队列,也就是TargetUrl,下一次进来的页面就不是这个列表页了,而是从待爬取队列中获取的章节的详情页面的链接了,这个时候就会走入详情页里面的抽数逻辑;这里要说一下由于顶点小说的列表页既有中文又有英文的章节名,所以特地加了数字转化,这是其一,其二就是很多章节是重复的,所以我用地图的特性干掉了重复的链接;说到这里可能不是很明白,我在说一遍WebMagic的执行流程:首先你要给他一个初始爬取链接,然后到这个页面获取所有你需要的链接并加入队列addTargetRequests这一步,后续的线程进来就会在这个队列中获取链接去抽取数据,你甚至可以抓取很多不同排版类型的网站链接,这时你只需要在获取到页 加上哪种页面哪种链接相匹配的判断即可,然后线程在抽完数据后,你将数据放入WebMagic的特定容器中页.putField(“allFileds” TBD),这一步,后面线程执行完后会自动触发 pipeline 持久化数据,是每次线程执行完都会执行一次,直到待抓取对面没有链接。
持久化数据 pipeline :
package com.zhy.springboot.crawler;
import java.util.Map;
import com.zhy.springboot.model.TBookDetails;
import com.zhy.springboot.service.iservice.ITBookDetailsService;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
public class BookDetailsMapperPipeline implements Pipeline{
private ITBookDetailsService bookDetailsService;
@Override
public void process(ResultItems resultitems, Task task) {
try {
//解决多线程情况下springboot无法注入的问题,使用工具类来获取对象
bookDetailsService = ApplicationContextProvider.getBean(ITBookDetailsService.class);
if(null==bookDetailsService) {
return;
}
Map allMap = resultitems.getAll();
TBookDetails tbd = allMap.get("allFileds")==null?null:(TBookDetails)allMap.get("allFileds");
if(null==tbd) {
return;
}
bookDetailsService.insert(tbd);
} catch (Exception e) {
e.printStackTrace();
}
}
}
这里要说springboot的第一个坑,在多线程执行的时候,就算你标记了这个类被扫描到,你也无法注入你想要的对象,这个时候也不要方,既然我不能注入,我们可以强制获取,具体获取代码如下:
package com.zhy.springboot.crawler;
import org.springframework.beans.BeansException;
import org.springframework.context.ApplicationContext;
import org.springframework.context.ApplicationContextAware;
import org.springframework.stereotype.Component;
/**
* Author:ZhuShangJin
* Date:2018/7/3
*/
@Component
public class ApplicationContextProvider implements ApplicationContextAware {
/**
* 上下文对象实例
*/
private static ApplicationContext applicationContext;
@SuppressWarnings("static-access")
@Override
public void setApplicationContext(ApplicationContext applicationContext) throws BeansException {
this.applicationContext = applicationContext;
}
/**
* 获取applicationContext
*
* @return
*/
public static ApplicationContext getApplicationContext() {
return applicationContext;
}
/**
* 通过name获取 Bean.
*
* @param name
* @return
*/
public static Object getBean(String name) {
return getApplicationContext().getBean(name);
}
/**
* 通过class获取Bean.
*
* @param clazz
* @param
* @return
*/
public static T getBean(Class clazz) {
return getApplicationContext().getBean(clazz);
}
/**
* 通过name,以及Clazz返回指定的Bean
*
* @param name
* @param clazz
* @param
* @return
*/
public static T getBean(String name, Class clazz) {
return getApplicationContext().getBean(name, clazz);
}
}
用应用程序上下来直接获取对象,走到这一步基本上爬虫可以说是定制完成了,至于其他的细节问题也就不详说了;
最后贴上抓取的成果,你线程开的越多抓取速度越快,但是也要有个度,一本近400章的小说也就三四秒GG。
最后附上项目源码:爬虫整合代码