WebMagic是一个简单灵活的Java爬虫框架。它提供简单灵活的API,只需少量代码即可实现一个爬虫。基于WebMagic,你可以快速开发出一个高效、易维护的爬虫。
{
"site": {
# 网站域名
"domain": "139.159.3.18",
# 请求头,主要是模拟浏览器请求
"headers": {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36",
"authorization": "Your own authorization here."
},
# 如果爬取的网站需要登录,在这里设置cookie信息
"cookie": {
"JSESSIONID":"FBCC0D50EC568B1A7A6EF7FD94C50079"
},
"retryTimes": 3,
"sleepTime": 500
},
"base_dir": "/Users/zz/"
}
创建Configuration配置类用来获取config.json配置
2. 不用config.json,完全可以用代码来配置,使用Site类。例如:
Site site = Site.me()
.setRetryTimes(3)
.setSleepTime(2000)
.setTimeOut(60000)
.setCharset("utf-8")
.addCookie("域","名称", "内容")
.addCookie("域","名称", "内容");
import java.util.List;
import cn.dofuntech.spider.collector.site99.Configuration;
import cn.dofuntech.spider.webmagic.Page;
import cn.dofuntech.spider.webmagic.Site;
import cn.dofuntech.spider.webmagic.Spider;
import cn.dofuntech.spider.webmagic.pipeline.Pipeline;
import cn.dofuntech.spider.webmagic.processor.PageProcessor;
import cn.dofuntech.spider.webmagic.scheduler.BloomFilterDuplicateRemover;
import cn.dofuntech.spider.webmagic.scheduler.FileCacheQueueScheduler;
import cn.dofuntech.spider.webmagic.selector.Selectable;
import com.hs2e.common.collect.ListUtils;
import com.hs2e.common.lang.StringUtils;
/**
*
* 爬取症状详细信息
*/
public class ZzPageProcessor2 implements PageProcessor {
// 获取配置
private Site site = new Configuration().getSite();
// 爬取符合正则的网页
public static final String URL_LIST_PINYIN = "https://jbk.99.com.cn/zz/py/[A-Z]-[0-9]\\.html";
public void process(Page page) {
//拼音症状列表页面
if (page.getUrl().regex(URL_LIST_PINYIN).match()) {
// 获取所有url
page.addTargetRequests(page.getHtml().xpath("//div[@class=\"part-cont3\"]/dl/dt").links().all());
}
//症状详情页
else {
Selectable selectable = page.getHtml().xpath("//div[@id='d-top2']//li/font");
List nodes = selectable.nodes();
// System.out.println("症状:" + nodes.get(0).$("font", "text"));
// System.out.println("部位:" + nodes.get(1).$("font", "text"));
List a2 = ListUtils.newArrayList();
List a3 = ListUtils.newArrayList();
nodes.get(2).$("font > a").nodes().forEach(a -> {
a2.add(a.$("a", "text").get());
});
nodes.get(3).$("font > a").nodes().forEach(a -> {
a3.add(a.$("a", "text").get());
});
// System.out.println("科室:" + StringUtils.join(a2, " "));
// System.out.println("疾病:" + StringUtils.join(a3, " "));
page.putField("name", nodes.get(0).$("font", "text").toString());
page.putField("bw", nodes.get(1).$("font", "text").toString());
page.putField("dept", StringUtils.join(a2, ",").trim());
page.putField("disease", StringUtils.join(a3, ",").trim());
}
}
public Site getSite() {
return site;
}
@SuppressWarnings("resource")
public void start(Pipeline pipeline,String url)
{
String pipelinePath = new Configuration().getZzPath();
int crawlSize = 100_0000;
Spider.create(new ZzPageProcessor2())
.setScheduler(new FileCacheQueueScheduler(pipelinePath)
.setDuplicateRemover(new BloomFilterDuplicateRemover(crawlSize)))
.addUrl(url)
.addPipeline(pipeline)
.thread(300)
.run();
}
/**
* 下载关注列表的用户数据,用于提取 url_tokens
* @param args 无须其他参数
*/
public static void main(String[] args) {
String pipelinePath = new Configuration().getZzPath();
int crawlSize = 100_0000;
System.out.println(crawlSize);
Spider.create(new ZzPageProcessor2()).setScheduler(//new QueueScheduler()
new FileCacheQueueScheduler(pipelinePath).setDuplicateRemover(new BloomFilterDuplicateRemover(crawlSize))).addPipeline(new ZzPipeline()).addUrl("https://jbk.99.com.cn/zz/py/Z-2.html").thread(200).run();
}
}
webmagic内置有很多pipeline,基本能够满足开发者的需求。例如ResultItemsCollectorPipeline将爬取的所有数据保存ResultItems集合里面;FilePipeline将爬取的数据或者url写入带文件里面,等等。还可以自定义pipeline
package cn.dofuntech.spider.collector.site99.download;
import java.util.ArrayList;
import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import cn.dofuntech.spider.webmagic.ResultItems;
import cn.dofuntech.spider.webmagic.Task;
import cn.dofuntech.spider.webmagic.pipeline.ResultItemsCollectorPipeline;
/**
*
* 症状详细信息的 pipeline
*
* Copyright (C) 2019 dofuntech. All Rights Reserved.
* @author
* @version 1.0
* filename:ZzPipeline.java
*/
public class ZzPipeline extends ResultItemsCollectorPipeline {
private Logger logger = LoggerFactory.getLogger(getClass());
List collector = new ArrayList();
@Override
public void process(ResultItems resultItems, Task task) {
collector.add(resultItems);
}
@Override
public List getCollected()
{
return collector;
}
}
public class ZzPipeline extends FilePipeline {
private Logger logger = LoggerFactory.getLogger(getClass());
static final String URL = "url";
static final String RESPONSE = "response";
/**
* create a ZhihuPipeline with default path"/data/webporter/"
*/
public ZzPipeline() {
setPath("/data/webporter/");
}
public ZzPipeline(String path) {
setPath(path);
}
@Override
public void process(ResultItems resultItems, Task task) {
String path = this.path + PATH_SEPERATOR + task.getUUID() + PATH_SEPERATOR;
try {
PrintWriter printWriter = new PrintWriter(new OutputStreamWriter(new FileOutputStream(getFile(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".html")), "UTF-8"));
Map results = resultItems.getAll();
printWriter.println(results.get(URL));
printWriter.println(results.get(RESPONSE));
printWriter.close();
}
catch (IOException e) {
logger.warn("write file error", e);
}
}
}
ZzPipeline zzPipeline = new ZzPipeline();
new ZzPageProcessor2().start(zzPipeline, url);
List resultItems = zzPipeline.getCollected();
List list = new ArrayList<>();
if (ListUtils.isNotEmpty(resultItems))
{
for (ResultItems r : resultItems)
{
try
{
BasicZz z = new BasicZz();
z.setName(r.get("name").toString());
z.setDept(r.get("dept").toString());
z.setDisease(r.get("disease").toString());
z.setBw(r.get("bw").toString());
list.add(z);
}
catch (Exception e)
{
continue;
}
}
baseZzService.saveBatch(list);
}
到这里就简单的实现了一个爬虫