us.codecraft
webmagic-core
0.7.3
us.codecraft
webmagic-extension
0.7.3
设置请求header、cookie,很多网站都要专门的header才能正确网站,否则会出现访问错误
使用skip来阻止pipline处理
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
public class GithubRepoPageProcessor implements PageProcessor {
private Site site = Site.me().setRetryTimes(3).setSleepTime(1000).setTimeOut(10000)
.addHeader("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36");
@Override
public void process(Page page) {
if(!page.getUrl().toString().startsWith("https://github.com/code4craft")){
//skip为true,则不会通知pipeline处理
page.setSkip(true);
}
page.addTargetRequests(page.getHtml().links().regex("https://github\\.com/code4craft.*").all());
}
@Override
public Site getSite() {
//header、cookie在Site对象里面设置
return site;
}
public static void main(String[] args) {
Spider.create(new GithubRepoPageProcessor()).addUrl("https://github.com/code4craft").thread(1).run();
}
}
自定义实现爬取数据的输出,比如调用http接口上传数据
package com.lenovo.spider;
import java.io.File;
import java.io.IOException;
import java.util.Date;
import java.util.List;
import java.util.Map;
import java.util.concurrent.Executors;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.math.NumberUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.lenovo.exception.NetException;
import com.lenovo.spider.http.HttpInterface;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
public class QiChaChaPipline implements Pipeline {
private Logger logger = LoggerFactory.getLogger("spider");
private static AtomicInteger uploadCount;
private static ScheduledExecutorService threadPool = Executors.newScheduledThreadPool(1);
private static final LinkedBlockingQueue dataQueue = new LinkedBlockingQueue<>(100 * 10000);
public QiChaChaPipline() {
threadPool.execute(() -> upload());
}
@Override
public void process(ResultItems resultItems, Task task) {
JSONObject company = new JSONObject(resultItems.get("company_map"));
dataQueue.offer(company);
}
public void upload() {
while (true) {
JSONObject company = null;
try {
company = dataQueue.take();
} catch (InterruptedException e1) {}
if (company != null) {
try {
HttpInterface.uploadCompanyInfo(company);
logger.info(company.toString());
logger.debug("上传统计:" + uploadCount.incrementAndGet());
} catch (NetException e) {
dataQueue.offer(company);
}
}
}
}
}
QiChaChaPipline pipline = new QiChaChaPipline();
spider.addPipeline(pipline);
WebMagic默认使用HttpClientDownloader,如果要在下载失败后切换代理ip如下:
Spider spider = Spider.create(new GithubRepoPageProcessor()).addUrl("https://github.com/code4craft").thread(1);
HttpClientDownloader downloader = new HttpClientDownloader(){
@Override
protected void onError(Request request) {
setProxyProvider(SimpleProxyProvider.from(new Proxy("10.10.10.10", 8888)));
}
};
spider.setDownloader(downloader);
使用SelenuimDownloader下载js动态渲染过后的网页,不过SelenuimDownloader有点软肋就是默认的WebDriverPool不能设置代理ip,要能设置代理ip必须重写WebDriverPool和SelenuimDownloader
默认的scheduler就会自动去重已经爬取过的url
使用RedisScheduler实现分布式爬虫,共享爬取队列,重启爬虫不会导致从头开始爬
RedisScheduler scheduler = new RedisScheduler(new JedisPool("10.100.124.206", 6379));
// FileCacheQueueScheduler scheduler = new FileCacheQueueScheduler("urls");
// QueueScheduler scheduler = new QueueScheduler();
spider.setScheduler(scheduler);
设置页面下载成功、失败的监听器,方便失败的时候做一些善后处理,比如把失败的url再加入到爬取队列里面,这样就不会遗漏一些页面的爬取
手动添加爬去url到待爬队列,通过设置Request.CYCLE_TRIED_TIMES设置失败重试次数,可以强制把url加到待爬队列里面,避免去重机制把url看成已爬过的
ArrayList listeners = new ArrayList<>();
listeners.add(new SpiderListener() {
@Override
public void onSuccess(Request request) {}
@Override
public void onError(Request request) {
Integer cycleTriedTimes =
(Integer)request.getExtra(Request.CYCLE_TRIED_TIMES);
request.putExtra(Request.CYCLE_TRIED_TIMES,
cycleTriedTimes == null ? 1 : cycleTriedTimes + 1);
spider.addRequest(request);
}
});
spider.setSpiderListeners(listeners);
因为css选择器语法各开源支持有很多细节的不统一,而xpath有统一的标准,一般不会出错
WebMagic使用的是slf4j api打印日志,只需加入log4j.xml日志配置文件到classpath路径就可以打印日志