这系列文章主要分析分析webmagic框架,没有实战内容,如有实战问题可以讨论,也可以提供技术支持。
欢迎加群313557283(刚创建),小白互相学习~
package us.codecraft.webmagic;
import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.lang3.SerializationUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.downloader.Downloader;
import us.codecraft.webmagic.downloader.HttpClientDownloader;
import us.codecraft.webmagic.pipeline.CollectorPipeline;
import us.codecraft.webmagic.pipeline.ConsolePipeline;
import us.codecraft.webmagic.pipeline.Pipeline;
import us.codecraft.webmagic.pipeline.ResultItemsCollectorPipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.QueueScheduler;
import us.codecraft.webmagic.scheduler.Scheduler;
import us.codecraft.webmagic.thread.CountableThreadPool;
import us.codecraft.webmagic.utils.UrlUtils;
import us.codecraft.webmagic.utils.WMCollections;
import java.io.Closeable;
import java.io.IOException;
import java.util.*;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.ReentrantLock;
/**
* Entrance of a crawler.
* A spider contains four modules: Downloader, Scheduler, PageProcessor and
* Pipeline.
* Every module is a field of Spider.
* The modules are defined in interface.
* You can customize a spider with various implementations of them.
* Examples:
*
* A simple crawler:
* Spider.create(new SimplePageProcessor("http://my.oschina.net/",
* "http://my.oschina.net/*blog/*")).run();
*
* Store results to files by FilePipeline:
* Spider.create(new SimplePageProcessor("http://my.oschina.net/",
* "http://my.oschina.net/*blog/*"))
* .pipeline(new FilePipeline("/data/temp/webmagic/")).run();
*
* Use FileCacheQueueScheduler to store urls and cursor in files, so that a
* Spider can resume the status when shutdown.
* Spider.create(new SimplePageProcessor("http://my.oschina.net/",
* "http://my.oschina.net/*blog/*"))
* .scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).run();
*
* @author [email protected]
* @see Downloader
* @see Scheduler
* @see PageProcessor
* @see Pipeline
* @since 0.1.0
*/
//实现两个接口一个Runnable,一个Task
/**
* 对于每个任务返回一个
* @return uuid
*//*
public String getUUID();
*//**
* 返回site对象
* @return site
*//*
public Site getSite();*/
public class Spider implements Runnable, Task {
//下载器
protected Downloader downloader;
//流管道,处理数据
protected List pipelines = new ArrayList();
//处理模块,解析页面,抽取有用信息
protected PageProcessor pageProcessor;
//请求集合
protected List startRequests;
//site 放请求信息
protected Site site;
//uuid
protected String uuid;
//负责管理待抓取的URL,以及一些去重的工作
protected Scheduler scheduler = new QueueScheduler();
//日志
protected Logger logger = LoggerFactory.getLogger(getClass());
//自己的线程池
protected CountableThreadPool threadPool;
//Executor直接的扩展接口,也是最常用的线程池接口,我们通常见到的线程池定时任务线程池都是它的实现类
protected ExecutorService executorService;
//线程数
protected int threadNum = 1;
//初始化提供原子操作
protected AtomicInteger stat = new AtomicInteger(STAT_INIT);
//处理完全标志位
protected boolean exitWhenComplete = true;
//初始化
protected final static int STAT_INIT = 0;
//运行
protected final static int STAT_RUNNING = 1;
//结束
protected final static int STAT_STOPPED = 2;
//不清楚
protected boolean spawnUrl = true;
//销毁标志位
protected boolean destroyWhenExit = true;
//重入锁(ReentrantLock)是一种递归无阻塞的同步机制
private ReentrantLock newUrlLock = new ReentrantLock();
//条件
private Condition newUrlCondition = newUrlLock.newCondition();
//爬虫监听集合 存放请求是否成功
private List spiderListeners;
//初始化提供原子操作
private final AtomicLong pageCount = new AtomicLong(0);
//开始时间
private Date startTime;
//空闲休眠30秒
private int emptySleepTime = 30000;
//通过加入PageProcessor创建返回Spider
/**
* create a spider with pageProcessor.
*
* @param pageProcessor pageProcessor
* @return new spider
* @see PageProcessor
*/
public static Spider create(PageProcessor pageProcessor) {
return new Spider(pageProcessor);
}
//默认要初始化site
/**
* create a spider with pageProcessor.
*
* @param pageProcessor pageProcessor
*/
public Spider(PageProcessor pageProcessor) {
this.pageProcessor = pageProcessor;
this.site = pageProcessor.getSite();
}
/**
* Set startUrls of Spider.
* Prior to startUrls of Site.
*
* @param startUrls startUrls
* @return this
*/
public Spider startUrls(List startUrls) {
//检查是否已经运行
checkIfRunning();
//将url 转换成request
this.startRequests = UrlUtils.convertToRequests(startUrls);
return this;
}
/**
* Set startUrls of Spider.
* Prior to startUrls of Site.
*
* @param startRequests startRequests
* @return this
*/
public Spider startRequest(List startRequests) {
//检查是否已经运行
checkIfRunning();
//如果是request 不转换
this.startRequests = startRequests;
return this;
}
/**
* Set an uuid for spider.
* Default uuid is domain of site.
*
* @param uuid uuid
* @return this
*/
public Spider setUUID(String uuid) {
//设置任务
this.uuid = uuid;
return this;
}
/**
* set scheduler for Spider
*
* @param scheduler scheduler
* @return this
* @see #setScheduler(us.codecraft.webmagic.scheduler.Scheduler)
*/
@Deprecated
public Spider scheduler(Scheduler scheduler) {
return setScheduler(scheduler);
}
/**
* set scheduler for Spider
*
* @param scheduler scheduler
* @return this
* @see Scheduler
* @since 0.2.1
*/
public Spider setScheduler(Scheduler scheduler) {
//检查是否运行
checkIfRunning();
Scheduler oldScheduler = this.scheduler;
this.scheduler = scheduler;
//可以理解 如果老的scheduler 还有请求没有去fetch 新的会把剩余加进去
//老队列吐出一个请求 新队列加一个
if (oldScheduler != null) {
Request request;
while ((request = oldScheduler.poll(this)) != null) {
this.scheduler.push(request, this);
}
}
return this;
}
/**
* add a pipeline for Spider
*
* @param pipeline pipeline
* @return this
* @see #addPipeline(us.codecraft.webmagic.pipeline.Pipeline)
* @deprecated
*/
public Spider pipeline(Pipeline pipeline) {
return addPipeline(pipeline);
}
/**
* add a pipeline for Spider
*
* @param pipeline pipeline
* @return this
* @see Pipeline
* @since 0.2.1
*/
public Spider addPipeline(Pipeline pipeline) {
checkIfRunning();
this.pipelines.add(pipeline);
return this;
}
/**
* set pipelines for Spider
*
* @param pipelines pipelines
* @return this
* @see Pipeline
* @since 0.4.1
*/
public Spider setPipelines(List pipelines) {
checkIfRunning();
//加入批量数据持久化
this.pipelines = pipelines;
return this;
}
/**
* clear the pipelines set
*
* @return this
*/
public Spider clearPipeline() {
pipelines = new ArrayList();
return this;
}
/**
* set the downloader of spider
*
* @param downloader downloader
* @return this
* @see #setDownloader(us.codecraft.webmagic.downloader.Downloader)
* @deprecated
*/
public Spider downloader(Downloader downloader) {
return setDownloader(downloader);
}
/**
* set the downloader of spider
*
* @param downloader downloader
* @return this
* @see Downloader
*/
public Spider setDownloader(Downloader downloader) {
checkIfRunning();
this.downloader = downloader;
return this;
}
//初始化
protected void initComponent() {
if (downloader == null) {
this.downloader = new HttpClientDownloader();
}
if (pipelines.isEmpty()) {
pipelines.add(new ConsolePipeline());
}
//默认一个线程
downloader.setThread(threadNum);
//生成线程池
if (threadPool == null || threadPool.isShutdown()) {
if (executorService != null && !executorService.isShutdown()) {
threadPool = new CountableThreadPool(threadNum, executorService);
} else {
threadPool = new CountableThreadPool(threadNum);
}
}
//批量加入初始链接
if (startRequests != null) {
for (Request request : startRequests) {
addRequest(request);
}
startRequests.clear();
}
//初始化开始时间
startTime = new Date();
}
//同步
@Override
public void run() {
//检查状态
checkRunningStat();
//初始化
initComponent();
logger.info("Spider {} started!",getUUID());
//已经运行并且测试线程 Thread 对象 是否已经是中断状态,但不清楚状态标志。
while (!Thread.currentThread().isInterrupted() && stat.get() == STAT_RUNNING) {
//默认是初始化队列scheduler 把任务放进去
final Request request = scheduler.poll(this);
if (request == null) {
if (threadPool.getThreadAlive() == 0 && exitWhenComplete) {
break;
}
// wait until new url added
waitNewUrl();
} else {
//异步
threadPool.execute(new Runnable() {
@Override
public void run() {
try {
//处理
processRequest(request);
//计数 多少页下载一般最后下完完毕都可以看到
onSuccess(request);
} catch (Exception e) {
onError(request);
logger.error("process request " + request + " error", e);
} finally {
//总计
pageCount.incrementAndGet();
//等待新的url
signalNewUrl();
}
}
});
}
}
//完成就设置状态over
stat.set(STAT_STOPPED);
// release some resources
if (destroyWhenExit) {
close();
}
logger.info("Spider {} closed! {} pages downloaded.", getUUID(), pageCount.get());
}
//失败计数
protected void onError(Request request) {
if (CollectionUtils.isNotEmpty(spiderListeners)) {
for (SpiderListener spiderListener : spiderListeners) {
spiderListener.onError(request);
}
}
}
//成功计数
protected void onSuccess(Request request) {
if (CollectionUtils.isNotEmpty(spiderListeners)) {
for (SpiderListener spiderListener : spiderListeners) {
spiderListener.onSuccess(request);
}
}
}
//检查初始化状态
private void checkRunningStat() {
while (true) {
int statNow = stat.get();
if (statNow == STAT_RUNNING) {
throw new IllegalStateException("Spider is already running!");
}
// CAS有3个操作数,内存值V,旧的预期值A,要修改的新值B。当且仅当预期值A和内存值V相同时,将内存值V修改为B,否则什么都不做。
if (stat.compareAndSet(statNow, STAT_RUNNING)) {
break;
}
}
}
//停止方法
public void close() {
destroyEach(downloader);
destroyEach(pageProcessor);
destroyEach(scheduler);
for (Pipeline pipeline : pipelines) {
destroyEach(pipeline);
}
threadPool.shutdown();
}
//对象销毁
private void destroyEach(Object object) {
if (object instanceof Closeable) {
try {
((Closeable) object).close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
/**
* Process specific urls without url discovering.
*
* @param urls urls to process
*/
public void test(String... urls) {
initComponent();
if (urls.length > 0) {
for (String url : urls) {
processRequest(new Request(url));
}
}
}
//处理请求
private void processRequest(Request request) {
Page page = downloader.download(request, this);
if (page.isDownloadSuccess()){
onDownloadSuccess(request, page);
} else {
onDownloaderFail(request);
}
}
//下载成功
private void onDownloadSuccess(Request request, Page page) {
//包含响应码
if (site.getAcceptStatCode().contains(page.getStatusCode())){
//处理
pageProcessor.process(page);
//不知道
extractAndAddRequests(page, spawnUrl);
if (!page.getResultItems().isSkip()) {
for (Pipeline pipeline : pipelines) {
//多pipeline 处理数据
pipeline.process(page.getResultItems(), this);
}
}
} else {
logger.info("page status code error, page {} , code: {}", request.getUrl(), page.getStatusCode());
}
//site 设值 休眠
sleep(site.getSleepTime());
return;
}
//下载失败
private void onDownloaderFail(Request request) {
//放弃
if (site.getCycleRetryTimes() == 0) {
sleep(site.getSleepTime());
} else {
// for cycle retry
//重试
doCycleRetry(request);
}
}
//重试
private void doCycleRetry(Request request) {
//重试参数放在 request extra key='_cycle_tried_times'
Object cycleTriedTimesObject = request.getExtra(Request.CYCLE_TRIED_TIMES);
if (cycleTriedTimesObject == null) {
addRequest(SerializationUtils.clone(request).setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, 1));
} else {
int cycleTriedTimes = (Integer) cycleTriedTimesObject;
cycleTriedTimes++;
if (cycleTriedTimes < site.getCycleRetryTimes()) {
addRequest(SerializationUtils.clone(request).setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, cycleTriedTimes));
}
}
sleep(site.getRetrySleepTime());
}
protected void sleep(int time) {
try {
Thread.sleep(time);
} catch (InterruptedException e) {
logger.error("Thread interrupted when sleep",e);
}
}
protected void extractAndAddRequests(Page page, boolean spawnUrl) {
if (spawnUrl && CollectionUtils.isNotEmpty(page.getTargetRequests())) {
for (Request request : page.getTargetRequests()) {
addRequest(request);
}
}
}
//加入请求
private void addRequest(Request request) {
//如果没有域名自动给你加域名
if (site.getDomain() == null && request != null && request.getUrl() != null) {
site.setDomain(UrlUtils.getDomain(request.getUrl()));
}
//加入scheduler 默认是队列
scheduler.push(request, this);
}
//检查状态
protected void checkIfRunning() {
if (stat.get() == STAT_RUNNING) {
throw new IllegalStateException("Spider is already running!");
}
}
//异步启动
public void runAsync() {
Thread thread = new Thread(this);
thread.setDaemon(false);
thread.start();
}
/**
* Add urls to crawl.
*
* @param urls urls
* @return this
*/
//url 转换成request
//例如 addurl(url1).addurl(url2)
public Spider addUrl(String... urls) {
for (String url : urls) {
addRequest(new Request(url));
}
signalNewUrl();
return this;
}
/**
* Download urls synchronizing.
*
* @param urls urls
* @param type of process result
* @return list downloaded
*/
public List getAll(Collection urls) {
destroyWhenExit = false;
spawnUrl = false;
if (startRequests!=null){
startRequests.clear();
}
for (Request request : UrlUtils.convertToRequests(urls)) {
addRequest(request);
}
CollectorPipeline collectorPipeline = getCollectorPipeline();
pipelines.add(collectorPipeline);
run();
spawnUrl = true;
destroyWhenExit = true;
return collectorPipeline.getCollected();
}
protected CollectorPipeline getCollectorPipeline() {
return new ResultItemsCollectorPipeline();
}
public T get(String url) {
List urls = WMCollections.newArrayList(url);
List resultItemses = getAll(urls);
if (resultItemses != null && resultItemses.size() > 0) {
return resultItemses.get(0);
} else {
return null;
}
}
/**
* Add urls with information to crawl.
*
* @param requests requests
* @return this
*/
//批量加入
public Spider addRequest(Request... requests) {
for (Request request : requests) {
addRequest(request);
}
signalNewUrl();
return this;
}
//等待新的url
private void waitNewUrl() {
newUrlLock.lock();
try {
//double check
if (threadPool.getThreadAlive() == 0 && exitWhenComplete) {
return;
}
newUrlCondition.await(emptySleepTime, TimeUnit.MILLISECONDS);
} catch (InterruptedException e) {
logger.warn("waitNewUrl - interrupted, error {}", e);
} finally {
newUrlLock.unlock();
}
}
private void signalNewUrl() {
try {
newUrlLock.lock();
///唤醒 await()等待队列中所有的线程
newUrlCondition.signalAll();
} finally {
newUrlLock.unlock();
}
}
//=runAsync() 异步启动
public void start() {
runAsync();
}
//停止爬虫
public void stop() {
//状态加1
if (stat.compareAndSet(STAT_RUNNING, STAT_STOPPED)) {
logger.info("Spider " + getUUID() + " stop success!");
} else {
logger.info("Spider " + getUUID() + " stop fail!");
}
}
/**
* start with more than one threads
*
* @param threadNum threadNum
* @return this
*/
//线程数
public Spider thread(int threadNum) {
checkIfRunning();
this.threadNum = threadNum;
if (threadNum <= 0) {
throw new IllegalArgumentException("threadNum should be more than one!");
}
return this;
}
/**
* start with more than one threads
*
* @param executorService executorService to run the spider
* @param threadNum threadNum
* @return this
*/
//重载方法 加入executorService 自定义线程池
public Spider thread(ExecutorService executorService, int threadNum) {
checkIfRunning();
this.threadNum = threadNum;
if (threadNum <= 0) {
throw new IllegalArgumentException("threadNum should be more than one!");
}
this.executorService = executorService;
return this;
}
//是否完全
public boolean isExitWhenComplete() {
return exitWhenComplete;
}
/**
* Exit when complete.
* True: exit when all url of the site is downloaded.
* False: not exit until call stop() manually.
*
* @param exitWhenComplete exitWhenComplete
* @return this
*/
public Spider setExitWhenComplete(boolean exitWhenComplete) {
this.exitWhenComplete = exitWhenComplete;
return this;
}
public boolean isSpawnUrl() {
return spawnUrl;
}
/**
* Get page count downloaded by spider.
*
* @return total downloaded page count
* @since 0.4.1
*/
public long getPageCount() {
return pageCount.get();
}
/**
* Get running status by spider.
*
* @return running status
* @see Status
* @since 0.4.1
*/
//爬虫状态获取
public Status getStatus() {
return Status.fromValue(stat.get());
}
//状态枚举内部类
public enum Status {
Init(0), Running(1), Stopped(2);
private Status(int value) {
this.value = value;
}
private int value;
int getValue() {
return value;
}
public static Status fromValue(int value) {
for (Status status : Status.values()) {
if (status.getValue() == value) {
return status;
}
}
//default value
return Init;
}
}
/**
* Get thread count which is running
*
* @return thread count which is running
* @since 0.4.1
*/
//获取存活线程数
public int getThreadAlive() {
if (threadPool == null) {
return 0;
}
return threadPool.getThreadAlive();
}
/**
* Whether add urls extracted to download.
* Add urls to download when it is true, and just download seed urls when it is false.
* DO NOT set it unless you know what it means!
*
* @param spawnUrl spawnUrl
* @return this
* @since 0.4.0
*/
public Spider setSpawnUrl(boolean spawnUrl) {
this.spawnUrl = spawnUrl;
return this;
}
@Override
public String getUUID() {
if (uuid != null) {
return uuid;
}
if (site != null) {
return site.getDomain();
}
uuid = UUID.randomUUID().toString();
return uuid;
}
public Spider setExecutorService(ExecutorService executorService) {
checkIfRunning();
this.executorService = executorService;
return this;
}
@Override
public Site getSite() {
return site;
}
public List getSpiderListeners() {
return spiderListeners;
}
public Spider setSpiderListeners(List spiderListeners) {
this.spiderListeners = spiderListeners;
return this;
}
public Date getStartTime() {
return startTime;
}
public Scheduler getScheduler() {
return scheduler;
}
/**
* Set wait time when no url is polled.
*
* @param emptySleepTime In MILLISECONDS.
*/
public void setEmptySleepTime(int emptySleepTime) {
this.emptySleepTime = emptySleepTime;
}
}