Heritrix架构学习笔记(三)

3Frontier链接制造工厂

heritrix- 1.12.1 /docs/articles/developer_manual/frontier.html下可找到Heritrix的官方文档的一个Frontier例子:

/**

 * A simple Frontier implementation for tutorial purposes

 */

public class MyFrontier extends ModuleType implements Frontier,

        FetchStatusCodes {

// A list of the discovered URIs that should be crawled.

// 列表用来保存还未抓取的链接

    List pendingURIs = new ArrayList();

   

    // A list of prerequisites that needs to be met before any other URI is

// allowed to be crawled, e.g. DNS-lookups

//这个列表中保存了一系列的链接,它们的优先级要高于pendingURIs那个List中的任何一个链接,

//表中的链接表示一些需要被满足的先决条件

    List prerequisites = new ArrayList();

   

// A hash of already crawled URIs so that every URI is crawled only once.

//一个HashMap,用于存储那些已经抓取过的链接

    Map alreadyIncluded = new HashMap();

   

// Reference to the CrawlController.

// CrawlController对象

    CrawlController controller;

 

// Flag to note if a URI is being processed.

//用于标识是否一个链接正在被处理

    boolean uriInProcess = false;

   

// top-level stats

//成功下载的数量

long successCount = 0;

//失败的数量

long failedCount = 0;

//抛弃掉链接的数量

long disregardedCount = 0;

//总共下载的字节数

    long totalProcessedBytes = 0;

 

    public MyFrontier(String name) {

        super(Frontier.ATTR_NAME, "A simple frontier.");

    }

 

    public void initialize(CrawlController controller)

            throws FatalConfigurationException, IOException {

        //注入

        this.controller = controller;

       

        // Initialize the pending queue with the seeds

        //把种子文件中的链接加入到pengdingURIs中去

        this.controller.getScope().refreshSeeds();

        List seeds = this.controller.getScope().getSeedlist();

        synchronized(seeds) {

            for (Iterator i = seeds.iterator(); i.hasNext();) {

                UURI u = (UURI) i.next();

                CandidateURI caUri = new CandidateURI(u);

                caUri.setSeed();

                schedule(caUri);

            }

        }

    }

 

    //该方法是给线程池中的线程调用的,用以取出下一个准备处理的链接

    public synchronized CrawlURI next(int timeout) throws InterruptedException {

        if (!uriInProcess && !isEmpty()) {

            uriInProcess = true;

            CrawlURI curi;

            /*

             先看prerequistes队列中是否有要处理的链接,如果有,就先处理,如果没有,再看pengdingURIs队列中是否有链接。每次在处理的时候,总是取出队列中的第一个链接

            */

            if (!prerequisites.isEmpty()) {

                curi = CrawlURI.from((CandidateURI) prerequisites.remove(0));

            } else {

                curi = CrawlURI.from((CandidateURI) pendingURIs.remove(0));

            }

            curi.setServer(controller.getServerCache().getServerFor(curi));

            return curi;

        } else {

            wait(timeout);

            return null;

        }

    }

       public boolean isEmpty() {

        return pendingURIs.isEmpty() && prerequisites.isEmpty();

    }

 

     //该方法用于将新链接加入到pengdingURIs队列中,等待处理

    public synchronized void schedule(CandidateURI caURI) {

        // Schedule a uri for crawling if it is not already crawled

       /*

          首先判断要加入的链接是否已经被抓取过,如果已经包含在alreadyIncluded这个HashMap中则说明处理过了,就可以放弃处理

       */

        if (!alreadyIncluded.containsKey(caURI.getURIString())) {

            if(caURI.needsImmediateScheduling()) {

                prerequisites.add(caURI);

            } else {

                pendingURIs.add(caURI);

            }

            //HashMap中使用url的字符串来作为key,而将实际的CadidateURI对象作为value

            alreadyIncluded.put(caURI.getURIString(), caURI);

        }

    }

 

    public void batchSchedule(CandidateURI caURI) {

        schedule(caURI);

    }

 

    public void batchFlush() {

    }

 

    //一次抓取结束后所执行的操作,该操作由线程池中的线程来进行调用

    public synchronized void finished(CrawlURI cURI) {

        uriInProcess = false;

        //成功下载

        if (cURI.isSuccess()) {

           

            successCount++;

            //统计下载总数

            totalProcessedBytes += cURI.getContentSize();

            //如果成功,则触发一个成功事件,比如将Extractor解析出来的新URL加入队列中

            controller.fireCrawledURISuccessfulEvent(cURI);

            cURI.stripToMinimal();

        }

         //需要推迟下载

else if (cURI.getFetchStatus() == S_DEFERRED) {

            cURI.processingCleanup();

            alreadyIncluded.remove(cURI.getURIString());

            schedule(cURI);

        }

        //其他状态

else if (cURI.getFetchStatus() == S_ROBOTS_PRECLUDED

                || cURI.getFetchStatus() == S_OUT_OF_SCOPE

                || cURI.getFetchStatus() == S_BLOCKED_BY_USER

                || cURI.getFetchStatus() == S_TOO_MANY_EMBED_HOPS

                || cURI.getFetchStatus() == S_TOO_MANY_LINK_HOPS

                || cURI.getFetchStatus() == S_DELETED_BY_USER) {

            //抛弃当前URI

            controller.fireCrawledURIDisregardEvent(cURI);

            disregardedCount++;

            cURI.stripToMinimal();

        } else {

            controller.fireCrawledURIFailureEvent(cURI);

            failedCount++;

            cURI.stripToMinimal();

        }

        cURI.processingCleanup();

    }

 

    //返回所有已经处理过的链接数量

    public long discoveredUriCount() {

        return alreadyIncluded.size();

    }

 

   //返回所有等待处理的链接数量

    public long queuedUriCount() {

        return pendingURIs.size() + prerequisites.size();

    }

 

    //返回所有已经完成的链接数量

    public long finishedUriCount() {

        return successCount + failedCount + disregardedCount;

    }

   

    //返回所有成功处理的链接数量

    public long successfullyFetchedCount() {

        return successCount;

    }

  

    //返回所有失败的链接数量

    public long failedFetchCount() {

        return failedCount;

    }

    //返回所有抛弃的链接数量

    public long disregardedFetchCount() {

        return disregardedCount;

    }

   //返回总共下载的字节数

    public long totalBytesWritten() {

        return totalProcessedBytes;

    }

 

    public String report() {

        return "This frontier does not return a report.";

    }

 

    public void importRecoverLog(String pathToLog) throws IOException {

        throw new UnsupportedOperationException();

    }

 

    public FrontierMarker getInitialMarker(String regexpr,

            boolean inCacheOnly) {

        return null;

    }

 

    public ArrayList getURIsList(FrontierMarker marker, int numberOfMatches,

            boolean verbose) throws InvalidFrontierMarkerException {

        return null;

    }

 

    public long deleteURIs(String match) {

        return 0;

    }

 

}

注意:上面仅仅是一个最基础的代码,从结构上揭示一个Frontier的作用

你可能感兴趣的:(Heritrix架构学习笔记(三))