Heritrix 3.1.0 源码解析(十六)

 我们接下来分析与与BdbFrontier对象CrawlURI next()方法相关的方法

/**

     * Return the next CrawlURI eligible to be processed (and presumably

     * visited/fetched) by a a worker thread.

     *

     * Relies on the readyClassQueues having been loaded with

     * any work queues that are eligible to provide a URI. 

     *

     * @return next CrawlURI eligible to be processed, or null if none available

     *

     * @see org.archive.crawler.framework.Frontier#next()

     */

    protected CrawlURI findEligibleURI() {

            // wake any snoozed queues

            wakeQueues();

            // consider rescheduled URIS

            checkFutures();

                   

            // find a non-empty ready queue, if any 

            // TODO: refactor to untangle these loops, early-exits, etc!

            WorkQueue readyQ = null;

            findauri: while(true) {

                findaqueue: do {

                    String key = readyClassQueues.poll();

                    if(key==null) {

                        // no ready queues; try to activate one

                        if(!getInactiveQueuesByPrecedence().isEmpty() 

                            && highestPrecedenceWaiting < getPrecedenceFloor()) {

                            activateInactiveQueue();

                            continue findaqueue;

                        } else {

                            // nothing ready or readyable

                            break findaqueue;

                        }

                    }

                    readyQ = getQueueFor(key);

                    if(readyQ==null) {

                         // readyQ key wasn't in all queues: unexpected

                        logger.severe("Key "+ key +

                            " in readyClassQueues but not allQueues");

                        break findaqueue;

                    }

                    if(readyQ.getCount()==0) {

                        // readyQ is empty and ready: it's exhausted

                        readyQ.noteExhausted(); 

                        readyQ.makeDirty();

                        readyQ = null;

                        continue; 

                    }

                    if(!inProcessQueues.add(readyQ)) {

                        // double activation; discard this and move on

                        // (this guard allows other enqueuings to ready or 

                        // the various inactive-by-precedence queues to 

                        // sometimes redundantly enqueue a queue key)

                        readyQ = null; 

                        continue;

                    }

                    // queue has gone 'in process' 

                    readyQ.considerActive();

                    readyQ.setWakeTime(0); // clear obsolete wake time, if any



                    readyQ.setSessionBudget(getBalanceReplenishAmount());

                    readyQ.setTotalBudget(getQueueTotalBudget()); 

                    if (readyQ.isOverSessionBudget()) {

                        deactivateQueue(readyQ);

                        readyQ.makeDirty();

                        readyQ = null;

                        continue; 

                    }

                    if (readyQ.isOverTotalBudget()) {

                        retireQueue(readyQ);

                        readyQ.makeDirty();

                        readyQ = null;

                        continue; 

                    }

                } while (readyQ == null);

                

                if (readyQ == null) {

                    // no queues left in ready or readiable

                    break findauri; 

                }

           

                returnauri: while(true) { // loop left by explicit return or break on empty

                    CrawlURI curi = null;

                    curi = readyQ.peek(this);   

                    if(curi == null) {

                        // should not reach

                        logger.severe("No CrawlURI from ready non-empty queue "

                                + readyQ.classKey + "\n" 

                                + readyQ.shortReportLegend() + "\n"

                                + readyQ.shortReportLine() + "\n");

                        break returnauri;

                    }

                    

                    // from queues, override names persist but not map source

                    curi.setOverlayMapsSource(sheetOverlaysManager);

                    // TODO: consider optimizations avoiding this recalc of

                    // overrides when not necessary

                    sheetOverlaysManager.applyOverlaysTo(curi);

                    // check if curi belongs in different queue

                    String currentQueueKey;

                    try {

                        KeyedProperties.loadOverridesFrom(curi);

                        currentQueueKey = getClassKey(curi);

                    } finally {

                        KeyedProperties.clearOverridesFrom(curi); 

                    }

                    if (currentQueueKey.equals(curi.getClassKey())) {

                        // curi was in right queue, emit

                        noteAboutToEmit(curi, readyQ);

                        return curi;

                    }

                    // URI's assigned queue has changed since it

                    // was queued (eg because its IP has become

                    // known). Requeue to new queue.

                    // TODO: consider synchronization on readyQ

                    readyQ.dequeue(this,curi);

                    doJournalRelocated(curi);

                    curi.setClassKey(currentQueueKey);

                    decrementQueuedCount(1);

                    curi.setHolderKey(null);

                    sendToQueue(curi);

                    if(readyQ.getCount()==0) {

                        // readyQ is empty and ready: it's exhausted

                        // release held status, allowing any subsequent 

                        // enqueues to again put queue in ready

                        // FIXME: tiny window here where queue could 

                        // receive new URI, be readied, fail not-in-process?

                        inProcessQueues.remove(readyQ);

                        readyQ.noteExhausted();

                        readyQ.makeDirty();

                        readyQ = null;

                        continue findauri;

                    }

                }

            }

                

            if(inProcessQueues.size()==0) {

                // Nothing was ready or in progress or imminent to wake; ensure 

                // any piled-up pending-scheduled URIs are considered

                uriUniqFilter.requestFlush();

            }

            

            // if truly nothing ready, wait a moment before returning null

            // so that loop in surrounding next() has a chance of getting something

            // next time

            if(getTotalEligibleInactiveQueues()==0) {

                try {

                    Thread.sleep(1000);

                } catch (InterruptedException e) {

                    // 

                } 

            }

            

            // nothing eligible

            return null; 

    }

这个方法有点长,我们先看一下void wakeQueues() 方法

     /** 唤醒snoozed queue中到时的队列

     * Wake any queues sitting in the snoozed queue whose time has come.

     */

    protected void wakeQueues() {

        DelayedWorkQueue waked; 

        while((waked = snoozedClassQueues.poll())!=null) {

            WorkQueue queue = waked.getWorkQueue(this);

            queue.setWakeTime(0);

            queue.makeDirty();

            reenqueueQueue(queue);

        }

        // also consider overflow (usually empty)

        if(!snoozedOverflow.isEmpty()) {

            synchronized(snoozedOverflow) {

                Iterator<DelayedWorkQueue> iter = 

                    snoozedOverflow.headMap(System.currentTimeMillis()).values().iterator();

                while(iter.hasNext()) {

                    DelayedWorkQueue dq = iter.next();

                    iter.remove();

                    snoozedOverflowCount.decrementAndGet();

                    WorkQueue queue = dq.getWorkQueue(this);

                    queue.setWakeTime(0);

                    queue.makeDirty();

                    reenqueueQueue(queue);

                }

            }

        }

    }

snoozedClassQueues.poll()方法是从休眠队列中取出时间到期的元素,重置睡眠时间为0,然后重置WorkQueue wq的队列归属(非活动状态队列或已经准备好被爬取的队列)

/**

     * Enqueue the given queue to either readyClassQueues or inactiveQueues,

     * as appropriate.

     * 

     * @param wq

     */

    protected void reenqueueQueue(WorkQueue wq) { 

        //TODO:SPRINGY set overrides by queue? 

        getQueuePrecedencePolicy().queueReevaluate(wq);

        if (logger.isLoggable(Level.FINE)) {

            logger.fine("queue reenqueued: " +

                wq.getClassKey());

        }

        if(highestPrecedenceWaiting < wq.getPrecedence() 

            || wq.getPrecedence() >= getPrecedenceFloor()) {

            // if still over budget, deactivate

            deactivateQueue(wq);

        } else {

            readyQueue(wq);

        }

    }

 首先是重置队列的优先级,然后是将WorkQueue wq归入非活动状态队列或已经准备好被爬取的队列

deactivateQueue(wq)方法我们上文已经分析过(将WorkQueue wq加入非活动状态队列),这里看一下readyQueue(wq)方法

/**

     * Put the given queue on the readyClassQueues queue

     * @param wq

     */

    protected void readyQueue(WorkQueue wq) {

//        assert Thread.currentThread() == managerThread;



        try {

            readyClassQueues.put(wq.getClassKey());

            if(logger.isLoggable(Level.FINE)) {

                logger.log(Level.FINE,

                        "queue readied: " + wq.getClassKey());

            }

        } catch (InterruptedException e) {

            e.printStackTrace();

            System.err.println("unable to ready queue "+wq);

            // propagate interrupt up 

            throw new RuntimeException(e);

        }

    }

该方法是将WorkQueue wq加入已经准备好被爬取的队列readyClassQueues

重新回到void wakeQueues()方法,后面是从snoozedOverflow容器中取出休眠到期的队列(snoozedOverflow用Map类型存储着优先级与过载的休眠状态的队列(队列存储着key)[Map类型]),然后重置WorkQueue wq归入哪个队列

回到CrawlURI findEligibleURI()方法里面的void checkFutures()方法检测到延迟时间的CrawlURI对象,并且加入BDB数据库

/**

     * Check for any future-scheduled URIs now eligible for reenqueuing

     */

    protected void checkFutures() {

//        assert Thread.currentThread() == managerThread;

        // TODO: consider only checking this every set interval

        if(!futureUris.isEmpty()) {

            synchronized(futureUris) {

                Iterator<CrawlURI> iter = 

                    futureUris.headMap(System.currentTimeMillis())

                        .values().iterator();

                while(iter.hasNext()) {

                    CrawlURI curi = iter.next();

                    curi.setRescheduleTime(-1); // unless again set elsewhere

                    iter.remove();

                    futureUriCount.decrementAndGet();

                    receive(curi);

                }

            }

        }

    }

继续往下面看,String key = readyClassQueues.poll()方法为从已经准备好被爬取的队列readyClassQueues中取出队头元素(WorkQueue wq的classkey)

如果预备队列中不存在元素,则激活非活动状态队列inactiveQueues,将合适的WorkQueue wq放已经准备好被爬取的队列入readyClassQueues中

activateInactiveQueue()

/**

     * 激活非活动状态的队列

     * Activate an inactive queue, if any are available. 

     */

    protected boolean activateInactiveQueue() {

        for (Entry<Integer, Queue<String>> entry: getInactiveQueuesByPrecedence().entrySet()) {

            int expectedPrecedence = entry.getKey();

            Queue<String> queueOfWorkQueueKeys = entry.getValue();



            while (true) {

                synchronized (getInactiveQueuesByPrecedence()) {

                    String workQueueKey = queueOfWorkQueueKeys.poll();

                    if (workQueueKey == null) {

                        break;

                    }



                    WorkQueue candidateQ = (WorkQueue) this.allQueues.get(workQueueKey);

                    if (candidateQ.getPrecedence() > expectedPrecedence) {

                        // queue demoted since placed; re-deactivate

                        deactivateQueue(candidateQ);

                        candidateQ.makeDirty();

                        continue; 

                    }



                    updateHighestWaiting(expectedPrecedence);

                    try {

                        readyClassQueues.put(workQueueKey);//readyClassQueues存储着已经准备好被爬取的队列的key

                    } catch (InterruptedException e) {

                        throw new RuntimeException(e); 

                    } 

                    

                    return true; 

                }

            }

        }

        

        return false;

    }

更新非活动状态队列inactiveQueues中最高优先级的值(最小值) 

/**

     * Recalculate the value of thehighest-precedence queue waiting

     * among inactive queues. 

     * 

     * @param startFrom start looking at this precedence value

     */

    protected void updateHighestWaiting(int startFrom) {

        // probe for new highestWaiting

        for(int precedenceKey : getInactiveQueuesByPrecedence().tailMap(startFrom).keySet()) {

            if(!getInactiveQueuesByPrecedence().get(precedenceKey).isEmpty()) {

                highestPrecedenceWaiting = precedenceKey;

                return;

            }

        }

        // nothing waiting

        highestPrecedenceWaiting = Integer.MAX_VALUE;

    }

 上面方法为从非活动状态队列inactiveQueues中获取大于指定值得队列元素集合,然后将highestPrecedenceWaiting值设置为非活动状态队列inactiveQueues中precedence最小的值(inactiveQueues是有序的)

---------------------------------------------------------------------------

本系列Heritrix 3.1.0 源码解析系本人原创

转载请注明出处 博客园 刺猬的温驯

本文链接 http://www.cnblogs.com/chenying99/archive/2013/04/21/3033510.html

你可能感兴趣的:(Heritrix)