【Heritrix基础教程之4】开始一个爬虫抓取的全流程代码分析


在创建一个job后,就要开始job的运行,运行的全流程如下:

1、在界面上启动job

【Heritrix基础教程之4】开始一个爬虫抓取的全流程代码分析_第1张图片

2、index.jsp

查看上述页面对应的源代码

<a href='"+request.getContextPath()+"/console/action.jsp?action=start'>Start</a>

3、action.jsp


    String sAction = request.getParameter("action");
    if(sAction != null)
    {
        // Need to handle an action    
        if(sAction.equalsIgnoreCase("start"))
        {
            // Tell handler to start crawl job
            handler.startCrawler();
        } else if(sAction.equalsIgnoreCase("stop")) {
            // Tell handler to stop crawl job
            handler.stopCrawler();
        } else if(sAction.equalsIgnoreCase("terminate")) {
            // Delete current job
            if(handler.getCurrentJob()!=null){
                handler.deleteJob(handler.getCurrentJob().getUID());
            }
        } else if(sAction.equalsIgnoreCase("pause")) {
            // Tell handler to pause crawl job
            handler.pauseJob();
        } else if(sAction.equalsIgnoreCase("resume")) {
            // Tell handler to resume crawl job
            handler.resumeJob();
        } else if(sAction.equalsIgnoreCase("checkpoint")) {
            if(handler.getCurrentJob() != null) {
                handler.checkpointJob();
            }
        }
    }    
    response.sendRedirect(request.getContextPath() + "/index.jsp");

4、CrawlJobHandler.jsp

(1)

    public void startCrawler() {
        running = true;
        if (pendingCrawlJobs.size() > 0 && isCrawling() == false) {
            // Ok, can just start the next job
            startNextJob();
        }
    }

(2)

    protected final void startNextJob() {
        synchronized (this) {
            if(startingNextJob != null) {
                try {
                    startingNextJob.join();
                } catch (InterruptedException e) {
                    e.printStackTrace();
                    return;
                }
            }
            startingNextJob = new Thread(new Runnable() {
                public void run() {
                    startNextJobInternal();
                }
            }, "StartNextJob");
            startingNextJob.start();
        }
    }

(3)

   protected void startNextJobInternal() {
        if (pendingCrawlJobs.size() == 0 || isCrawling()) {
            // No job ready or already crawling.
            return;
        }
        this.currentJob = (CrawlJob)pendingCrawlJobs.first();
        assert pendingCrawlJobs.contains(currentJob) :
            "pendingCrawlJobs is in an illegal state";
        pendingCrawlJobs.remove(currentJob);
        try {
            this.currentJob.setupForCrawlStart();
            // This is ugly but needed so I can clear the currentJob
            // reference in the crawlEnding and update the list of completed
            // jobs.  Also, crawlEnded can startup next job.
            this.currentJob.getController().addCrawlStatusListener(this);
            // now, actually start
            this.currentJob.getController().requestCrawlStart();
        } catch (InitializationException e) {
            loadJob(getStateJobFile(this.currentJob.getDirectory()));
            this.currentJob = null;
            startNextJobInternal(); // Load the next job if there is one.
        }
    }

(4)

    public void requestCrawlStart() {
        runProcessorInitialTasks();

        sendCrawlStateChangeEvent(STARTED, CrawlJob.STATUS_PENDING);
        String jobState;
        state = RUNNING;
        jobState = CrawlJob.STATUS_RUNNING;
        sendCrawlStateChangeEvent(this.state, jobState);

        // A proper exit will change this value.
        this.sExit = CrawlJob.STATUS_FINISHED_ABNORMAL;
        
        Thread statLogger = new Thread(statistics);
        statLogger.setName("StatLogger");
        statLogger.start();
        
        frontier.start();
    }



你可能感兴趣的:(【Heritrix基础教程之4】开始一个爬虫抓取的全流程代码分析)