1. Fetcher的Mapp模型
Fetcher.java代码中可以看到,Fetcher继承自MapRunable,它是Mapper的抽象接口,实现这个接口的子类能够更好的对Map的流程进行控制,包括多线程与异步Maper。// 对配置进行检测,看一些必要的配置是否已经配置了,如http.agent.name等参数 checkConfiguration(); // 记录fetch的开始时间 SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); if (LOG.isInfoEnabled()) { LOG.info("Fetcher: starting at " + sdf.format(start)); LOG.info("Fetcher: segment: " + segment); } // 这里对抓取的时候进行限制,在FetchItemQueue中会用到这个参数 // set the actual time for the timelimit relative // to the beginning of the whole job and not of a specific task // otherwise it keeps trying again if a task fails long timelimit = getConf().getLong("fetcher.timelimit.mins", -1); if (timelimit != -1) { timelimit = System.currentTimeMillis() + (timelimit * 60 * 1000); LOG.info("Fetcher Timelimit set for : " + timelimit); getConf().setLong("fetcher.timelimit", timelimit); } // 生成一个Nutch的Map-Reduce配置 JobConf job = new NutchJob(getConf()); job.setJobName("fetch " + segment); // 配置抓取线程数, job.setInt("fetcher.threads.fetch", threads); job.set(Nutch.SEGMENT_NAME_KEY, segment.getName()); // 配置是否对抓取的内容进行解析 job.setBoolean("fetcher.parse", parsing); // for politeness, don't permit parallel execution of a single task job.setSpeculativeExecution(false); // 配置输出的路径名 FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.GENERATE_DIR_NAME)); // 配置输入的文件格式,这里类继承自SequenceFileInputFormat // 它主要是覆盖了其getSplits方法,其作用是不对文件进行切分,以文件数量作为splits的依据 // 就是有几个文件,就有几个Map操作 job.setInputFormat(InputFormat.class); // 配置Map操作的类 job.setMapRunnerClass(Fetcher.class); // 配置输出路径 FileOutputFormat.setOutputPath(job, segment); // 这里配置输出文件方法,这个类在前面已经分析过 job.setOutputFormat(FetcherOutputFormat.class); // 配置输出<key,value>类型 job.setOutputKeyClass(Text.class); job.setOutputValueClass(NutchWritable.class); JobClient.runJob(job);
// 生成生产者,用于读取Generate出来的CrawlDatum,把它们放到共享队列中 feeder = new QueueFeeder(input, fetchQueues, threadCount * 50); //feeder.setPriority((Thread.MAX_PRIORITY + Thread.NORM_PRIORITY) / 2); // the value of the time limit is either -1 or the time where it should finish long timelimit = getConf().getLong("fetcher.timelimit", -1); if (timelimit != -1) feeder.setTimeLimit(timelimit); feeder.start(); // set non-blocking & no-robots mode for HTTP protocol plugins. getConf().setBoolean(Protocol.CHECK_BLOCKING, false); getConf().setBoolean(Protocol.CHECK_ROBOTS, false); // 启动消费者线程 for (int i = 0; i < threadCount; i++) { // spawn threads new FetcherThread(getConf()).start(); } // select a timeout that avoids a task timeout long timeout = getConf().getInt("mapred.task.timeout", 10*60*1000)/2; // 这里用一个循环来等待线程结束 do { // wait for threads to exit try { Thread.sleep(1000); } catch (InterruptedException e) {} // 这个函数是得到相前线程的抓取状态,如抓取了多少网页,多少网页抓取失败,抓取速度是多少 reportStatus(); LOG.info("-activeThreads=" + activeThreads + ", spinWaiting=" + spinWaiting.get() + ", fetchQueues.totalSize=" + fetchQueues.getTotalSize()); // 输出抓取队列中的信息 if (!feeder.isAlive() && fetchQueues.getTotalSize() < 5) { fetchQueues.dump(); } // 查看timelimit的值,这里只要返回的hitByTimeLimit不为0,checkTimelimit方法会清空抓取队列中的所有数据 // check timelimit if (!feeder.isAlive()) { int hitByTimeLimit = fetchQueues.checkTimelimit(); if (hitByTimeLimit != 0) reporter.incrCounter("FetcherStatus", "hitByTimeLimit", hitByTimeLimit); } // 查看抓取抓取线程是否超时,如果超时,就退出等待 // some requests seem to hang, despite all intentions if ((System.currentTimeMillis() - lastRequestStart.get()) > timeout) { if (LOG.isWarnEnabled()) { LOG.warn("Aborting with "+activeThreads+" hung threads."); } return; } } while (activeThreads.get() > 0); LOG.info("-activeThreads=" + activeThreads);