inject -- inject new urls into the database
inject是在整个nutch运行的最初阶段执行,只执行一次,是将指定目录下的url信息注入到crawldb中。
inject的运行代码在org.apache.nutch.crawl.Injector 类中,implements Tool, 所以执行中先调用run方法。
看看run方法中都做了些什么:
public void inject(Path crawlDb, Path urlDir) throws IOException { 。。。。。。。 //建立一个队url排序的job JobConf sortJob = new NutchJob(getConf()); //job名称 sortJob.setJobName("inject " + urlDir); //job的输入路径 FileInputFormat.addInputPath(sortJob, urlDir); //没有对输入的文件格式化,采用默认的textinputformat //设置执行map的class sortJob.setMapperClass(InjectMapper.class); //job执行后的输出路径 FileOutputFormat.setOutputPath(sortJob, tempDir); //输出文件的格式 sortJob.setOutputFormat(SequenceFileOutputFormat.class); //输出的key的类型 sortJob.setOutputKeyClass(Text.class); //输出的value的类型 //CrawlDatum中是url信息,如抓取状态,抓取时间,分数等 sortJob.setOutputValueClass(CrawlDatum.class); //当前时间 sortJob.setLong("injector.current.time", System.currentTimeMillis()); //执行这个job,只有map RunningJob mapJob = JobClient.runJob(sortJob); /* * 根据mapred的counter来获取job执行完后的相关数据 * counter用来统计map或reduce执行的情况和数据的。 * counters下有多个group,每个group下有多个counter */ //得到注入了多少个url long urlsInjected = mapJob.getCounters().findCounter("injector", "urls_injected").getValue(); //得到过滤了多少个url long urlsFiltered = mapJob.getCounters().findCounter("injector", "urls_filtered").getValue(); 。。。。 。。。。 /* * 创建job,在createJob中对job进行一些通用设置,如下: * job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(CrawlDbFilter.class); job.setReducerClass(CrawlDbReducer.class); FileOutputFormat.setOutputPath(job, newCrawlDb); job.setOutputFormat(MapFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlDatum.class); */ JobConf mergeJob = CrawlDb.createJob(getConf(), crawlDb); FileInputFormat.addInputPath(mergeJob, tempDir); mergeJob.setReducerClass(InjectReducer.class); JobClient.runJob(mergeJob); /* * 将上一次抓取url存放目录改为old,当前的为current * 如果db.preserve.backup为false,只保存当前的 */ CrawlDb.install(mergeJob, crawlDb); 。。。。 }
可以看到,在run方法中,先执行了一个job任务,这个任务是往crawldb中注入url文件目录下的记录,这里只有一个map任务,将结果输出到临时文件夹下,map输入的key-value对是文件行号--一行文本内容,输出的key-value对为每行的文本内容对象Text - CrawlDatum对象,CrawlDatum对象中保存的是有关每个url的信息,包括抓取状态,抓取间隔,分数等。mergeJob是对url进行合并,更新。
看看InjectMapper做了些什么:
public void configure(JobConf job) { this.jobConf = job; urlNormalizers = new URLNormalizers(job, URLNormalizers.SCOPE_INJECT); //重新抓取的时间,默认30天 interval = jobConf.getInt("db.fetch.interval.default", 2592000); //url过滤,参见nutch-default.xml中urlfilter.order属性 filters = new URLFilters(jobConf); //url分数过滤,参加nutch-default.xml中scoring.filter.order属性 scfilters = new ScoringFilters(jobConf); //inject注入的url的默认分值 scoreInjected = jobConf.getFloat("db.score.injected", 1.0f); curTime = job.getLong("injector.current.time", System.currentTimeMillis()); } public void close() {} public void map(WritableComparable key, Text value, OutputCollector<Text, CrawlDatum> output, Reporter reporter) throws IOException { String url = value.toString(); // value is line of text if (url != null && url.trim().startsWith("#")) { /* Ignore line that start with # */ return; } // if tabs : metadata that could be stored // must be name=value and separated by \t float customScore = -1f; int customInterval = interval; int fixedInterval = -1; Map<String,String> metadata = new TreeMap<String,String>(); /* * 在inject的url文件中,不仅可以设置要抓取的url,还可以设置关于这个url的一些操作信息 * 如: www.163.com nutch.score=2.5f nutch.fetchInterval=10 */ if (url.indexOf("\t")!=-1){ String[] splits = url.split("\t"); //url url = splits[0]; //url的相关信息 for (int s=1;s<splits.length;s++){ // find separation between name and value int indexEquals = splits[s].indexOf("="); if (indexEquals==-1) { // skip anything without a = continue; } String metaname = splits[s].substring(0, indexEquals); String metavalue = splits[s].substring(indexEquals+1); //url的分数 if (metaname.equals(nutchScoreMDName)) { try { customScore = Float.parseFloat(metavalue);} catch (NumberFormatException nfe){} } //抓取间隔 else if (metaname.equals(nutchFetchIntervalMDName)) { try { customInterval = Integer.parseInt(metavalue);} catch (NumberFormatException nfe){} } //固定抓取间隔 else if (metaname.equals(nutchFixedFetchIntervalMDName)) { try { fixedInterval = Integer.parseInt(metavalue);} catch (NumberFormatException nfe){} } else metadata.put(metaname,metavalue); } } try { //规范化url url = urlNormalizers.normalize(url, URLNormalizers.SCOPE_INJECT); //过滤url,如果该url被过滤了,返回null url = filters.filter(url); // filter the url } catch (Exception e) { if (LOG.isWarnEnabled()) { LOG.warn("Skipping " +url+":"+e); } url = null; } if (url == null) { /* * 如果没有获取到group,会new一个 * 如果没有在该组下获取到counter,也会new一个 * 被过滤的url增加1个 */ reporter.getCounter("injector", "urls_filtered").increment(1); } else { // if it passes value.set(url); // collect it CrawlDatum datum = new CrawlDatum(); //url状态 datum.setStatus(CrawlDatum.STATUS_INJECTED); // Is interval custom? Then set as meta data if (fixedInterval > -1) { // Set writable using float. Flaot is used by AdaptiveFetchSchedule datum.getMetaData().put(Nutch.WRITABLE_FIXED_INTERVAL_KEY, new FloatWritable(fixedInterval)); datum.setFetchInterval(fixedInterval); } else { datum.setFetchInterval(customInterval); } //当前时间 datum.setFetchTime(curTime); // now add the metadata Iterator<String> keysIter = metadata.keySet().iterator(); while (keysIter.hasNext()){ String keymd = keysIter.next(); String valuemd = metadata.get(keymd); datum.getMetaData().put(new Text(keymd), new Text(valuemd)); } if (customScore != -1) datum.setScore(customScore); else datum.setScore(scoreInjected); try { //这里暂时不明白 scfilters.injectedScore(value, datum); } catch (ScoringFilterException e) { if (LOG.isWarnEnabled()) { LOG.warn("Cannot filter injected score for url " + url + ", using default (" + e.getMessage() + ")"); } } //注入成功的url增加1个 reporter.getCounter("injector", "urls_injected").increment(1); output.collect(value, datum); } } }
这里面主要就是对url规范和过滤,最终的url写到临时文件里,给reducer用。
InjectReducer比较简单就是对url合并和状态更新。
这里有个疑问就是为什么这里会分为2个mapred任务来执行呢?只是为了复用crawldb的job吗?