Usage: LinkDb <linkdb> (-dir <segmentsDir> | <seg1> <seg2> ...) [-force] [-noNormalize] [-noFilter] linkdb output LinkDb to create or update -dir segmentsDir parent directory of several segments, OR seg1 seg2 ... list of segment directories -force force update even if LinkDb appears to be locked (CAUTION advised) -noNormalize don't normalize link URLs -noFilter don't apply URLFilters to link URLs
lemo@debian:~/Workspace/java/Apache/Nutch/nutch-1.3$ bin/nutch invertlinks db/linkdb/ db/segments/20110822105243/ LinkDb: starting at 2011-08-29 09:21:36 LinkDb: linkdb: db/linkdb LinkDb: URL normalize: true LinkDb: URL filter: true LinkDb: adding segment: db/segments/20110822105243 // 加入新的segment库 LinkDb: merging with existing linkdb: db/linkdb // 与原因的库进行合并 LinkDb: finished at 2011-08-29 09:21:40, elapsed: 00:00:03
// 新建立一个MP任务 JobConf job = LinkDb.createJob(getConf(), linkDb, normalize, filter); // 添加目录到输入路径,这里可能有多个输入路径, parse_data for (int i = 0; i < segments.length; i++) { if (LOG.isInfoEnabled()) { LOG.info("LinkDb: adding segment: " + segments[i]); } FileInputFormat.addInputPath(job, new Path(segments[i], ParseData.DIR_NAME)); } // 提交MP任务 try { JobClient.runJob(job); } catch (IOException e) { LockUtil.removeLockFile(fs, lock); throw e; }
private static JobConf createJob(Configuration config, Path linkDb, boolean normalize, boolean filter) { // 新成一个临时的目录 Path newLinkDb = new Path("linkdb-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); JobConf job = new NutchJob(config); job.setJobName("linkdb " + linkDb); // 设置输出格式 job.setInputFormat(SequenceFileInputFormat.class); // 配置Map,Combiner,Reducer方法 job.setMapperClass(LinkDb.class); job.setCombinerClass(LinkDbMerger.class); // 如果配置了过滤或者规格化,并且没有找到老的linkdb目录,好就以filter和normalize进行配置 // if we don't run the mergeJob, perform normalization/filtering now if (normalize || filter) { try { FileSystem fs = FileSystem.get(config); if (!fs.exists(linkDb)) { job.setBoolean(LinkDbFilter.URL_FILTERING, filter); job.setBoolean(LinkDbFilter.URL_NORMALIZING, normalize); } } catch (Exception e) { LOG.warn("LinkDb createJob: " + e); } } job.setReducerClass(LinkDbMerger.class); // 配置MP输出路径 FileOutputFormat.setOutputPath(job, newLinkDb); // 配置输出格式 job.setOutputFormat(MapFileOutputFormat.class); // 对map输出使用压缩,以减少Reducer的输入压力 job.setBoolean("mapred.output.compress", true); // 配置<key,value>的输出类型 job.setOutputKeyClass(Text.class); job.setOutputValueClass(Inlinks.class); return job; }
if (fs.exists(currentLinkDb)) { // 如果存在老的反向链接库,就进行合并 if (LOG.isInfoEnabled()) { LOG.info("LinkDb: merging with existing linkdb: " + linkDb); } // try to merge // Path newLinkDb = FileOutputFormat.getOutputPath(job); job = LinkDbMerger.createMergeJob(getConf(), linkDb, normalize, filter); // 加入输入路径 FileInputFormat.addInputPath(job, currentLinkDb); FileInputFormat.addInputPath(job, newLinkDb); try { JobClient.runJob(job); } catch (IOException e) { LockUtil.removeLockFile(fs, lock); fs.delete(newLinkDb, true); throw e; } fs.delete(newLinkDb, true); } LinkDb.install(job, linkDb); // 安装新生成的反向链接库
public static JobConf createMergeJob(Configuration config, Path linkDb, boolean normalize, boolean filter) { // 生成一个临时目录 Path newLinkDb = new Path("linkdb-merge-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); JobConf job = new NutchJob(config); job.setJobName("linkdb merge " + linkDb); // 配置个输出格式 job.setInputFormat(SequenceFileInputFormat.class); // 配置Map与Reducer方法,这里的Reducer方法与上面的一样,用于对相同key(toUrl)的values进行聚合 // 然后输出指定个数的value,而这里的LinkDbFilter应该是对key与value所对应的url进行过滤与正规化 job.setMapperClass(LinkDbFilter.class); job.setBoolean(LinkDbFilter.URL_NORMALIZING, normalize); job.setBoolean(LinkDbFilter.URL_FILTERING, filter); job.setReducerClass(LinkDbMerger.class); // 配置输出路径 FileOutputFormat.setOutputPath(job, newLinkDb); job.setOutputFormat(MapFileOutputFormat.class); job.setBoolean("mapred.output.compress", true); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Inlinks.class); return job; }
主要是用于下载linkdb的内容到指定的目录,帮助如下:
Usage: LinkDbReader <linkdb> {-dump <out_dir> | -url <url>) -dump <out_dir> dump whole link db to a text file in <out_dir> -url <url> print information about <url> to System.out
lemo@debian:~/Workspace/java/Apache/Nutch/nutch-1.3$ bin/nutch readlinkdb db/linkdb/ -dump output2 LinkDb dump: starting at 2011-08-29 09:54:08 LinkDb dump: db: db/linkdb/ LinkDb dump: finished at 2011-08-29 09:54:09, elapsed: 00:00:01
lemo@debian:~/Workspace/java/Apache/Nutch/nutch-1.3$ head output2/part-00000 http://baike.baidu.com/ Inlinks: fromUrl: http://www.baidu.com/ anchor: 百科 http://hi.baidu.com/ Inlinks: fromUrl: http://www.baidu.com/ anchor: 空间 http://hi.baidu.com/baidu/ Inlinks: fromUrl: http://www.baidu.com/ anchor: http://home.baidu.com/ Inlinks:
主要是用来合并个不同的linkdb数据库的
Usage: LinkDbMerger <output_linkdb> <linkdb1> [<linkdb2> <linkdb3> ...] [-normalize] [-filter] output_linkdb output LinkDb linkdb1 ... input LinkDb-s (single input LinkDb is ok) -normalize use URLNormalizer on both fromUrls and toUrls in linkdb(s) (usually not needed) -filter use URLFilters on both fromUrls and toUrls in linkdb(s)