mapred包升级为mapreduce包后,一个NutchJob的主要修改

引用的包从mapred改为mapreduce,一个NutchJob的相关代码修改。

1. Job设置和运行

旧API

    // mapred 包中有 JobConf、JobClient,在 mapreduce 包中都取消了
    public void myTask() throws Exception {
        JobConf job = new NutchJob(getConf());
        job.setJobName("MyTool");
        job.setInputFormat(SequenceFileInputFormat.class);
        job.setMapperClass(MyMapper.class);
        job.setReducerClass(MyReducer.class);
        job.setOutputFormat(SequenceFileOutputFormat.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);
        // my conf
        job.set(key, value);
        try{
            RunningJob runningJob = JobClient.runJob(job);
            if (!runningJob.isSuccessful()){
                throw new Exception("@@JOB FAILED");
            }
        }catch (Exception e){
            throw e;
        }
    }

新API

    // mapreduce 包中配置使用 Configuration 类,运行通过 Job 类
    public void myTask() throws Exception {
        Job job = NutchJob.getInstance(getConf());
        Configuration conf = job.getConfiguration();
        job.setJobName("MyTool");
        job.setInputFormatClass(SequenceFileInputFormat.class);
        job.setJarByClass(MyTool.class); // 通过传入的 class 找到 job 的 jar 包
        job.setMapperClass(MyMapper.class);
        job.setReducerClass(MyReducer.class);
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);
        // my conf
        conf.set(key, value);

        try{
            boolean success = job.waitForCompletion(true);
            if (!success) {
                throw new Exception("@@JOB FAILED");
            }
        }catch (Exception e){
            throw e;
        }
    }

2. Mapper

	// 旧 API 中 Mapper 和 Reducer 是接口
    public static class MyMapper implements Mapper{
        @Override
        public void map(Text key, CrawlDatum value, OutputCollector output, Reporter reporter) 
        		throws IOException {
            String mapOutputKey;
            String mapOutoutValue;
            // some operations
            output.collect(new Text(mapOutputKey), new Text(mapOutoutValue));
        }
    }
	// 新 API 中 Mapper 和 Reducer 是抽象类
    // 新 API 广泛使用 Context ,允许用户代码与 MapReduce 系统进行通信。
    public static class MyMapper extends Mapper {
        @Override
        public void map(Text key, CrawlDatum value, Context context) 
        		throws IOException, InterruptedException {
            String mapOutputKey;
            String mapOutoutValue;
            // some operations
            context.write(new Text(mapOutputKey), new Text(mapOutoutValue));
        }
    }

3. Reducer

	// 旧 API 的配置继承了JobConfigurable 中的 configure
	public static class MyReducer implements Reducer{
        @Override
        public void reduce(Text key, Iterator values, OutputCollector output, Reporter reporter) 
        		throws IOException {
            String reduceOutputKey;
            String reduceOutoutValue;
            while (values.hasNext()){
                // some operations
            }
            output.collect(new Text(reduceOutputKey), new Text(reduceOutoutValue));
        }
        @Override
        public void configure(JobConf job) {
            String someValue= job.get(SOME_KEY);
        }
}
    // 新 API 的配置函数 setup 在Mapper 和 Reducer 中,通过 context 传递上下文
    public static class MyReducer extends Reducer{
        @Override
        public void reduce(Text key, Iterable values, Context context)
                throws IOException, InterruptedException {
            String reduceOutputKey;
            String reduceOutoutValue;
            for (Text value : values) {
                 // some operations
            }
            context.write(new Text(reduceOutputKey), new Text(reduceOutoutValue));
        }
        @Override
        public void setup(Context context) {
            Configuration conf = context.getConfiguration();
            String someValue = conf.get(SOME_KEY);
        }
}

你可能感兴趣的:(Java)