Reducer的实现
map任务读取数据,解析数据,按照键值将数据分成一组一组的,reduce任务收集map任务的输出,通过合并、排序和归约三个过程对map的输出数据进行进一步的处理。现在我们只关心归约过程即reduce函数的实现。
实际上我们不用重新去实现,只需继承Hadoop提供的Mapper类即可,Mapper类的几个主要函数如下:protected void setup(Context context ) throws IOException, InterruptedException { //添加自己的初始化程序,比如读取作业的配置,自定义参数,读取DistrubteCache等 } protected void reduce(KEYIN key, Iterable<VALUEIN> values, Context context ) throws IOException, InterruptedException { //reduce主要业务流,下面的是默认实现,即老版本的IdentityReduce,数据原样输出。通过覆写,实现自己的业务流程 for(VALUEIN value: values) { context.write((KEYOUT) key, (VALUEOUT) value); } } protected void cleanup(Context context ) throws IOException, InterruptedException { // 所有的清理操作 }
public class FieldSelectionReducer<K, V> extends Reducer<Text, Text, Text, Text> { private String fieldSeparator = "\t"; private String reduceOutputKeyValueSpec; private List<Integer> reduceOutputKeyFieldList = new ArrayList<Integer>(); private List<Integer> reduceOutputValueFieldList = new ArrayList<Integer>(); private int allReduceValueFieldsFrom = -1; public static final Log LOG = LogFactory.getLog("FieldSelectionMapReduce"); public void setup(Context context) throws IOException, InterruptedException { Configuration conf = context.getConfiguration(); this.fieldSeparator = conf.get(FieldSelectionHelper.DATA_FIELD_SEPERATOR, "\t"); this.reduceOutputKeyValueSpec = conf.get(FieldSelectionHelper.REDUCE_OUTPUT_KEY_VALUE_SPEC, "0-:"); allReduceValueFieldsFrom = FieldSelectionHelper.parseOutputKeyValueSpec( reduceOutputKeyValueSpec, reduceOutputKeyFieldList, reduceOutputValueFieldList); } public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException { String keyStr = key.toString() + this.fieldSeparator; for (Text val : values) { FieldSelectionHelper helper = new FieldSelectionHelper(); helper.extractOutputKeyValue(keyStr, val.toString(), fieldSeparator, reduceOutputKeyFieldList, reduceOutputValueFieldList, allReduceValueFieldsFrom, false, false); context.write(helper.getKey(), helper.getValue()); } } }