【Avro三】Hadoop MapReduce读写Avro文件

Avro是Doug Cutting(此人绝对是神一般的存在)牵头开发的。 开发之初就是围绕着完善Hadoop生态系统的数据处理而开展的(使用Avro作为Hadoop MapReduce需要处理数据序列化和反序列化的场景),因此Hadoop MapReduce集成Avro也就是自然而然的事情。

这个例子是一个简单的Hadoop MapReduce读取Avro格式的源文件进行计数统计,然后将计算结果作为Avro格式的数据写到目标文件中,主要目的是体会下Hadoop MapReduce操作Avro的基本流程和Avro提供的API


1. Maven依赖

        <!--avro core-->

        <!--avro rpc support-->

        <!--avro utilities for Hadoop MapReduce to process avro files -->

        <!--Avro and Hadoop Map Reduce-->




2. MapReduce代码:

package examples.avro.mapreduce;

import examples.avro.simple.User;
import org.apache.avro.Schema;
import org.apache.avro.mapred.AvroKey;
import org.apache.avro.mapred.AvroValue;
import org.apache.avro.mapreduce.AvroJob;
import org.apache.avro.mapreduce.AvroKeyInputFormat;
import org.apache.avro.mapreduce.AvroKeyValueOutputFormat;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import java.io.IOException;

public class MapReduceColorCount extends Configured implements Tool {

    public static class ColorCountMapper extends
            Mapper<AvroKey<User>, NullWritable, Text, IntWritable> {

        public void map(AvroKey<User> key, NullWritable value, Context context)
                throws IOException, InterruptedException {

            CharSequence color = key.datum().getFavoriteColor();
            if (color == null) {
                color = "none";
            context.write(new Text(color.toString()), new IntWritable(1));

    public static class ColorCountReducer extends
            Reducer<Text, IntWritable, AvroKey<CharSequence>, AvroValue<Integer>> {

        public void reduce(Text key, Iterable<IntWritable> values,
                           Context context) throws IOException, InterruptedException {

            int sum = 0;
            for (IntWritable value : values) {
                sum += value.get();
            context.write(new AvroKey<CharSequence>(key.toString()), new AvroValue<Integer>(sum));

    public int run(String[] args) throws Exception {
        if (args.length != 2) {
            System.err.println("Usage: MapReduceColorCount <input path> <output path>");
            return -1;

        Job job = new Job(getConf());
        job.setJobName("Color Count");

        FileInputFormat.setInputPaths(job, new Path(args[0]));

        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        //AvroKeyInputFormat: A MapReduce InputFormat that can handle Avro container files.
        AvroJob.setInputKeySchema(job, User.getClassSchema());

        //AvroKeyValueOutputFormat: FileOutputFormat for writing Avro container files of key/value pairs
        AvroJob.setOutputKeySchema(job, Schema.create(Schema.Type.STRING));
        AvroJob.setOutputValueSchema(job, Schema.create(Schema.Type.INT));

        return (job.waitForCompletion(true) ? 0 : 1);

    public static void main(String[] args) throws Exception {
        int res = ToolRunner.run(new MapReduceColorCount(), args);



3. 主要类注释

3.1 AvroKey

/** The wrapper of keys for jobs configured with {@link AvroJob} . */


3.2 AvroValue

/** The wrapper of values for jobs configured with {@link AvroJob} . */


3.3 AvroJob

/** Setters to configure jobs for Avro data. */


3.4 AvroKeyInputFormat

 * A MapReduce InputFormat that can handle Avro container files.
 * <p>Keys are AvroKey wrapper objects that contain the Avro data.  Since Avro
 * container files store only records (not key/value pairs), the value from
 * this InputFormat is a NullWritable.</p>


3.5 AvroKeyValueOutputFormat

 * FileOutputFormat for writing Avro container files of key/value pairs.
 * <p>Since Avro container files can only contain records (not key/value pairs), this
 * output format puts the key and value into an Avro generic record with two fields, named
 * 'key' and 'value'.</p>
 * <p>The keys and values given to this output format may be Avro objects wrapped in
 * <code>AvroKey</code> or <code>AvroValue</code> objects.  The basic Writable types are
 * also supported (e.g., IntWritable, Text); they will be converted to their corresponding
 * Avro types.</p>
 * @param <K> The type of key. If an Avro type, it must be wrapped in an <code>AvroKey</code>.
 * @param <V> The type of value. If an Avro type, it must be wrapped in an <code>AvroValue</code>.



   * Sets the job input key schema.
   * @param job The job to configure.
   * @param schema The input key schema.
  public static void setInputKeySchema(Job job, Schema schema) {
    job.getConfiguration().set(CONF_INPUT_KEY_SCHEMA, schema.toString());

   * Sets the job input value schema.
   * @param job The job to configure.
   * @param schema The input value schema.
  public static void setInputValueSchema(Job job, Schema schema) {
    job.getConfiguration().set(CONF_INPUT_VALUE_SCHEMA, schema.toString());



   * Sets the map output key schema.
   * @param job The job to configure.
   * @param schema The map output key schema.
  public static void setMapOutputKeySchema(Job job, Schema schema) {
    AvroSerialization.setKeyWriterSchema(job.getConfiguration(), schema);
    AvroSerialization.setKeyReaderSchema(job.getConfiguration(), schema);

   * Sets the map output value schema.
   * @param job The job to configure.
   * @param schema The map output value schema.
  public static void setMapOutputValueSchema(Job job, Schema schema) {
    AvroSerialization.setValueWriterSchema(job.getConfiguration(), schema);
    AvroSerialization.setValueReaderSchema(job.getConfiguration(), schema);



