Avro 数据序列化

Apache Avro 是一个独立于编程语言的数据序列化系统。旨在解决Hadoop中Writable类型的不足:缺乏语言的可移植性。Avro 模式通常用json来写,数据通常采用二进制格式编码。


Avro 的使用可以分为两种:编译Schema和非编译Schema

  • 定义schema:emp.avsc
	"namespace": "tutorialspoint.com",
	"type": "record",
	"name": "emp",
	"fields": [
		{"name": "name", "type": "string"},
		{"name": "id", "type": "int"},
		{"name": "salary", "type": "int"},
		{"name": "age", "type": "int"},
		{"name": "address", "type": "string"}
  • 编译Schema,生成Java类,并将源码导入idea
// . 表示当前目录
cmd> java -jar avro-tools-1.8.2.jar compile schema emp.avsc .



package Tutorialspoint;

import org.apache.avro.specific.SpecificData;
import org.apache.avro.message.BinaryMessageEncoder;
import org.apache.avro.message.BinaryMessageDecoder;
import org.apache.avro.message.SchemaStore;

public class Employee extends org.apache.avro.specific.SpecificRecordBase implements org.apache.avro.specific.SpecificRecord {
  private static final long serialVersionUID = -8873171083721622992L;
  public static final org.apache.avro.Schema SCHEMA$ = new org.apache.avro.Schema.Parser().parse("{\"type\":\"record\",\"name\":\"Employee\",\"namespace\":\"Tutorialspoint\",\"fields\":[{\"name\":\"Name\",\"type\":\"string\"},{\"name\":\"age\",\"type\":\"int\"}]}");
  public static org.apache.avro.Schema getClassSchema() { return SCHEMA$; }

  private static SpecificData MODEL$ = new SpecificData();

  private static final BinaryMessageEncoder<Employee> ENCODER =
      new BinaryMessageEncoder<Employee>(MODEL$, SCHEMA$);

  private static final BinaryMessageDecoder<Employee> DECODER =
      new BinaryMessageDecoder<Employee>(MODEL$, SCHEMA$);

   * Return the BinaryMessageDecoder instance used by this class.
  public static BinaryMessageDecoder<Employee> getDecoder() {
    return DECODER;

   * Create a new BinaryMessageDecoder instance for this class that uses the specified {@link SchemaStore}.
   * @param resolver a {@link SchemaStore} used to find schemas by fingerprint
  public static BinaryMessageDecoder<Employee> createDecoder(SchemaStore resolver) {
    return new BinaryMessageDecoder<Employee>(MODEL$, SCHEMA$, resolver);

  /** Serializes this Employee to a ByteBuffer. */
  public java.nio.ByteBuffer toByteBuffer() throws java.io.IOException {
    return ENCODER.encode(this);

  /** Deserializes a Employee from a ByteBuffer. */
  public static Employee fromByteBuffer(
      java.nio.ByteBuffer b) throws java.io.IOException {
    return DECODER.decode(b);

  @Deprecated public CharSequence Name;
  @Deprecated public int age;

   * Default constructor.  Note that this does not initialize fields
   * to their default values from the schema.  If that is desired then
   * one should use newBuilder().
  public Employee() {}

   * All-args constructor.
   * @param Name The new value for Name
   * @param age The new value for age
  public Employee(CharSequence Name, Integer age) {
    this.Name = Name;
    this.age = age;

  public org.apache.avro.Schema getSchema() { return SCHEMA$; }
  // Used by DatumWriter.  Applications should not call.
  public Object get(int field$) {
    switch (field$) {
    case 0: return Name;
    case 1: return age;
    default: throw new org.apache.avro.AvroRuntimeException("Bad index");

  // Used by DatumReader.  Applications should not call.
  public void put(int field$, Object value$) {
    switch (field$) {
    case 0: Name = (CharSequence)value$; break;
    case 1: age = (Integer)value$; break;
    default: throw new org.apache.avro.AvroRuntimeException("Bad index");

   * Gets the value of the 'Name' field.
   * @return The value of the 'Name' field.
  public CharSequence getName() {
    return Name;

   * Sets the value of the 'Name' field.
   * @param value the value to set.
  public void setName(CharSequence value) {
    this.Name = value;

   * Gets the value of the 'age' field.
   * @return The value of the 'age' field.
  public Integer getAge() {
    return age;

   * Sets the value of the 'age' field.
   * @param value the value to set.
  public void setAge(Integer value) {
    this.age = value;

   * Creates a new Employee RecordBuilder.
   * @return A new Employee RecordBuilder
  public static Builder newBuilder() {
    return new Builder();

   * Creates a new Employee RecordBuilder by copying an existing Builder.
   * @param other The existing builder to copy.
   * @return A new Employee RecordBuilder
  public static Builder newBuilder(Builder other) {
    return new Builder(other);

   * Creates a new Employee RecordBuilder by copying an existing Employee instance.
   * @param other The existing instance to copy.
   * @return A new Employee RecordBuilder
  public static Builder newBuilder(Employee other) {
    return new Builder(other);

   * RecordBuilder for Employee instances.
  public static class Builder extends org.apache.avro.specific.SpecificRecordBuilderBase<Employee>
    implements org.apache.avro.data.RecordBuilder<Employee> {

    private CharSequence Name;
    private int age;

    /** Creates a new Builder */
    private Builder() {

     * Creates a Builder by copying an existing Builder.
     * @param other The existing Builder to copy.
    private Builder(Builder other) {
      if (isValidValue(fields()[0], other.Name)) {
        this.Name = data().deepCopy(fields()[0].schema(), other.Name);
        fieldSetFlags()[0] = true;
      if (isValidValue(fields()[1], other.age)) {
        this.age = data().deepCopy(fields()[1].schema(), other.age);
        fieldSetFlags()[1] = true;

     * Creates a Builder by copying an existing Employee instance
     * @param other The existing instance to copy.
    private Builder(Employee other) {
      if (isValidValue(fields()[0], other.Name)) {
        this.Name = data().deepCopy(fields()[0].schema(), other.Name);
        fieldSetFlags()[0] = true;
      if (isValidValue(fields()[1], other.age)) {
        this.age = data().deepCopy(fields()[1].schema(), other.age);
        fieldSetFlags()[1] = true;

      * Gets the value of the 'Name' field.
      * @return The value.
    public CharSequence getName() {
      return Name;

      * Sets the value of the 'Name' field.
      * @param value The value of 'Name'.
      * @return This builder.
    public Builder setName(CharSequence value) {
      validate(fields()[0], value);
      this.Name = value;
      fieldSetFlags()[0] = true;
      return this;

      * Checks whether the 'Name' field has been set.
      * @return True if the 'Name' field has been set, false otherwise.
    public boolean hasName() {
      return fieldSetFlags()[0];

      * Clears the value of the 'Name' field.
      * @return This builder.
    public Builder clearName() {
      Name = null;
      fieldSetFlags()[0] = false;
      return this;

      * Gets the value of the 'age' field.
      * @return The value.
    public Integer getAge() {
      return age;

      * Sets the value of the 'age' field.
      * @param value The value of 'age'.
      * @return This builder.
    public Builder setAge(int value) {
      validate(fields()[1], value);
      this.age = value;
      fieldSetFlags()[1] = true;
      return this;

      * Checks whether the 'age' field has been set.
      * @return True if the 'age' field has been set, false otherwise.
    public boolean hasAge() {
      return fieldSetFlags()[1];

      * Clears the value of the 'age' field.
      * @return This builder.
    public Builder clearAge() {
      fieldSetFlags()[1] = false;
      return this;

    public Employee build() {
      try {
        Employee record = new Employee();
        record.Name = fieldSetFlags()[0] ? this.Name : (CharSequence) defaultValue(fields()[0]);
        record.age = fieldSetFlags()[1] ? this.age : (Integer) defaultValue(fields()[1]);
        return record;
      } catch (Exception e) {
        throw new org.apache.avro.AvroRuntimeException(e);

  private static final org.apache.avro.io.DatumWriter<Employee>
    WRITER$ = (org.apache.avro.io.DatumWriter<Employee>)MODEL$.createDatumWriter(SCHEMA$);

  @Override public void writeExternal(java.io.ObjectOutput out)
    throws java.io.IOException {
    WRITER$.write(this, SpecificData.getEncoder(out));

  private static final org.apache.avro.io.DatumReader<Employee>
    READER$ = (org.apache.avro.io.DatumReader<Employee>)MODEL$.createDatumReader(SCHEMA$);

  @Override public void readExternal(java.io.ObjectInput in)
    throws java.io.IOException {
    READER$.read(this, SpecificData.getDecoder(in));


  • 串行化
 * 串行化
public void write() throws IOException {
    SpecificDatumWriter empDatumWriter = new SpecificDatumWriter<Employee>(Employee.class);
    DataFileWriter<Employee> empFileWriter = new DataFileWriter<Employee>(empDatumWriter);
    Employee e = new Employee();
    empFileWriter.create(e.getSchema(), new File("F:/hadoop/avro/e.avro"));

  • 反串行化
 * 反串行化
public void read() throws IOException {
    SpecificDatumReader empDatumWriter = new SpecificDatumReader(Employee.class);
    DataFileReader<Employee> dataReader = new DataFileReader<Employee>(new File("F:/hadoop/avro/e.avro"), empDatumWriter);

    Iterator<Employee> it = dataReader.iterator();
    while(it.hasNext()) {

{"Name": "Bob", "age": 12}
{"Name": "Bob", "age": 12}
{"Name": "Bob", "age": 12}

可以不用编译 emp.avsc 文件,直接使用

  • 串行化
 * 直接使用Schema文件进行读写,不用编译
public void writeInSchme() throws IOException {
    Schema schema = new Schema.Parser().parse(new File("F:/hadoop/avro/emp.avsc"));

    GenericRecord e = new GenericData.Record(schema);
    e.put("Name", "tomas");
    e.put("age", 25);

    DatumWriter<GenericRecord> w1 = new SpecificDatumWriter(schema);
    DataFileWriter<GenericRecord> w2 = new DataFileWriter(w1);
    w2.create(schema, new File("F:/hadoop/avro/e2.avro"));
  • 反串行化
 * 反串行avro数据
public void readInSchema() throws  Exception {
    Schema schema = new Schema.Parser().parse(new File("F:/hadoop/avro/emp.avsc"));

    GenericRecord e1 = new GenericData.Record(schema);
    DatumReader r1 = new SpecificDatumReader (schema);
    DataFileReader r2 = new DataFileReader(new File("F:/hadoop/avro/e2.avro"),r1);
        GenericRecord rec = (GenericRecord)r2.next();

