Apache avro常用Java中数据类型序列与反序列化

背景

在大数据领域,总是会遇到需要将各种数据类型序列化成字节数组,或者从字节数组反序列化回常用数据类型的场景,比如,Spark中推荐使用kyro,HBase中,使用HBase提供的工具来进行序列化以及反序列化,HBase内部使用google的probuff来序列化进行网络通讯等情况,以及hadoop使用Apache avro来序列化。当然,各种序列化方式的性能以及效率各种优缺点(此文不做对比)。
以上序列化都是框架内部已经给我们做好了序列化以及反序列化操作,如果我们在实际工作中,需要自己手动来序列化的场景,比如,将Java中的一个对象序列化成一个字节数组,然后存储到HBase中,或者从HBase中读取出对应的字节数组反序列化成一个对象,面对这样的需求,我们就可以借组avro来帮我们处理。
下面,给出Apache avro对各种类型的序列化以及反序列化操作。

1. Integer
@Test
  public void serdesIntType() throws IOException {
    // 构建一个schema
    Schema intSchema = SchemaBuilder.builder().intType();
    SpecificData specificData = SpecificData.get();
    DatumWriter writer = specificData.createDatumWriter(intSchema);
    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    
    // 将Integer类型序列化成字节数组
    writer.write(1, EncoderFactory.get().directBinaryEncoder(baos, null));
    byte[] bytes = baos.toByteArray();
    
    // 将字节数组反序列化成Integer
    DatumReader reader = specificData.createDatumReader(intSchema);
    ByteArrayInputStream bais = new ByteArrayInputStream(bytes);
    Object deResult = reader.read(null, DecoderFactory.get().directBinaryDecoder(bais, null));
    Assert.assertEquals(1, deResult);
  }
2. Long
@Test
  public void serdesLongType() throws IOException {
    Schema longSchema = SchemaBuilder.builder().longType();
    SpecificData specificData = SpecificData.get();
    DatumWriter writer = specificData.createDatumWriter(longSchema);
    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    writer.write(((long) 1), EncoderFactory.get().directBinaryEncoder(baos, null));
    byte[] bytes = baos.toByteArray();

    DatumReader reader = specificData.createDatumReader(longSchema);
    ByteArrayInputStream bais = new ByteArrayInputStream(bytes);
    Object deResult = reader.read(null, DecoderFactory.get().directBinaryDecoder(bais, null));
    Assert.assertEquals(((long) 1), deResult);
  }
3. Float
@Test
  public void serdesFloatType() throws IOException {
    Schema floatSchema = SchemaBuilder.builder().floatType();
    SpecificData specificData = SpecificData.get();
    DatumWriter writer = specificData.createDatumWriter(floatSchema);
    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    writer.write((1.0f), EncoderFactory.get().directBinaryEncoder(baos, null));
    byte[] bytes = baos.toByteArray();

    DatumReader reader = specificData.createDatumReader(floatSchema);
    ByteArrayInputStream bais = new ByteArrayInputStream(bytes);
    Object deResult = reader.read(null, DecoderFactory.get().directBinaryDecoder(bais, null));
    Assert.assertEquals(1.0f, deResult);
  }
4. Double
@Test
  public void serdesDoubleType() throws IOException {
    Schema doubleSchema = SchemaBuilder.builder().doubleType();
    SpecificData specificData = SpecificData.get();
    DatumWriter writer = specificData.createDatumWriter(doubleSchema);
    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    writer.write((1.0d), EncoderFactory.get().directBinaryEncoder(baos, null));
    byte[] bytes = baos.toByteArray();

    DatumReader reader = specificData.createDatumReader(doubleSchema);
    ByteArrayInputStream bais = new ByteArrayInputStream(bytes);
    Object deResult = reader.read(null, DecoderFactory.get().directBinaryDecoder(bais, null));
    Assert.assertEquals(1.0d, deResult);
  }
5. Boolean
@Test
  public void serdesBooleanType() throws IOException {
    Schema booleanSchema = SchemaBuilder.builder().booleanType();
    SpecificData specificData = SpecificData.get();
    DatumWriter writer = specificData.createDatumWriter(booleanSchema);
    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    writer.write(true, EncoderFactory.get().directBinaryEncoder(baos, null));
    byte[] bytes = baos.toByteArray();

    DatumReader reader = specificData.createDatumReader(booleanSchema);
    ByteArrayInputStream bais = new ByteArrayInputStream(bytes);
    Object deResult = reader.read(null, DecoderFactory.get().directBinaryDecoder(bais, null));
    Assert.assertEquals(true, deResult);
  }
6. String
@Test
  public void serdesStringType() throws IOException {
    Schema stringSchema = SchemaBuilder.builder().stringType();
    SpecificData specificData = SpecificData.get();
    DatumWriter writer = specificData.createDatumWriter(stringSchema);
    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    writer.write("hello", EncoderFactory.get().directBinaryEncoder(baos, null));
    byte[] bytes = baos.toByteArray();

    DatumReader reader = specificData.createDatumReader(stringSchema);
    ByteArrayInputStream bais = new ByteArrayInputStream(bytes);
    Utf8 deResult = (Utf8) reader.read(null, DecoderFactory.get().directBinaryDecoder(bais, null));
    Assert.assertEquals("hello", deResult.toString());
  }
7. byte[]
@Test
  public void serdesBytesType() throws IOException {
    Schema bytesSchema = SchemaBuilder.builder().bytesType();
    SpecificData specificData = SpecificData.get();
    DatumWriter writer = specificData.createDatumWriter(bytesSchema);
    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    writer.write(ByteBuffer.wrap("hello".getBytes()), EncoderFactory.get().directBinaryEncoder(baos, null));
    byte[] bytes = baos.toByteArray();

    DatumReader reader = specificData.createDatumReader(bytesSchema);
    ByteArrayInputStream bais = new ByteArrayInputStream(bytes);
    ByteBuffer deResult = (ByteBuffer) reader.read(null, DecoderFactory.get().directBinaryDecoder(bais, null));
    Assert.assertTrue(Arrays.equals("hello".getBytes(), deResult.array()));
  }
8. java.sql.Date

注意,apache avro对date的反序列化时使用的时区是UTC的,且没法更换,如果我们序列化时使用的时区不是UTC的,可能会导致结果不正确,所以需要我们自定义一个转换类提供给avro内部转化时使用。
自定义SqlDateConversion,实现继承avro内置的Conversion基类。

public class SqlDateConversion extends Conversion<Date> {
  private static final LocalDate EPOCH = LocalDate.of(1970, 1, 1);

  public SqlDateConversion() {
  }

  @Override
  public Class<Date> getConvertedType() {
    return Date.class;
  }

  @Override
  public String getLogicalTypeName() {
    // 注意,avro对date的LogicalType name为date,所以这里只能是date
    return "date";
  }

  @Override
  public Date fromInt(Integer value, Schema schema, LogicalType type) {
    LocalDate localDate = EPOCH.plusDays(value);
    return Date.valueOf(localDate);
  }

  @Override
  public Integer toInt(Date value, Schema schema, LogicalType type) {
    return Math.toIntExact(value.toLocalDate().toEpochDay());
  }
}

序列化与反序列化代码如下:

  @Test
  public void serdesDateType() throws IOException {
    Schema dateSchema = SchemaBuilder.builder().intType();
    LogicalTypes.date().addToSchema(dateSchema);
    SpecificData specificData = SpecificData.get();
    // 使用我们自定义的Conversion来进行内部date的转化
    specificData.addLogicalTypeConversion(new SqlDateConversion());
    DatumWriter writer = specificData.createDatumWriter(dateSchema);
    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    writer.write(Date.valueOf("2020-01-26"), EncoderFactory.get().directBinaryEncoder(baos, null));
    byte[] bytes = baos.toByteArray();

    DatumReader reader = specificData.createDatumReader(dateSchema);
    ByteArrayInputStream bais = new ByteArrayInputStream(bytes);
    Object deResult = reader.read(null, DecoderFactory.get().directBinaryDecoder(bais, null));
    Assert.assertEquals(deResult, Date.valueOf("2020-01-26"));
  }
9. java.sql.Timestamp

同Date一样,为了避免avro内部反序列化时使用的时区与序列化时我们给的时区不一致导致结果不正确,我们需要自定义转化类。

public class SqlTimestampConversion extends Conversion<Timestamp> {
  public SqlTimestampConversion() {
  }

  @Override
  public Class<Timestamp> getConvertedType() {
    return Timestamp.class;
  }

  @Override
  public String getLogicalTypeName() {
    // 注意对于Timestamp,avro内部的LogicalType name为timestamp-millis和timestamp-micros,
    // 此处我们使用timestamp-millis
    return "timestamp-millis";
  }

  @Override
  public Timestamp fromLong(Long value, Schema schema, LogicalType type) {
    return new Timestamp(value);
  }

  @Override
  public Long toLong(Timestamp value, Schema schema, LogicalType type) {
    return value.getTime();
  }
}

序列化与反序列化如下:

  @Test
  public void serdesTimestampType() throws IOException {
    Schema timestampSchema = SchemaBuilder.builder().longType();
    LogicalTypes.timestampMillis().addToSchema(timestampSchema);
    SpecificData specificData = SpecificData.get();
    specificData.addLogicalTypeConversion(new SqlTimestampConversion());
    DatumWriter writer = specificData.createDatumWriter(timestampSchema);
    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    writer.write(Timestamp.valueOf("2021-01-25 17:39:46"), EncoderFactory.get().directBinaryEncoder(baos, null));
    byte[] bytes = baos.toByteArray();

    DatumReader reader = specificData.createDatumReader(timestampSchema);
    ByteArrayInputStream bais = new ByteArrayInputStream(bytes);
    Object deResult = reader.read(null, DecoderFactory.get().directBinaryDecoder(bais, null));
    Assert.assertEquals(deResult, Timestamp.valueOf("2021-01-25 17:39:46"));
  }
10. BigDecimal
  @Test
  public void serdesDecimalType() throws IOException {
    Schema decimalSchema = SchemaBuilder.builder().bytesType();
    LogicalTypes.decimal(10, 2).addToSchema(decimalSchema);
    SpecificData specificData = SpecificData.get();
    // 注意设置一个decimal的转换器,否则序列化过程中会遇到转换异常的情况
    specificData.addLogicalTypeConversion(new Conversions.DecimalConversion());
    DatumWriter writer = specificData.createDatumWriter(decimalSchema);
    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    writer.write(BigDecimal.valueOf(18.67), EncoderFactory.get().directBinaryEncoder(baos, null));
    byte[] bytes = baos.toByteArray();

    DatumReader reader = specificData.createDatumReader(decimalSchema);
    ByteArrayInputStream bais = new ByteArrayInputStream(bytes);
    Object deResult = reader.read(null, DecoderFactory.get().directBinaryDecoder(bais, null));
    Assert.assertEquals(deResult, BigDecimal.valueOf(18.67));
  }
11. 对于复杂数据类型record、array、map的序列化与反序列化综合案例
  @Test
  public void serdesRecordType() throws IOException {
    SchemaBuilder.TypeBuilder<Schema> builder = SchemaBuilder.builder();
    SchemaBuilder.FieldAssembler<Schema> fieldAssembler = builder.record("Record").namespace("com.bugboy")
            .fields();

    Schema f1Schema = builder.bytesType();
    LogicalTypes.decimal(10, 2).addToSchema(f1Schema);

    Schema f2Schema = builder.intType();
    LogicalTypes.date().addToSchema(f2Schema);

    Schema f3Schema = builder.longType();
    LogicalTypes.timestampMillis().addToSchema(f3Schema);

    Schema subRecordSchema = builder.record("SubRecord")
            .namespace("com.bugboy")
            .fields()
            .name("f1")
            .type(f1Schema)
            .noDefault()
            .name("f2")
            .type(f2Schema)
            .noDefault()
            .name("f3")
            .type(f3Schema)
            .noDefault()
            .endRecord();

    Schema itemsSchema = builder.array().items(f1Schema);

    Schema eleSchema = builder.intType();
    LogicalTypes.date().addToSchema(eleSchema);
    Schema valueSchema = builder.map().values(eleSchema);
    
    // 构建Schema
    Schema recordSchema = fieldAssembler.name("id")
            .type(builder.stringType())
            .noDefault()
            .name("subRecord")
            .type(subRecordSchema)
            .noDefault()
            .name("array")
            .type(itemsSchema)
            .noDefault()
            .name("map")
            .type(valueSchema)
            .noDefault()
            .endRecord();

    // 构建Record
    GenericData.Record record = new GenericData.Record(recordSchema);
    record.put("id", "0001");

    GenericData.Record subRecord = new GenericData.Record(subRecordSchema);
    subRecord.put("f1", BigDecimal.valueOf(6867.68));
    subRecord.put("f2", Date.valueOf("2021-02-01"));
    subRecord.put("f3", Timestamp.valueOf("2021-02-01 10:47:56"));
    record.put("subRecord", subRecord);

    GenericData.Array array = new GenericData.Array(itemsSchema,
            Lists.newArrayList(BigDecimal.valueOf(6573.89),
                    BigDecimal.valueOf(2347.56), BigDecimal.valueOf(6543.12)));
    record.put("array", array);
    Map<String, Date> map = new HashMap<>();
    record.put("map", map);
    map.put("today", Date.valueOf("2021-02-01"));
    map.put("tomorrow", Date.valueOf("2021-02-02"));
    map.put("yesterday", Date.valueOf("2021-02-02"));
    SpecificDatumWriter<GenericData.Record> writer = new SpecificDatumWriter<>();
    SpecificData specificData = writer.getSpecificData();
    // 添加转换器
    specificData.addLogicalTypeConversion(new SqlDateConversion());
    specificData.addLogicalTypeConversion(new SqlTimestampConversion());
    specificData.addLogicalTypeConversion(new Conversions.DecimalConversion());
    writer.setSchema(recordSchema);// 此处别忘记设置
    
    // 对record进行序列化
    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    BinaryEncoder encoder = EncoderFactory.get().directBinaryEncoder(baos, null);
    writer.write(record, encoder);
    byte[] bytes = baos.toByteArray();
    
    // 反序列化为record
    ByteArrayInputStream bais = new ByteArrayInputStream(bytes);
    BinaryDecoder decoder = DecoderFactory.get().directBinaryDecoder(bais, null);
    SpecificDatumReader<Object> reader = new SpecificDatumReader<>();
    reader.setSchema(recordSchema);
    Object read = reader.read(null, decoder);
    // 打印观察结果
    System.out.println(read);
  }

最后

Apache avro还支持enum、fixed等上面没有具体给出的类型的序列化,因为我实际用不到,就没有给出(手动滑稽)。所以如果有需要的同学,就自己摸索,或者联系我一起讨论也行。
最后,快过年了,祝大家牛气冲天。biu biu biu ~ ~ ~

你可能感兴趣的:(java,Spark,hbase,avro,java.sql.Date,Timestamp,BigDecimal,大数据)