序列化
序列化是把结构化的对像转为字节流,以便网络传输或存储到磁盘设备上。反序列化是一个相反的过程,即把字节流转变为一系列的结构化对象。
RPC序列化建议的特性
1.紧凑(Compact)即方便网络传输,充分利用存储空间
2.快速(Fast)即序列化及反序列化性能要好
3.扩展性(Extensible)即协议有变化,可以支持新的需求
4.互操作性(Interoperable)即客户端及服务器端不依赖语言的实现
Hadoop使用Writables,满足紧凑、快速,不满足扩展能及互操作性
Writable 接口
package org.apache.hadoop.io; import java.io.DataOutput; import java.io.DataInput; import java.io.IOException; public interface Writable { void write(DataOutput out) throws IOException; void readFields(DataInput in) throws IOException; }
package com.bigdata.io; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.DataInputStream; import java.io.DataOutputStream; import java.io.IOException; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Writable; public final class WritableHelper { public static byte[] serialize(Writable writable) throws IOException{ ByteArrayOutputStream out = new ByteArrayOutputStream(); DataOutputStream dataOut = new DataOutputStream(out); writable.write(dataOut); dataOut.close(); return out.toByteArray(); } public static byte[] deserialize(Writable writable , byte[] bytes) throws IOException{ ByteArrayInputStream in = new ByteArrayInputStream(bytes); DataInputStream dataIn = new DataInputStream(in); writable.readFields(dataIn); dataIn.close(); return bytes; } public static void main(String[] args) throws IOException { IntWritable writable = new IntWritable(); writable.set(163); byte[] bytes = serialize(writable); System.out.println(bytes.length+"," + Bytes.toInt(bytes)); deserialize(writable, bytes); System.out.println(bytes.length+"," + Bytes.toInt(bytes)); } }
WritableComparable and comparators
package org.apache.hadoop.io; public interface WritableComparable<T> extends Writable,Comparable<T> { }
package org.apache.hadoop.io; import java.util.Comparator; public interface RawComparator<T> extends Comparator<T> { public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2); }
package com.bigdata.io; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.DataInputStream; import java.io.DataOutputStream; import java.io.IOException; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.RawComparator; import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.WritableComparator; public final class WritableHelper { public static byte[] serialize(Writable writable) throws IOException{ ByteArrayOutputStream out = new ByteArrayOutputStream(); DataOutputStream dataOut = new DataOutputStream(out); writable.write(dataOut); dataOut.close(); return out.toByteArray(); } public static byte[] deserialize(Writable writable , byte[] bytes) throws IOException{ ByteArrayInputStream in = new ByteArrayInputStream(bytes); DataInputStream dataIn = new DataInputStream(in); writable.readFields(dataIn); dataIn.close(); return bytes; } public static void main(String[] args) throws IOException { IntWritable writable = new IntWritable(); writable.set(163); byte[] bytes = serialize(writable); System.out.println(bytes.length+"," + Bytes.toInt(bytes)); deserialize(writable, bytes); System.out.println(bytes.length+"," + Bytes.toInt(bytes)); RawComparator<IntWritable> comparator = WritableComparator.get(IntWritable.class); IntWritable w1 = new IntWritable(163); IntWritable w2 = new IntWritable(67); int result = comparator.compare(w1, w2); System.out.println(result); byte[] b1 = serialize(w1); byte[] b2 = serialize(w2); result = comparator.compare(b1, 0, b1.length, b2, 0, b2.length); System.out.println(result); } }
Java primitive | Writable Implementation | Serialized size(bytes) |
boolean | BooleanWritable | 1 |
byte | ByteWritable | 1 |
short | ShortWritable | 2 |
int | IntWritable | 4 |
VIntWritable | 1-5 | |
float | FloatWritable | 4 |
long | LongWritable | 8 |
VLongWritable | 1-9 | |
double | DoubleWritable | 8 |
Type | Description | Schema |
null | The absence of a value | "null" |
boolean | A binary value | "boolean" |
int | 32-bit singed integer | "int" |
long | 64-bit singed integer | "long" |
float | Single precision(32-bit) IEEE 754 floating-point number | "float" |
double | Double precision(64-bit) IEEE 754 floating-point number | "double" |
bytes | Sequence of 8-bit unsigned bytes | "bytes" |
string | Sequence of Unicode characters | "string" |
Type | Description | Schema example |
array | An ordered collection of objects. All objects in a particular array must have the same schema. | { "type":"array", "items":"long" } |
map | An unordered collection of key-value pairs.Keys must be strings, values may be any type, although within a particular map all values must have the same schema. | { "type":"map", "values":"string" |
record | A collection of named fields of any type. | { "type":'record", "name": "WeatherRecord", "doc":"A weather reading.", "fields":[ {"name":"year","type":"int"}, {"name":"temperature","type":"int"}, {"name":"stationId","type":"string"} ] } |
enum | A set of named values. | { "type":"enum", "name":"Cultery", "doc":"An eating utensil.", "symbols":["KNIFE","FORK","SPOON"] } |
fixed | A fixed number of 8-bit unsigned bytes. | { "type":"fixed", "name":"Md5Hash", "size":16 |
union | A union of schemas. A union is represented by a JSON array,where each element in the arry is a schema.Data represented by a union must match one of th eschemas in the union. | [ "null", "string", {"type":"map","values":"string"} ]
|
package com.bigdata.io.avro; import java.io.ByteArrayOutputStream; import java.io.IOException; import org.apache.avro.Schema; import org.apache.avro.generic.GenericData; import org.apache.avro.generic.GenericDatumReader; import org.apache.avro.generic.GenericDatumWriter; import org.apache.avro.generic.GenericRecord; import org.apache.avro.io.DatumReader; import org.apache.avro.io.DatumWriter; import org.apache.avro.io.Decoder; import org.apache.avro.io.DecoderFactory; import org.apache.avro.io.Encoder; import org.apache.avro.io.EncoderFactory; public class StringPair { /** * @param args * @throws IOException */ public static void main(String[] args) throws IOException { Schema.Parser parser = new Schema.Parser(); Schema schema = parser.parse(StringPair.class.getResourceAsStream("/StringPair.avsc")); //We can create an instance of an Avro record using the generic API as follows GenericRecord datum = new GenericData.Record(schema); datum.put("left", "L"); datum.put("right", "R"); // we serialize the record to an output stream ByteArrayOutputStream out = new ByteArrayOutputStream(); DatumWriter<GenericRecord> writer = new GenericDatumWriter<GenericRecord>(schema); Encoder encoder = EncoderFactory.get().binaryEncoder(out, null); writer.write(datum, encoder); encoder.flush(); out.close(); DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>(schema); Decoder decoder = DecoderFactory.get().binaryDecoder(out.toByteArray(), null); GenericRecord result = reader.read(null, decoder); String r1 = result.get("left").toString(); String r2 = result.get("right").toString(); System.out.println(r1+ ","+r2); } }
{ "type": "record", "name": "StringPair", "doc": "A pair of strings.", "fields": [ {"name":"left", "type": "string"}, {"name":"right", "type": "string"} ] }
package com.bigdata.io.avro; import java.io.File; import java.io.IOException; import org.apache.avro.Schema; import org.apache.avro.file.DataFileReader; import org.apache.avro.file.DataFileWriter; import org.apache.avro.generic.GenericData; import org.apache.avro.generic.GenericDatumReader; import org.apache.avro.generic.GenericDatumWriter; import org.apache.avro.generic.GenericRecord; import org.apache.avro.io.DatumReader; import org.apache.avro.io.DatumWriter; public class AvroWriteToFile { public static void main(String[] args) throws IOException { Schema.Parser parser = new Schema.Parser(); Schema schema = parser.parse(AvroWriteToFile.class.getResourceAsStream("/StringPair.avsc")); GenericRecord datum = new GenericData.Record(schema); datum.put("left", "L"); datum.put("right", "R"); File file = new File("data.avro"); DatumWriter<GenericRecord> writer = new GenericDatumWriter<GenericRecord>(schema); DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<GenericRecord>(writer); dataFileWriter.create(schema, file); dataFileWriter.append(datum); datum.put("left", "is left"); datum.put("right", "is right"); dataFileWriter.append(datum); dataFileWriter.close(); DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>(schema); DataFileReader<GenericRecord> fileReader = new DataFileReader<GenericRecord>(file, reader); GenericRecord result = null; for (GenericRecord record : fileReader) { System.out.println(record.get("left")+","+record.get("right")); } fileReader.sync(0); System.out.println(fileReader.getBlockCount()); while(fileReader.hasNext()){ result = fileReader.next(); System.out.println(result.get("left")+","+result.get("right")); } fileReader.close(); } }
StringPair.avsc 增加description,一定要设置默认值,这样原来的规范也可以使用此Schema,同时新的规范也可用。
{ "type": "record", "name": "StringPair", "doc": "A pair of strings.", "fields": [ {"name":"left", "type": "string"}, {"name":"right", "type": "string"}, {"name": "description", "type": ["null", "string"], "default": null} ] }