RCFile分为两部分:Header和Record
version:由4个字节组成,前3个字节为‘R’,‘C’,‘F’,之前也直接用过secequencefile的‘S’,‘E’,‘Q’。第4个字节为版本号。这部分主要表明这是一个RCFile 。
compression:一个boolean,表明数据是否被压缩。
compression codec:若compression的值为true,此字段表明了编解码器。若compression为false,则没有此字段。
metadata:元数据信息,在内存中以TreeMap存放信息
sync:表明Header结束,由唯一的UID+@+当前时间的毫秒形式组成。
Record是存放实际数据的部分,其分为两类数据:Key part和Value part。key和value是配对出现的,Record由多个key part和Value part组成 。
Key part:
Record length in bytes:key part和value part所占用的总字节
Key length in bytes:key part占用的总字节
Number_of_rows_in_this_record(vint):行数
Column_1_ondisk_length(vint):第1列所占用的总字节数 Column_1_row_1_value_plain_length:第1列第1行(即二维坐标所确定的一个元素)占用的字节数
Column_1_row_2_value_plain_length:第1列第2行占用的字节
...
Column_2_ondisk_length(vint):第2列所占用的总字节数 Column_2_row_1_value_plain_length:第2列第1行(即二维坐标所确定的一个元素)占用的字节数
Column_2_row_2_value_plain_length:第2列第2行占用的字节数
...
Value part:
value part存放的即是实际的数据。直接上代码。
import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hive.ql.io.RCFile; import org.apache.hadoop.hive.serde2.ColumnProjectionUtils; import org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable; import org.apache.hadoop.hive.serde2.columnar.BytesRefWritable; import org.apache.hadoop.io.LongWritable; /** * * @author patrick * */ public class RCFileDemo { public static void main(String[] args) throws IOException { conf = new Configuration(); Path src = new Path("/rcfile"); createRcFile(src, conf); readRcFile(src, conf); } private static Configuration conf; private static final String TAB = "\t"; private static String strings[] = { "1,true,123.123,2012-10-24 08:55:00", "2,false,1243.5,2012-10-25 13:40:00", "3,false,24453.325,2008-08-22 09:33:21.123", "4,false,243423.325,2007-05-12 22:32:21.33454", "5,true,243.325,1953-04-22 09:11:33 " }; /** * 生成一个RCF file * * @param src * @param conf * @throws IOException */ private static void createRcFile(Path src, Configuration conf) throws IOException { conf.setInt(RCFile.COLUMN_NUMBER_CONF_STR, 4);// 列数 conf.setInt(RCFile.Writer.COLUMNS_BUFFER_SIZE_CONF_STR, 4 * 1024 * 1024);// 决定行数参数一 conf.setInt(RCFile.RECORD_INTERVAL_CONF_STR, 3);// 决定行数参数二 FileSystem fs = FileSystem.get(conf); RCFile.Writer writer = new RCFile.Writer(fs, conf, src); BytesRefArrayWritable cols = new BytesRefArrayWritable(4);// 列数,可以动态获取 BytesRefWritable col = null; for (String s : strings) { String splits[] = s.split(","); int count = 0; for (String split : splits) { col = new BytesRefWritable(Bytes.toBytes(split), 0, Bytes.toBytes(split).length); cols.set(count, col); count++; } writer.append(cols); } writer.close(); fs.close(); } /** * 读取解析一个RCF file * * @param src * @param conf * @throws IOException */ private static void readRcFile(Path src, Configuration conf) throws IOException { // 需要获取的列,必须指定,具体看ColumnProjectionUtils中的设置方法 ColumnProjectionUtils.setFullyReadColumns(conf); FileSystem fs = FileSystem.get(conf); RCFile.Reader reader = new RCFile.Reader(fs, src, conf); // readerByRow(reader); readerByCol(reader); reader.close(); } protected static void readerByRow(RCFile.Reader reader) throws IOException { // 已经读取的行数 LongWritable rowID = new LongWritable(); // 一个行组的数据 BytesRefArrayWritable cols = new BytesRefArrayWritable(); while (reader.next(rowID)) { reader.getCurrentRow(cols); // 包含一列的数据 BytesRefWritable brw = null; StringBuilder sb = new StringBuilder(); for (int i = 0; i < cols.size(); i++) { brw = cols.get(i); // 根据start 和 length 获取指定行-列数据 sb.append(Bytes.toString(brw.getData(), brw.getStart(), brw.getLength())); if (i < cols.size() - 1) { sb.append(TAB); } } System.out.println(sb.toString()); } } protected static void readerByCol(RCFile.Reader reader) throws IOException { // 一个行组的数据 BytesRefArrayWritable cols = new BytesRefArrayWritable(); while (reader.nextBlock()) { for (int count = 0; count < 4; count++) { cols = reader.getColumn(count, cols); BytesRefWritable brw = null; StringBuilder sb = new StringBuilder(); for (int i = 0; i < cols.size(); i++) { brw = cols.get(i); // 根据start 和 length 获取指定行-列数据 sb.append(Bytes.toString(brw.getData(), brw.getStart(), brw.getLength())); if (i < cols.size() - 1) { sb.append(TAB); } } System.out.println(sb.toString()); } } } }