Stripe:
MAGIC
stripe1 {
data
index
footer
},
stripe2 {
data
index
footer
},
...
metadata
footer
PostScript + size(PostScript)
部分关联类的调用关系
通过 OrcFile.createWriter()
静态方法设置写文件参数,生成一个WriterImpl
对象来写ORC文件。
Writer writer = OrcFile.createWriter()
动态数组是ORC 读写文件中非常重要的一步,设计的目的是 memory optimization,具有如下特点:
下面以 DynamicIntArray 为例。
DEFAULT_CHUNKSIZE = 8 * 1024
data = new int[128][];
data[0] : 0 - 8191
data[1] : 8192 - 16384
...
data[n] :
/**
* Ensure that the given index is valid.
*/
private void grow(int chunkIndex) {
if (chunkIndex >= initializedChunks) {
if (chunkIndex >= data.length) {
int newSize = Math.max(chunkIndex + 1, 2 * data.length);
int[][] newChunk = new int[newSize][];
System.arraycopy(data, 0, newChunk, 0, data.length);
data = newChunk;
}
for (int i=initializedChunks; i <= chunkIndex; ++i) {
data[i] = new int[chunkSize];
}
initializedChunks = chunkIndex + 1;
}
}
这里是后面程序入口,后面均会有介绍:
@Override
public void addRow(Object row) throws IOException {
synchronized (this) {
// 这里只是做 indexStatistics,NULL,bit位的一些统计
treeWriter.write(row);
rowsInStripe += 1;
if (buildIndex) {
rowsInIndex += 1;
// rows
if (rowsInIndex >= rowIndexStride) {
createRowIndexEntry();
}
}
}
memoryManager.addedRow();
}
// StringTreeWriter
@Override
void write(Object obj) throws IOException {
super.write(obj); // 父类TreeWriter 为抽象类,只做一些 indexStatistics 统计等抽象操作
if (obj != null) {
Text val = getTextValue(obj);
if (useDictionaryEncoding || !strideDictionaryCheck) {
rows.add(dictionary.add(val));
} else {
// write data and length
directStreamOutput.write(val.getBytes(), 0, val.getLength());
directLengthOutput.write(val.getLength());
}
indexStatistics.updateString(val);
}
}
rows.add(dictionary.add(val));
例如,现在要写入两个字符串,01
和234
(因为对象的存储最终会存储该对象Writable方法的UDF-8编码 chars,所以使用数字字符串来理解);
// StringRedBlackTree
private int addNewKey() {
// if the newKey is actually new, add it to our byteArray and store the offset & length
if (add()) {
int len = newKey.getLength();
keyOffsets.add(byteArray.add(newKey.getBytes(), 0, len));
}
return lastAdd;
}
以StringTreeWriter 创建的Stream为例
private final StreamFactory streamFactory = new StreamFactory(); // 全局唯一 streamFactory
treeWriter = createTreeWriter(inspector, streamFactory, false);
new StringTreeWriter(streamFactory.getNextColumnId(), inspector, streamFactory, nullable); // 根据 inspector.getCategory() 类型创建对应的TreeWriter
super(columnId, inspector, writer, nullable); // 调用父类 TreeWriter() 构造方法
rowIndexStream = streamFactory.createStream(id, OrcProto.Stream.Kind.ROW_INDEX); // 因为 HIVE_ORC_DEFAULT_ROW_INDEX_STRIDE(“hive.exec.orc.default.row.index.stride”, 10000) 默认 stride > 0(Stride is the number of rows an index entry represents.) 所以创建 (id=0,ROW_INDEX) 类型 OutStream
在 StringTreeWriter 构造方法中创建其他三个Stream
根据 column 和 kind 创建 OutStream 对象,并在 streams 中记录,后续在 flushStripe() 方法中会处理所有的streams.
BufferedStream 中有两个对象,OutStream outStream
和List
。TreeWriters
直接写数据到outStream
; codec
compresses the data as buffers 存储在output
;
StringTreeWriter 内部Stream和引用对象关系
所有 TreeWriter 都有rowIndexStream
对于 LENGTH 类型的Stream 使用 createIntegerWriter 进行包装
对于 DATA 类型Stream,有createIntegerWriter的包装,也有Direct Stream
Column 可能创建的Stream 类型(OrcProto.Stream.Kind):
StringTreeWriter : void createRowIndexEntry() | void writeStripe()
private void flushDictionary() {
....
final int[] dumpOrder = new int[dictionary.size()];
if (useDictionaryEncoding) {
// Write the dictionary by traversing the red-black tree writing out
// the bytes and lengths; and creating the map from the original order
// to the final sorted order.
// 对red-black tree中所有node调用visit方法,将数据写入到 BufferedStream
dictionary.visit(new StringRedBlackTree.Visitor() {
private int currentId = 0;
@Override
public void visit(StringRedBlackTree.VisitorContext context
) throws IOException {
// 将red-black tree 中当前节点写入 stringOutput : DICTIONARY_DATA
context.writeBytes(stringOutput);
// 将当前节点长度写入 lengthOutput : LENGTH
lengthOutput.write(context.getLength());
// 当前节点在树中的位置,
dumpOrder[context.getOriginalPosition()] = currentId++;
}
});
} else {
// for direct encoding, we don't want the dictionary data stream
stringOutput.suppress();
}
int length = rows.size();
int rowIndexEntry = 0;
OrcProto.RowIndex.Builder rowIndex = getRowIndex();
Text text = new Text();
// write the values translated into the dump order.
for(int i = 0; i <= length; ++i) {
// now that we are writing out the row values, we can finalize the
// row index
if (buildIndex) {
// 对于每个 RowIndex,将base 对象包装为PositionRecorder,并调用当前recorder的position
while (i == rowIndexValueCount.get(rowIndexEntry) &&
rowIndexEntry < savedRowIndex.size()) {
OrcProto.RowIndexEntry.Builder base =
savedRowIndex.get(rowIndexEntry++).toBuilder();
if (useDictionaryEncoding) {
rowOutput.getPosition(new RowIndexPositionRecorder(base));
} else {
PositionRecorder posn = new RowIndexPositionRecorder(base);
directStreamOutput.getPosition(posn);
directLengthOutput.getPosition(posn);
}
// base.build() 创建一个 RowIndexEntry 实例,再添加到 rowIndex
rowIndex.addEntry(base.build());
}
}
if (i != length) {
if (useDictionaryEncoding) {
// 如果使用字典压缩,因为dictionary数据已经全部写出,只需要将对应数据在stringOutput : DICTIONARY_DATA 中的数组dumpOrder的下标即可
rowOutput.write(dumpOrder[rows.get(i)]);
} else {
// 如果不使用字典,将 text 值,写入 directStreamOutput,将 text length 写入 directLengthOutput
dictionary.getText(text, rows.get(i));
directStreamOutput.write(text.getBytes(), 0, text.getLength());
directLengthOutput.write(text.getLength());
}
}
}
rows.clear();
}
在flush 完数据后,在下面的recordDirectStreamPosition() 方法中
通过addRow() 方法入口,每5000条记录,检查一下内存使用是否超出 limit 限制,超出限制后,调用 flushStripe()
@Override
public synchronized boolean checkMemory(double newScale) throws IOException {
long limit = (long) Math.round(adjustedStripeSize * newScale);
long size = estimateStripeSize();
...
if (size > limit) {
flushStripe();
return true;
}
return false;
}
输出流的建立时机发生在checkMemory满足条件 或 close() 文件时,调用 flushStripe()方法。
@VisibleForTesting
FSDataOutputStream getStream() throws IOException {
if (rawWriter == null) {
rawWriter = fs.create(path, false, HDFS_BUFFER_SIZE,
fs.getDefaultReplication(), blockSize);
rawWriter.writeBytes(OrcFile.MAGIC);
headerLength = rawWriter.getPos();
writer = new OutStream("metadata", bufferSize, codec,
new DirectStream(rawWriter));
protobufWriter = CodedOutputStream.newInstance(writer);
}
return rawWriter;
}
private void flushStripe() throws IOException {
getStream();
if (buildIndex && rowsInIndex != 0) {
createRowIndexEntry();
}
if (rowsInStripe != 0) {
if (callback != null) {
callback.preStripeWrite(callbackContext);
}
// finalize the data for the stripe
int requiredIndexEntries = rowIndexStride == 0 ? 0 :
(int) ((rowsInStripe + rowIndexStride - 1) / rowIndexStride);
OrcProto.StripeFooter.Builder builder =
OrcProto.StripeFooter.newBuilder();
treeWriter.writeStripe(builder, requiredIndexEntries);
long indexSize = 0;
long dataSize = 0;
for(Map.Entry<StreamName, BufferedStream> pair: streams.entrySet()) {
BufferedStream stream = pair.getValue();
if (!stream.isSuppressed()) {
stream.flush();
StreamName name = pair.getKey();
long streamSize = pair.getValue().getOutputSize();
builder.addStreams(OrcProto.Stream.newBuilder()
.setColumn(name.getColumn())
.setKind(name.getKind())
.setLength(streamSize));
if (StreamName.Area.INDEX == name.getArea()) {
indexSize += streamSize;
} else {
dataSize += streamSize;
}
}
}
OrcProto.StripeFooter footer = builder.build();
// Do we need to pad the file so the stripe doesn't straddle a block
// boundary?
long start = rawWriter.getPos();
final long currentStripeSize = indexSize + dataSize + footer.getSerializedSize();
final long available = blockSize - (start % blockSize);
final long overflow = currentStripeSize - adjustedStripeSize;
final float availRatio = (float) available / (float) defaultStripeSize;
if (availRatio > 0.0f && availRatio < 1.0f
&& availRatio > paddingTolerance) {
// adjust default stripe size to fit into remaining space, also adjust
// the next stripe for correction based on the current stripe size
// and user specified padding tolerance. Since stripe size can overflow
// the default stripe size we should apply this correction to avoid
// writing portion of last stripe to next hdfs block.
float correction = overflow > 0 ? (float) overflow
/ (float) adjustedStripeSize : 0.0f;
// correction should not be greater than user specified padding
// tolerance
correction = correction > paddingTolerance ? paddingTolerance
: correction;
// adjust next stripe size based on current stripe estimate correction
adjustedStripeSize = (long) ((1.0f - correction) * (availRatio * defaultStripeSize));
} else if (availRatio >= 1.0) {
adjustedStripeSize = defaultStripeSize;
}
if (availRatio < paddingTolerance && addBlockPadding) {
long padding = blockSize - (start % blockSize);
byte[] pad = new byte[(int) Math.min(HDFS_BUFFER_SIZE, padding)];
LOG.info(String.format("Padding ORC by %d bytes (<= %.2f * %d)",
padding, availRatio, defaultStripeSize));
start += padding;
while (padding > 0) {
int writeLen = (int) Math.min(padding, pad.length);
rawWriter.write(pad, 0, writeLen);
padding -= writeLen;
}
adjustedStripeSize = defaultStripeSize;
} else if (currentStripeSize < blockSize
&& (start % blockSize) + currentStripeSize > blockSize) {
// even if you don't pad, reset the default stripe size when crossing a
// block boundary
adjustedStripeSize = defaultStripeSize;
}
// 将 streams 中的List output 数据(Write the saved compressed buffers)写出到 rawWriter中
// write out the data streams
for(Map.Entry<StreamName, BufferedStream> pair: streams.entrySet()) {
BufferedStream stream = pair.getValue();
if (!stream.isSuppressed()) {
stream.spillTo(rawWriter);
}
stream.clear();
}
// write footer 数据
footer.writeTo(protobufWriter);
protobufWriter.flush();
// protobufWriter包装了writer,并增加了Buffer,所以需要同步flush writer
writer.flush();
long footerLength = rawWriter.getPos() - start - dataSize - indexSize;
OrcProto.StripeInformation dirEntry =
OrcProto.StripeInformation.newBuilder()
.setOffset(start)
.setNumberOfRows(rowsInStripe)
.setIndexLength(indexSize)
.setDataLength(dataSize)
.setFooterLength(footerLength).build();
// dirEntry 包含stripe 元数据信息
stripes.add(dirEntry);
rowCount += rowsInStripe;
rowsInStripe = 0;
}
}
@Override
public void close() throws IOException {
// remove us from the memory manager so that we don't get any callbacks
memoryManager.removeWriter(path);
// actually close the file
synchronized (this) {
flushStripe();
// 写入 treeWriter.stripeStatsBuilders 中所有Stripe metadata
int metadataLength = writeMetadata(rawWriter.getPos());
int footerLength = writeFooter(rawWriter.getPos() - metadataLength);
rawWriter.writeByte(writePostScript(footerLength, metadataLength));
rawWriter.close();
}
}
RedBlackTree add() 方法
// RedBlackTree 和 StringRedBlackTree
// TODO 对应Int 类型红黑树的处理
/**
* Insert or find a given key in the tree and rebalance the tree correctly.
* Rebalancing restores the red-black aspect of the tree to maintain the
* invariants:
* 1. If a node is red, both of its children are black.
* 2. Each child of a node has the same black height (the number of black
* nodes between it and the leaves of the tree).
*
* Inserted nodes are at the leaves and are red, therefore there is at most a
* violation of rule 1 at the node we just put in. Instead of always keeping
* the parents, this routine passing down the context.
*
* The fix is broken down into 6 cases (1.{1,2,3} and 2.{1,2,3} that are
* left-right mirror images of each other). See Algorighms by Cormen,
* Leiserson, and Rivest for the explaination of the subcases.
*
* @param node The node that we are fixing right now.
* @param fromLeft Did we come down from the left?
* @param parent Nodes' parent
* @param grandparent Parent's parent
* @param greatGrandparent Grandparent's parent
* @return Does parent also need to be checked and/or fixed?
*/
private boolean add(int node, boolean fromLeft, int parent,
int grandparent, int greatGrandparent) {
if (node == NULL) {
if (root == NULL) {
lastAdd = insert(NULL, NULL, false);
root = lastAdd;
wasAdd = true;
return false;
} else {
lastAdd = insert(NULL, NULL, true);
node = lastAdd;
wasAdd = true;
// connect the new node into the tree
if (fromLeft) {
setLeft(parent, node);
} else {
setRight(parent, node);
}
}
} else {
int compare = compareValue(node);
boolean keepGoing;
// Recurse down to find where the node needs to be added
if (compare < 0) {
keepGoing = add(getLeft(node), true, node, parent, grandparent);
} else if (compare > 0) {
keepGoing = add(getRight(node), false, node, parent, grandparent);
} else {
lastAdd = node;
wasAdd = false;
return false;
}
// we don't need to fix the root (because it is always set to black)
if (node == root || !keepGoing) {
return false;
}
}
// Do we need to fix this node? Only if there are two reds right under each
// other.
if (isRed(node) && isRed(parent)) {
if (parent == getLeft(grandparent)) {
int uncle = getRight(grandparent);
if (isRed(uncle)) {
// case 1.1
setRed(parent, false);
setRed(uncle, false);
setRed(grandparent, true);
return true;
} else {
if (node == getRight(parent)) {
// case 1.2
// swap node and parent
int tmp = node;
node = parent;
parent = tmp;
// left-rotate on node
setLeft(grandparent, parent);
setRight(node, getLeft(parent));
setLeft(parent, node);
}
// case 1.2 and 1.3
setRed(parent, false);
setRed(grandparent, true);
// right-rotate on grandparent
if (greatGrandparent == NULL) {
root = parent;
} else if (getLeft(greatGrandparent) == grandparent) {
setLeft(greatGrandparent, parent);
} else {
setRight(greatGrandparent, parent);
}
setLeft(grandparent, getRight(parent));
setRight(parent, grandparent);
return false;
}
} else {
int uncle = getLeft(grandparent);
if (isRed(uncle)) {
// case 2.1
setRed(parent, false);
setRed(uncle, false);
setRed(grandparent, true);
return true;
} else {
if (node == getLeft(parent)) {
// case 2.2
// swap node and parent
int tmp = node;
node = parent;
parent = tmp;
// right-rotate on node
setRight(grandparent, parent);
setLeft(node, getRight(parent));
setRight(parent, node);
}
// case 2.2 and 2.3
setRed(parent, false);
setRed(grandparent, true);
// left-rotate on grandparent
if (greatGrandparent == NULL) {
root = parent;
} else if (getRight(greatGrandparent) == grandparent) {
setRight(greatGrandparent, parent);
} else {
setLeft(greatGrandparent, parent);
}
setRight(grandparent, getLeft(parent));
setLeft(parent, grandparent);
return false;
}
}
} else {
return true;
}
}
/**
* Add the new key to the tree.
* @return true if the element is a new one.
*/
protected boolean add() {
add(root, false, NULL, NULL, NULL);
if (wasAdd) {
setRed(root, false);
return true;
} else {
return false;
}
}
TODO