转载自:http://iwinit.iteye.com/blog/1827527
HBase put数据时会先将数据写入内存,其内存结构是一个ConcurrentSkipListMap,其Comparator是KVComparator。
keyvalue对象结构

KVComparator的KeyValue对象比较过程
1.使用KeyComparator比较rowkey,结果是rowkey字节序从小到大
2.如果rowkey一样,则按column family比较,结果是column family字节序从小到大
3.如果column family一样,则按family+qualifier比较,结果是qualifier字节序从小到大
4.如果qualifier也一样,则按timestamp排序,结果是timestamp从大到小排序
5.如果timestamp也一样,则按type排序,delete在put之前
6.以上都一样,则按照memstoreTS排序,memstoreTS是原子递增id,不可能一样,结果是memstoreTS从大到小排序,越新的修改会排前面,方便scan
可见KeyValue对象在内存里其实是已经排序好了,flush生成文件的时候,只是简单的scan一下,设置maxVersion(在这里超过maxVersion的put自动失效了),将每个KeyValue对象写入HDFS
Flush生成HFile的过程大抵如下
1.构造Writer,最新版本是HFileWriterV2,第2版
2.循环将KeyValue对象append到writer,这里会按block写入cache,默认64k,每64k,就要重新new一个block,每次finish一个block,就会添加一条索引记录到block index,到block index超过一定限制(默认124K),则写入一个特殊的InlineBlock,代表这是一个索引块,HFile就是data block和inline block交替结构
3.KeyValue对象写完后,再将索引数据以inline block的形式全部写入,最后写入root index,fileInfo等信息。
HFile V2结构![[HBase]KeyValue and HFile create_第1张图片](http://img.e-com-net.com/image/info5/9b427eb9666a412586d76b2b4dba5e24.png)
其实现类图如下![点击查看原始大小图片 [HBase]KeyValue and HFile create_第2张图片](http://img.e-com-net.com/image/info5/0361e57da19d4a47897ab9efe41f71e4.jpg)
主流程
- Scan scan = new Scan();
-
- scan.setMaxVersions(scanInfo.getMaxVersions());
-
-
-
- InternalScanner scanner = new StoreScanner(this, scan, Collections
- .singletonList(new CollectionBackedScanner(set, this.comparator)),
- ScanType.MINOR_COMPACT, this.region.getSmallestReadPoint(),
- HConstants.OLDEST_TIMESTAMP);
- try {
-
-
-
- synchronized (flushLock) {
- status.setStatus("Flushing " + this + ": creating writer");
-
- writer = createWriterInTmp(set.size());
- writer.setTimeRangeTracker(snapshotTimeRangeTracker);
- pathName = writer.getPath();
- try {
- List<KeyValue> kvs = new ArrayList<KeyValue>();
- boolean hasMore;
- do {
- hasMore = scanner.next(kvs);
- if (!kvs.isEmpty()) {
- for (KeyValue kv : kvs) {
-
-
- if (kv.getMemstoreTS() <= smallestReadPoint) {
-
-
- kv = kv.shallowCopy();
- kv.setMemstoreTS(0);
- }
-
- writer.append(kv);
- flushed += this.memstore.heapSizeChange(kv, true);
- }
- kvs.clear();
- }
- } while (hasMore);
- } finally {
-
-
- status.setStatus("Flushing " + this + ": appending metadata");
- writer.appendMetadata(logCacheFlushId, false);
- status.setStatus("Flushing " + this + ": closing flushed file");
-
- writer.close();
- }
- }
- } finally {
- flushedSize.set(flushed);
- scanner.close();
- }
append过程
- private void append(final long memstoreTS, final byte[] key, final int koffset, final int klength,
- final byte[] value, final int voffset, final int vlength)
- throws IOException {
-
- boolean dupKey = checkKey(key, koffset, klength);
- checkValue(value, voffset, vlength);
-
-
- if (!dupKey) {
- checkBlockBoundary();
- }
-
- if (!fsBlockWriter.isWriting())
- newBlock();
-
-
-
- {
-
-
- DataOutputStream out = fsBlockWriter.getUserDataStream();
- out.writeInt(klength);
- totalKeyLength += klength;
- out.writeInt(vlength);
- totalValueLength += vlength;
- out.write(key, koffset, klength);
- out.write(value, voffset, vlength);
- if (this.includeMemstoreTS) {
- WritableUtils.writeVLong(out, memstoreTS);
- }
- }
-
-
-
- if (firstKeyInBlock == null) {
-
- firstKeyInBlock = new byte[klength];
- System.arraycopy(key, koffset, firstKeyInBlock, 0, klength);
- }
-
- lastKeyBuffer = key;
- lastKeyOffset = koffset;
- lastKeyLength = klength;
- entryCount++;
- }
初始化开始写入
-
-
-
-
-
-
- public DataOutputStream startWriting(BlockType newBlockType)
- throws IOException {
- if (state == State.BLOCK_READY && startOffset != -1) {
-
-
-
- prevOffsetByType[blockType.getId()] = startOffset;
- }
-
- startOffset = -1;
-
- blockType = newBlockType;
-
- baosInMemory.reset();
-
- baosInMemory.write(DUMMY_HEADER);
-
- state = State.WRITING;
-
-
-
- userDataStream = new DataOutputStream(baosInMemory);
- return userDataStream;
- }
写着写着,可能block就满了,检查data block是否已满
- private void checkBlockBoundary() throws IOException {
-
- if (fsBlockWriter.blockSizeWritten() < blockSize)
- return;
-
- finishBlock();
-
- writeInlineBlocks(false);
-
- newBlock();
- }
具体finishBlock过程,flush数据到HDFS的outputstream
-
- private void finishBlock() throws IOException {
-
- if (!fsBlockWriter.isWriting() || fsBlockWriter.blockSizeWritten() == 0)
- return;
-
- long startTimeNs = System.nanoTime();
-
-
-
- if (firstDataBlockOffset == -1) {
- firstDataBlockOffset = outputStream.getPos();
- }
-
-
-
- lastDataBlockOffset = outputStream.getPos();
-
-
- fsBlockWriter.writeHeaderAndData(outputStream);
-
- int onDiskSize = fsBlockWriter.getOnDiskSizeWithHeader();
-
- dataBlockIndexWriter.addEntry(firstKeyInBlock, lastDataBlockOffset,
- onDiskSize);
-
- totalUncompressedBytes += fsBlockWriter.getUncompressedSizeWithHeader();
-
- HFile.offerWriteLatency(System.nanoTime() - startTimeNs);
- HFile.offerWriteData(onDiskSize);
-
- if (cacheConf.shouldCacheDataOnWrite()) {
- doCacheOnWrite(lastDataBlockOffset);
- }
- }
写入HDFS stream过程
- public void writeHeaderAndData(FSDataOutputStream out) throws IOException {
- long offset = out.getPos();
- if (startOffset != -1 && offset != startOffset) {
- throw new IOException("A " + blockType + " block written to a "
- + "stream twice, first at offset " + startOffset + ", then at "
- + offset);
- }
-
- startOffset = offset;
-
- writeHeaderAndData((DataOutputStream) out);
- }
-
- private void writeHeaderAndData(DataOutputStream out) throws IOException {
-
- ensureBlockReady();
-
- out.write(onDiskBytesWithHeader);
- if (compressAlgo == NONE) {
- if (onDiskChecksum == HConstants.EMPTY_BYTE_ARRAY) {
- throw new IOException("A " + blockType
- + " without compression should have checksums "
- + " stored separately.");
- }
-
- out.write(onDiskChecksum);
- }
- }
对buffer数据处理部分,包括压缩和编码等处理
- private void finishBlock() throws IOException {
-
- userDataStream.flush();
-
-
-
- uncompressedBytesWithHeader = baosInMemory.toByteArray();
-
- prevOffset = prevOffsetByType[blockType.getId()];
-
-
-
-
-
- state = State.BLOCK_READY;
-
- encodeDataBlockForDisk();
-
- doCompressionAndChecksumming();
- }
压缩和checksum,压缩之后,checksum数据直接写入onDiskBytesWithHeader,否则写入onDiskChecksum,不管压缩不压缩,都要写入block的header数据
- private void doCompressionAndChecksumming() throws IOException {
-
- if (compressAlgo != NONE) {
- compressedByteStream.reset();
- compressedByteStream.write(DUMMY_HEADER);
-
- compressionStream.resetState();
-
- compressionStream.write(uncompressedBytesWithHeader, HEADER_SIZE,
- uncompressedBytesWithHeader.length - HEADER_SIZE);
-
- compressionStream.flush();
- compressionStream.finish();
-
-
- onDiskDataSizeWithHeader = compressedByteStream.size();
-
-
- ChecksumUtil.reserveSpaceForChecksums(compressedByteStream,
- onDiskDataSizeWithHeader, bytesPerChecksum);
-
-
- onDiskBytesWithHeader = compressedByteStream.toByteArray();
- putHeader(onDiskBytesWithHeader, 0, onDiskBytesWithHeader.length,
- uncompressedBytesWithHeader.length, onDiskDataSizeWithHeader);
-
-
-
- ChecksumUtil.generateChecksums(
- onDiskBytesWithHeader, 0, onDiskDataSizeWithHeader,
- onDiskBytesWithHeader, onDiskDataSizeWithHeader,
- checksumType, bytesPerChecksum);
-
-
- onDiskChecksum = HConstants.EMPTY_BYTE_ARRAY;
-
-
- putHeader(uncompressedBytesWithHeader, 0,
- onDiskBytesWithHeader.length + onDiskChecksum.length,
- uncompressedBytesWithHeader.length, onDiskDataSizeWithHeader);
-
- } else {
-
-
- onDiskBytesWithHeader = uncompressedBytesWithHeader;
-
- onDiskDataSizeWithHeader = onDiskBytesWithHeader.length;
-
- int numBytes = (int)ChecksumUtil.numBytes(
- uncompressedBytesWithHeader.length,
- bytesPerChecksum);
-
- onDiskChecksum = new byte[numBytes];
-
-
-
- putHeader(uncompressedBytesWithHeader, 0,
- onDiskBytesWithHeader.length + onDiskChecksum.length,
- uncompressedBytesWithHeader.length, onDiskDataSizeWithHeader);
-
- ChecksumUtil.generateChecksums(
- uncompressedBytesWithHeader, 0, uncompressedBytesWithHeader.length,
- onDiskChecksum, 0,
- checksumType, bytesPerChecksum);
- }
- }
data block处理完之后,更新索引,索引项由block的firstkey,开始的偏移量,dataSize组成。索引主要有2种,leaf-level chunk和root index chunk
- void add(byte[] firstKey, long blockOffset, int onDiskDataSize,
- long curTotalNumSubEntries) {
-
-
- secondaryIndexOffsetMarks.add(curTotalNonRootEntrySize);
-
- curTotalNonRootEntrySize += SECONDARY_INDEX_ENTRY_OVERHEAD
- + firstKey.length;
-
- curTotalRootSize += Bytes.SIZEOF_LONG + Bytes.SIZEOF_INT
- + WritableUtils.getVIntSize(firstKey.length) + firstKey.length;
-
- blockKeys.add(firstKey);
- blockOffsets.add(blockOffset);
- onDiskDataSizes.add(onDiskDataSize);
-
- if (curTotalNumSubEntries != -1) {
- numSubEntriesAt.add(curTotalNumSubEntries);
-
-
- if (numSubEntriesAt.size() != blockKeys.size()) {
- throw new IllegalStateException("Only have key/value count " +
- "stats for " + numSubEntriesAt.size() + " block index " +
- "entries out of " + blockKeys.size());
- }
- }
- }
回到前头,finishBlock之后,数据都从buffer中flush到了HDFS的stream里。这个时候给index block一个机会,检查下是否已满,满的话,将索引块flush到HDFS
- private void writeInlineBlocks(boolean closing) throws IOException {
- for (InlineBlockWriter ibw : inlineBlockWriters) {
-
- while (ibw.shouldWriteBlock(closing)) {
- long offset = outputStream.getPos();
- boolean cacheThisBlock = ibw.cacheOnWrite();
-
- ibw.writeInlineBlock(fsBlockWriter.startWriting(
- ibw.getInlineBlockType()));
- fsBlockWriter.writeHeaderAndData(outputStream);
- ibw.blockWritten(offset, fsBlockWriter.getOnDiskSizeWithHeader(),
- fsBlockWriter.getUncompressedSizeWithoutHeader());
- totalUncompressedBytes += fsBlockWriter.getUncompressedSizeWithHeader();
-
- if (cacheThisBlock) {
- doCacheOnWrite(offset);
- }
- }
- }
- }
对于BlockIndexWriter来说规则是,maxChunkSize默认128K
- curInlineChunk.getNonRootSize() >= maxChunkSize;
看看BlockIndexWriter是如何写的
- public void writeInlineBlock(DataOutput out) throws IOException {
- if (singleLevelOnly)
- throw new UnsupportedOperationException(INLINE_BLOCKS_NOT_ALLOWED);
-
-
-
-
- curInlineChunk.writeNonRoot(out);
-
-
-
-
- firstKey = curInlineChunk.getBlockKey(0);
-
-
-
- curInlineChunk.clear();
- }
写入leaf chunk
- void writeNonRoot(DataOutput out) throws IOException {
-
-
- out.writeInt(blockKeys.size());
-
- if (secondaryIndexOffsetMarks.size() != blockKeys.size()) {
- throw new IOException("Corrupted block index chunk writer: " +
- blockKeys.size() + " entries but " +
- secondaryIndexOffsetMarks.size() + " secondary index items");
- }
-
-
-
-
-
-
- for (int currentSecondaryIndex : secondaryIndexOffsetMarks)
- out.writeInt(currentSecondaryIndex);
-
-
-
-
- out.writeInt(curTotalNonRootEntrySize);
-
- for (int i = 0; i < blockKeys.size(); ++i) {
- out.writeLong(blockOffsets.get(i));
- out.writeInt(onDiskDataSizes.get(i));
- out.write(blockKeys.get(i));
- }
- }
索引block写入HDFS stream后,更新rootChunk索引,rootChunk是一个对data block index块的索引结构,所有keyvalue都写完后,rootChunk才会flush
到HDFS stream,会进一步分裂多级结构,但是在循环写入的时候只有2级
- public void blockWritten(long offset, int onDiskSize, int uncompressedSize)
- {
-
- totalBlockOnDiskSize += onDiskSize;
- totalBlockUncompressedSize += uncompressedSize;
-
- if (singleLevelOnly)
- throw new UnsupportedOperationException(INLINE_BLOCKS_NOT_ALLOWED);
-
- if (firstKey == null) {
- throw new IllegalStateException("Trying to add second-level index " +
- "entry with offset=" + offset + " and onDiskSize=" + onDiskSize +
- "but the first key was not set in writeInlineBlock");
- }
-
- if (rootChunk.getNumEntries() == 0) {
-
- expectNumLevels(1);
- numLevels = 2;
- }
-
-
-
-
- rootChunk.add(firstKey, offset, onDiskSize, totalNumEntries);
- firstKey = null;
- }
以上就是循环的大抵过程,data block和data index block交替写入。当所有数据都写完后,开始做close操作,看HFileWriterV2的close操作
- public void close() throws IOException {
- if (outputStream == null) {
- return;
- }
-
-
-
- finishBlock();
-
- writeInlineBlocks(true);
-
- FixedFileTrailer trailer = new FixedFileTrailer(2,
- HFileReaderV2.MAX_MINOR_VERSION);
-
-
-
- if (!metaNames.isEmpty()) {
- for (int i = 0; i < metaNames.size(); ++i) {
-
- long offset = outputStream.getPos();
-
- DataOutputStream dos = fsBlockWriter.startWriting(BlockType.META);
- metaData.get(i).write(dos);
-
- fsBlockWriter.writeHeaderAndData(outputStream);
- totalUncompressedBytes += fsBlockWriter.getUncompressedSizeWithHeader();
-
-
- metaBlockIndexWriter.addEntry(metaNames.get(i), offset,
- fsBlockWriter.getOnDiskSizeWithHeader());
- }
- }
-
-
-
-
-
-
-
-
-
-
-
-
- long rootIndexOffset = dataBlockIndexWriter.writeIndexBlocks(outputStream);
- trailer.setLoadOnOpenOffset(rootIndexOffset);
-
-
-
- metaBlockIndexWriter.writeSingleLevelIndex(fsBlockWriter.startWriting(
- BlockType.ROOT_INDEX), "meta");
- fsBlockWriter.writeHeaderAndData(outputStream);
- totalUncompressedBytes += fsBlockWriter.getUncompressedSizeWithHeader();
-
- if (this.includeMemstoreTS) {
- appendFileInfo(MAX_MEMSTORE_TS_KEY, Bytes.toBytes(maxMemstoreTS));
- appendFileInfo(KEY_VALUE_VERSION, Bytes.toBytes(KEY_VALUE_VER_WITH_MEMSTORE));
- }
-
-
- writeFileInfo(trailer, fsBlockWriter.startWriting(BlockType.FILE_INFO));
- fsBlockWriter.writeHeaderAndData(outputStream);
- totalUncompressedBytes += fsBlockWriter.getUncompressedSizeWithHeader();
-
-
- for (BlockWritable w : additionalLoadOnOpenData){
- fsBlockWriter.writeBlock(w, outputStream);
- totalUncompressedBytes += fsBlockWriter.getUncompressedSizeWithHeader();
- }
-
-
-
- trailer.setNumDataIndexLevels(dataBlockIndexWriter.getNumLevels());
- trailer.setUncompressedDataIndexSize(
- dataBlockIndexWriter.getTotalUncompressedSize());
- trailer.setFirstDataBlockOffset(firstDataBlockOffset);
- trailer.setLastDataBlockOffset(lastDataBlockOffset);
- trailer.setComparatorClass(comparator.getClass());
- trailer.setDataIndexCount(dataBlockIndexWriter.getNumRootEntries());
-
-
- finishClose(trailer);
-
- fsBlockWriter.releaseCompressor();
- }