HoodieWriteHandle 数据写入处理

项目中使用Flink SQL 将离线数据、流数据写入Hudi，自下而上分析写hudi的流程。hudi 版本0.10.0-patch。

HoodieWriteHandle

HoodieWriteHandle 负责对数据进行加工后写入底层DFS。

FlinkCreateHandle：创建新的 parquet 文件，并将一批数据写入，例如：copy表的batch insert。
FlinkAppendHandle：向已有的 hoodie log 文件追加数据，例如：mor表的upsert、delete数据。
FlinkMergeHandle: 将历史数据和新增数据合并后输出新的 parquet 文件，例如：cor表的update，mor表的compact。

HoodieFileWriter：不同文件 format 的数据写入器，同时维护一个布隆过滤器。HoodieMergeHandle、HoodieCreateHandle 依赖该组件。

FlinkCreateHandle

创建 parquet 文件并将数据填充 hudi 元数据列信息后写入。主要流程：

向分区写入元数据文件 .hoodie_partition_metadata记录分区创建时间以及分区深度。

#partition metadata
#Thu Mar 10 03:07:01 UTC 2022
commitTime=20220310030659727
partitionDepth=1

向timeline server 发送创建marker , 表示在哪个 instant 针对哪个分区 file group 做了 create 操作，在发生回滚时根据该 marker 删除目标 parquet。
根据要输出的文件格式创建 HoodieFileWriter，默认格式为Parquet，同时支持 orc 格式。HoodieParquetWriter 维护了一个 BloomFilter ，对flink 是没什么用的，spark 可以做 BloomIndex。
为 record 添加 hudi 元数据列后输出到最终文件。
填充 WriteStatus 作为最终结果返回，StreamWriteOperatorCoordinator 接收到所有 task的WriteStatus后写到./hoodie 下。

public HoodieCreateHandle(HoodieWriteConfig config, String instantTime, HoodieTable hoodieTable,
                          String partitionPath, String fileId, Option overriddenSchema,
                          TaskContextSupplier taskContextSupplier, boolean preserveHoodieMetadata) {
  super(config, instantTime, partitionPath, fileId, hoodieTable, overriddenSchema,
      taskContextSupplier);
  this.preserveHoodieMetadata = preserveHoodieMetadata;
  writeStatus.setFileId(fileId);
  writeStatus.setPartitionPath(partitionPath);
  writeStatus.setStat(new HoodieWriteStat());
  // 构建 parquet path 例如： xxx/202203/710c791b-2358-4903-bef8-b885f48a40b8_1-2-0_20220318111533929.parquet
  this.path = makeNewPath(partitionPath);
  try {
    /**
     *向分区里面写入元数据信息，记录当前分区的创建时间和分区深度
     * #partition metadata
     * #Thu Mar 17 16:55:10 CST 2022
     * commitTime=20220317165442543
     * partitionDepth=1
     */
    HoodiePartitionMetadata partitionMetadata = new HoodiePartitionMetadata(fs, instantTime,
        new Path(config.getBasePath()), FSUtils.getPartitionPath(config.getBasePath(), partitionPath));
    partitionMetadata.trySave(getPartitionId());

    // 创建 mark file， 在哪个 instant 针对哪些分区 file group 做了什么操作
    createMarkerFile(partitionPath, FSUtils.makeDataFileName(this.instantTime, this.writeToken, this.fileId, hoodieTable.getBaseFileExtension()));
    //  创建文件写入器   FileWriter -->   HoodieFileWriter   --->  HoodieParquetWriter
    this.fileWriter = HoodieFileWriterFactory.getFileWriter(instantTime, path, hoodieTable, config,
      writeSchemaWithMetaFields, this.taskContextSupplier);
  } catch (IOException e) {
    throw new HoodieInsertException("Failed to initialize HoodieStorageWriter for path " + path, e);
  }
  LOG.info("New CreateHandle for partition :" + partitionPath + " with fileId " + fileId);
}

// HoodieWriteHandle#write 入口
public void write(HoodieRecord record, Option avroRecord) {
  Option recordMetadata = record.getData().getMetadata();

  if (HoodieOperation.isDelete(record.getOperation())) {
    avroRecord = Option.empty();
  }

  try {
    if (avroRecord.isPresent()) {
      if (avroRecord.get().equals(IGNORE_RECORD)) {
        return;
      }
      //  为record 填充hoodie 的schema信息
      IndexedRecord recordWithMetadataInSchema = rewriteRecord((GenericRecord) avroRecord.get());
      if (preserveHoodieMetadata) {
        fileWriter.writeAvro(record.getRecordKey(), recordWithMetadataInSchema);
      } else {
        // 为填充hoodie 的schema，填充数据
        fileWriter.writeAvroWithMetadata(recordWithMetadataInSchema, record);
      }
      // update the new location of record, so we know where to find it next
      record.unseal();
      record.setNewLocation(new HoodieRecordLocation(instantTime, writeStatus.getFileId()));
      record.seal();
      recordsWritten++;
      insertRecordsWritten++;
    } else {
      recordsDeleted++;
    }

    writeStatus.markSuccess(record, recordMetadata);
    // deflate record payload after recording success. This will help users access payload as a
    // part of marking
    // record successful.
    record.deflate();
  } catch (Throwable t) {
    // Not throwing exception from here, since we don't want to fail the entire job
    // for a single record
    writeStatus.markFailure(record, t, recordMetadata);
    LOG.error("Error writing record " + record, t);
  }
}

FlinkAppendHandle

向 mor 表对应的logfile 追加数据完成数据的upsert/delete。通过 HoodieLogFormatWriter 写 HoodieLogBlock完成。
主要流程：

根据 partition 及 fileId 从HoodieTableFileSystemView 查找最新 fileSlice。
向timeline server 请求创建marker。
如果是新分区，创建分区写入.hoodie_partition_metadata记录分区元数据信息。
通过 HoodieLogFormatWriter 向fileSlice 的logfile 写 HoodieLogBlock，数据会先被缓存，在close（）或者达到设置的 maxBlockSize 时会写出。
填充WriteStatus 作为结果返回。

public void write(HoodieRecord record, Option insertValue) {
  Option> recordMetadata = record.getData().getMetadata();
  try {
    // 初始化fileSlice、HoodieLogFormatWriter、marker
    init(record);
    // 缓存数据大小达到 maxBlockSize （默认256M） 写出到
    flushToDiskIfRequired(record);
    // 为HoodieRecord 填充 hoodie meta 字段并转成avro数据格式存储到缓存
    writeToBuffer(record);
  } catch (Throwable t) {
    ...
  }
}

public List close() {
  try {
    // flush any remaining records to disk
    // 将数据从内存刷出到DFS
    appendDataAndDeleteBlocks(header);
    recordItr = null;
    if (writer != null) {
      writer.close();
      writer = null;
      // update final size, once for all log files
      ....
    }
    return statuses;
  } catch (IOException e) {
    throw new HoodieUpsertException("Failed to close UpdateHandle", e);
  }
}

将缓存数据封装为 HoodieDataBlock 由 HoodieLogFormatWriter 写入hudi log。

protected void appendDataAndDeleteBlocks(Map header) {
  try {
    //  Header metadata for a log block    header 元数据
    header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, instantTime);
    header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, writeSchemaWithMetaFields.toString());

    // 最多包含 DataBlock 和 DeleteBlock
    List blocks = new ArrayList<>(2);

    if (recordList.size() > 0) {
      if (config.populateMetaFields()) {
        // flink ==> 从元数据列抽取keyField, 并构建 HoodieAvroDataBlock。
        HoodieLogBlock dataBlock = HoodieDataBlock.getBlock(hoodieTable.getLogDataBlockFormat(), recordList, header);
        blocks.add(dataBlock);
      } else {
        final String keyField = hoodieTable.getMetaClient().getTableConfig().getRecordKeyFieldProp();
        blocks.add(HoodieDataBlock.getBlock(hoodieTable.getLogDataBlockFormat(), recordList, header, keyField));
      }
    }
    //  有需要删除的record, 创建 HoodieDeleteBlock, 写入 keysToDelete
    if (keysToDelete.size() > 0) {
      blocks.add(new HoodieDeleteBlock(keysToDelete.toArray(new HoodieKey[keysToDelete.size()]), header));
    }

    if (blocks.size() > 0) {
      //  由 HoodieLogFormatWriter 写入 HoodieLogBlock
      AppendResult appendResult = writer.appendBlocks(blocks);
      //  将写入结果 写到状态中
      processAppendResult(appendResult);

      recordList.clear();
      keysToDelete.clear();
    }
  } catch (Exception e) {
    throw new HoodieAppendException("Failed while appending records to " + writer.getLogFile().getPath(), e);
  }
}

HoodieLogBlock

hudi 写log 的格式为自己定义的 HoodieLogBlock，类型不同在读取时做不同处理。包含Header、Footer、Content、 Version等。

HoodieDataBlock：数据块，存储序列化后的 record，默认为HoodieAvroDataBlock。
HoodieDeleteBlock：要删除 HoodieKey 组成的数据块。
HoodieCorruptBlock：错误的数据快，在读的时候数据块未校验过，定义成HoodieCorruptBlock。
HoodieCommandBlock：命令行块，只有ROLLBACK_PREVIOUS_BLOCK类型，发生回滚时写入，在 scanner 读数据时读到该块会跳过目标块。

HoodieAvroDataBlock#serializeRecords
protected byte[] serializeRecords() throws IOException {
    Schema schema = new Schema.Parser().parse(super.getLogBlockHeader().get(HeaderMetadataType.SCHEMA));
    GenericDatumWriter writer = new GenericDatumWriter<>(schema);
    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    DataOutputStream output = new DataOutputStream(baos);

    // 1. Write out the log block version      日志块版本
    output.writeInt(HoodieLogBlock.version);
    // 2. Write total number of records        总的记录数
    output.writeInt(records.size());
    // 3. Write the records                    avro record       
    Iterator itr = records.iterator();
    while (itr.hasNext()) {
      IndexedRecord s = itr.next();
      ByteArrayOutputStream temp = new ByteArrayOutputStream();
      BinaryEncoder encoder = EncoderFactory.get().binaryEncoder(temp, encoderCache.get());
      encoderCache.set(encoder);
      try {
        // Encode the record into bytes
        writer.write(s, encoder);
        encoder.flush();

        // Get the size of the bytes
        int size = temp.toByteArray().length;
        // Write the record size
        output.writeInt(size);
        // Write the content
        output.write(temp.toByteArray());
        itr.remove();
      } catch (IOException e) {
        throw new HoodieIOException("IOException converting HoodieAvroDataBlock to bytes", e);
      }
    }
    output.close();
    return baos.toByteArray();
  }

HoodieLogFormatWriter

将 HoodieLogBlock 以特定的格式写入 log file，如果底层文件系统不支持 Append 则创建新的文件写入。
针对logfile 构建 OutputStream，如果DFS不支持Append则会重新创建，注意观察生成的新文件名称。

private FSDataOutputStream getOutputStream() throws IOException, InterruptedException {
  if (this.output == null) {
    Path path = logFile.getPath();
    if (fs.exists(path)) {
      // 文件是否支持Append, 支持append的文件系统比较少， hdfs支持append
      boolean isAppendSupported = StorageSchemes.isAppendSupported(fs.getScheme());
      if (isAppendSupported) {
        LOG.info(logFile + " exists. Appending to existing file");
        try {
          // open the path for append and record the offset
          this.output = fs.append(path, bufferSize);
        } catch (RemoteException e) {
          LOG.warn("Remote Exception, attempting to handle or recover lease", e);
          handleAppendExceptionOrRecoverLease(path, e);
        } catch (IOException ioe) {
          if (ioe.getMessage().toLowerCase().contains("not supported")) {
            // may still happen if scheme is viewfs.
            isAppendSupported = false;
          } else {
            close();
            throw ioe;
          }
        }
      }
      //  不支持 append 的文件系统会创建一个新的 log file
      if (!isAppendSupported) {
        //  滚动生成新的 log file，最大version +1
        rollOver();
        //  创建新的 logfile
        createNewFile();
        LOG.info("Append not supported.. Rolling over to " + logFile);
      }
    } else {
      LOG.info(logFile + " does not exist. Create a new file");
      // Block size does not matter as we will always manually autoflush
      createNewFile();
    }
  }
  return output;
}

block 写入log file 数据格式：

magic
block length
log version
block type
block header
block content length
block content
block footer
block length

public AppendResult appendBlocks(List blocks) throws IOException, InterruptedException {
  // Find current version      log format 版本
  HoodieLogFormat.LogFormatVersion currentLogFormatVersion =
      new HoodieLogFormatVersion(HoodieLogFormat.CURRENT_VERSION);
  // 针对 logfile 构建 outputStream
  FSDataOutputStream outputStream = getOutputStream();
  long startPos = outputStream.getPos();
  long sizeWritten = 0;

  for (HoodieLogBlock block: blocks) {
    long startSize = outputStream.size();

    // 1. Write the magic header for the start of the block
    outputStream.write(HoodieLogFormat.MAGIC);

    // bytes for header   头信息  Instance, schema
    byte[] headerBytes = HoodieLogBlock.getLogMetadataBytes(block.getLogBlockHeader());

    // content bytes  将record 以avro 格式写入
    byte[] content = block.getContentBytes();

    // bytes for footer   空map
    byte[] footerBytes = HoodieLogBlock.getLogMetadataBytes(block.getLogBlockFooter());

    // 2. Write the total size of the block (excluding Magic)   log block的总字节大小
    outputStream.writeLong(getLogBlockLength(content.length, headerBytes.length, footerBytes.length));

    // 3. Write the version of this log block
    outputStream.writeInt(currentLogFormatVersion.getVersion());
    // 4. Write the block type
    outputStream.writeInt(block.getBlockType().ordinal());

    // 5. Write the headers for the log block
    outputStream.write(headerBytes);
    // 6. Write the size of the content block
    outputStream.writeLong(content.length);
    // 7. Write the contents of the data block
    outputStream.write(content);
    // 8. Write the footers for the log block
    outputStream.write(footerBytes);
    // 9. Write the total size of the log block (including magic) which is everything written
    // until now (for reverse pointer)
    //  通过与标头中的块大小进行比较来确定块是否损坏
    // Update: this information is now used in determining if a block is corrupt by comparing to the
    //   block size in header. This change assumes that the block size will be the last data written
    //   to a block. Read will break if any data is written past this point for a block.
    outputStream.writeLong(outputStream.size() - startSize);

    // Fetch the size again, so it accounts also (9).
    sizeWritten +=  outputStream.size() - startSize;
  }
  // Flush all blocks to disk   数据刷出
  flush();
  //   数据写入的 
  AppendResult result = new AppendResult(logFile, startPos, sizeWritten);

  // roll over if size is past the threshold
  rolloverIfNeeded();

  return result;
}

FlinkMergeHandle

用来将新数据和历史数据进行合并输出 baseFile，例如：cor表的 update，mor表的压缩。
主要流程：

基于新数据来构建 ExternalSpillableMap，该集合提供O(1)的查询能力，同时能够将数据溢出到磁盘，底层使用RocksDb 或者 BitCask 来实现，默认为BitCask。在压缩场景下，新数据可以是上游读取的log文件并下发。
接收历史数据和ExternalSpillableMap 中的新数据进行匹配，如果匹配成功根据数据合并策略对数据列进行合并，flink 默认会使用新数据的全部列，如果未匹配成功则将历史数据直接下发，写到新parquet里。
close 时将未匹配历史数据的新数据下发。

public HoodieMergeHandle(HoodieWriteConfig config, String instantTime, HoodieTable hoodieTable,
                         Iterator> recordItr, String partitionPath, String fileId,
                         TaskContextSupplier taskContextSupplier, HoodieBaseFile baseFile, Option keyGeneratorOpt) {
  super(config, instantTime, partitionPath, fileId, hoodieTable, taskContextSupplier);
  // 将新记录存储到 ExternalSpillableMap
  init(fileId, recordItr);
  // 准备 fileWriter 及其他
  init(fileId, partitionPath, baseFile);
  
  validateAndSetAndKeyGenProps(keyGeneratorOpt, config.populateMetaFields());
}


protected void init(String fileId, Iterator> newRecordsItr) {
  //  初始化 keyToNewRecords， 具有持久化能力的map
  initializeIncomingRecordsMap();
  
  while (newRecordsItr.hasNext()) {
    HoodieRecord record = newRecordsItr.next();
    // update the new location of the record, so we know where to find it next
    if (needsUpdateLocation()) {
      record.unseal();
      record.setNewLocation(new HoodieRecordLocation(instantTime, fileId));
      record.seal();
    }

    // NOTE: Once Records are added to map (spillable-map), DO NOT change it as they won't persist
    keyToNewRecords.put(record.getRecordKey(), record);
  }
}

private void init(String fileId, String partitionPath, HoodieBaseFile baseFileToMerge) {
    LOG.info("partitionPath:" + partitionPath + ", fileId to be merged:" + fileId);
    this.baseFileToMerge = baseFileToMerge;
    this.writtenRecordKeys = new HashSet<>();
    writeStatus.setStat(new HoodieWriteStat());
    try {
      //  最新 basefile (parquet文件)
      String latestValidFilePath = baseFileToMerge.getFileName();
      //  前一次commit 时间
      writeStatus.getStat().setPrevCommit(FSUtils.getCommitTime(latestValidFilePath));

      HoodiePartitionMetadata partitionMetadata = new HoodiePartitionMetadata(fs, instantTime,
          new Path(config.getBasePath()), FSUtils.getPartitionPath(config.getBasePath(), partitionPath));
      partitionMetadata.trySave(getPartitionId());

      //  新的parquet文件名,       fileId_writeToken_instantTime_fileExtension
      String newFileName = FSUtils.makeDataFileName(instantTime, writeToken, fileId, hoodieTable.getBaseFileExtension());

      // 构建 oldFilePath, newFilePath
      makeOldAndNewFilePaths(partitionPath, latestValidFilePath, newFileName);

      LOG.info(String.format("Merging new data into oldPath %s, as newPath %s", oldFilePath.toString(),
          newFilePath.toString()));

      // file name is same for all records, in this bunch
      writeStatus.setFileId(fileId);
      writeStatus.setPartitionPath(partitionPath);
      writeStatus.getStat().setPartitionPath(partitionPath);
      writeStatus.getStat().setFileId(fileId);
      setWriteStatusPath();

      // Create Marker file
      createMarkerFile(partitionPath, newFileName);

      // Create the writer for writing the new version file
      fileWriter = createNewFileWriter(instantTime, newFilePath, hoodieTable, config,
        writeSchemaWithMetaFields, taskContextSupplier);
    } catch (IOException io) {
      LOG.error("Error in update task at commit " + instantTime, io);
      writeStatus.setGlobalError(io);
      throw new HoodieUpsertException("Failed to initialize HoodieUpdateHandle for FileId: " + fileId + " on commit "
          + instantTime + " on path " + hoodieTable.getMetaClient().getBasePath(), io);
    }
  }

数据处理流程：

public void write(GenericRecord oldRecord) {
  String key = KeyGenUtils.getRecordKeyFromGenericRecord(oldRecord, keyGeneratorOpt);
  boolean copyOldRecord = true;

  if (keyToNewRecords.containsKey(key)) {
     // old record 和 新数据完全匹配
    // If we have duplicate records that we are updating, then the hoodie record will be deflated after
    // writing the first record. So make a copy of the record to be merged
    HoodieRecord hoodieRecord = new HoodieRecord<>(keyToNewRecords.get(key));
    try {
      //  delete 则Option 不为empty.
      //  flink combinedAvroRecord 后的结果为新数据的 arvo，即 overwrite old record
      Option combinedAvroRecord =
          hoodieRecord.getData().combineAndGetUpdateValue(oldRecord,
            useWriterSchema ? tableSchemaWithMetaFields : tableSchema,
              config.getPayloadConfig().getProps());

      if (combinedAvroRecord.isPresent() && combinedAvroRecord.get().equals(IGNORE_RECORD)) {
        // If it is an IGNORE_RECORD, just copy the old record, and do not update the new record.
        //  只拷贝旧记录
        copyOldRecord = true;
      } else if (writeUpdateRecord(hoodieRecord, oldRecord, combinedAvroRecord)) { 
          // writeUpdateRecord 新数据直接写入, 就是写parquet文件
        /*
         * ONLY WHEN
         *  1) we have an update for this key AND
         *  2) We are able to successfully write the the combined new value
         *    We no longer need to copy the old record over.
         */
        copyOldRecord = false;
      }
      // 缓存已经写入的 record key
      writtenRecordKeys.add(key);
    } catch (Exception e) {
      throw new HoodieUpsertException("Failed to combine/merge new record with old value in storage, for new record {"
          + keyToNewRecords.get(key) + "}, old value {" + oldRecord + "}", e);
    }
  }

  if (copyOldRecord) {
    // this should work as it is, since this is an existing record
    try {
      // incoming 的数据不包含对历史数据的变更，直接下发  oldRecord
      fileWriter.writeAvro(key, oldRecord);
    } catch (IOException | RuntimeException e) {
      String errMsg = String.format("Failed to merge old record into new file for key %s from old file %s to new file %s with writerSchema %s",
              key, getOldFilePath(), newFilePath, writeSchemaWithMetaFields.toString(true));
      LOG.debug("Old record is " + oldRecord);
      throw new HoodieUpsertException(errMsg, e);
    }
    recordsWritten++;
  }
}

close 时下发未匹配到新数据，返回状态信息。

public List close() {
  try {
    //
    writeIncomingRecords();

    if (keyToNewRecords instanceof ExternalSpillableMap) {
      ((ExternalSpillableMap) keyToNewRecords).close();
    } else {
      keyToNewRecords.clear();
    }
    writtenRecordKeys.clear();

    if (fileWriter != null) {
      fileWriter.close();
      fileWriter = null;
    }

    long fileSizeInBytes = FSUtils.getFileSize(fs, newFilePath);
    HoodieWriteStat stat = writeStatus.getStat();

    stat.setTotalWriteBytes(fileSizeInBytes);
    stat.setFileSizeInBytes(fileSizeInBytes);
    stat.setNumWrites(recordsWritten);
    stat.setNumDeletes(recordsDeleted);
    stat.setNumUpdateWrites(updatedRecordsWritten);
    stat.setNumInserts(insertRecordsWritten);
    stat.setTotalWriteErrors(writeStatus.getTotalErrorRecords());
    RuntimeStats runtimeStats = new RuntimeStats();
    runtimeStats.setTotalUpsertTime(timer.endTimer());
    stat.setRuntimeStats(runtimeStats);

    performMergeDataValidationCheck(writeStatus);

    LOG.info(String.format("MergeHandle for partitionPath %s fileID %s, took %d ms.", stat.getPartitionPath(),
        stat.getFileId(), runtimeStats.getTotalUpsertTime()));

    return Collections.singletonList(writeStatus);
  } catch (IOException e) {
    throw new HoodieUpsertException("Failed to close UpdateHandle", e);
  }
}

protected void writeIncomingRecords() throws IOException {
    // write out any pending records (this can happen when inserts are turned into updates)
    Iterator> newRecordsItr = (keyToNewRecords instanceof ExternalSpillableMap)
        ? ((ExternalSpillableMap)keyToNewRecords).iterator() : keyToNewRecords.values().iterator();
    while (newRecordsItr.hasNext()) {
      HoodieRecord hoodieRecord = newRecordsItr.next();
      //  已经写入的数据不包含map中的数据，则下发map中的数据
      if (!writtenRecordKeys.contains(hoodieRecord.getRecordKey())) {
        writeInsertRecord(hoodieRecord);
      }
    }
  }