spark-parquet列存储之:数据写入过程源码分析

数据写入过程源码分析

数据写入过程

构造ParquetRecordWriter时构造InternalParquetRecordWriter
public ParquetRecordWriter(
...){
 internalWriter = new InternalParquetRecordWriter(w, writeSupport, schema,
        extraMetaData, blockSize, pageSize, compressor, dictionaryPageSize, enableDictionary, validating, writerVersion);
}
构造InternalParquetRecordWriter时触发initStore
public InternalParquetRecordWriter(
...
initStore()
...)
执行ParquetRecordWriter.write时,执行InternalParquetRecordWriter.write
  public void write(Void key, T value) throws IOException, InterruptedException {
    internalWriter.write(value);
  }
InternalParquetRecordWriter.write方法实现如下:
  public void write(T value) throws IOException, InterruptedException {
    writeSupport.write(value);
    ++ recordCount;
    checkBlockSizeReached();
  }

因此,数据写入过程大致分为四个阶段:initStore阶段,数据写入准备阶段,writeSupport写入数据阶段,检查和flush阶段

initStore阶段

 int initialBlockBufferSize = max(MINIMUM_BUFFER_SIZE, rowGroupSize / schema.getColumns().size() / 5);
 int initialPageBufferSize = max(MINIMUM_BUFFER_SIZE, min(pageSize + pageSize / 10, initialBlockBufferSize));
初始化BlockBufferSize和PageBufferSize
pageStore = new ColumnChunkPageWriteStore(compressor, schema, initialBlockBufferSize);
columnStore = new ColumnWriteStoreImpl(pageStore, pageSize, initialPageBufferSize, dictionaryPageSize, enableDictionary, writerVersion);
初始化pageSWritertore和ColumWriterStore,分别含有PageWriter和ColumnWriter内部类进行数据写入操作

数据写入准备阶段

    MessageColumnIO columnIO = new ColumnIOFactory(validating).getColumnIO(schema);
    writeSupport.prepareForWrite(columnIO.getRecordWriter(columnStore));

初始化MessageColumnIO,传递schema信息,为数据写入做准备

validating值在ParquetRecordWrite初始化时被确定,默认为false,如下代码所示

public RecordWriter getRecordWriter(Configuration conf, Path file, CompressionCodecName codec)
        throws IOException, InterruptedException {
...
 boolean validating = getValidation(conf);
    if (INFO) LOG.info("Validation is " + (validating ? "on" : "off"));
....
return new ParquetRecordWriter(
        w,
        writeSupport,
        init.getSchema(),
        init.getExtraMetaData(),
        blockSize, pageSize,
        codecFactory.getCompressor(codec, pageSize),
        dictionaryPageSize,
        enableDictionary,
        validating,
        writerVersion);
  }

   
根据ColumnWriteStore获取RecordConsumer(数据消费者)
  public RecordConsumer getRecordWriter(ColumnWriteStore columns) {
    RecordConsumer recordWriter = new MessageColumnIORecordConsumer(columns);
    if (DEBUG) recordWriter = new RecordConsumerLoggingWrapper(recordWriter);
    return validating ? new ValidatingRecordConsumer(recordWriter, getType()) : recordWriter;
  }

writeSupport写入数据阶段

write定义如下:
def write(record: Row): Unit = {
    val attributesSize = attributes.size
    if (attributesSize > record.size) {
      throw new IndexOutOfBoundsException(
        s"Trying to write more fields than contained in row (${attributesSize}>${record.size})")
    }
    var index = 0
    writer.startMessage()
    while(index < attributesSize) {
      // null values indicate optional fields but we do not check currently
      if (record(index) != null && record(index) != Nil) {
        writer.startField(attributes(index).name, index)
        consumeType(attributes(index).dataType, record, index)
        writer.endField(attributes(index).name, index)
      }
      index = index + 1
    }
    writer.endMessage()
  }

startFile方法主要定义如下:
public void startField(String field, int index) {
      try {
        if (DEBUG) log("startField(" + field + ", " + index + ")");
        currentColumnIO = ((GroupColumnIO)currentColumnIO).getChild(index);
        emptyField = true;
        if (DEBUG) printState();
      } catch (RuntimeException e) {
        throw new ParquetEncodingException("error starting field " + field + " at " + index, e);
      }
    }
consumeType主要定义如下:字符串当做binary处理,因为

private def consumeType(
      ctype: DataType,
      record: Row,
      index: Int): Unit = {
    ctype match {
      case StringType => writer.addBinary(
        Binary.fromByteArray(
          record(index).asInstanceOf[String].getBytes("utf-8")
        )
      )
      case BinaryType => writer.addBinary(
        Binary.fromByteArray(record(index).asInstanceOf[Array[Byte]]))
      case IntegerType => writer.addInteger(record.getInt(index))
      case ShortType => writer.addInteger(record.getShort(index))
      case LongType => writer.addLong(record.getLong(index))
      case ByteType => writer.addInteger(record.getByte(index))
      case DoubleType => writer.addDouble(record.getDouble(index))
      case FloatType => writer.addFloat(record.getFloat(index))
      case BooleanType => writer.addBoolean(record.getBoolean(index))
      case d: DecimalType =>
        if (d.precisionInfo == None || d.precisionInfo.get.precision > 18) {
          sys.error(s"Unsupported datatype $d, cannot write to consumer")
        }
        writeDecimal(record(index).asInstanceOf[Decimal], d.precisionInfo.get.precision)
      case _ => sys.error(s"Unsupported datatype $ctype, cannot write to consumer")
    }
  }
addInteger定义如下
@Override
    public void addInteger(int value) {
      if (DEBUG) log("addInt(" + value + ")");
      emptyField = false;
      getColumnWriter().write(value, r[currentLevel], currentColumnIO.getDefinitionLevel());

      setRepetitionLevel();
      if (DEBUG) printState();
    }
write定义如下:由dataColum写入实际的值
  public void write(int value, int repetitionLevel, int definitionLevel) {
    if (DEBUG) log(value, repetitionLevel, definitionLevel);
    repetitionLevelColumn.writeInteger(repetitionLevel);
    definitionLevelColumn.writeInteger(definitionLevel);
    dataColumn.writeInteger(value);
    updateStatistics(value);
    accountForValueWritten();
  }

endFile主要定义如下:    
    public void endField(String field, int index) {
      if (DEBUG) log("endField(" + field + ", " + index + ")");
      currentColumnIO = currentColumnIO.getParent();
      if (emptyField) {
        throw new ParquetEncodingException("empty fields are illegal, the field should be ommited completely instead");
      }
      fieldsWritten[currentLevel].markWritten(index);
      r[currentLevel] = currentLevel == 0 ? 0 : r[currentLevel - 1];
      if (DEBUG) printState();
    }

若数据写入执行过程如下面伪代码
 startField("A", 0)
 addValue(1)
 addValue(2)
 endField("A", 0)
 startField("B", 1)
 startGroup()
 startField("C", 0)
 addValue(3)
 endField("C", 0)
 endGroup()
 endField("B", 1)

则写入数据的格式如下:
{
     A:[1,2]
     B:{C:3}
}



检查和Flush阶段

checkBlockSizeReached定义:
private void checkBlockSizeReached() throws IOException {
    if (recordCount >= recordCountForNextMemCheck) { // checking the memory size is relatively expensive, so let's not do it for every record.
      long memSize = columnStore.memSize();
      if (memSize > rowGroupSize) {
        LOG.info(format("mem size %,d > %,d: flushing %,d records to disk.", memSize, rowGroupSize, recordCount));
        flushRowGroupToStore();
        initStore();
        recordCountForNextMemCheck = min(max(MINIMUM_RECORD_COUNT_FOR_CHECK, recordCount / 2), MAXIMUM_RECORD_COUNT_FOR_CHECK);
      } else {
        float recordSize = (float) memSize / recordCount;
        recordCountForNextMemCheck = min(
            max(MINIMUM_RECORD_COUNT_FOR_CHECK, (recordCount + (long)(rowGroupSize / recordSize)) / 2), // will check halfway
            recordCount + MAXIMUM_RECORD_COUNT_FOR_CHECK // will not look more than max records ahead
            );
        if (DEBUG) LOG.debug(format("Checked mem at %,d will check again at: %,d ", recordCount, recordCountForNextMemCheck));
      }
    }
  }


flushRowGroupToStore定义
startBlock过程:获取Block的元数据信息,并统计当前block中的rocordCount
private void flushRowGroupToStore()
      throws IOException {
    LOG.info(format("Flushing mem columnStore to file. allocated memory: %,d", columnStore.allocatedSize()));
    if (columnStore.allocatedSize() > 3 * (long)rowGroupSize) {
      LOG.warn("Too much memory used: " + columnStore.memUsageString());
    }
    if (recordCount > 0) {
      parquetFileWriter.startBlock(recordCount);
      columnStore.flush();
      pageStore.flushToFileWriter(parquetFileWriter);
      recordCount = 0;
      parquetFileWriter.endBlock();
    }
    columnStore = null;
    pageStore = null;
  }
ColumWriteStoreImpl.flush实现:实际就是所有的ColumnWriterImpl.flush
public void flush() {
    Collection values = columns.values();
    for (ColumnWriterImpl memColumn : values) {
      memColumn.flush();
    }
  }
ColumnWriterImpl.flush实现:
  public void flush() {
    if (valueCount > 0) {
      writePage();
    }
    final DictionaryPage dictionaryPage = dataColumn.createDictionaryPage();
    if (dictionaryPage != null) {
      if (DEBUG) LOG.debug("write dictionary");
      try {
        pageWriter.writeDictionaryPage(dictionaryPage);
      } catch (IOException e) {
        throw new ParquetEncodingException("could not write dictionary page for " + path, e);
      }
      dataColumn.resetDictionary();
    }
  }

writePage实现:将所有数据都转换成了Bytes
  private void writePage() {
    if (DEBUG) LOG.debug("write page");
    try {
      pageWriter.writePage(
          concat(repetitionLevelColumn.getBytes(), definitionLevelColumn.getBytes(), dataColumn.getBytes()),
          valueCount,
          statistics,
          repetitionLevelColumn.getEncoding(),
          definitionLevelColumn.getEncoding(),
          dataColumn.getEncoding());
    } catch (IOException e) {
      throw new ParquetEncodingException("could not write page for " + path, e);
    }
    repetitionLevelColumn.reset();
    definitionLevelColumn.reset();
    dataColumn.reset();
    valueCount = 0;
    resetStatistics();
  }





你可能感兴趣的:(Spark)