Flink1.10-基于BucketingSink的FileSystem Sql Connector

Flink1.10的SQL支持FileSystem的SQL Connector。使用语法如下:

CREATE TABLE MyUserTable (
  ...
) WITH (
  'connector.type' = 'filesystem',                -- required: specify to connector type
  'connector.path' = 'file:///path/to/whatever',  -- required: path to a file or directory
  'format.type' = '...',                          -- required: file system connector requires to specify a format,
  ...                                             -- currently only 'csv' format is supported.
                                                  -- Please refer to old CSV format part of Table Formats section for more details.
)  

但是官方提供的这种方法有很多局限性:

  • 不能自定义数据分区
  • 不能指定覆盖原文件的参数(同样的路径,第二次插入必须要提前把写入的文件删除)
  • 没有文件滚动策略

BucketingSink

BucketingSink是Flink提供的FileSystem的Connector,支持Hadoop文件系统支持的所有文件系统,提供了多种文件滚的策略, 但是没有SQL版本,下面就基于BucketingSink自己实现一个SQL版本的Connector。直接上代码了。

TableFactory

看过Flink的源码的应该都知道Flink的TableFactory机制,它是一个SPI,提供了Flink与外部数据源的扩展。我们在自定义TableSource或者TableSink时,需要在类路径下添加SPI机制的文件:META_INF/services/org.apache.flink.table.factories.TableFactory,内容是TableFactory的实现类全名。比如我的是 com.szc.streaming.connectors.hdfs26.StreamingHDFS26TableSinkFactory。Flink在解析完SQL后,会把SQL的schema和with属性转换成一个Map,Flink通过扫描类路径下的所有Jar包里的TableFactory,一个一个的与当前的with属性里的key,value匹配。感兴趣的同学可以去看下相关源码: 包名: flink-table-common, org.apache.flink.table.factories.TableFactoryService。下面是我自己实现的TableFactory
先看下代码结构(因为涉及一些公司的信息,我把包名和一些涉及公司的类名打码了):

代码结构.png

  • StreamingHDFS26TableSinkFactory.java - TableFactory的实现类, 负责connector的验证和Sink的实例化
public class StreamingHDFS26TableSinkFactory implements StreamTableSinkFactory {


  @Override
  public Map requiredContext() {
    Map context = new HashMap<>();
    context.put(CONNECTOR_TYPE, CONNECTOR_HDFS_TYPE_VALUE);
    context.put(CONNECTOR_PROPERTY_VERSION, "1");
    return context;
  }

  @Override
  public List supportedProperties() {
    List properties = new ArrayList<>();
    // update mode
    properties.add(UPDATE_MODE);
    //hdfs
    properties.add(CONNECTOR_PATH);
    properties.add(ROLLING_POLICY_PARTSIZE);
    properties.add(ROLLING_POLICY_ROLL_OVER_INTERVAL);
    properties.add(ROLLING_POLICY_INACTIVITY_INTERVAL);
    properties.add(ROLLING_POLICY_INACTIVITY_THRESHOLD);
    properties.add(BUCKET_ASSIGNER_FIELD);
    properties.add(BUCKET_ASSIGNER_TYPE);
    properties.add(BUCKET_ASSIGNER_FIELD_FORMATTER);
    properties.add(BUCKET_ASSIGNER_OUTPUT_FORMATTER);

    // schema
    properties.add(SCHEMA + ".#." + SCHEMA_DATA_TYPE);
    properties.add(SCHEMA + ".#." + SCHEMA_TYPE);
    properties.add(SCHEMA + ".#." + SCHEMA_NAME);
    properties.add(SCHEMA + ".#." + SCHEMA_FROM);

    return properties;
  }

  /**
   * 创建TableSink实例
   * @param properties
   * @return
   */
  @Override
  public StreamTableSink createStreamTableSink(Map properties) {
    // 最终FlinkSql的table都会转换成Map的key-value
    final DescriptorProperties descriptorProperties = getValidatedProperties(properties);

    //FileSystemSink的builder
    FileSystemBuilder builder = FileSystemSink.builder();
    //schema
    final TableSchema schema = TableSchemaUtils.getPhysicalSchema(descriptorProperties.getTableSchema(SCHEMA));
    builder.setSchema(schema);

    // bucket assigner,验证 bucket.assigner.field,是否自定义分区
    final String assignerField = descriptorProperties.getOptionalString(BUCKET_ASSIGNER_FIELD).orElse(null);
    if(StringUtils.isNotEmpty(assignerField)) {
      final String assignerFieldType = descriptorProperties.getString(BUCKET_ASSIGNER_TYPE);
      builder.setAssignerField(assignerField);
      builder.setAssignerFieldType(assignerFieldType);
      if(BUCKET_ASSIGNER_FIELD_TYPE_DATE_VALUE.equalsIgnoreCase(assignerFieldType)) {
        builder.setInputFormatter(descriptorProperties.getString(BUCKET_ASSIGNER_FIELD_FORMATTER));
        builder.setOutputFormatter(descriptorProperties.getString(BUCKET_ASSIGNER_OUTPUT_FORMATTER));
      }
    }

    //验证滚动策略配置
    //rolling policy
    if(descriptorProperties.containsKey(ROLLING_POLICY_PARTSIZE)) {
      builder.setMaxPartSize(descriptorProperties.getMemorySize(ROLLING_POLICY_PARTSIZE).getBytes());
    }
    if(descriptorProperties.containsKey(ROLLING_POLICY_ROLL_OVER_INTERVAL)) {
      builder.setRollOverInterval(descriptorProperties.getDuration(ROLLING_POLICY_ROLL_OVER_INTERVAL).toMillis());
    }
    if(descriptorProperties.containsKey(ROLLING_POLICY_INACTIVITY_INTERVAL)) {
      builder.setInactivityInterval(descriptorProperties.getDuration(ROLLING_POLICY_INACTIVITY_INTERVAL).toMillis());
    }
    if(descriptorProperties.containsKey(ROLLING_POLICY_INACTIVITY_THRESHOLD)) {
      builder.setInactivityThreshold(descriptorProperties.getDuration(ROLLING_POLICY_INACTIVITY_THRESHOLD).toMillis());
    }

    builder.setOutputPath(descriptorProperties.getString(CONNECTOR_PATH));
    return builder.build();
  }

  private DescriptorProperties getValidatedProperties(Map properties) {
    final DescriptorProperties descriptorProperties = new DescriptorProperties(true);
    descriptorProperties.putProperties(properties);

    // allow Kafka timestamps to be used, watermarks can not be received from source
    new SchemaValidator(true, false, false).validate(descriptorProperties);
    new HDFSValidator().validate(descriptorProperties);
    return descriptorProperties;
  }
}
  • HDFSValidator.java - 验证with属性
public class HDFSValidator extends ConnectorDescriptorValidator {

  public static final String CONNECTOR_HDFS_TYPE_VALUE = "hdfs26";
  public static final String CONNECTOR_PATH = "connector.path";

  /**
   * 滚动策略:
   */
  public static final String ROLLING_POLICY_PARTSIZE = "rolling.policy.part.size";
  // 打开的bucket最大存活时间,无论有没有达到part.size,也要关闭bucket
  public static final String ROLLING_POLICY_ROLL_OVER_INTERVAL = "rolling.policy.over.interval";
  // 多久检查一次非活跃状态的bucket
  public static final String ROLLING_POLICY_INACTIVITY_INTERVAL = "rolling.policy.inactivity.interval";
  // 处于非活跃状态时的bucket的阈值
  public static final String ROLLING_POLICY_INACTIVITY_THRESHOLD = "rolling.policy.inactivity.threshold";

  /**
   * 指定数据中的某一个字段为bucket assigner,也就是数据存入的分区目录
   */
  public static final String BUCKET_ASSIGNER_FIELD = "bucket.assigner.field";
  // date,代表用时间来分区; string代表取数据源中的某一个字段 来分区
  public static final String BUCKET_ASSIGNER_TYPE = "bucket.assigner.type";
  public static final String BUCKET_ASSIGNER_FIELD_TYPE_DATE_VALUE = "date";
  public static final String BUCKET_ASSIGNER_FIELD_TYPE_STRING_VALUE = "string";
  // 时间格式,如果原field是一个long类型,直接写成BIGINT,否则为 通用的时间类型
  public static final String BUCKET_ASSIGNER_FIELD_FORMATTER = "bucket.assigner.field.formatter";
  public static final String BUCKET_ASSIGNER_OUTPUT_FORMATTER = "bucket.assigner.output.formatter";

  @Override
  public void validate(DescriptorProperties properties) {
    super.validate(properties);

    properties.validateEnumValues(UPDATE_MODE, true, Collections.singletonList(UPDATE_MODE_VALUE_APPEND));
    properties.validateValue(CONNECTOR_TYPE, CONNECTOR_HDFS_TYPE_VALUE, false);
    properties.validateString(CONNECTOR_PATH, false, 1);
    //
    validateBucket(properties);
  }

  private void validateBucket(DescriptorProperties properties) {
    properties.validateEnumValues(UPDATE_MODE, true, Collections.singletonList(UPDATE_MODE_VALUE_APPEND));

    final Map> assignerFieldTypeValidation = new HashMap<>();
    assignerFieldTypeValidation.put(BUCKET_ASSIGNER_FIELD_TYPE_DATE_VALUE, noValidation());
    assignerFieldTypeValidation.put(BUCKET_ASSIGNER_FIELD_TYPE_STRING_VALUE, noValidation());
    properties.validateEnum(BUCKET_ASSIGNER_TYPE, true, assignerFieldTypeValidation);

    properties.validateString(ROLLING_POLICY_PARTSIZE, true, 2);
    properties.validateString(ROLLING_POLICY_ROLL_OVER_INTERVAL, true, 2);
    properties.validateString(ROLLING_POLICY_INACTIVITY_INTERVAL, true, 2);
    properties.validateString(CONNECTOR_PATH, false, 1);
  }

}
  • FileSystemSink.java - 主要的sink逻辑类,定义如何把数据发射到外部数据源
public class FileSystemSink implements AppendStreamTableSink {

  public static final Long DEFAULT_BUCKET_CHECK_INTERVAL = 60 * 1000L;

  private TableSchema schema;

  //分区字段
  private String assignerField;
  // 分区字段是否是一个时间类型
  private String assignerFieldType;
  //如果是dateAssigner,formatter必须指定。
  private String inputFormatter;
  // 如果是dateAssigner,以哪种格式输出
  private String outputFormatter;
  // 输出的baseDir
  private String outputPath;
  // rolling policy中,滚动的最大size
  private Long maxPartSize;
  //InactivityInterva 单位 秒
  private Long inactivityInterval;
  private Long inactivityThreshold;
  //单位 秒
  private Long rollOverInterval;

  private FileSystemSink(
      TableSchema schema,
      String assignerField,
      String assignerFieldType,
      String inputFormatter,
      String outputFormatter,
      String outputPath,
      Long maxPartSize,
      Long inactivityInterval,
      Long rollOverInterval,
      Long inactivityThreshold) {
    this.schema = schema;
    this.assignerField = assignerField;
    this.assignerFieldType = assignerFieldType;
    this.inputFormatter = inputFormatter;
    this.outputFormatter = outputFormatter;
    this.outputPath = outputPath;
    this.maxPartSize = maxPartSize;
    this.inactivityInterval = inactivityInterval;
    this.rollOverInterval = rollOverInterval;
    this.inactivityThreshold = inactivityThreshold;
  }

  @Override
  public void emitDataStream(DataStream dataStream) {

  }

  @Override
  public DataStreamSink consumeDataStream(DataStream dataStream) {

    BucketingSink sink = new BucketingSink(outputPath);
    // rolling policy
    if(maxPartSize != null) {
      sink.setBatchSize(maxPartSize);
    }
    if(inactivityInterval != null) {
      sink.setInactiveBucketCheckInterval(inactivityInterval);
    }
    if(inactivityInterval != null) {
      sink.setInactiveBucketThreshold(inactivityThreshold);
    }
    if(rollOverInterval != null) {
      sink.setBatchRolloverInterval(rollOverInterval);
    }

    Bucketer bucketer;
    Integer fieldIndex = 0;
    //bucketAssigner,如果不设置,则用默认的DateBucketAssigner
    if(StringUtils.isEmpty(assignerField)) {
      bucketer =  new DateTimeBucketer<>(outputFormatter);
    } else {
      bucketer = new CustomBucketer();
      String[] fieldNames = schema.getFieldNames();
      for(String field : fieldNames) {
        if(assignerField.equals(field)) {
          break;
        }
        fieldIndex ++;
      }
    }
    sink.setBucketer(bucketer);
    // 把Row转换成HDFSDomain
    return dataStream
        .map(new HDFSStringMapFunction(fieldIndex, assignerFieldType, inputFormatter, outputFormatter))
        .addSink(sink)
        .setParallelism(dataStream.getParallelism())
        .name(TableConnectorUtils.generateRuntimeName(this.getClass(), schema.getFieldNames()));
  }

  @Override
  public DataType getConsumedDataType() {
    return schema.toRowDataType();
  }

  @Override
  public TableSchema getTableSchema() {
    return schema;
  }

  @Override
  public TableSink configure(String[] fieldNames, TypeInformation[] fieldTypes) {
    return null;
  }

  public static FileSystemBuilder builder() {
    return new FileSystemBuilder();
  }

  public static final class FileSystemBuilder {
    private TableSchema schema;
    private String assignerField;
    private String assignerFieldType;
    private String inputFormatter;
    private String outputFormatter;
    private String outputPath;
    private Long maxPartSize;
    private Long inactivityInterval;
    private Long inactivityThreshold;
    private Long rollOverInterval;

    private FileSystemBuilder() {}

    public FileSystemBuilder setSchema(TableSchema schema) {
      this.schema = schema;
      return this;
    }

    public FileSystemBuilder setAssignerField(String assignerField) {
      this.assignerField = assignerField;
      return this;
    }

    public FileSystemBuilder setAssignerFieldType(String assignerFieldType) {
      this.assignerFieldType = assignerFieldType;
      return this;
    }

    public FileSystemBuilder setInputFormatter(String inputFormatter) {
      this.inputFormatter = inputFormatter;
      return this;
    }

    public FileSystemBuilder setOutputFormatter(String outputFormatter) {
      this.outputFormatter = outputFormatter;
      return this;
    }

    public FileSystemBuilder setOutputPath(String outputPath) {
      this.outputPath = outputPath;
      return this;
    }

    public FileSystemBuilder setMaxPartSize(Long maxPartSize) {
      this.maxPartSize = maxPartSize;
      return this;
    }

    public FileSystemBuilder setInactivityInterval(Long inactivityInterval) {
      this.inactivityInterval = inactivityInterval;
      return this;
    }

    public FileSystemBuilder setRollOverInterval(Long rollOverInterval) {
      this.rollOverInterval = rollOverInterval;
      return this;
    }
    public FileSystemBuilder setInactivityThreshold(Long inactivityThreshold) {
      this.inactivityThreshold = inactivityThreshold;
      return this;
    }

    public FileSystemSink build() {
      return new FileSystemSink(
          schema,
          assignerField,
          assignerFieldType,
          inputFormatter,
          outputFormatter,
          outputPath,
          maxPartSize,
          inactivityInterval,
          rollOverInterval,
          inactivityThreshold);
    }
  }
}
  • HDFSStringMapFunction.java - 把row转换成HDFSDomain,
public class HDFSStringMapFunction extends RichMapFunction {

  private int assignerIndex;

  private String assignerFieldType;

  /**
   * 如果是dateAssigner,formatter必须指定。
   */
  private String inputFormatter;

  /**
   * 就是以哪种格式输出
   */
  private String outputFormatter;

  public HDFSStringMapFunction(int assignerIndex, String assignerFieldType, String inputFormatter, String outputFormatter) {
    this.assignerIndex = assignerIndex;
    this.assignerFieldType = assignerFieldType;
    this.inputFormatter = inputFormatter;
    this.outputFormatter = outputFormatter;
  }

  @Override
  public HDFSDomain map(Row row) throws Exception {
    String partition;
    if(assignerIndex >= 0) {
      Object dateObj = row.getField(assignerIndex);
      String assignerValue = null;
      if(dateObj != null) {
        assignerValue = dateObj.toString();
      }
      if(assignerFieldType.equalsIgnoreCase(HDFSValidator.BUCKET_ASSIGNER_FIELD_TYPE_DATE_VALUE)) {
        long millis;
        //原数据就是一个毫秒数,不用格式化
        if(inputFormatter.equalsIgnoreCase("bigint")) {
          millis = StringUtils.isEmpty(assignerValue) ? System.currentTimeMillis() : Long.valueOf(assignerValue);
        } else {
          Date date = StringUtils.isEmpty(assignerValue) ? new Date() : DateUtils.parseDate(assignerValue, inputFormatter);
          millis = date.getTime();
        }
        partition = DateFormatUtils.format(millis, outputFormatter);
      } else {
        partition = StringUtils.isEmpty(assignerValue) ? "undifined" : assignerValue;
      }
    } else {
      partition = "default";
    }

    StringBuilder sb = new StringBuilder();
    for(int i = 0; i < row.getArity(); i++) {
      Object o = Objects.isNull(row.getField(i)) ? "" : row.getField(i);
      sb.append(o).append("\t");
    }
    sb.deleteCharAt(sb.length() - 1);
    return new HDFSDomain(partition, sb.toString());
  }
}

  • HDFSDomain.java - 输出对象,因为BucketingSink默认用了StringWriter,所以要实现toString方法
public class HDFSDomain {

  private String bucketId;

  private String value;

  public HDFSDomain(String bucketId, String value) {
    this.bucketId = bucketId;
    this.value = value;
  }

  public String getBucketId() {
    return bucketId;
  }

  public void setBucketId(String bucketId) {
    this.bucketId = bucketId;
  }

  public String getValue() {
    return value;
  }

  public void setValue(String value) {
    this.value = value;
  }

  public String toString() {
    return value;
  }
}

  • CustomBucketer.java - 负责获取存入数据的路径
public class CustomBucketer implements Bucketer {

  @Override
  public Path getBucketPath(Clock clock, Path basePath, HDFSDomain element) {
    return new Path(basePath + "/" + element.getBucketId());
  }
}

如何使用

以上就是基于BucketingSink的 Sql Connector的全部实现代码,具体使用方法如下:

CREATE TABLE binlog_locker_pay_detail_m(
 id bitint,
 birth_day bigint,
 name varchar
) WITH (
  'connector.type'='hdfs26',
  'connector.path' = 'hdfs://xxx/flink/xxxx',
  'update-mode' = 'append',
  -- 可选,指定数据中的某个字段分桶,不配置用默认的分桶策略
  'bucket.assigner.field' = 'birth_day',
  -- 如果配置了bucket.assigner.field, 那这个字段必须配置。可选值为1.date: 代表这个字段值为一个时间属性, 2.string: 代表这个值为一个字符串。
  'bucket.assigner.type' = 'date',
  -- 配置分桶字段的原时间格式。如果bucket.assigner.type配置为date,则这个字段必须配置。可选值为1.bigint: 代表分桶的字段已经是一个long类型,可以不用格式化。2.标准时间格式,比如: yyyy-MM-dd HH:mm:ss
  'bucket.assigner.field.formatter' = 'bigint',
  -- 配置分桶目录的时间格式。如果bucket.assigner.type配置为date,则这个字段必须配置。1.标准时间格式,比如: yyyyMMdd
  'bucket.assigner.output.formatter' = 'yyyyMMdd',
  -- 滚动策略200M,参考 MemoryUnit
  'rolling.policy.part.size' = '200m',
  -- 滚动文件的时间间隔: 参考TimeUtils
  'rolling.policy.over.interval' = '5min',
  -- 多久检查一次非活跃状态的bucket
  'rolling.policy.inactivity.interval' = '1min',
  -- 非活跃bucket阈值
  'rolling.policy.inactivity.threshold' = '5min'
);

全篇完

你可能感兴趣的:(Flink1.10-基于BucketingSink的FileSystem Sql Connector)