Flink写入HDFS(文本,parquet,parquet+snappy)

flink版本:1.10.0

code:

    // 构建env环境
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.enableCheckpointing(1000*60) //1000*60 s一个checkpoint
    env.getCheckpointConfig.setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE)
    env.setStateBackend(new RocksDBStateBackend(checkpointPath, true))
    env.getCheckpointConfig.enableExternalizedCheckpoints(CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION) //job取消,cp还在
    

    //任务失败会进行五次重试,重试间隔为50s
    env.setRestartStrategy(RestartStrategies.
      fixedDelayRestart(5, 
        50000)) 
    //kafak参数
    val properties = new Properties()
    properties.setProperty("bootstrap.servers", TEST_BROKERS)
    properties.setProperty("group.id", "toHdfs-Parquet-group")
    val kafkaConsumer = new FlinkKafkaConsumer[String](LAUNCHS_TOPIC, new SimpleStringSchema(), properties)
    val source = env.addSource(kafkaConsumer)
    // 解析json
    val kafkaSource=source.flatMap(new RichFlatMapFunction[String, LogSchema] {
      var gson:Gson=_
      override def open(parameters: Configuration): Unit = {
        gson=new Gson
      }
      override def flatMap(value: String, out: Collector[LogSchema]): Unit = {
        val LogSchema = gson.fromJson(value, classOf[LogSchema])
        out.collect(LogSchema)
      }
    })

    //文件名
    val config = OutputFileConfig.builder()
      .withPartPrefix("prefix")
      .withPartSuffix(".txt")
      .build()
    //定义桶
    val assigner = new DateTimeBucketAssigner[LogSchema]("yyyy-MM-dd--HH", ZoneId.of("Asia/Shanghai"))

    // text  sink
    val sinkRow = StreamingFileSink
      .forRowFormat(new Path((outputPath)), new SimpleStringEncoder[String]("utf-8"))
      //      .withBucketAssigner(new MyBucketAssigner())
      .withRollingPolicy(
        DefaultRollingPolicy.builder()
          .withRolloverInterval(TimeUnit.MINUTES.toMillis(10)) //10min 生成一个文件
          .withInactivityInterval(TimeUnit.MINUTES.toMillis(5)) //5min未接收到数据,生成一个文件
          .withMaxPartSize(1024 * 1024 * 1024) //文件大小达到1G
          .build())
      .build()    
      // parquet  sink
    val sinkClo = StreamingFileSink
      .forBulkFormat(
        new org.apache.flink.core.fs.Path(outputPath),
        ParquetAvroWriters.forReflectRecord(classOf[LogSchema]).asInstanceOf[BulkWriter.Factory[LogSchema]]
      )
      .withBucketAssigner(assigner)
      .build()
    // parquet + 压缩
    val sinkCloCom = StreamingFileSink
      .forBulkFormat(
        new org.apache.flink.core.fs.Path(outputPathParquetSnappy),
        PaulParquetAvroWriters.forReflectRecord(classOf[LogSchema], CompressionCodecName.SNAPPY)
      )
      .withBucketAssigner(assigner)
      .build()

    // 根据不同的存储格式替换上面三种sink
    kafkaSource.addSink(sinkCloCom)
    env.execute("KafkaLogToHdfs")

上述三种方式文本方式没有小文件问题,但是parquet格式会存在小文件问题,可以调大checkpoint时间、减小并行度等手段来处理,但是因为这两种方式在数据量大的情况下比较影响性能,可能会导致背压,所以推荐是写入hdfs后,单独写个spark任务coalesce 下。

PaulParquetAvroWriters类:


import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.reflect.ReflectData;
import org.apache.avro.specific.SpecificData;
import org.apache.avro.specific.SpecificRecordBase;
import org.apache.flink.formats.parquet.ParquetBuilder;
import org.apache.flink.formats.parquet.ParquetWriterFactory;
import org.apache.parquet.avro.AvroParquetWriter;
import org.apache.parquet.hadoop.ParquetWriter;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
import org.apache.parquet.io.OutputFile;

import java.io.IOException;

/**
 * @description: parquet压缩
 * @author: guaiguaizhu
 * @create: 2020-08-19 15
 **/

public class PaulParquetAvroWriters {

    public static <T extends SpecificRecordBase> ParquetWriterFactory<T> forSpecificRecord(Class<T> type, CompressionCodecName compressionCodecName) {
        final String schemaString = SpecificData.get().getSchema(type).toString();
        final ParquetBuilder<T> builder = (out) -> createAvroParquetWriter(schemaString, SpecificData.get(), out, compressionCodecName);
        return new ParquetWriterFactory<>(builder);
    }

    //compressionCodecName 压缩算法
    public static ParquetWriterFactory<GenericRecord> forGenericRecord(Schema schema, CompressionCodecName compressionCodecName) {
        final String schemaString = schema.toString();
        final ParquetBuilder<GenericRecord> builder = (out) -> createAvroParquetWriter(schemaString, GenericData.get(), out, compressionCodecName);
        return new ParquetWriterFactory<>(builder);
    }

    //compressionCodecName 压缩算法
    public static <T> ParquetWriterFactory<T> forReflectRecord(Class<T> type, CompressionCodecName compressionCodecName) {
        final String schemaString = ReflectData.get().getSchema(type).toString();
        final ParquetBuilder<T> builder = (out) -> createAvroParquetWriter(schemaString, ReflectData.get(), out, compressionCodecName);
        return new ParquetWriterFactory<>(builder);
    }

    //compressionCodecName 压缩算法
    private static <T> ParquetWriter<T> createAvroParquetWriter(
            String schemaString,
            GenericData dataModel,
            OutputFile out,
            CompressionCodecName compressionCodecName) throws IOException {
        final Schema schema = new Schema.Parser().parse(schemaString);
        return AvroParquetWriter.<T>builder(out)
                .withSchema(schema)
                .withDataModel(dataModel)
                .withCompressionCodec(compressionCodecName)//压缩算法
                .build();
    }

    private PaulParquetAvroWriters() {
    }
}

你可能感兴趣的:(flink,大数据,hdfs,flink,java)