以下是我在公司项目上写的一些代码,删去了业务逻辑后的通用内容
后续会再分享一些其他flink的链路
首先肯定要先导入maven依赖
我的依赖如下
<properties>
<maven.compiler.source>1.8maven.compiler.source>
<maven.compiler.target>1.8maven.compiler.target>
<encoding>UTF-8encoding>
<scala.version>2.11.8scala.version>
<scala.binary.version>2.11scala.binary.version>
<hadoop.version>3.0.0-cdh6.3.0hadoop.version>
<flink.version>1.12.0flink.version>
<kafka.version>1.1.1kafka.version>
<hive.version>2.1.1-cdh6.3.0hive.version>
<hbase.version>1.2.0hbase.version>
<mysql.connector.version>5.1.40mysql.connector.version>
<kudu.version>1.10.0kudu.version>
properties>
<profiles>
<profile>
<id>devid>
<activation>
<activeByDefault>trueactiveByDefault>
activation>
<properties>
<maven.dependency.scope>compilemaven.dependency.scope>
properties>
profile>
<profile>
<id>prodid>
<properties>
<maven.dependency.scope>providedmaven.dependency.scope>
properties>
profile>
profiles>
<repositories>
<repository>
<id>apache.snapshotsid>
<name>Apache Development Snapshot Repositoryname>
<url>https://repository.apache.org/content/repositories/snapshots/url>
<releases>
<enabled>falseenabled>
releases>
<snapshots>
<enabled>trueenabled>
snapshots>
repository>
repositories>
<dependencies>
<dependency>
<groupId>org.apache.flinkgroupId>
<artifactId>flink-connector-filesystem_2.12artifactId>
<version>1.11.3version>
dependency>
<dependency>
<groupId>com.alibabagroupId>
<artifactId>fastjsonartifactId>
<version>1.2.62version>
dependency>
<dependency>
<groupId>org.apache.flinkgroupId>
<artifactId>flink-avroartifactId>
<version>${flink.version}version>
dependency>
<dependency>
<groupId>org.apache.flinkgroupId>
<artifactId>flink-avro-confluent-registryartifactId>
<version>${flink.version}version>
dependency>
<dependency>
<groupId>org.apache.kudugroupId>
<artifactId>kudu-clientartifactId>
<version>${kudu.version}version>
dependency>
<dependency>
<groupId>com.google.protobufgroupId>
<artifactId>protobuf-javaartifactId>
<version>2.6.0version>
dependency>
<dependency>
<groupId>org.apache.hbasegroupId>
<artifactId>hbase-protocolartifactId>
<version>${hbase.version}version>
<scope>providedscope>
dependency>
<dependency>
<groupId>org.apache.hbasegroupId>
<artifactId>hbase-clientartifactId>
<version>${hbase.version}version>
<scope>providedscope>
dependency>
<dependency>
<groupId>org.apache.hbasegroupId>
<artifactId>hbase-commonartifactId>
<version>${hbase.version}version>
<scope>providedscope>
dependency>
<dependency>
<groupId>org.apache.hadoopgroupId>
<artifactId>hadoop-commonartifactId>
<version>3.0.0version>
<scope>providedscope>
dependency>
<dependency>
<groupId>org.apache.flinkgroupId>
<artifactId>flink-javaartifactId>
<version>${flink.version}version>
<scope>${maven.dependency.scope}scope>
dependency>
<dependency>
<groupId>org.apache.flinkgroupId>
<artifactId>flink-streaming-java_${scala.binary.version}artifactId>
<version>${flink.version}version>
<scope>${maven.dependency.scope}scope>
dependency>
<dependency>
<groupId>org.apache.flinkgroupId>
<artifactId>flink-coreartifactId>
<version>${flink.version}version>
<scope>${maven.dependency.scope}scope>
dependency>
<dependency>
<groupId>org.apache.flinkgroupId>
<artifactId>flink-runtime_2.11artifactId>
<version>${flink.version}version>
<scope>${maven.dependency.scope}scope>
dependency>
<dependency>
<groupId>org.apache.flinkgroupId>
<artifactId>flink-connector-kafka_${scala.binary.version}artifactId>
<version>${flink.version}version>
dependency>
<dependency>
<groupId>org.apache.kafkagroupId>
<artifactId>kafka-clientsartifactId>
<version>${kafka.version}version>
<scope>${maven.dependency.scope}scope>
dependency>
<dependency>
<groupId>org.apache.flinkgroupId>
<artifactId>flink-connector-hive_2.11artifactId>
<version>${flink.version}version>
<scope>providedscope>
dependency>
<dependency>
<groupId>org.apache.flinkgroupId>
<artifactId>flink-connector-filesartifactId>
<version>${flink.version}version>
dependency>
<dependency>
<groupId>org.apache.flinkgroupId>
<artifactId>flink-table-api-javaartifactId>
<version>${flink.version}version>
<scope>${maven.dependency.scope}scope>
dependency>
<dependency>
<groupId>org.apache.flinkgroupId>
<artifactId>flink-table-planner-blink_2.11artifactId>
<version>${flink.version}version>
<scope>${maven.dependency.scope}scope>
dependency>
<dependency>
<groupId>org.apache.flinkgroupId>
<artifactId>flink-table-api-java-bridge_${scala.binary.version}artifactId>
<version>${flink.version}version>
<scope>providedscope>
dependency>
<dependency>
<groupId>org.apache.flinkgroupId>
<artifactId>flink-table-commonartifactId>
<version>${flink.version}version>
<scope>${maven.dependency.scope}scope>
dependency>
<dependency>
<groupId>org.slf4jgroupId>
<artifactId>slf4j-log4j12artifactId>
<version>1.7.7version>
<scope>runtimescope>
dependency>
<dependency>
<groupId>commons-logginggroupId>
<artifactId>commons-logging-apiartifactId>
<version>1.1version>
dependency>
<dependency>
<groupId>log4jgroupId>
<artifactId>log4jartifactId>
<version>1.2.17version>
<scope>runtimescope>
dependency>
dependencies>
然后在build里的个性化设置就不粘贴了
接下来是主类,整个主类的传参只需要一个配置文件,然后根据代码内容把对应的配置项写到配置文件即可
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.serialization.SimpleStringEncoder;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.connector.file.sink.FileSink;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.sink.filesystem.OutputFileConfig;
import org.apache.flink.streaming.api.functions.sink.filesystem.bucketassigners.DateTimeBucketAssigner;
import org.apache.flink.streaming.api.functions.sink.filesystem.rollingpolicies.DefaultRollingPolicy;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer;
import org.apache.flink.util.Collector;
import org.apache.flink.core.fs.Path;
import java.io.BufferedReader;
import java.io.FileReader;
import java.math.BigInteger;
import java.util.Properties;
import java.util.concurrent.TimeUnit;
public class KafkaToHdfs {
public static void main(String[] args) throws Exception{
//传参
String config_path = args[0];
//获取配置文件
Properties properties = new Properties();
BufferedReader bufferedReader = new BufferedReader(new FileReader(config_path));
properties.load(bufferedReader);
//解析配置文件
String topic_name = properties.getProperty("kakfa.topic.name");
String group_name = properties.getProperty("kakfa.group.name");
String kafka_ips = properties.getProperty("kakfa.ips");
String out_path = properties.getProperty("hdfs.outpath");
String check_path = properties.getProperty("hdfs.checkpoint.path");
String job_name = properties.getProperty("flink.job.name");
String head_name = properties.getProperty("file.header.name");
//设置FLINK环境
StreamExecutionEnvironment env = FlinkEnvUtils.creatEnv(check_path);
//创建kafka环境
Properties props = new Properties();
props.setProperty("bootstrap.servers", kafka_ips);
props.setProperty("group.id", group_name);
FlinkKafkaConsumer<String> consumer = new FlinkKafkaConsumer<>(topic_name, new SimpleStringSchema(), props);
consumer.setCommitOffsetsOnCheckpoints(true);
consumer.setStartFromGroupOffsets();
//创建流
DataStream<String> stream = env.addSource(consumer);
//设置文件格式
OutputFileConfig config = OutputFileConfig
.builder()
.withPartPrefix(head_name)
.withPartSuffix(".dat")
.build();
//设置时间格式
DateTimeBucketAssigner dateTimeBucketAssigner = new DateTimeBucketAssigner("yyyyMMddHH");
//设置文件生成
FileSink<String> sink = FileSink
.forRowFormat(new Path(out_path), new SimpleStringEncoder<String>("UTF-8"))
.withBucketAssigner(dateTimeBucketAssigner)
.withRollingPolicy(
DefaultRollingPolicy.builder()
.withRolloverInterval(TimeUnit.MINUTES.toMillis(5))
.withInactivityInterval(TimeUnit.MINUTES.toMillis(5))
.withMaxPartSize(1024*1024*1024)
.build())
.withOutputFileConfig(config)
.build();
//业务处理
stream.flatMap(new FlatMapFunction<String, String>() {
@Override
public void flatMap(String value, Collector<String> out) {
if(value!=null) {
//TODO 这里写具体的业务处理逻辑,每读取一个kafka的offset,会进行一次处理
out.collect(value);
}
}
}
}).sinkTo(sink);
//进行执行
env.execute(job_name);
}
}
FileSink可以自定义滚动策略
withRolloverInterval 包含了至少多少时间的数据量
withInactivityInterval 多久没接受到数据
withMaxPartSize 文件大小达到了多少
当满足以上三个条件的任何一个时都会将 In-progress 状态文件转化为正式文件
其中FlinkEnvUtils.creatEnv方法的代码如下,是一些配置项
public static StreamExecutionEnvironment creatEnv(String check_path){
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.enableCheckpointing(5*60*1000L);
env.setStateBackend(new FsStateBackend(check_path));
env.getCheckpointConfig().setCheckpointInterval(5*60*1000L);
env.getCheckpointConfig().setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE);
env.getCheckpointConfig().setCheckpointTimeout(5*60000L);
env.getCheckpointConfig().setMaxConcurrentCheckpoints(1);
env.getCheckpointConfig().enableExternalizedCheckpoints(CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION);
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
return env;
}