Flink1.12读取Kafka数据写入到Hdfs | 含maven依赖 | FileSink

以下是我在公司项目上写的一些代码,删去了业务逻辑后的通用内容

后续会再分享一些其他flink的链路

首先肯定要先导入maven依赖

我的依赖如下

 <properties>
        <maven.compiler.source>1.8maven.compiler.source>
        <maven.compiler.target>1.8maven.compiler.target>
        <encoding>UTF-8encoding>
        <scala.version>2.11.8scala.version>
        <scala.binary.version>2.11scala.binary.version>
        <hadoop.version>3.0.0-cdh6.3.0hadoop.version>
        <flink.version>1.12.0flink.version>
        <kafka.version>1.1.1kafka.version>
        <hive.version>2.1.1-cdh6.3.0hive.version>
        <hbase.version>1.2.0hbase.version>
        <mysql.connector.version>5.1.40mysql.connector.version>
        <kudu.version>1.10.0kudu.version>
    properties>

    <profiles>
        <profile>
            <id>devid>
            <activation>
                <activeByDefault>trueactiveByDefault>
            activation>
            <properties>
                <maven.dependency.scope>compilemaven.dependency.scope>
            properties>
        profile>
        <profile>
            <id>prodid>
            <properties>
                <maven.dependency.scope>providedmaven.dependency.scope>
            properties>
        profile>
    profiles>

    <repositories>
        <repository>
            <id>apache.snapshotsid>
            <name>Apache Development Snapshot Repositoryname>
            <url>https://repository.apache.org/content/repositories/snapshots/url>
            <releases>
                <enabled>falseenabled>
            releases>
            <snapshots>
                <enabled>trueenabled>
            snapshots>
        repository>
    repositories>

    <dependencies>

        
        <dependency>
            <groupId>org.apache.flinkgroupId>
            <artifactId>flink-connector-filesystem_2.12artifactId>
            <version>1.11.3version>
        dependency>


        
        <dependency>
            <groupId>com.alibabagroupId>
            <artifactId>fastjsonartifactId>
            <version>1.2.62version>
        dependency>

        
        <dependency>
            <groupId>org.apache.flinkgroupId>
            <artifactId>flink-avroartifactId>
            <version>${flink.version}version>
        dependency>

        <dependency>
            <groupId>org.apache.flinkgroupId>
            <artifactId>flink-avro-confluent-registryartifactId>
            <version>${flink.version}version>
        dependency>

        
        <dependency>
            <groupId>org.apache.kudugroupId>
            <artifactId>kudu-clientartifactId>
            <version>${kudu.version}version>
            
        dependency>

        <dependency>
            <groupId>com.google.protobufgroupId>
            <artifactId>protobuf-javaartifactId>
            <version>2.6.0version>
        dependency>

        <dependency>
            <groupId>org.apache.hbasegroupId>
            <artifactId>hbase-protocolartifactId>
            <version>${hbase.version}version>
            <scope>providedscope>
        dependency>


        <dependency>
            <groupId>org.apache.hbasegroupId>
            <artifactId>hbase-clientartifactId>
            <version>${hbase.version}version>
            <scope>providedscope>
        dependency>

        
        <dependency>
            <groupId>org.apache.hbasegroupId>
            <artifactId>hbase-commonartifactId>
            <version>${hbase.version}version>
            <scope>providedscope>
        dependency>

        
        <dependency>
            <groupId>org.apache.hadoopgroupId>
            <artifactId>hadoop-commonartifactId>
            <version>3.0.0version>
            <scope>providedscope>
        dependency>


        
        
        <dependency>
            <groupId>org.apache.flinkgroupId>
            <artifactId>flink-javaartifactId>
            <version>${flink.version}version>
            <scope>${maven.dependency.scope}scope>
        dependency>
        <dependency>
            <groupId>org.apache.flinkgroupId>
            <artifactId>flink-streaming-java_${scala.binary.version}artifactId>
            <version>${flink.version}version>
            <scope>${maven.dependency.scope}scope>
        dependency>
        <dependency>
            <groupId>org.apache.flinkgroupId>
            <artifactId>flink-coreartifactId>
            <version>${flink.version}version>
            <scope>${maven.dependency.scope}scope>
        dependency>
        <dependency>
            <groupId>org.apache.flinkgroupId>
            <artifactId>flink-runtime_2.11artifactId>
            <version>${flink.version}version>
            <scope>${maven.dependency.scope}scope>
        dependency>

        

        <dependency>
            <groupId>org.apache.flinkgroupId>
            <artifactId>flink-connector-kafka_${scala.binary.version}artifactId>
            <version>${flink.version}version>
        dependency>

        <dependency>
            <groupId>org.apache.kafkagroupId>
            <artifactId>kafka-clientsartifactId>
            <version>${kafka.version}version>
            <scope>${maven.dependency.scope}scope>
        dependency>

        

        <dependency>
            <groupId>org.apache.flinkgroupId>
            <artifactId>flink-connector-hive_2.11artifactId>
            <version>${flink.version}version>
            <scope>providedscope>
        dependency>

        
        <dependency>
            <groupId>org.apache.flinkgroupId>
            <artifactId>flink-connector-filesartifactId>
            <version>${flink.version}version>
        dependency>


        
        <dependency>
            <groupId>org.apache.flinkgroupId>
            <artifactId>flink-table-api-javaartifactId>
            <version>${flink.version}version>
            <scope>${maven.dependency.scope}scope>
        dependency>
        <dependency>
            <groupId>org.apache.flinkgroupId>
            <artifactId>flink-table-planner-blink_2.11artifactId>
            <version>${flink.version}version>
            <scope>${maven.dependency.scope}scope>
        dependency>
        <dependency>
            <groupId>org.apache.flinkgroupId>
            <artifactId>flink-table-api-java-bridge_${scala.binary.version}artifactId>
            <version>${flink.version}version>
            <scope>providedscope>
        dependency>
        <dependency>
            <groupId>org.apache.flinkgroupId>
            <artifactId>flink-table-commonartifactId>
            <version>${flink.version}version>
            <scope>${maven.dependency.scope}scope>
        dependency>


        <dependency>
            <groupId>org.slf4jgroupId>
            <artifactId>slf4j-log4j12artifactId>
            <version>1.7.7version>
            <scope>runtimescope>
        dependency>
        <dependency>
            <groupId>commons-logginggroupId>
            <artifactId>commons-logging-apiartifactId>
            <version>1.1version>
        dependency>
        <dependency>
            <groupId>log4jgroupId>
            <artifactId>log4jartifactId>
            <version>1.2.17version>
            <scope>runtimescope>
        dependency>
    dependencies>

然后在build里的个性化设置就不粘贴了

接下来是主类,整个主类的传参只需要一个配置文件,然后根据代码内容把对应的配置项写到配置文件即可

import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.serialization.SimpleStringEncoder;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.connector.file.sink.FileSink;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.sink.filesystem.OutputFileConfig;
import org.apache.flink.streaming.api.functions.sink.filesystem.bucketassigners.DateTimeBucketAssigner;
import org.apache.flink.streaming.api.functions.sink.filesystem.rollingpolicies.DefaultRollingPolicy;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer;
import org.apache.flink.util.Collector;
import org.apache.flink.core.fs.Path;

import java.io.BufferedReader;
import java.io.FileReader;
import java.math.BigInteger;
import java.util.Properties;
import java.util.concurrent.TimeUnit;


public class KafkaToHdfs {

    public static void main(String[] args) throws Exception{

        //传参
        String config_path = args[0];

        //获取配置文件
        Properties properties = new Properties();
        BufferedReader bufferedReader = new BufferedReader(new FileReader(config_path));
        properties.load(bufferedReader);

        //解析配置文件
        String topic_name = properties.getProperty("kakfa.topic.name");
        String group_name = properties.getProperty("kakfa.group.name");
        String kafka_ips = properties.getProperty("kakfa.ips");
        String out_path = properties.getProperty("hdfs.outpath");
        String check_path = properties.getProperty("hdfs.checkpoint.path");
        String job_name = properties.getProperty("flink.job.name");
        String head_name = properties.getProperty("file.header.name");

        //设置FLINK环境
        StreamExecutionEnvironment env = FlinkEnvUtils.creatEnv(check_path);

        //创建kafka环境
        Properties props = new Properties();
        props.setProperty("bootstrap.servers", kafka_ips);
        props.setProperty("group.id", group_name);
        FlinkKafkaConsumer<String> consumer = new FlinkKafkaConsumer<>(topic_name, new SimpleStringSchema(), props);
        consumer.setCommitOffsetsOnCheckpoints(true);
        consumer.setStartFromGroupOffsets();

        //创建流
        DataStream<String> stream = env.addSource(consumer);

        //设置文件格式
        OutputFileConfig config = OutputFileConfig
                .builder()
                .withPartPrefix(head_name)
                .withPartSuffix(".dat")
                .build();

        //设置时间格式
        DateTimeBucketAssigner dateTimeBucketAssigner = new DateTimeBucketAssigner("yyyyMMddHH");

        //设置文件生成
        FileSink<String> sink = FileSink
                .forRowFormat(new Path(out_path), new SimpleStringEncoder<String>("UTF-8"))
                .withBucketAssigner(dateTimeBucketAssigner)
                .withRollingPolicy(
                        DefaultRollingPolicy.builder()
                                .withRolloverInterval(TimeUnit.MINUTES.toMillis(5))
                                .withInactivityInterval(TimeUnit.MINUTES.toMillis(5))
                                .withMaxPartSize(1024*1024*1024)
                                .build())
                .withOutputFileConfig(config)
                .build();

        //业务处理
        stream.flatMap(new FlatMapFunction<String, String>() {
            @Override
            public void flatMap(String value, Collector<String> out) {
                if(value!=null) {
                    	//TODO 这里写具体的业务处理逻辑,每读取一个kafka的offset,会进行一次处理
                        out.collect(value);
                    }
                }
            }
        }).sinkTo(sink);

        //进行执行
        env.execute(job_name);

    }

}

FileSink可以自定义滚动策略

withRolloverInterval 包含了至少多少时间的数据量

withInactivityInterval 多久没接受到数据

withMaxPartSize 文件大小达到了多少

当满足以上三个条件的任何一个时都会将 In-progress 状态文件转化为正式文件

其中FlinkEnvUtils.creatEnv方法的代码如下,是一些配置项

public static StreamExecutionEnvironment creatEnv(String check_path){
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.enableCheckpointing(5*60*1000L);
        env.setStateBackend(new FsStateBackend(check_path));
        env.getCheckpointConfig().setCheckpointInterval(5*60*1000L);
        env.getCheckpointConfig().setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE);
        env.getCheckpointConfig().setCheckpointTimeout(5*60000L);
        env.getCheckpointConfig().setMaxConcurrentCheckpoints(1);
        env.getCheckpointConfig().enableExternalizedCheckpoints(CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION);
        env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
        return env;
    }

你可能感兴趣的:(大数据,kafka,flink,hadoop,java,大数据)