大数据之Flink流处理 stream-quickstart

前言

基于Flink 1.14写了一个quickstart旨在能够让大家更快上手,感受flink带来的优秀体验
注意:flink1.13-和1.14+ api有很多不同的地方

依赖

使用的是scala2.12

 
        8
        8
        UTF-8
        8
        8
        1.14.5
        2.12
        1.8
        2.12.15
        1.7.36
        2.7.5
        1.10.3
        5.8.12
        5.1.47
        1.18.20
    

    
        
        
            org.apache.flink
            flink-clients_${scala.binary.version}
            ${flink.version}
            provided
        

        
        
            org.apache.flink
            flink-runtime-web_${scala.binary.version}
            ${flink.version}
            provided
        

        
        
            cn.hutool
            hutool-all
            ${hutool.version}
        

        
        
            org.apache.flink
            flink-connector-elasticsearch7_${scala.binary.version}
            ${flink.version}
        

        
        
            org.apache.flink
            flink-connector-kafka_${scala.binary.version}
            ${flink.version}
        

        
        
            org.apache.flink
            flink-jdbc_${scala.binary.version}
            ${flink.jdbc.version}
        

        
        
            mysql
            mysql-connector-java
            ${mysql.connector.version}
        

        
        
            org.projectlombok
            lombok
            ${lombok.version}
        

        
        
            org.slf4j
            slf4j-api
            ${slf4j.version}
        
        
            org.slf4j
            slf4j-log4j12
            ${slf4j.version}
        
    

启动模版

package flink.model;

import org.apache.flink.api.java.utils.ParameterTool;

import java.io.IOException;
import java.io.InputStream;

/**
 * @author zhangxuecheng4441
 * @date 2022/2/24/024 17:31
 */
public interface FlinkModel {
    String ENV_PARAM = "env";
    String DEV_ENV = "dev";
    String LOCAL_ENV_PARAM = "local";

    /**
     * 获取resource下配置
     *
     * @param envConf envConf
     * @return ParameterTool
     */
    static ParameterTool getInitConfig(String envConf) {
        InputStream resourceAsStream = FlinkStreamModel.class.getResourceAsStream("/" + String.format("application-%s.properties", envConf));
        ParameterTool parameterTool = null;
        try {
            parameterTool = ParameterTool.fromPropertiesFile(resourceAsStream);
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
        return parameterTool;
    }
}
package flink.model;

import lombok.extern.slf4j.Slf4j;
import lombok.val;
import org.apache.flink.api.common.RuntimeExecutionMode;
import org.apache.flink.api.common.time.Time;
import org.apache.flink.api.java.utils.MultipleParameterTool;
import org.apache.flink.api.java.utils.ParameterTool;
import org.apache.flink.client.program.StreamContextEnvironment;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.CheckpointingMode;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;

import java.util.Map;
import java.util.concurrent.TimeUnit;

/**
 * flink 流任务模板
 *
 * @author zhangxuecheng4441
 * @date 2022/10/31/031 19:36
 */
@Slf4j
public class FlinkStreamModel implements FlinkModel {

    /**
     * environment
     */
    public static StreamExecutionEnvironment env;
    /**
     * cli args
     */
    public static MultipleParameterTool param;
    /**
     * default config
     */
    public static Map config;

    /**
     * @param args params
     * @return StreamExecutionEnvironment
     */
    public static StreamExecutionEnvironment initEnv(String[] args) {
        //get params
        val params = MultipleParameterTool.fromArgs(args);
        param = params;

        long checkpointInterval = params.getLong("checkpointInterval", 120 * 1000);
        int parallelism = params.getInt("parallelism", 2);
        // set up the execution environment
        env = StreamEnvBuilder.builder()
                .setCheckpointInterval(checkpointInterval)
                .setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE)
                .setCheckpointTimeout(600 * 1000L)
                .setMinPauseBetweenCheckpoints(5 * 1000)
                .setTolerableCheckpointFailureNumber(2)
                .setMaxConcurrentCheckpoints(1)
                .setDefaultRestartStrategy(3, Time.of(3, TimeUnit.MINUTES), Time.of(2, TimeUnit.MINUTES))
                .setParallelism(parallelism)
                .build();
        //netty buffer 传输超时
        env.setBufferTimeout(1000);

        //todo debug 增加参数 -local local 可以IDEA测试开启 http://localhost:8081/ 研发环境
        if (params.has(LOCAL_ENV_PARAM)) {
            env = StreamContextEnvironment.createLocalEnvironmentWithWebUI(new Configuration());
        }

        //批模式运行
        env.setRuntimeMode(RuntimeExecutionMode.STREAMING);

        //根据环境 加载配置文件到 PropLoader
        String envConf = params.get(ENV_PARAM, DEV_ENV);
        ParameterTool parameterTool = FlinkModel.getInitConfig(envConf);

        config = parameterTool.toMap();

        // make parameters available in the web interface
        env.getConfig().setGlobalJobParameters(parameterTool);

        return env;
    }
}

数据源

//
// Source code recreated from a .class file by IntelliJ IDEA
// (powered by FernFlower decompiler)
//

package org.apache.flink.streaming.api.functions.source;

import java.io.Serializable;
import org.apache.flink.annotation.Public;
import org.apache.flink.annotation.PublicEvolving;
import org.apache.flink.api.common.functions.Function;
import org.apache.flink.streaming.api.watermark.Watermark;

@Public
public interface SourceFunction extends Function, Serializable {
    void run(SourceFunction.SourceContext var1) throws Exception;

    void cancel();

    @Public
    public interface SourceContext {
        void collect(T var1);

        @PublicEvolving
        void collectWithTimestamp(T var1, long var2);

        @PublicEvolving
        void emitWatermark(Watermark var1);

        @PublicEvolving
        void markAsTemporarilyIdle();

        Object getCheckpointLock();

        void close();
    }
}

quick-start

package flink.launch;

import cn.hutool.core.date.DateUtil;
import cn.hutool.core.thread.ThreadUtil;
import flink.model.FlinkStreamModel;
import flink.sink.GenericSink;
import lombok.extern.slf4j.Slf4j;
import lombok.val;
import org.apache.flink.streaming.api.functions.source.SourceFunction;

import java.util.List;

/**
 * flink stream demo
 * todo IDEA run main方法时 need [add IDEA provided选项 ] AND [add -local local to program argument]
 *
 * @author zhangxuecheng4441
 * @date 2022/2/22/022 11:43
 */
@Slf4j
public class FlinkStreamDemoApp extends FlinkStreamModel {


    /**
     * idea 启动需要配置 [add dependencies with "provided"]
     *
     * @param args args
     * @throws Exception Exception
     */
    public static void main(String[] args) throws Exception {
        initEnv(args);

        //获取数据源
        val source = env.addSource(new SourceFunction() {
            boolean out = true;

            @Override
            public void run(SourceContext sourceContext) throws Exception {
                while (out) {
                    ThreadUtil.sleep(1.2 * 1000);
                    val str = "print-time:" + DateUtil.now();
                    log.warn("add string:{}", str);
                    sourceContext.collect(str);
                }
            }

            @Override
            public void cancel() {
                out = false;
            }
        }).setParallelism(1).name("string-source");

        //打印
        source.print().setParallelism(2).name("print-time");

        //每5个数据进行一次数据输出
        source.addSink(new GenericSink(5) {
            @Override
            public void flush(List elements) {
                log.error("output str:{}", elements);
            }
        }).setParallelism(4).name("branch-sink");

        //todo debug 增加参数 -local local 可以IDEA测试开启 http://localhost:8081/ 研发环境
        env.execute("DemoStreamApp");
    }
}

能够通过flink-runtime-web包获取调试的web页面


image.png

算子

除了常用算子,process应该是灵活度非常高的算子了

package flink.launch.stream;

import cn.hutool.core.collection.ListUtil;
import cn.hutool.core.thread.ThreadUtil;
import cn.hutool.core.util.RandomUtil;
import flink.function.check.JsonStrCheckFunc;
import flink.model.FlinkStreamModel;
import flink.pojo.AccountUploadPojo;
import flink.sink.GenericSink;
import lombok.extern.slf4j.Slf4j;
import lombok.val;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.streaming.api.functions.source.SourceFunction;
import org.apache.flink.util.OutputTag;

import java.util.List;

/**
 * 一堆数据 需要将其中数据分成多个类别 进行再处理使用旁路
 * 用例: 读取数据找到正确的json数据打印 不正确的输出
 * todo IDEA run main方法时 need [add IDEA provided选项 ] AND [add -local local to program argument]
 *
 * @author zhangxuecheng4441
 * @date 2022/3/13/013 15:21
 */
@Slf4j
public class ProcessApp extends FlinkStreamModel {


    public static void main(String[] args) throws Exception {
        initEnv(args);
        val errorTag = "error";

        //造假数据
        val source = env.addSource(new SourceFunction() {
            boolean out = true;

            @Override
            public void run(SourceContext sourceContext) throws Exception {
                val strList = ListUtil.toList("{\"uid\":1}", "1", "2");
                while (out) {
                    ThreadUtil.sleep(1.2 * 1000);
                    val str = strList.get(RandomUtil.randomInt(0, 3));
                    log.warn("add string:{}", str);
                    sourceContext.collect(str);
                }
            }

            @Override
            public void cancel() {
                out = false;
            }
        }).setParallelism(1).name("string-source");


        //分流 判断json直接返回 非json添加到名为errorTag的流中
        val process = source.process(new JsonStrCheckFunc<>(AccountUploadPojo.class, errorTag))
                .returns(TypeInformation.of(AccountUploadPojo.class))
                .setParallelism(2).name("json-parse");

        //处理直接返回的数据
        process.map(String::valueOf)
                .addSink(new GenericSink(1) {
                    @Override
                    public void flush(List elements) {
                        log.warn(">>>>>> right str:{}", elements);
                    }
                }).setParallelism(1).name("right-sink");

        //获取名为errorTag流中的数据处理
        process.getSideOutput(new OutputTag<>(errorTag, TypeInformation.of(String.class)))
                        .addSink(new GenericSink(1) {
                            @Override
                            public void flush(List elements) {
                                log.error(">>>>>> error str:{}", elements);
                            }
                        }).setParallelism(1).name("error-sink");

        env.execute("ProcessApp");
    }
}

输出流

实现sinkFunction即可

package flink.sink;

import org.apache.flink.streaming.api.functions.sink.RichSinkFunction;

import java.util.ArrayList;
import java.util.List;

/**
 * @author zhangxuecheng4441
 * @date 2022/10/14/014 15:25
 */
public abstract class GenericSink extends RichSinkFunction {
    private static final long serialVersionUID = -7594648194757224332L;

    public final List dataCache;
    private final int batchInterval;
    private int batchCount = 0;

    public GenericSink(int batchInterval) {
        this.batchInterval = batchInterval;
        dataCache = new ArrayList<>();
    }


    @Override
    public void invoke(T record, Context context) {
        if (record != null) {
            ++this.batchCount;
            this.addBatch(record);

            if (this.batchCount >= this.batchInterval) {
                this.flush(dataCache);
                dataCache.clear();
                this.batchCount = 0;
            }
        }
    }


    /**
     * 增加批量数据 缓存批写
     *
     * @param element element
     */
    public void addBatch(T element) {
        dataCache.add(element);
    }


    /**
     * 数据入库
     *
     * @param elements elements
     */
    public abstract void flush(List elements);

    @Override
    public void close() throws Exception {
        if (!this.dataCache.isEmpty()) {
            try {
                this.flush(dataCache);
            } finally {
                this.dataCache.clear();
            }
        }
        super.close();
    }
}

仓库地址

https://github.com/opop32165455/flink-java-quickstart-1.14.git

你可能感兴趣的:(大数据之Flink流处理 stream-quickstart)