Flink DataStream API编程

消费Kafka主题数据

import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.connector.kafka.source.KafkaSource;
import org.apache.flink.connector.kafka.source.enumerator.initializer.OffsetsInitializer;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer;

import java.util.Arrays;
import java.util.Properties;

public class KafkaDemo01 {
    /**
     * 版本1.13之前的常见写法
     *
     * @param env StreamExecutionEnvironment
     * @throws Exception e
     */
    private static void beforeVersion13(StreamExecutionEnvironment env) throws Exception {
        KafkaSource source = KafkaSource.builder()
                .setBootstrapServers("vm01:9092")
                .setTopics(Arrays.asList("test01"))
                .setValueOnlyDeserializer(new SimpleStringSchema())
                .setStartingOffsets(OffsetsInitializer.earliest())
                .build();
        DataStream stream = env.fromSource(source, WatermarkStrategy.noWatermarks(), "kafka_source");
        stream.print();
    }

    /**
     * 版本1.13及之后的常见写法
     *
     * @param env StreamExecutionEnvironment
     * @throws Exception e
     */
    private static void afterVersion13(StreamExecutionEnvironment env) throws Exception {
        // kafka source
        Properties props = new Properties();
        props.setProperty("bootstrap.servers", "vm01:9092");
        props.setProperty("group.id", "testGroup");
        props.setProperty("auto.offset.reset", "latest");
        props.setProperty("enable.auto.commit", "true");
        props.setProperty("auto.commit.interval.ms", "3000");
        FlinkKafkaConsumer consumer = new FlinkKafkaConsumer("test01", new SimpleStringSchema(), props);
        DataStreamSource stream = env.addSource(consumer);
        stream.print();
    }

    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
//        beforeVersion13(env);
        afterVersion13(env);
        env.execute("Demo");
    }

}

生产Kafka主题数据

import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.connector.kafka.sink.KafkaRecordSerializationSchema;
import org.apache.flink.connector.kafka.sink.KafkaSink;
import org.apache.flink.connector.kafka.source.KafkaSource;
import org.apache.flink.connector.kafka.source.enumerator.initializer.OffsetsInitializer;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaProducer;

import java.util.Arrays;
import java.util.Properties;


public class KafkaDemo02 {

    /**
     * Generate Data Source
     *
     * @param env StreamExecutionEnvironment
     * @return DataStream
     */
    private static DataStream generateSource(StreamExecutionEnvironment env) {
        // common
        KafkaSource source = KafkaSource.builder()
                .setBootstrapServers("vm01:9092")
                .setTopics(Arrays.asList("test01"))
                .setValueOnlyDeserializer(new SimpleStringSchema())
                .setStartingOffsets(OffsetsInitializer.earliest())
                .build();
        return env.fromSource(source, WatermarkStrategy.noWatermarks(), "kafka_source");

    }

    /**
     * FlinkKafkaProducer will be deleted after v1.15
     *
     * @param env StreamExecutionEnvironment
     * @throws Exception e
     */
    private static void beforeVersion14(StreamExecutionEnvironment env) throws Exception {
        DataStream stream = generateSource(env);
        // Use FlinkKafkaProducer to generate producer
        Properties props = new Properties();
        props.setProperty("bootstrap.servers", "vm01:9092");
        props.setProperty("group.id", "testGroup");
        props.setProperty("auto.commit.interval.ms", "3000");
        FlinkKafkaProducer producer = new FlinkKafkaProducer("test02", new SimpleStringSchema(), props);
        stream.addSink(producer);
    }

    /**
     * KafkaSink
     *
     * @param env StreamExecutionEnvironment
     * @throws Exception e
     */
    private static void afterVersion14(StreamExecutionEnvironment env) throws Exception {
        DataStream stream = generateSource(env);
        // Use KafkaSink to generate producer
        KafkaSink sink = KafkaSink.builder()
                .setBootstrapServers("vm01:9092")
                .setRecordSerializer(KafkaRecordSerializationSchema.builder()
                        .setTopic("test02")
                        .setValueSerializationSchema(new SimpleStringSchema())
                        .build()
                )
                .build();

        stream.sinkTo(sink);
    }

    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
//        beforeVersion14(env);
        afterVersion14(env);
        env.execute("Demo");
    }
}

Checkpoint

        env.setStateBackend(new FsStateBackend("file:///root/checkpoints"));
        // 时间间隔
        env.enableCheckpointing(10000);
        // 重启策略-fixed delay(固定间隔),Failure rate(失败率), No restart(无启动)
        env.setRestartStrategy(RestartStrategies.fixedDelayRestart(3, org.apache.flink.api.common.time.Time.of(10, TimeUnit.SECONDS)));
        // 精准一次
        env.getCheckpointConfig().setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE);
        // checkpoint超时时间
        env.getCheckpointConfig().setCheckpointTimeout(60000);
        // 最大并行
        env.getCheckpointConfig().setMaxConcurrentCheckpoints(2);
        // 最小的检查点间隔
        env.getCheckpointConfig().setMinPauseBetweenCheckpoints(500);

Event time

import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.util.Collector;

import java.time.Duration;

/**
 * 数据格式如:
 * jordan,1547718210
 * jordan,1547718211
 * jordan,1547718212
 * jordan,1547718215
 * kobe,1547718216
 * kobe,1547718217
 * kobe,1547718218
 * andy,1547718219
 * andy,1547718220
 * jordan,1547718221
 * jordan,1547718222
 * jordan,1547718223
 * jordan,1547718224
 * jordan,1547718225   // 触发1547718210-1547718219窗口的计算(时间窗口10s+延迟5s=15s=1547718225s-1547718210s)
 * jordan,1547718226
 * jordan,1547718231
 * jordan,1547718235  // 触发1547718220-1547718229窗口的计算(时间窗口10s+延迟5s=15s=1547718235s-1547718220s)
 */
public class FlinkEventTimeDemo {
    public static void main(String[] args) throws Exception {
        final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
// 数据量少时,可能会导致不同线程的watermark不能对齐,此时可以选择单线程运行,或使用withIdleness       
// env.setParallelism(1);
        DataStream dataStream = env.socketTextStream("192.168.61.128", 9999);
        dataStream.flatMap(new SplitterFunction())
                .assignTimestampsAndWatermarks(WatermarkStrategy
                        .>forBoundedOutOfOrderness(Duration.ofSeconds(5))
                        .withTimestampAssigner((event, timestamp) -> {
//                            System.out.println(timestamp);
                            return event.f2 * 1000;
                        })
                        // 闲置线程对齐watermark
                        .withIdleness(Duration.ofSeconds(1)))
                .keyBy(value -> value.f0)
                .window(TumblingEventTimeWindows.of(Time.seconds(10)))
                .sum(1)
                .print();
        env.execute();
    }

    static class SplitterFunction implements FlatMapFunction> {

        @Override
        public void flatMap(String value, Collector> out) throws Exception {
            String[] fields = value.split(",");
            out.collect(new Tuple3<>(fields[0], 1, Long.valueOf(fields[1])));
        }
    }
}

指定单个算子并行度

package org.example.flink;

import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.RichMapFunction;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;

import java.time.Duration;

/**
 * jordan,1547718210
 * jordan,1547718211
 * jordan,1547718212
 * jordan,1547718215
 * kobe,1547718216
 * kobe,1547718217
 * kobe,1547718218
 * andy,1547718219
 * andy,1547718220
 * jordan,1547718221
 * jordan,1547718222
 * jordan,1547718223
 * jordan,1547718224
 * jordan,1547718225
 * jordan,1547718226
 * jordan,1547718231
 * jordan,1547718235
 * michael,1547718236
 * michael,1547718237
 * jordan,1547718238
 * jordan,1547718239
 * tom,1547718240
 */
public class FlinkDemo04 {
    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        env.socketTextStream("vm01", 9999)
                .map(new SplitterFunction())
                .setParallelism(1)
                .assignTimestampsAndWatermarks(
                        WatermarkStrategy.>forBoundedOutOfOrderness(Duration.ofSeconds(5))
                                .withTimestampAssigner((event, timestamp) -> event.f2 * 1000))
                .keyBy(value -> value.f0)
                .window(TumblingEventTimeWindows.of(Time.seconds(10)))
                .sum(1)
                .print();

        env.execute("FlinkDemo04");
    }


    private static class SplitterFunction extends RichMapFunction> {

        @Override
        public Tuple3 map(String value) throws Exception {
            String[] fields = value.split(",");
            return new Tuple3<>(fields[0], 1, Long.valueOf(fields[1]));
        }
    }
}

Accumulator

package org.example.flink;

import org.apache.flink.api.common.accumulators.LongCounter;
import org.apache.flink.api.common.functions.RichMapFunction;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;


public class FlinkDemo01 {
    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(5);
        env.socketTextStream("vm01", 9999)
                .map(new SplitterFunction())
                .print();
        env.execute("FlinkDemo01");
    }

    static private class SplitterFunction extends RichMapFunction> {

        // 初始化
        private static LongCounter numLines = new LongCounter();

        @Override
        public void open(Configuration parameters) throws Exception {
            // 注册
            getRuntimeContext().addAccumulator("num-lines", this.numLines);
            super.open(parameters);
        }

        @Override
        public Tuple3 map(String value) throws Exception {
            String[] fields = value.split(",");
            // 值加一
            this.numLines.add(1);
            System.out.println("++++");
            System.out.println(getRuntimeContext().getAccumulator("num-lines"));
            return new Tuple3<>(fields[0], 1, Long.valueOf(fields[1]));
        }

    }
}

Hive作为Sink

这里使用Flink的DataStream API,数据源则是通过消费Kafka的主题。最终下沉到Hive(通过hive-jdbc的方式),因此,maven工程中的依赖必须包括:


    org.apache.flink
    flink-streaming-java_2.11
    ${flink.version}


    org.apache.flink
    flink-connector-kafka_2.11
    ${flink.version}


    org.apache.hive
    hive-jdbc
    ${hive.version}

代码如下:

1) 通过继承RichSourceFunction,创建hive的下沉类

import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.functions.sink.RichSinkFunction;
 
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.SQLException;
 
 
public class HiveWriter extends RichSinkFunction {
    private Connection connection = null;
    private PreparedStatement pstmt = null;
 
    @Override
    public void open(Configuration parameters) throws Exception {
        Class.forName("org.apache.hive.jdbc.HiveDriver");
        connection = DriverManager.getConnection("jdbc:hive2://host1:2181,host2:2181,host3:2181/;serviceDiscoveryMode=zookeeper;zookeeperNamespace=hiveserver2", "username", "password");
    }
 
    @Override
    public void invoke(String value, Context context) throws SQLException {
        String[] fields = value.split(",");
        if (fields.length < 2) return;
        String sql = "INSERT INTO testDb.testTable SELECT ?,?");
        pstmt = connection.prepareStatement(sql);
        pstmt.setString(1, fields[0]);
        pstmt.setString(2, fields[1]);
        pstmt.executeUpdate();
    }
 
    @Override
    public void close() throws Exception {
        if (pstmt != null) pstmt.close();
        if (connection != null) connection.close();
    }
}

2) 运行Flink

import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
 
 
public class FlinkHiveDemo {
    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        // 添加数据源
        DataStream stream = env.addSource(new HiveReader());
        stream.print();
        env.execute("FlinkHiveDemo");
    }
}

你可能感兴趣的:(大数据,flink,flink)