import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.connector.kafka.source.KafkaSource;
import org.apache.flink.connector.kafka.source.enumerator.initializer.OffsetsInitializer;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer;
import java.util.Arrays;
import java.util.Properties;
public class KafkaDemo01 {
/**
* 版本1.13之前的常见写法
*
* @param env StreamExecutionEnvironment
* @throws Exception e
*/
private static void beforeVersion13(StreamExecutionEnvironment env) throws Exception {
KafkaSource source = KafkaSource.builder()
.setBootstrapServers("vm01:9092")
.setTopics(Arrays.asList("test01"))
.setValueOnlyDeserializer(new SimpleStringSchema())
.setStartingOffsets(OffsetsInitializer.earliest())
.build();
DataStream stream = env.fromSource(source, WatermarkStrategy.noWatermarks(), "kafka_source");
stream.print();
}
/**
* 版本1.13及之后的常见写法
*
* @param env StreamExecutionEnvironment
* @throws Exception e
*/
private static void afterVersion13(StreamExecutionEnvironment env) throws Exception {
// kafka source
Properties props = new Properties();
props.setProperty("bootstrap.servers", "vm01:9092");
props.setProperty("group.id", "testGroup");
props.setProperty("auto.offset.reset", "latest");
props.setProperty("enable.auto.commit", "true");
props.setProperty("auto.commit.interval.ms", "3000");
FlinkKafkaConsumer consumer = new FlinkKafkaConsumer("test01", new SimpleStringSchema(), props);
DataStreamSource stream = env.addSource(consumer);
stream.print();
}
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
// beforeVersion13(env);
afterVersion13(env);
env.execute("Demo");
}
}
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.connector.kafka.sink.KafkaRecordSerializationSchema;
import org.apache.flink.connector.kafka.sink.KafkaSink;
import org.apache.flink.connector.kafka.source.KafkaSource;
import org.apache.flink.connector.kafka.source.enumerator.initializer.OffsetsInitializer;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaProducer;
import java.util.Arrays;
import java.util.Properties;
public class KafkaDemo02 {
/**
* Generate Data Source
*
* @param env StreamExecutionEnvironment
* @return DataStream
*/
private static DataStream generateSource(StreamExecutionEnvironment env) {
// common
KafkaSource source = KafkaSource.builder()
.setBootstrapServers("vm01:9092")
.setTopics(Arrays.asList("test01"))
.setValueOnlyDeserializer(new SimpleStringSchema())
.setStartingOffsets(OffsetsInitializer.earliest())
.build();
return env.fromSource(source, WatermarkStrategy.noWatermarks(), "kafka_source");
}
/**
* FlinkKafkaProducer will be deleted after v1.15
*
* @param env StreamExecutionEnvironment
* @throws Exception e
*/
private static void beforeVersion14(StreamExecutionEnvironment env) throws Exception {
DataStream stream = generateSource(env);
// Use FlinkKafkaProducer to generate producer
Properties props = new Properties();
props.setProperty("bootstrap.servers", "vm01:9092");
props.setProperty("group.id", "testGroup");
props.setProperty("auto.commit.interval.ms", "3000");
FlinkKafkaProducer producer = new FlinkKafkaProducer("test02", new SimpleStringSchema(), props);
stream.addSink(producer);
}
/**
* KafkaSink
*
* @param env StreamExecutionEnvironment
* @throws Exception e
*/
private static void afterVersion14(StreamExecutionEnvironment env) throws Exception {
DataStream stream = generateSource(env);
// Use KafkaSink to generate producer
KafkaSink sink = KafkaSink.builder()
.setBootstrapServers("vm01:9092")
.setRecordSerializer(KafkaRecordSerializationSchema.builder()
.setTopic("test02")
.setValueSerializationSchema(new SimpleStringSchema())
.build()
)
.build();
stream.sinkTo(sink);
}
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
// beforeVersion14(env);
afterVersion14(env);
env.execute("Demo");
}
}
env.setStateBackend(new FsStateBackend("file:///root/checkpoints"));
// 时间间隔
env.enableCheckpointing(10000);
// 重启策略-fixed delay(固定间隔),Failure rate(失败率), No restart(无启动)
env.setRestartStrategy(RestartStrategies.fixedDelayRestart(3, org.apache.flink.api.common.time.Time.of(10, TimeUnit.SECONDS)));
// 精准一次
env.getCheckpointConfig().setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE);
// checkpoint超时时间
env.getCheckpointConfig().setCheckpointTimeout(60000);
// 最大并行
env.getCheckpointConfig().setMaxConcurrentCheckpoints(2);
// 最小的检查点间隔
env.getCheckpointConfig().setMinPauseBetweenCheckpoints(500);
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.util.Collector;
import java.time.Duration;
/**
* 数据格式如:
* jordan,1547718210
* jordan,1547718211
* jordan,1547718212
* jordan,1547718215
* kobe,1547718216
* kobe,1547718217
* kobe,1547718218
* andy,1547718219
* andy,1547718220
* jordan,1547718221
* jordan,1547718222
* jordan,1547718223
* jordan,1547718224
* jordan,1547718225 // 触发1547718210-1547718219窗口的计算(时间窗口10s+延迟5s=15s=1547718225s-1547718210s)
* jordan,1547718226
* jordan,1547718231
* jordan,1547718235 // 触发1547718220-1547718229窗口的计算(时间窗口10s+延迟5s=15s=1547718235s-1547718220s)
*/
public class FlinkEventTimeDemo {
public static void main(String[] args) throws Exception {
final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
// 数据量少时,可能会导致不同线程的watermark不能对齐,此时可以选择单线程运行,或使用withIdleness
// env.setParallelism(1);
DataStream dataStream = env.socketTextStream("192.168.61.128", 9999);
dataStream.flatMap(new SplitterFunction())
.assignTimestampsAndWatermarks(WatermarkStrategy
.>forBoundedOutOfOrderness(Duration.ofSeconds(5))
.withTimestampAssigner((event, timestamp) -> {
// System.out.println(timestamp);
return event.f2 * 1000;
})
// 闲置线程对齐watermark
.withIdleness(Duration.ofSeconds(1)))
.keyBy(value -> value.f0)
.window(TumblingEventTimeWindows.of(Time.seconds(10)))
.sum(1)
.print();
env.execute();
}
static class SplitterFunction implements FlatMapFunction> {
@Override
public void flatMap(String value, Collector> out) throws Exception {
String[] fields = value.split(",");
out.collect(new Tuple3<>(fields[0], 1, Long.valueOf(fields[1])));
}
}
}
package org.example.flink;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.RichMapFunction;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import java.time.Duration;
/**
* jordan,1547718210
* jordan,1547718211
* jordan,1547718212
* jordan,1547718215
* kobe,1547718216
* kobe,1547718217
* kobe,1547718218
* andy,1547718219
* andy,1547718220
* jordan,1547718221
* jordan,1547718222
* jordan,1547718223
* jordan,1547718224
* jordan,1547718225
* jordan,1547718226
* jordan,1547718231
* jordan,1547718235
* michael,1547718236
* michael,1547718237
* jordan,1547718238
* jordan,1547718239
* tom,1547718240
*/
public class FlinkDemo04 {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.socketTextStream("vm01", 9999)
.map(new SplitterFunction())
.setParallelism(1)
.assignTimestampsAndWatermarks(
WatermarkStrategy.>forBoundedOutOfOrderness(Duration.ofSeconds(5))
.withTimestampAssigner((event, timestamp) -> event.f2 * 1000))
.keyBy(value -> value.f0)
.window(TumblingEventTimeWindows.of(Time.seconds(10)))
.sum(1)
.print();
env.execute("FlinkDemo04");
}
private static class SplitterFunction extends RichMapFunction> {
@Override
public Tuple3 map(String value) throws Exception {
String[] fields = value.split(",");
return new Tuple3<>(fields[0], 1, Long.valueOf(fields[1]));
}
}
}
package org.example.flink;
import org.apache.flink.api.common.accumulators.LongCounter;
import org.apache.flink.api.common.functions.RichMapFunction;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
public class FlinkDemo01 {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(5);
env.socketTextStream("vm01", 9999)
.map(new SplitterFunction())
.print();
env.execute("FlinkDemo01");
}
static private class SplitterFunction extends RichMapFunction> {
// 初始化
private static LongCounter numLines = new LongCounter();
@Override
public void open(Configuration parameters) throws Exception {
// 注册
getRuntimeContext().addAccumulator("num-lines", this.numLines);
super.open(parameters);
}
@Override
public Tuple3 map(String value) throws Exception {
String[] fields = value.split(",");
// 值加一
this.numLines.add(1);
System.out.println("++++");
System.out.println(getRuntimeContext().getAccumulator("num-lines"));
return new Tuple3<>(fields[0], 1, Long.valueOf(fields[1]));
}
}
}
这里使用Flink的DataStream API,数据源则是通过消费Kafka的主题。最终下沉到Hive(通过hive-jdbc的方式),因此,maven工程中的依赖必须包括:
org.apache.flink
flink-streaming-java_2.11
${flink.version}
org.apache.flink
flink-connector-kafka_2.11
${flink.version}
org.apache.hive
hive-jdbc
${hive.version}
代码如下:
1) 通过继承RichSourceFunction,创建hive的下沉类
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.functions.sink.RichSinkFunction;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.SQLException;
public class HiveWriter extends RichSinkFunction {
private Connection connection = null;
private PreparedStatement pstmt = null;
@Override
public void open(Configuration parameters) throws Exception {
Class.forName("org.apache.hive.jdbc.HiveDriver");
connection = DriverManager.getConnection("jdbc:hive2://host1:2181,host2:2181,host3:2181/;serviceDiscoveryMode=zookeeper;zookeeperNamespace=hiveserver2", "username", "password");
}
@Override
public void invoke(String value, Context context) throws SQLException {
String[] fields = value.split(",");
if (fields.length < 2) return;
String sql = "INSERT INTO testDb.testTable SELECT ?,?");
pstmt = connection.prepareStatement(sql);
pstmt.setString(1, fields[0]);
pstmt.setString(2, fields[1]);
pstmt.executeUpdate();
}
@Override
public void close() throws Exception {
if (pstmt != null) pstmt.close();
if (connection != null) connection.close();
}
}
2) 运行Flink
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
public class FlinkHiveDemo {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
// 添加数据源
DataStream stream = env.addSource(new HiveReader());
stream.print();
env.execute("FlinkHiveDemo");
}
}