1、映射(map)
map 是大家非常熟悉的大数据操作算子,主要用于将数据流中的数据进行转换,形成新的数据流。简单来说,就是一个“一一映射”,消费一个元素就产出一个元素,如图 5-5 所示。
import org.apache.flink.api.common.functions.MapFunction; import org.apache.flink.streaming.api.datastream.DataStreamSource; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
public class TransMapTest {
public static void main(String[] args) throws Exception{
StreamExecutionEnvironment env =
StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1);
DataStreamSource<Event> stream = env.fromElements( new Event("Mary", "./home", 1000L), new Event("Bob", "./cart", 2000L)
);
// 传入匿名类,实现MapFunction
stream.map(new MapFunction<Event, String>() {
@Override public String map(Event e) throws Exception { return e.user;
}
});
// 传入MapFunction的实现类 stream.map(new UserExtractor()).print();
env.execute();
} public static class UserExtractor implements MapFunction<Event, String> {
@Override public String map(Event e) throws Exception { return e.user;
}
}
}
public <R> SingleOutputStreamOperator<R> map(MapFunction<T, R> mapper){}
2、过滤(filter)
filter 转换操作,顾名思义是对数据流执行一个过滤,通过一个布尔条件表达式设置过滤条件,对于每一个流内元素进行判断,若为 true 则元素正常输出,若为 false 则元素被过滤掉,如图 5-6 所示。
import org.apache.flink.api.common.functions.FilterFunction; import org.apache.flink.streaming.api.datastream.DataStreamSource; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
public class TransFilterTest {
public static void main(String[] args) throws Exception{
StreamExecutionEnvironment env =
StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1);
DataStreamSource<Event> stream = env.fromElements( new Event("Mary", "./home", 1000L), new Event("Bob", "./cart", 2000L)
);
// 传入匿名类实现FilterFunction
stream.filter(new FilterFunction<Event>() {
@Override public boolean filter(Event e) throws Exception { return e.user.equals("Mary");
}
});
// 传入FilterFunction实现类 stream.filter(new UserFilter()).print();
env.execute();
} public static class UserFilter implements FilterFunction<Event> {
@Override public boolean filter(Event e) throws Exception { return e.user.equals("Mary");
}
}
}
3、扁平映射(flatMap)
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.util.Collector;
public class TransFlatmapTest {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env =
StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1);
DataStreamSource<Event> stream = env.fromElements( new Event("Mary", "./home", 1000L), new Event("Bob", "./cart", 2000L)
);
stream.flatMap(new MyFlatMap()).print();
env.execute();
} public static class MyFlatMap implements FlatMapFunction<Event, String> {
@Override public void flatMap(Event value, Collector<String> out) throws Exception
{ if (value.user.equals("Mary")) { out.collect(value.user);
} else if (value.user.equals("Bob")) { out.collect(value.user); out.collect(value.url);
}
}
}
}
1、按键分区(keyBy)
我们可以以 id 作为 key 做一个分区操作,代码实现如下:
import org.apache.flink.api.java.functions.KeySelector; import org.apache.flink.streaming.api.datastream.DataStreamSource; import org.apache.flink.streaming.api.datastream.KeyedStream; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
public class TransKeyByTest {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env =
StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1);
DataStreamSource<Event> stream = env.fromElements( new Event("Mary", "./home", 1000L), new Event("Bob", "./cart", 2000L)
);
// 使用Lambda表达式
KeyedStream<Event, String> keyedStream = stream.keyBy(e -> e.user);
// 使用匿名类实现KeySelector
KeyedStream<Event, String> keyedStream1 = stream.keyBy(new
KeySelector<Event, String>() {
@Override public String getKey(Event e) throws Exception { return e.user;
}
});
env.execute();
}
}
2、简单聚合
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
public class TransTupleAggreationTest {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env =
StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1);
DataStreamSource<Tuple2<String, Integer>> stream = env.fromElements(
Tuple2.of("a", 1),
Tuple2.of("a", 3),
Tuple2.of("b", 3),
Tuple2.of("b", 4)
);
stream.keyBy(r -> r.f0).sum(1).print(); stream.keyBy(r -> r.f0).sum("f1").print(); stream.keyBy(r -> r.f0).max(1).print(); stream.keyBy(r -> r.f0).max("f1").print(); stream.keyBy(r -> r.f0).min(1).print(); stream.keyBy(r -> r.f0).min("f1").print(); stream.keyBy(r -> r.f0).maxBy(1).print(); stream.keyBy(r -> r.f0).maxBy("f1").print(); stream.keyBy(r -> r.f0).minBy(1).print(); stream.keyBy(r -> r.f0).minBy("f1").print();
env.execute();
}
}
而如果数据流的类型是 POJO 类,那么就只能通过字段名称来指定,不能通过位置来指定了。
import org.apache.flink.streaming.api.datastream.DataStreamSource; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
public class TransPojoAggregationTest {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env =
StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1);
DataStreamSource<Event> stream = env.fromElements( new Event("Mary", "./home", 1000L), new Event("Bob", "./cart", 2000L)
);
stream.keyBy(e -> e.user).max("timestamp").print(); // 指定字段名称
env.execute();
}
}
3、归约聚合(reduce)
public interface ReduceFunction<T> extends Function, Serializable {
T reduce(T value1, T value2) throws Exception;
}
import org.apache.flink.api.common.functions.MapFunction; import org.apache.flink.api.common.functions.ReduceFunction; import org.apache.flink.api.java.tuple.Tuple2; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
public class TransReduceTest {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env =
StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1);
// 这里的ClickSource()使用了之前自定义数据源小节中的ClickSource() env.addSource(new ClickSource())
// 将Event数据类型转换成元组类型
.map(new MapFunction<Event, Tuple2<String, Long>>() {
@Override public Tuple2<String, Long> map(Event e) throws Exception { return Tuple2.of(e.user, 1L);
}
})
.keyBy(r -> r.f0) // 使用用户名来进行分流
.reduce(new ReduceFunction<Tuple2<String, Long>>() {
@Override
public Tuple2<String, Long> reduce(Tuple2<String, Long> value1,
Tuple2<String, Long> value2) throws Exception {
// 每到一条数据,用户pv的统计值加1
return Tuple2.of(value1.f0, value1.f1 + value2.f1); }
})
.keyBy(r -> true) // 为每一条数据分配同一个 key,将聚合结果发送到一条流中去
.reduce(new ReduceFunction<Tuple2<String, Long>>() {
@Override
public Tuple2<String, Long> reduce(Tuple2<String, Long> value1,
Tuple2<String, Long> value2) throws Exception {
// 将累加器更新为当前最大的pv统计值,然后向下游发送累加器的值
return value1.f1 > value2.f1 ? value1 : value2; }
})
.print();
env.execute();
} }
1、函数类(Function Classes)
import org.apache.flink.api.common.functions.FilterFunction; import org.apache.flink.streaming.api.datastream.DataStream; import org.apache.flink.streaming.api.datastream.DataStreamSource; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
public class TransFunctionUDFTest {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env =
StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1);
DataStreamSource<Event> clicks = env.fromElements( new Event("Mary", "./home", 1000L), new Event("Bob", "./cart", 2000L)
);
DataStream<Event> stream = clicks.filter(new FlinkFilter());
stream.print();
env.execute();
} public static class FlinkFilter implements FilterFunction<Event> {
@Override public boolean filter(Event value) throws Exception { return value.url.contains("home");
}
}
}
当然还可以通过匿名类来实现 FilterFunction 接口:
DataStream<String> stream = clicks.filter(new FilterFunction<Event>() {
@Override public boolean filter(Event value) throws Exception { return value.url.contains("home");
}
});
为了类可以更加通用,我们还可以将用于过滤的关键字"home"抽象出来作为类的属性,调用构造方法时传进去。
DataStream<Event> stream = clicks.filter(new KeyWordFilter("home"));
public static class KeyWordFilter implements FilterFunction<Event> { private String keyWord;
KeyWordFilter(String keyWord) { this.keyWord = keyWord; }
@Override public boolean filter(Event value) throws Exception { return value.url.contains(this.keyWord);
}
}
2、匿名函数(Lambda)
import org.apache.flink.api.common.functions.MapFunction; import org.apache.flink.api.java.tuple.Tuple2; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
public class TransFunctionLambdaTest {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env =
StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1);
DataStreamSource<Event> clicks = env.fromElements( new Event("Mary", "./home", 1000L), new Event("Bob", "./cart", 2000L)
);
//map函数使用Lambda表达式,返回简单类型,不需要进行类型声明
DataStream<String> stream1 = clicks.map(event -> event.url);
stream1.print();
env.execute();
}
}
// flatMap使用Lambda表达式,抛出异常
DataStream<String> stream2 = clicks.flatMap((event, out) -> { out.collect(event.url);
}); stream2.print();
如果执行程序,Flink 会抛出如下异常:
org.apache.flink.api.common.functions.InvalidTypesException: The generic type parameters of 'Collector' are missing.
In many cases lambda methods don't provide enough information for automatic type extraction when Java generics are involved.
An easy workaround is to use an (anonymous) class instead that implements the 'org.apache.flink.api.common.functions.FlatMapFunction' interface.
Otherwise the type has to be specified explicitly using type information.
// flatMap使用Lambda表达式,必须通过returns明确声明返回类型
DataStream<String> stream2 = clicks.flatMap((Event event, Collector<String> out) -> { out.collect(event.url);
}).returns(Types.STRING);
stream2.print();
当使用 map() 函数返回 Flink 自定义的元组类型时也会发生类似的问题。下例中的函数签名 Tuple2
//使用map函数也会出现类似问题,以下代码会报错
DataStream<Tuple2<String, Long>> stream3 = clicks
.map( event -> Tuple2.of(event.user, 1L) ); stream3.print();
一般来说,这个问题可以通过多种方式解决:
import org.apache.flink.api.common.functions.MapFunction; import org.apache.flink.api.common.typeinfo.Types; import org.apache.flink.api.java.tuple.Tuple2; import org.apache.flink.streaming.api.datastream.DataStream; import org.apache.flink.streaming.api.datastream.DataStreamSource; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
public class ReturnTypeResolve {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env =
StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1);
DataStreamSource<Event> clicks = env.fromElements( new Event("Mary", "./home", 1000L), new Event("Bob", "./cart", 2000L)
);
// 想要转换成二元组类型,需要进行以下处理
// 1) 使用显式的 ".returns(...)"
DataStream<Tuple2<String, Long>> stream3 = clicks
.map( event -> Tuple2.of(event.user, 1L) ) .returns(Types.TUPLE(Types.STRING, Types.LONG)); stream3.print();
// 2) 使用类来替代Lambda表达式
clicks.map(new MyTuple2Mapper())
.print();
// 3) 使用匿名类来代替Lambda表达式
clicks.map(new MapFunction<Event, Tuple2<String, Long>>() {
@Override public Tuple2<String, Long> map(Event value) throws Exception { return Tuple2.of(value.user, 1L);
}
}).print();
env.execute();
}
// 自定义MapFunction的实现类
public static class MyTuple2Mapper implements MapFunction<Event, Tuple2<String, Long>>{
@Override public Tuple2<String, Long> map(Event value) throws Exception { return Tuple2.of(value.user, 1L);
}
}
}
这些方法对于其它泛型擦除的场景同样适用。
3、富函数类(Rich Function Classes)
import org.apache.flink.api.common.functions.RichMapFunction; import org.apache.flink.configuration.Configuration; import org.apache.flink.streaming.api.datastream.DataStreamSource; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
public class RichFunctionTest {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env =
StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(2);
DataStreamSource<Event> clicks = env.fromElements( new Event("Mary", "./home", 1000L), new Event("Bob", "./cart", 2000L), new Event("Alice", "./prod?id=1", 5 * 1000L), new Event("Cary", "./home", 60 * 1000L) );
// 将点击事件转换成长整型的时间戳输出
clicks.map(new RichMapFunction<Event, Long>() {
@Override public void open(Configuration parameters) throws Exception { super.open(parameters);
System.out.println(" 索 引 为 " +
getRuntimeContext().getIndexOfThisSubtask() + " 的任务开始"); }
@Override public Long map(Event value) throws Exception { return value.timestamp;
}
@Override public void close() throws Exception { super.close();
System.out.println(" 索 引 为
getRuntimeContext().getIndexOfThisSubtask() + " 的任务结束"); }
})
.print();
env.execute();
}
}
" +
输出结果是:
索引为 0 的任务开始索引为 1 的任务开始
1> 1000
2> 2000
2> 60000
1> 5000
索引为 0 的任务结束索引为 1 的任务结束
public class MyFlatMap extends RichFlatMapFunction<IN, OUT>> {
@Override public void open(Configuration configuration) {
// 做一些初始化工作
// 例如建立一个和MySQL的连接
}
@Override public void flatMap(IN in, Collector<OUT out) { // 对数据库进行读写
}
@Override public void close() {
// 清理工作,关闭和MySQL数据库的连接。 }
}
1、随机分区(shuffle)
import org.apache.flink.streaming.api.datastream.DataStreamSource; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
public class ShuffleTest { public static void main(String[] args) throws Exception { // 创建执行环境
StreamExecutionEnvironment env =
StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1);
// 读取数据源,并行度为1
DataStreamSource<Event> stream = env.addSource(new ClickSource());
// 经洗牌后打印输出,并行度为4
stream.shuffle().print("shuffle").setParallelism(4);
env.execute();
}
}
2、轮询分区(Round-Robin)
import org.apache.flink.streaming.api.datastream.DataStreamSource; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
public class RebalanceTest { public static void main(String[] args) throws Exception {
// 创建执行环境
StreamExecutionEnvironment env =
StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1);
// 读取数据源,并行度为1
DataStreamSource<Event> stream = env.addSource(new ClickSource());
// 经轮询重分区后打印输出,并行度为4
stream.rebalance().print("rebalance").setParallelism(4);
env.execute();
}
}
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; import org.apache.flink.streaming.api.functions.source.RichParallelSourceFunction;
public class RescaleTest {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env =
StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1);
// 这里使用了并行数据源的富函数版本
// 这样可以调用getRuntimeContext方法来获取运行时上下文的一些信息
env
.addSource(new RichParallelSourceFunction<Integer>() {
@Override
public void run(SourceContext<Integer> sourceContext) throws Exception { for (int i = 0; i < 8; i++) {
// 将奇数发送到索引为1的并行子任务
// 将偶数发送到索引为0的并行子任务
if ((i + 1) % 2 == getRuntimeContext().getIndexOfThisSubtask()) { sourceContext.collect(i + 1);
}
}
}
@Override public void cancel() {
}
})
.setParallelism(2)
.rescale()
.print().setParallelism(4);
env.execute();
}
}
这里使用 rescale 方法,来做数据的分区,输出结果是:
4> 3
3> 1
1> 2
1> 6
3> 5
4> 7
2> 4
2> 8
可以将 rescale 方法换成 rebalance 方法,来体会一下这两种方法的区别。
4、广播(broadcast)
这种方式其实不应该叫做“重分区”,因为经过广播之后,数据会在不同的分区都保留一份,可能进行重复处理。可以通过调用 DataStream 的 broadcast()方法,将输入数据复制并发送到下游算子的所有并行任务中去。具体代码测试如下:
import org.apache.flink.streaming.api.datastream.DataStreamSource; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
public class BroadcastTest { public static void main(String[] args) throws Exception { // 创建执行环境
StreamExecutionEnvironment env =
StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1);
// 读取数据源,并行度为1
DataStreamSource<Event> stream = env.addSource(new ClickSource());
// 经广播后打印输出,并行度为4
stream. broadcast().print("broadcast").setParallelism(4);
env.execute();
}
}
5、全局分区(global)
全局分区也是一种特殊的分区方式。这种做法非常极端,通过调用.global()方法,会将所有的输入流数据都发送到下游算子的第一个并行子任务中去。这就相当于强行让下游任务并行度变成了 1,所以使用这个操作需要非常谨慎,可能对程序造成很大的压力。
6、自定义分区(Custom)
import org.apache.flink.api.common.functions.Partitioner; import org.apache.flink.api.java.functions.KeySelector; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
public class CustomPartitionTest {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env =
StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1);
// 将自然数按照奇偶分区
env.fromElements(1, 2, 3, 4, 5, 6, 7, 8)
.partitionCustom(new Partitioner<Integer>() {
@Override public int partition(Integer key, int numPartitions) { return key % 2;
}
}, new KeySelector<Integer, Integer>() {
@Override public Integer getKey(Integer value) throws Exception { return value;
}
})
.print().setParallelism(2);
env.execute();
}
}
public DataStreamSink<T> print(String sinkIdentifier) {
PrintSinkFunction<T> printFunction = new PrintSinkFunction<>(sinkIdentifier, false);
return addSink(printFunction).name("Print to Std. Out");
}
与 Source 算子非常类似,除去一些 Flink 预实现的 Sink,一般情况下 Sink 算子的创建是通过调用 DataStream 的.addSink()方法实现的。
stream.addSink(new SinkFunction(…));
default void invoke(IN value, Context context) throws Exception
import org.apache.flink.api.common.serialization.SimpleStringEncoder; import org.apache.flink.core.fs.Path; import org.apache.flink.streaming.api.datastream.DataStreamSource; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; import org.apache.flink.streaming.api.functions.sink.filesystem.StreamingFileSink; import org.apache.flink.streaming.api.functions.sink.filesystem.rollingpolicies.Defa ultRollingPolicy;
import java.util.concurrent.TimeUnit; public class SinkToFileTest {
public static void main(String[] args) throws Exception{
StreamExecutionEnvironment env =
StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(4);
DataStreamSource<Event> stream = env.fromElements(new Event("Mary",
"./home", 1000L), new Event("Bob", "./cart", 2000L), new Event("Alice", "./prod?id=100", 3000L), new Event("Alice", "./prod?id=200", 3500L), new Event("Bob", "./prod?id=2", 2500L), new Event("Alice", "./prod?id=300", 3600L), new Event("Bob", "./home", 3000L), new Event("Bob", "./prod?id=1", 2300L), new Event("Bob", "./prod?id=3", 3300L));
StreamingFileSink<String> fileSink = StreamingFileSink .<String>forRowFormat(new Path("./output"), new SimpleStringEncoder<>("UTF-8"))
.withRollingPolicy(
DefaultRollingPolicy.builder()
.withRolloverInterval(TimeUnit.MINUTES.toMillis(15)
)
.withInactivityInterval(TimeUnit.MINUTES.toMillis(5 ))
.withMaxPartSize(1024 * 1024 * 1024)
.build())
.build();
// 将Event转换成String写入文件
stream.map(Event::toString).addSink(fileSink);
env.execute();
}
}
这里我们创建了一个简单的文件 Sink,通过.withRollingPolicy()方法指定了一个“滚动策略”。“滚动”的概念在日志文件的写入中经常遇到:因为文件会有内容持续不断地写入,所以我们应该给一个标准,到什么时候就开启新的文件,将之前的内容归档保存。也就是说,上面的代码设置了在以下 3 种情况下,我们就会滚动分区文件:
⚫至少包含 15 分钟的数据
⚫最近 5 分钟没有收到新的数据
⚫文件大小已达到 1 GB
现在我们要将数据输出到 Kafka,整个数据处理的闭环已经形成,所以可以完整测试如下:
(1)添加 Kafka 连接器依赖
由于我们已经测试过从 Kafka 数据源读取数据,连接器相关依赖已经引入,这里就不重复介绍了。
(2)启动 Kafka 集群
(3)编写输出到 Kafka 的示例代码
我们可以直接将用户行为数据保存为文件 clicks.csv,读取后不做转换直接写入 Kafka,主题(topic)命名为“clicks”。
import org.apache.flink.api.common.serialization.SimpleStringSchema; import org.apache.flink.streaming.api.datastream.DataStreamSource; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; import org.apache.flink.streaming.connectors.kafka.FlinkKafkaProducer;
import java.util.Properties;
public class SinkToKafkaTest {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env =
StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1);
Properties properties = new Properties(); properties.put("bootstrap.servers", "hadoop102:9092");
DataStreamSource<String> stream = env.readTextFile("input/clicks.csv");
stream
.addSink(new FlinkKafkaProducer<String>(
"clicks", new SimpleStringSchema(), properties
));
env.execute();
}
}
bin/kafka-console-consumer.sh --bootstrap-server hadoop102:9092 --topic clicks
(1)添加依赖
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-jdbc_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.47</version>
</dependency>
(2)启动 MySQL,在 database 库下建表 clicks
mysql> create table clicks(
-> user varchar(20) not null,
-> url varchar(100) not null);
(3)编写输出到 MySQL 的示例代码
import org.apache.flink.streaming.api.datastream.DataStreamSource; import org.apache.flink.connector.jdbc.JdbcConnectionOptions; import org.apache.flink.connector.jdbc.JdbcExecutionOptions; import org.apache.flink.connector.jdbc.JdbcSink; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
public class SinkToMySQL {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env =
StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1);
DataStreamSource<Event> stream = env.fromElements( new Event("Mary", "./home", 1000L),
new Event("Bob", "./cart", 2000L), new Event("Alice", "./prod?id=100", 3000L), new Event("Alice", "./prod?id=200", 3500L), new Event("Bob", "./prod?id=2", 2500L), new Event("Alice", "./prod?id=300", 3600L), new Event("Bob", "./home", 3000L),
new Event("Bob", "./prod?id=1", 2300L), new Event("Bob", "./prod?id=3", 3300L));
stream.addSink(
JdbcSink.sink(
"INSERT INTO clicks (user, url) VALUES (?, ?)",
(statement, r) -> { statement.setString(1, r.user); statement.setString(2, r.url);
},
JdbcExecutionOptions.builder()
.withBatchSize(1000)
.withBatchIntervalMs(200)
.withMaxRetries(5)
.build(), new
JdbcConnectionOptions.JdbcConnectionOptionsBuilder()
.withUrl("jdbc:mysql://localhost:3306/userbe havior")
// 对于MySQL 5.7,用"com.mysql.jdbc.Driver"
.withDriverName("com.mysql.cj.jdbc.Driver")
.withUsername("username")
.withPassword("password")
.build()
)
);
env.execute();
}
}
(4)运行代码,用客户端连接 MySQL,查看是否成功写入数据。
mysql> select * from clicks;
+------+--------------+
| user | url |
+------+--------------+
| Mary | ./home |
| Alice| ./prod?id=300 |
| Bob | ./prod?id=3 |
+------+---------------+
3 rows in set (0.00 sec)
如果我们想将数据存储到我们自己的存储设备中,而 Flink 并没有提供可以直接使用的连接器,又该怎么办呢?
与 Source 类似,Flink 为我们提供了通用的 SinkFunction 接口和对应的 RichSinkDunction 抽象类,只要实现它,通过简单地调用 DataStream 的.addSink()方法就可以自定义写入任何外部存储。之前与外部系统的连接,其实都是连接器帮我们实现了 SinkFunction,现在既然没有现成的,我们就只好自力更生了。例如,Flink 并没有提供 HBase 的连接器,所以需要我们自己写。
在实现 SinkFunction 的时候,需要重写的一个关键方法 invoke(),在这个方法中我们就可
以实现将流里的数据发送出去的逻辑。
我们这里使用了 SinkFunction 的富函数版本,因为这里我们又使用到了生命周期的概念,
创建 HBase 的连接以及关闭 HBase 的连接需要分别放在 open()方法和 close()方法中。
(1)导入依赖
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-client</artifactId>
<version>${hbase.version}</version>
</dependency>
(2)编写输出到 HBase 的示例代码
import org.apache.flink.configuration.Configuration; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; import org.apache.flink.streaming.api.functions.sink.RichSinkFunction; import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.client.Connection; import org.apache.hadoop.hbase.client.ConnectionFactory; import org.apache.hadoop.hbase.client.Put; import org.apache.hadoop.hbase.client.Table;
import java.nio.charset.StandardCharsets;
public class SinkCustomtoHBase {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env =
StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1);
env
.fromElements("hello", "world")
.addSink( new RichSinkFunction<String>() {
public org.apache.hadoop.conf.Configuration configuration; // 管理Hbase的配置信息,这里因为Configuration的重名问题,将类以完整路径导入
public Connection connection; // 管理Hbase连接
@Override
public void open(Configuration parameters) throws Exception { super.open(parameters); configuration = HBaseConfiguration.create(); configuration.set("hbase.zookeeper.quorum",
"hadoop102:2181");
connection =
ConnectionFactory.createConnection(configuration);
}
@Override
public void invoke(String value, Context context) throws Exception {
Table table =
connection.getTable(TableName.valueOf("test")); // 表名为test
Put put = new
Put("rowkey".getBytes(StandardCharsets.UTF_8)); // 指定rowkey
put.addColumn("info".getBytes(StandardCharsets.UTF_8) // 指定列名
, value.getBytes(StandardCharsets.UTF_8) // 写入的数据
, "1".getBytes(StandardCharsets.UTF_8)); // 写入的数据
table.put(put); // 执行put操作 table.close(); // 将表关闭 }
@Override public void close() throws Exception { super.close(); connection.close(); // 关闭连接
}
}
);
env.execute();
}
}
(3)可以在 HBase 查看插入的数据。