实时流需要和维护表Join做属性的扩展.
Spark-Streaming可以 stream join hive表.
flink没发现这个功能,所以将维度表放在ES上.
UTF-8
1.6.2
1.2.47
6.3.0
25.1-jre
...
com.alibaba
fastjson
${fastjson.version}
org.elasticsearch.client
elasticsearch-rest-high-level-client
${elasticsearch.version}
com.google.guava
guava
${guava.version}
说明:因为根据关键字读取ES表的数据,且采用guava做缓存,减少多次拉取ES的次数.
新建类AsyncEsDataRequest继承RichAsyncFunction类.
package com.tc.flink.demo.es;
import com.alibaba.fastjson.JSONObject;
import com.google.common.cache.Cache;
import com.google.common.cache.CacheBuilder;
import com.google.common.cache.RemovalListener;
import com.google.common.cache.RemovalNotification;
import com.tc.flink.util.CommonUtil;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.functions.async.ResultFuture;
import org.apache.flink.streaming.api.functions.async.RichAsyncFunction;
import org.elasticsearch.action.ActionListener;
import org.elasticsearch.action.search.SearchRequest;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.client.RestHighLevelClient;
import org.elasticsearch.index.query.QueryBuilder;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.search.SearchHit;
import org.elasticsearch.search.builder.SearchSourceBuilder;
import java.io.IOException;
import java.util.Collections;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.TimeUnit;
import java.util.function.Supplier;
public class AsyncEsDataRequest extends RichAsyncFunction, Integer>, Tuple2, String>> {
private transient RestHighLevelClient restHighLevelClient;
private transient volatile Cache, String> cityPercent;
@Override
public void open(Configuration parameters) throws Exception {
//初始化ElasticSearch-Client
restHighLevelClient = CommonUtil.getRestHighLevelClient();
//缓存设置
cityPercent = CacheBuilder., String> newBuilder().maximumSize(10).expireAfterWrite(5, TimeUnit.MINUTES)
.removalListener(
//生成环境,可以注销,这个是测试观察缓存使用
new RemovalListener
package com.tc.flink.demo.es;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import com.tc.flink.conf.KafkaConfig;
import org.apache.flink.api.common.functions.JoinFunction;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.streaming.api.TimeCharacteristic;
import org.apache.flink.streaming.api.datastream.AsyncDataStream;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer011;
import java.util.Properties;
import java.util.concurrent.TimeUnit;
public class StreamJoinStaticData {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment envStream = StreamExecutionEnvironment.createLocalEnvironment();
envStream.setStreamTimeCharacteristic(TimeCharacteristic.IngestionTime);
Properties propsConsumer = new Properties();
propsConsumer.setProperty("bootstrap.servers", KafkaConfig.KAFKA_BROKER_LIST);
propsConsumer.setProperty("group.id", "test");
FlinkKafkaConsumer011 consumer = new FlinkKafkaConsumer011("topic-test", new SimpleStringSchema(), propsConsumer);
consumer.setStartFromLatest();
DataStream stream = envStream.addSource(consumer).setParallelism(2);
stream.print();
DataStream, Integer>> tuple2Stream = stream.map(s -> {
JSONObject jsonObject = JSON.parseObject(s);
String fromCity = jsonObject.getString("fromCity");
String toCity = jsonObject.getString("toCity");
Integer ticketNum = jsonObject.getInteger("ticketNum");
return Tuple2.of(Tuple2.of(fromCity, toCity), ticketNum);
}).returns(Types.TUPLE(Types.TUPLE(Types.STRING, Types.STRING), Types.INT));
//超时时间设置长一些,要不容易报错.,这步相当于建立一个读Es表的流
DataStream, String>> dimTable = AsyncDataStream.unorderedWait(tuple2Stream, new AsyncEsDataRequest(), 2, TimeUnit.SECONDS, 100);
//实时流Join Es表的流
DataStream, String, Integer>> finalResult = tuple2Stream.join(dimTable).where(new FirstKeySelector()).equalTo(new SecondKeySelector())
.window(TumblingEventTimeWindows.of(Time.milliseconds(1000))).apply(
new JoinFunction, Integer>, Tuple2, String>, Tuple3, String, Integer>>() {
@Override
public Tuple3, String, Integer> join(Tuple2, Integer> first, Tuple2, String> second) throws Exception {
return Tuple3.of(first.f0, second.f1, first.f1);
}
}
);
finalResult.print();
envStream.execute("this-test");
}
private static class FirstKeySelector implements KeySelector, Integer>, Tuple2> {
@Override
public Tuple2 getKey(Tuple2, Integer> value) throws Exception {
return value.f0;
}
}
private static class SecondKeySelector implements KeySelector, String>, Tuple2> {
@Override
public Tuple2 getKey(Tuple2, String> value) throws Exception {
return value.f0;
}
}
}
这就是完成实时流与ElasticSearch表维度表join的实时流,
且对ES流做了guava的缓存,减少对ES多次读取.