Flink入门(五) 实时流Join ElasticSearch6维度表

需求

实时流需要和维护表Join做属性的扩展.
Spark-Streaming可以 stream join hive表.
flink没发现这个功能,所以将维度表放在ES上.


maven依赖


  
        UTF-8
        1.6.2
        1.2.47
        6.3.0
        25.1-jre
    
     ...

        
            com.alibaba
            fastjson
            ${fastjson.version}
        
        
        
            org.elasticsearch.client
            elasticsearch-rest-high-level-client
            ${elasticsearch.version}
        

        
            com.google.guava
            guava
            ${guava.version}
        
        

说明:因为根据关键字读取ES表的数据,且采用guava做缓存,减少多次拉取ES的次数.
新建类AsyncEsDataRequest继承RichAsyncFunction类.


package com.tc.flink.demo.es;

import com.alibaba.fastjson.JSONObject;
import com.google.common.cache.Cache;
import com.google.common.cache.CacheBuilder;
import com.google.common.cache.RemovalListener;
import com.google.common.cache.RemovalNotification;
import com.tc.flink.util.CommonUtil;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.functions.async.ResultFuture;
import org.apache.flink.streaming.api.functions.async.RichAsyncFunction;
import org.elasticsearch.action.ActionListener;
import org.elasticsearch.action.search.SearchRequest;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.client.RestHighLevelClient;
import org.elasticsearch.index.query.QueryBuilder;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.search.SearchHit;
import org.elasticsearch.search.builder.SearchSourceBuilder;

import java.io.IOException;
import java.util.Collections;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.TimeUnit;
import java.util.function.Supplier;

public class AsyncEsDataRequest extends RichAsyncFunction, Integer>, Tuple2, String>> {

    private transient RestHighLevelClient restHighLevelClient;

    private transient volatile Cache, String> cityPercent;

    @Override
    public void open(Configuration parameters) throws Exception {
        //初始化ElasticSearch-Client
        restHighLevelClient = CommonUtil.getRestHighLevelClient();
        //缓存设置
        cityPercent = CacheBuilder., String> newBuilder().maximumSize(10).expireAfterWrite(5, TimeUnit.MINUTES)
                .removalListener(
                        //生成环境,可以注销,这个是测试观察缓存使用
                        new RemovalListener() {
                            @Override
                            public void onRemoval(RemovalNotification notification) {
                                System.out.println(notification.getKey() + " wa remove,cause is:" + notification.getCause());
                            }
                        }
                ).build();
    }

    @Override
    public void close() throws Exception {
        restHighLevelClient.close();
    }


    @Override
    public void asyncInvoke(Tuple2, Integer> input, ResultFuture, String>> resultFuture) throws Exception {
        Tuple2 fromToCity = input.f0;
        //若缓存里存在,直接从缓存里读取key
        String stationPercent = cityPercent.getIfPresent(fromToCity);
        if (stationPercent != null) {
            System.out.println("get data from the cache :" + stationPercent);
            resultFuture.complete(Collections.singleton(new Tuple2, String>(input.f0, stationPercent)));
        } else {
            search(input, resultFuture);
        }

    }
    //异步去读Es表
    private void search(Tuple2, Integer> input, ResultFuture, String>> resultFuture) {
        SearchRequest searchRequest = new SearchRequest("posts");
        searchRequest.indices("trafficwisdom.train_section_percent");
        String fromCity = input.f0.f0;
        String toCity = input.f0.f1;
        QueryBuilder builder = QueryBuilders.boolQuery().must(QueryBuilders.termQuery("from_city", fromCity)).must(QueryBuilders.termQuery("to_city", toCity));
        SearchSourceBuilder sourceBuilder = new SearchSourceBuilder();
        sourceBuilder.query(builder);
        searchRequest.source(sourceBuilder);
        ActionListener listener = new ActionListener() {
            //成功
            @Override
            public void onResponse(SearchResponse searchResponse) {
                String stationPercent = null;
                SearchHit[] searchHits = searchResponse.getHits().getHits();
                if (searchHits.length > 0) {
                    JSONObject jsonObject = JSONObject.parseObject(searchHits[0].getSourceAsString());
                    stationPercent = jsonObject.getString("section_search_percent");
                    cityPercent.put(input.f0, stationPercent);
                }
                System.out.println("get data from the es :" + stationPercent);
                resultFuture.complete(Collections.singleton(new Tuple2, String>(input.f0, stationPercent)));
            }

            //失败
            @Override
            public void onFailure(Exception e) {
                resultFuture.complete(Collections.singleton(new Tuple2, String>(input.f0, null)));
            }
        };
        restHighLevelClient.searchAsync(searchRequest, listener);
    }
}

主程序


package com.tc.flink.demo.es;

import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import com.tc.flink.conf.KafkaConfig;
import org.apache.flink.api.common.functions.JoinFunction;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.streaming.api.TimeCharacteristic;
import org.apache.flink.streaming.api.datastream.AsyncDataStream;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer011;

import java.util.Properties;
import java.util.concurrent.TimeUnit;


public class StreamJoinStaticData {

    public static void main(String[] args) throws Exception {

        StreamExecutionEnvironment envStream = StreamExecutionEnvironment.createLocalEnvironment();
        envStream.setStreamTimeCharacteristic(TimeCharacteristic.IngestionTime);
        Properties propsConsumer = new Properties();
        propsConsumer.setProperty("bootstrap.servers", KafkaConfig.KAFKA_BROKER_LIST);
        propsConsumer.setProperty("group.id", "test");
        FlinkKafkaConsumer011 consumer = new FlinkKafkaConsumer011("topic-test", new SimpleStringSchema(), propsConsumer);
        consumer.setStartFromLatest();
        DataStream stream = envStream.addSource(consumer).setParallelism(2);
        stream.print();
        DataStream, Integer>> tuple2Stream = stream.map(s -> {
            JSONObject jsonObject = JSON.parseObject(s);
            String fromCity = jsonObject.getString("fromCity");
            String toCity = jsonObject.getString("toCity");
            Integer ticketNum = jsonObject.getInteger("ticketNum");
            return Tuple2.of(Tuple2.of(fromCity, toCity), ticketNum);
        }).returns(Types.TUPLE(Types.TUPLE(Types.STRING, Types.STRING), Types.INT));

        //超时时间设置长一些,要不容易报错.,这步相当于建立一个读Es表的流
        DataStream, String>> dimTable = AsyncDataStream.unorderedWait(tuple2Stream, new AsyncEsDataRequest(), 2, TimeUnit.SECONDS, 100);
        //实时流Join Es表的流
        DataStream, String, Integer>> finalResult = tuple2Stream.join(dimTable).where(new FirstKeySelector()).equalTo(new SecondKeySelector())
                .window(TumblingEventTimeWindows.of(Time.milliseconds(1000))).apply(
                        new JoinFunction, Integer>, Tuple2, String>, Tuple3, String, Integer>>() {
                            @Override
                            public Tuple3, String, Integer> join(Tuple2, Integer> first, Tuple2, String> second) throws Exception {
                                return Tuple3.of(first.f0, second.f1, first.f1);
                            }
                        }
                );

        finalResult.print();

        envStream.execute("this-test");
    }

    private static class FirstKeySelector implements KeySelector, Integer>, Tuple2> {
        @Override
        public Tuple2 getKey(Tuple2, Integer> value) throws Exception {
            return value.f0;
        }
    }

    private static class SecondKeySelector implements KeySelector, String>, Tuple2> {
        @Override
        public Tuple2 getKey(Tuple2, String> value) throws Exception {
            return value.f0;
        }
    }

}

这就是完成实时流与ElasticSearch表维度表join的实时流,
且对ES流做了guava的缓存,减少对ES多次读取.

你可能感兴趣的:(ElasticSearch,Flink,Flink入门)