Flink:实时ETL案例

抽取(extract)、转换(transform)、加载(load)

需求

将国家编号变成地区编号(在跨国业务常用)

形式

数据源

{"dt":"2020-08-12 17:19:40","countryCode":"SA","data":[{"type":"s2","score":0.1,"level":"A+"},{"type":"s5","score":0.5,"level":"C"}]}

国家和地区映射,前面是地区编号,后面是国家编号

areass   AREA_US   US

areass   AREA_CT   TW,HK

areass   AREA_AR   PK,KW,SA

areass   AREA_IN   IN

下面就是转化后的数据

dt, 区域(亚洲区) ,data[]

{"area":"AREA_AR","dt":"2020-08-12 17:20:25","score":0.8,"level":"C","type":"s5"}

实现思路

1、产生数据源

2、将区域和国家的对应关系数据进行保存----Redis

3、通过代码,将redis中的映射关系数据取出,并保存到HashMap中

4、将数据源中的每一条数据发送到Kafka上保存,allData这个Topic

5、通过Flink的消费者Consumer,从allData这个Topic上去拉取数据

6、通过Flink的自定义数据源,从Redis中拉取映射关系的数据

7、在Flink中转化数据源

8、将转化好的数据进行保存 -- kafka allDataDone这个topic

分析

  • flume采集数据

  • 将flume采集到的数据发送到Kafaka的名为:AllData的Topic上

  • 通过flink读取kafka上alldata的数据,进行实时转换

  • 数据解析完成后,通过FlinkKafkaProducer将数据写到Kafka的alldataclean中

  • 为了后期的离线操作,通过Flume对数据进行分类落盘操作

依赖



    org.apache.flink
    flink-java
    1.7.2



    org.apache.flink
    flink-streaming-java_2.12
    1.7.2
    



    org.apache.flink
    flink-connector-redis_2.11
    1.1.5



    org.apache.flink
    flink-connector-kafka_2.11
    1.7.2



    com.alibaba
    fastjson
    1.2.73

实现

flume的配置文件:file-kafka-allData.conf  

agent1.sources = source1
 
agent1.sinks = sink1
 
agent1.channels = channel1
 
 
# Describe/configure tail -F source1
 
agent1.sources.source1.type = exec
 
agent1.sources.source1.command = tail -F /data/log/allData.log
 
agent1.sources.source1.channels = channel1
 
 
#configure host for source
 
agent1.sources.source1.interceptors = i1
 
agent1.sources.source1.interceptors.i1.type = host
 
agent1.sources.source1.interceptors.i1.hostHeader = hostname
 
 
# Describe sink1
 
# sink
a1.sinks.k1.type = org.apache.flume.sink.kafka.KafkaSink
a1.sinks.k1.kafka.bootstrap.servers = 192.168.25.129:9092,192.168.25.128:9092,192.168.25.127:9092
a1.sinks.k1.kafka.topic = allData 
a1.sinks.k1.kafka.flumeBatchSize = 20 
a1.sinks.k1.kafka.producer.acks = 1
a1.sinks.k1.kafka.producer.linger.ms = 1
a1.sinks.k1.kafka.producer.compression.type = snappy
 
# Use a channel which buffers events in memory
 
agent1.channels.channel1.type = memory
 
agent1.channels.channel1.keep-alive = 120
 
agent1.channels.channel1.capacity = 500000
 
agent1.channels.channel1.transactionCapacity = 600
 
 
# Bind the source and sink to the channel
 
agent1.sources.source1.channels = channel1
 
agent1.sinks.sink1.channel = channel1

模拟产生流数据源

import org.apache.kafka.clients.producer.KafkaProducer;
import org.apache.kafka.clients.producer.ProducerRecord;
import org.apache.kafka.common.serialization.StringSerializer;

import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Properties;
import java.util.Random;

public class KafkaProducerDemo {
    public static void main(String[] args) throws Exception {
        Properties prop = new Properties();
        //指定kafka broker地址
        prop.put("bootstrap.servers", "192.168.25.129:9092");
        //指定key value的序列化方式
        prop.put("key.serializer", StringSerializer.class.getName());
        prop.put("value.serializer", StringSerializer.class.getName());
        //指定topic名称
        String topic = "allData";

        //创建producer链接
        KafkaProducer producer = new KafkaProducer(prop);

        //{"dt":"2018-01-01 10:11:11","countryCode":"US","data":[{"type":"s1","score":0.3,"level":"A"},{"type":"s2","score":0.2,"level":"B"}]}

        //生产消息
        while (true) {
            String message = "{\"dt\":\"" + getCurrentTime() + "\",\"countryCode\":\"" + getCountryCode() + "\",\"data\":[{\"type\":\"" + getRandomType() + "\",\"score\":" + getRandomScore() + ",\"level\":\"" + getRandomLevel() + "\"},{\"type\":\"" + getRandomType() + "\",\"score\":" + getRandomScore() + ",\"level\":\"" + getRandomLevel() + "\"}]}";
            System.out.println(message);
            producer.send(new ProducerRecord(topic, message));
            Thread.sleep(2000);
        }
        //关闭链接
        //producer.close();
    }

    public static String getCurrentTime() {
        // 返回当前的时间
        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
        return sdf.format(new Date());
    }

    // 得到一个随机的国家编号
    public static String getCountryCode() {
        String[] types = {"US", "TW", "HK", "PK", "KW", "SA", "IN"};
        Random random = new Random();
        int i = random.nextInt(types.length);
        return types[i];
    }


    public static String getRandomType() {
        String[] types = {"s1", "s2", "s3", "s4", "s5"};
        Random random = new Random();
        int i = random.nextInt(types.length);
        return types[i];
    }

    // 得到一个随机的得分
    public static double getRandomScore() {
        double[] types = {0.3, 0.2, 0.1, 0.5, 0.8};
        Random random = new Random();
        int i = random.nextInt(types.length);
        return types[i];
    }

    // 得到随机的leve
    public static String getRandomLevel() {
        String[] types = {"A", "A+", "B", "C", "D"};
        Random random = new Random();
        int i = random.nextInt(types.length);
        return types[i];
    }

}

自定义RedisSource

从Redis中读(国家和大区)数据,并把大区和国家的对应关系组装成java的hashmap

自定义数据源,实现SourceFunction接口,重写run和cancel两个方法

run方法中获取数据

cancel:释放资源

import org.apache.flink.streaming.api.functions.source.SourceFunction;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import redis.clients.jedis.Jedis;
import redis.clients.jedis.exceptions.JedisConnectionException;

/**
 *
 * redis中进行数据初始化
 *
 * hset areas AREA_US US
 * hset areas AREA_CT TW,HK
 * hset areas AREA_AR PK,KW,SA
 * hset areas AREA_IN IN
 *
 * 在redis中保存的有国家和大区的关系
 *
 * 需要把大区和国家的对应关系组装成java的hashmap
 *
 */
public class MyRedisSource implements SourceFunction> {
    private Logger logger = LoggerFactory.getLogger(MyRedisSource.class);

    private final long SLEEP_MILLION = 60000;

    private boolean isRunning = true;
    private Jedis jedis = null;

    public void run(SourceContext> ctx) throws Exception {

        this.jedis = new Jedis("192.168.25.129", 6379);
        //存储所有国家和大区的对应关系
        HashMap keyValueMap = new HashMap();
        while (isRunning){
            try{
                keyValueMap.clear();
                Map areas = jedis.hgetAll("areas");
                for (Map.Entry entry: areas.entrySet()) {
                    String key = entry.getKey();
                    String value = entry.getValue();
                    String[] splits = value.split(",");
                    for (String split: splits) {
                        keyValueMap.put(split,key);
                    }
                }
                if(keyValueMap.size()>0){
                    ctx.collect(keyValueMap);
                }else{
                    logger.warn("从redis中获取的数据为空!!!");
                }
                Thread.sleep(SLEEP_MILLION);
            }catch (JedisConnectionException e){
                logger.error("redis链接异常,重新获取链接",e.getCause());
                jedis = new Jedis("teacher2", 6379);
            }catch (Exception e){
                logger.error("source 数据源异常",e.getCause());
            }

        }

    }

    public void cancel() {
        isRunning = false;
        if(jedis!=null){
            jedis.close();
        }
    }
}

Flink数据转换

bin/kafka-topics.sh --create --zookeeper 192.168.25.129:2181 --replication-factor 1 --partitions 1 --topic allData

bin/kafka-topics.sh --create --zookeeper 192.168.25.129:2181 --replication-factor 1 --partitions 1 --topic allDataClen
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.streaming.api.CheckpointingMode;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.CheckpointConfig;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.co.CoFlatMapFunction;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaProducer;
import org.apache.flink.streaming.util.serialization.KeyedSerializationSchemaWrapper;
import org.apache.flink.util.Collector;

import java.util.Properties;

/**
 *
 * 创建kafka topic的命令
 * bin/kafka-topics.sh  --create --topic allData --zookeeper localhost:2181 --partitions 5 --replication-factor 1
 * bin/kafka-topics.sh  --create --topic allDataClean --zookeeper localhost:2181 --partitions 5 --replication-factor 1
 *
 *
 * 数据清洗需求
 *
 * 组装代码
 *
 */
public class DataClean {

    public static void main(String[] args) throws Exception{

        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        // 修改并行度
        // 与Kafka主题的分区数对应
        env.setParallelism(5);

        // checkpoint配置
        env.enableCheckpointing(60000);
        env.getCheckpointConfig().setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE);
        env.getCheckpointConfig().setMinPauseBetweenCheckpoints(30000);
        env.getCheckpointConfig().setCheckpointTimeout(10000);
        env.getCheckpointConfig().setMaxConcurrentCheckpoints(1);
        env.getCheckpointConfig().enableExternalizedCheckpoints(CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION);

        //设置statebackend

        //env.setStateBackend(new RocksDBStateBackend("hdfs://192.168.25.129:9000/flink/checkpoints",true));


        //指定kafkasource
        String topic = "allData";
        Properties prop = new Properties();
        prop.setProperty("bootstrap.servers","192.168.25.129:9092");
        prop.setProperty("group.id","con1");
        FlinkKafkaConsumer myConsumer = new FlinkKafkaConsumer(topic, new SimpleStringSchema(), prop);

        //获取kafka中的数据
        //{"dt":"2018-01-01 11:11:11","countryCode":"US","data":[{"type":"s1","score":0.3,"level":"A"},{"type":"s2","score":0.1,"level":"B"}]}
        DataStreamSource data = env.addSource(myConsumer);

//        data.print();
        //最新的国家码和大区的映射关系
        DataStream> mapData = env.addSource(new MyRedisSource()).broadcast();//可以把数据发送到后面算子的所有并行实例中

        DataStream resData = data.connect(mapData).flatMap(new CoFlatMapFunction, String>() {


            //存储国家和大区的映射关系
            private HashMap allMap = new HashMap();

            //flatmap1处理的是kafka中的数据
            public void flatMap1(String value, Collector out) throws Exception {
                JSONObject jsonObject = JSONObject.parseObject(value);
                String dt = jsonObject.getString("dt");
                String countryCode = jsonObject.getString("countryCode");
                //获取大区
                String area = allMap.get(countryCode);

                JSONArray jsonArray = jsonObject.getJSONArray("data");
                for (int i = 0; i < jsonArray.size(); i++) {
                    JSONObject jsonObject1 = jsonArray.getJSONObject(i);
                    System.out.println("area:----"+area+"--------------");
                    jsonObject1.put("area", area);
                    jsonObject1.put("dt", dt);
                    out.collect(jsonObject1.toJSONString());
                }
            }

            //flatmap2处理的是redis返回的map类型的数据
            public void flatMap2(HashMap value, Collector out) throws Exception {
                this.allMap = value;
            }
        });
//        resData.print();

        String outTopic = "allDataClean";
        Properties outprop = new Properties();
        outprop.setProperty("bootstrap.servers","192.168.25.129:9092");
        //第一种解决方案,设置FlinkKafkaProducer011里面的事务超时时间
        //设置事务超时时间
        prop.setProperty("transaction.timeout.ms",60000*15+"");

        //第二种解决方案,设置kafka的最大事务超时时间

//        FlinkKafkaProducer myProducer = new FlinkKafkaProducer(outTopic, new KeyedSerializationSchemaWrapper(new SimpleStringSchema()), outprop, FlinkKafkaProducer.Semantic.EXACTLY_ONCE);
//        resData.addSink(myProducer);

        FlinkKafkaProducer myProducer = new FlinkKafkaProducer("teacher2:9092", outTopic, new SimpleStringSchema());
        resData.addSink(myProducer);


        env.execute("DataClean");

    }


}
/kafka-console-consumer.sh --bootstrap-server 192.168.25.129:9092 --from-beginning --topic allData

你可能感兴趣的:(Flink,Java,flink,etl,大数据)