抽取(extract)、转换(transform)、加载(load)
将国家编号变成地区编号(在跨国业务常用)
数据源
{"dt":"2020-08-12 17:19:40","countryCode":"SA","data":[{"type":"s2","score":0.1,"level":"A+"},{"type":"s5","score":0.5,"level":"C"}]}
国家和地区映射,前面是地区编号,后面是国家编号
areass AREA_US US
areass AREA_CT TW,HK
areass AREA_AR PK,KW,SA
areass AREA_IN IN
下面就是转化后的数据
dt, 区域(亚洲区) ,data[]
{"area":"AREA_AR","dt":"2020-08-12 17:20:25","score":0.8,"level":"C","type":"s5"}
1、产生数据源
2、将区域和国家的对应关系数据进行保存----Redis
3、通过代码,将redis中的映射关系数据取出,并保存到HashMap中
4、将数据源中的每一条数据发送到Kafka上保存,allData这个Topic
5、通过Flink的消费者Consumer,从allData这个Topic上去拉取数据
6、通过Flink的自定义数据源,从Redis中拉取映射关系的数据
7、在Flink中转化数据源
8、将转化好的数据进行保存 -- kafka allDataDone这个topic
flume采集数据
将flume采集到的数据发送到Kafaka的名为:AllData的Topic上
通过flink读取kafka上alldata的数据,进行实时转换
数据解析完成后,通过FlinkKafkaProducer将数据写到Kafka的alldataclean中
为了后期的离线操作,通过Flume对数据进行分类落盘操作
org.apache.flink
flink-java
1.7.2
org.apache.flink
flink-streaming-java_2.12
1.7.2
org.apache.flink
flink-connector-redis_2.11
1.1.5
org.apache.flink
flink-connector-kafka_2.11
1.7.2
com.alibaba
fastjson
1.2.73
agent1.sources = source1
agent1.sinks = sink1
agent1.channels = channel1
# Describe/configure tail -F source1
agent1.sources.source1.type = exec
agent1.sources.source1.command = tail -F /data/log/allData.log
agent1.sources.source1.channels = channel1
#configure host for source
agent1.sources.source1.interceptors = i1
agent1.sources.source1.interceptors.i1.type = host
agent1.sources.source1.interceptors.i1.hostHeader = hostname
# Describe sink1
# sink
a1.sinks.k1.type = org.apache.flume.sink.kafka.KafkaSink
a1.sinks.k1.kafka.bootstrap.servers = 192.168.25.129:9092,192.168.25.128:9092,192.168.25.127:9092
a1.sinks.k1.kafka.topic = allData
a1.sinks.k1.kafka.flumeBatchSize = 20
a1.sinks.k1.kafka.producer.acks = 1
a1.sinks.k1.kafka.producer.linger.ms = 1
a1.sinks.k1.kafka.producer.compression.type = snappy
# Use a channel which buffers events in memory
agent1.channels.channel1.type = memory
agent1.channels.channel1.keep-alive = 120
agent1.channels.channel1.capacity = 500000
agent1.channels.channel1.transactionCapacity = 600
# Bind the source and sink to the channel
agent1.sources.source1.channels = channel1
agent1.sinks.sink1.channel = channel1
import org.apache.kafka.clients.producer.KafkaProducer;
import org.apache.kafka.clients.producer.ProducerRecord;
import org.apache.kafka.common.serialization.StringSerializer;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Properties;
import java.util.Random;
public class KafkaProducerDemo {
public static void main(String[] args) throws Exception {
Properties prop = new Properties();
//指定kafka broker地址
prop.put("bootstrap.servers", "192.168.25.129:9092");
//指定key value的序列化方式
prop.put("key.serializer", StringSerializer.class.getName());
prop.put("value.serializer", StringSerializer.class.getName());
//指定topic名称
String topic = "allData";
//创建producer链接
KafkaProducer producer = new KafkaProducer(prop);
//{"dt":"2018-01-01 10:11:11","countryCode":"US","data":[{"type":"s1","score":0.3,"level":"A"},{"type":"s2","score":0.2,"level":"B"}]}
//生产消息
while (true) {
String message = "{\"dt\":\"" + getCurrentTime() + "\",\"countryCode\":\"" + getCountryCode() + "\",\"data\":[{\"type\":\"" + getRandomType() + "\",\"score\":" + getRandomScore() + ",\"level\":\"" + getRandomLevel() + "\"},{\"type\":\"" + getRandomType() + "\",\"score\":" + getRandomScore() + ",\"level\":\"" + getRandomLevel() + "\"}]}";
System.out.println(message);
producer.send(new ProducerRecord(topic, message));
Thread.sleep(2000);
}
//关闭链接
//producer.close();
}
public static String getCurrentTime() {
// 返回当前的时间
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
return sdf.format(new Date());
}
// 得到一个随机的国家编号
public static String getCountryCode() {
String[] types = {"US", "TW", "HK", "PK", "KW", "SA", "IN"};
Random random = new Random();
int i = random.nextInt(types.length);
return types[i];
}
public static String getRandomType() {
String[] types = {"s1", "s2", "s3", "s4", "s5"};
Random random = new Random();
int i = random.nextInt(types.length);
return types[i];
}
// 得到一个随机的得分
public static double getRandomScore() {
double[] types = {0.3, 0.2, 0.1, 0.5, 0.8};
Random random = new Random();
int i = random.nextInt(types.length);
return types[i];
}
// 得到随机的leve
public static String getRandomLevel() {
String[] types = {"A", "A+", "B", "C", "D"};
Random random = new Random();
int i = random.nextInt(types.length);
return types[i];
}
}
从Redis中读(国家和大区)数据,并把大区和国家的对应关系组装成java的hashmap
自定义数据源,实现SourceFunction接口,重写run和cancel两个方法
run方法中获取数据
cancel:释放资源
import org.apache.flink.streaming.api.functions.source.SourceFunction;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import redis.clients.jedis.Jedis;
import redis.clients.jedis.exceptions.JedisConnectionException;
/**
*
* redis中进行数据初始化
*
* hset areas AREA_US US
* hset areas AREA_CT TW,HK
* hset areas AREA_AR PK,KW,SA
* hset areas AREA_IN IN
*
* 在redis中保存的有国家和大区的关系
*
* 需要把大区和国家的对应关系组装成java的hashmap
*
*/
public class MyRedisSource implements SourceFunction> {
private Logger logger = LoggerFactory.getLogger(MyRedisSource.class);
private final long SLEEP_MILLION = 60000;
private boolean isRunning = true;
private Jedis jedis = null;
public void run(SourceContext> ctx) throws Exception {
this.jedis = new Jedis("192.168.25.129", 6379);
//存储所有国家和大区的对应关系
HashMap keyValueMap = new HashMap();
while (isRunning){
try{
keyValueMap.clear();
Map areas = jedis.hgetAll("areas");
for (Map.Entry entry: areas.entrySet()) {
String key = entry.getKey();
String value = entry.getValue();
String[] splits = value.split(",");
for (String split: splits) {
keyValueMap.put(split,key);
}
}
if(keyValueMap.size()>0){
ctx.collect(keyValueMap);
}else{
logger.warn("从redis中获取的数据为空!!!");
}
Thread.sleep(SLEEP_MILLION);
}catch (JedisConnectionException e){
logger.error("redis链接异常,重新获取链接",e.getCause());
jedis = new Jedis("teacher2", 6379);
}catch (Exception e){
logger.error("source 数据源异常",e.getCause());
}
}
}
public void cancel() {
isRunning = false;
if(jedis!=null){
jedis.close();
}
}
}
bin/kafka-topics.sh --create --zookeeper 192.168.25.129:2181 --replication-factor 1 --partitions 1 --topic allData
bin/kafka-topics.sh --create --zookeeper 192.168.25.129:2181 --replication-factor 1 --partitions 1 --topic allDataClen
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.streaming.api.CheckpointingMode;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.CheckpointConfig;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.co.CoFlatMapFunction;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaProducer;
import org.apache.flink.streaming.util.serialization.KeyedSerializationSchemaWrapper;
import org.apache.flink.util.Collector;
import java.util.Properties;
/**
*
* 创建kafka topic的命令
* bin/kafka-topics.sh --create --topic allData --zookeeper localhost:2181 --partitions 5 --replication-factor 1
* bin/kafka-topics.sh --create --topic allDataClean --zookeeper localhost:2181 --partitions 5 --replication-factor 1
*
*
* 数据清洗需求
*
* 组装代码
*
*/
public class DataClean {
public static void main(String[] args) throws Exception{
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
// 修改并行度
// 与Kafka主题的分区数对应
env.setParallelism(5);
// checkpoint配置
env.enableCheckpointing(60000);
env.getCheckpointConfig().setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE);
env.getCheckpointConfig().setMinPauseBetweenCheckpoints(30000);
env.getCheckpointConfig().setCheckpointTimeout(10000);
env.getCheckpointConfig().setMaxConcurrentCheckpoints(1);
env.getCheckpointConfig().enableExternalizedCheckpoints(CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION);
//设置statebackend
//env.setStateBackend(new RocksDBStateBackend("hdfs://192.168.25.129:9000/flink/checkpoints",true));
//指定kafkasource
String topic = "allData";
Properties prop = new Properties();
prop.setProperty("bootstrap.servers","192.168.25.129:9092");
prop.setProperty("group.id","con1");
FlinkKafkaConsumer myConsumer = new FlinkKafkaConsumer(topic, new SimpleStringSchema(), prop);
//获取kafka中的数据
//{"dt":"2018-01-01 11:11:11","countryCode":"US","data":[{"type":"s1","score":0.3,"level":"A"},{"type":"s2","score":0.1,"level":"B"}]}
DataStreamSource data = env.addSource(myConsumer);
// data.print();
//最新的国家码和大区的映射关系
DataStream> mapData = env.addSource(new MyRedisSource()).broadcast();//可以把数据发送到后面算子的所有并行实例中
DataStream resData = data.connect(mapData).flatMap(new CoFlatMapFunction, String>() {
//存储国家和大区的映射关系
private HashMap allMap = new HashMap();
//flatmap1处理的是kafka中的数据
public void flatMap1(String value, Collector out) throws Exception {
JSONObject jsonObject = JSONObject.parseObject(value);
String dt = jsonObject.getString("dt");
String countryCode = jsonObject.getString("countryCode");
//获取大区
String area = allMap.get(countryCode);
JSONArray jsonArray = jsonObject.getJSONArray("data");
for (int i = 0; i < jsonArray.size(); i++) {
JSONObject jsonObject1 = jsonArray.getJSONObject(i);
System.out.println("area:----"+area+"--------------");
jsonObject1.put("area", area);
jsonObject1.put("dt", dt);
out.collect(jsonObject1.toJSONString());
}
}
//flatmap2处理的是redis返回的map类型的数据
public void flatMap2(HashMap value, Collector out) throws Exception {
this.allMap = value;
}
});
// resData.print();
String outTopic = "allDataClean";
Properties outprop = new Properties();
outprop.setProperty("bootstrap.servers","192.168.25.129:9092");
//第一种解决方案,设置FlinkKafkaProducer011里面的事务超时时间
//设置事务超时时间
prop.setProperty("transaction.timeout.ms",60000*15+"");
//第二种解决方案,设置kafka的最大事务超时时间
// FlinkKafkaProducer myProducer = new FlinkKafkaProducer(outTopic, new KeyedSerializationSchemaWrapper(new SimpleStringSchema()), outprop, FlinkKafkaProducer.Semantic.EXACTLY_ONCE);
// resData.addSink(myProducer);
FlinkKafkaProducer myProducer = new FlinkKafkaProducer("teacher2:9092", outTopic, new SimpleStringSchema());
resData.addSink(myProducer);
env.execute("DataClean");
}
}
/kafka-console-consumer.sh --bootstrap-server 192.168.25.129:9092 --from-beginning --topic allData