这个案例是建立在上一篇的基础之上,之所以做总结是因为网上很多都只是简单的调用API而已,根本找不到优化过后的 sink 自定义方法,显示的生产中我们不可能仅仅是像教程那样简单的能在HBase插入数据就行,性能对于生产系统是非常重要的。
接下来我们开搞啦。。。
一、前期准备
1.1 引入相关依赖
org.apache.flink
flink-hbase_2.11
1.10.1
xyz.downgoon
snowflake
1.0.0
二、代码的实现
2.1 主体类的实现
package com.nfdw;
import com.nfdw.entity.Employees;
import com.nfdw.sink.MyHBaseSinkFunction;
import com.nfdw.utils.*;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer011;
import java.util.Date;
import java.util.Properties;
public class App {
public static void main(String[] args) throws Exception {
// 1. 获取环境对象
StreamExecutionEnvironment env = GetStreamExecutionEnvironment.getEnv();
//请求kafka数据
Properties prop = new Properties();
prop.setProperty("bootstrap.servers","cdh101:9092");
prop.setProperty("group.id","cloudera_mirrormaker");
prop.put("value.serializer","org.apache.kafka.common.serialization.StringSerializer");
FlinkKafkaConsumer011 myConsumer = new FlinkKafkaConsumer011("luchangyin", new SimpleStringSchema() ,prop);
myConsumer.setStartFromLatest(); //最近的
//请求kafka数据
DataStreamSource dataStream = env.addSource(myConsumer);
//dataStream.print(); // {"id":"226","name":"tang tang - 226","sal":280751,"dept":"美女部","ts":1615191802523}
DataStream result = dataStream.map(new MapFunction() {
@Override
public Employees map(String s) throws Exception {
Employees emp = MyJsonUtils.str2JsonObj(s);
emp.setEmpStartTime(new Date(emp.getTs()));
emp.setDt(MyDateUtils.getDate2Second(emp.getEmpStartTime()));
return emp;
}
});
result.print();
// Employees(eId=257, eName=fei fei - 257, eSal=97674.0, eDept=美女部, ts=1615251002894, empStartTime=Tue Mar 09 08:50:02 GMT+08:00 2021, dt=2021-03-09)
// 2. 自定义 sink 入hbase
result.addSink(new MyHBaseSinkFunction());
env.execute("wo xi huan ni");
}
}
2.1.1 获取环境对象
package com.nfdw.utils;
import org.apache.flink.streaming.api.CheckpointingMode;
import org.apache.flink.streaming.api.TimeCharacteristic;
import org.apache.flink.streaming.api.environment.CheckpointConfig;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
public class GetStreamExecutionEnvironment {
public static StreamExecutionEnvironment getEnv(){
//获取Flink的运行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
//checkpoint配置
// 每隔5000毫秒启动一个检查点
env.enableCheckpointing(5000);
env.setStreamTimeCharacteristic(TimeCharacteristic.ProcessingTime);
// 将模式设置为恰好一次
env.getCheckpointConfig().setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE);
// 检查点必须在一分钟内完成,否则将被丢弃
env.getCheckpointConfig().setCheckpointTimeout(60000);
// 确保在检查点之间有500毫秒的进程
env.getCheckpointConfig().setMinPauseBetweenCheckpoints(500);
// 只允许一个检查点同时进行
env.getCheckpointConfig().setMaxConcurrentCheckpoints(1);
// job cancellation启用保留的外部检查点
env.getCheckpointConfig().enableExternalizedCheckpoints(CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION);
//状态后端
//env.setStateBackend(new FsStateBackend("hdfs://10.122.1.112:40010/flink/checkpoints", false));
// 并行度
env.setParallelism(3);
return env;
}
}
2.1.2 自定义 sink 入hbase
package com.nfdw.sink;
import com.nfdw.entity.Employees;
import com.nfdw.utils.MyDateUtils;
import com.nfdw.utils.SnowflakeIdUtil;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.functions.sink.RichSinkFunction;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.ConnectionFactory;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Table;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.MD5Hash;
public class MyHBaseSinkFunction extends RichSinkFunction {
private transient Connection conn = null;
private transient Table table = null;
@Override
public void open(Configuration parameters) throws Exception {
super.open(parameters);
org.apache.hadoop.conf.Configuration conf = HBaseConfiguration.create();
//链接服务器
conf.set("hbase.zookeeper.quorum", "10.122.1.112");
conf.set("hbase.zookeeper.property.clientPort", "2181");
if (null == conn) {
this.conn = ConnectionFactory.createConnection(conf);
}
}
@Override
public void invoke(Employees value, Context context) throws Exception {
//表名
TableName tableName = TableName.valueOf("employees");
// 获取表对象
table = conn.getTable(tableName);
// 生成 rowkey
String pkId = String.valueOf(SnowflakeIdUtil.getdidId(SnowflakeIdUtil.DCD_SNOWFLAKE)); // msg.getPkId()
byte[] orginkey = Bytes.toBytes(pkId.toString());
// 为了避免ROWKEY过长,取前4位
String md5AsHex = MD5Hash.getMD5AsHex(orginkey).substring(0,4);
String rowkey = md5AsHex + pkId;
Put put = new Put(Bytes.toBytes(rowkey));
// 列簇,列名,列值
put.addColumn(Bytes.toBytes("cf"), Bytes.toBytes("eId"), Bytes.toBytes(String.valueOf(value.getEId())));
put.addColumn(Bytes.toBytes("cf"), Bytes.toBytes("eName"), Bytes.toBytes(value.getEName()));
put.addColumn(Bytes.toBytes("cf"), Bytes.toBytes("eSal"), Bytes.toBytes(value.getESal()));
put.addColumn(Bytes.toBytes("cf"), Bytes.toBytes("eDept"), Bytes.toBytes(value.getEDept()));
put.addColumn(Bytes.toBytes("cf"), Bytes.toBytes("ts"), Bytes.toBytes(value.getTs()));
put.addColumn(Bytes.toBytes("cf"), Bytes.toBytes("empStartTime"), Bytes.toBytes(MyDateUtils.getDate2Str(value.getEmpStartTime())));
put.addColumn(Bytes.toBytes("cf"), Bytes.toBytes("dt"), Bytes.toBytes(value.getDt()));
table.put(put);
}
@Override
public void close() throws Exception {
super.close();
if (table != null){
table.close();
}
if (conn != null){
conn.close();
}
}
}
2.1.3 雪花模型生成 rowkey工具类
package com.nfdw.utils;
import xyz.downgoon.snowflake.Snowflake;
public class SnowflakeIdUtil {
/** 装置 / 原始报文*/
public static long groupId = Long.parseLong("6");
public static long workId = Long.parseLong("10");
public static Snowflake DCD_SNOWFLAKE = new Snowflake(groupId, workId);
/**
* 返回id
* @return
*/
public static long getdidId(Snowflake snowflake){
return snowflake.nextId();
}
}
三、执行获取结果
3.1 在hbase中先创建对应的 hbase 表和 列簇
hbase(main):108:0* create 'employees','cf'
3.2 执行程序并查看结果
hbase(main):115:0* scan "employees"
四、改造优化
由于上边的 hbaseSink 仅仅是简单的插入数据而已,对于大数据量这样操作不是很好,效率低,因此我们就它进行改在让其在大数据量的情况下批量对数据进行处理。代码改造如下:
package com.nfdw.sink;
import com.nfdw.entity.Employees;
import com.nfdw.utils.MyDateUtils;
import com.nfdw.utils.SnowflakeIdUtil;
import lombok.extern.slf4j.Slf4j;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.runtime.state.FunctionInitializationContext;
import org.apache.flink.runtime.state.FunctionSnapshotContext;
import org.apache.flink.runtime.util.ExecutorThreadFactory;
import org.apache.flink.streaming.api.checkpoint.CheckpointedFunction;
import org.apache.flink.streaming.api.functions.sink.RichSinkFunction;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.TableNotFoundException;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.MD5Hash;
import java.io.IOException;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.ScheduledFuture;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.atomic.AtomicReference;
@Slf4j
public class MyHBaseSinkFunction02 extends RichSinkFunction implements CheckpointedFunction, BufferedMutator.ExceptionListener {
private transient Connection conn = null;
private transient BufferedMutator mutator;
private String qualifier;
private long bufferFlushMaxMutations;
private transient ScheduledExecutorService executor;
private transient ScheduledFuture scheduledFuture;
private transient AtomicLong numPendingRequests;
private transient volatile boolean closed = false;
private final AtomicReference failureThrowable = new AtomicReference<>();
@Override
public void open(Configuration parameters) throws Exception {
super.open(parameters);
org.apache.hadoop.conf.Configuration conf = HBaseConfiguration.create();
String zkQuorum = "10.122.1.112";
String clientPort = "2181";
String tableName = "employees";
// 根据不同环境的配置设置一下三个参数即可
long bufferFlushMaxSizeInBytes = 10000L;
long bufferFlushIntervalMillis = 1L;
bufferFlushMaxMutations = 10L;
qualifier = "cf"; //Bytes.toBytes("cf")
if (zkQuorum == null || tableName == null || bufferFlushMaxSizeInBytes <= 0L || bufferFlushMaxMutations <= 0L || qualifier == null) {
throw new RuntimeException("please set legal property, check zkQuorum/tableName/bufferFlushMaxSizeInBytes/bufferFlushMaxMutations/qualifier property");
}
//链接服务器
conf.set("hbase.zookeeper.quorum", zkQuorum);
conf.set("hbase.zookeeper.property.clientPort", clientPort);
try {
this.numPendingRequests = new AtomicLong(0);
if (null == conn) {
this.conn = ConnectionFactory.createConnection(conf);
}
BufferedMutatorParams params = new BufferedMutatorParams(TableName.valueOf(tableName))
.listener(this)
.writeBufferSize(bufferFlushMaxSizeInBytes);
mutator = conn.getBufferedMutator(params);
if (bufferFlushIntervalMillis > 0) {
this.executor = Executors.newScheduledThreadPool(
1, new ExecutorThreadFactory("hbase-sink-flusher"));
this.scheduledFuture = this.executor.scheduleWithFixedDelay(() -> {
if (closed) {
return;
}
try {
flush();
} catch (Exception e) {
failureThrowable.compareAndSet(null, e);
}
}, bufferFlushIntervalMillis, bufferFlushIntervalMillis, TimeUnit.MILLISECONDS);
}
} catch (TableNotFoundException tnfe) {
log.error("The table " + tableName + " not found ", tnfe);
throw new RuntimeException("HBase table '" + tableName + "' not found.", tnfe);
} catch (IOException ioe) {
log.error("Exception while creating connection to HBase.", ioe);
throw new RuntimeException("Cannot create connection to HBase.", ioe);
}
}
@Override
public void invoke(Employees value, Context context) throws Exception {
checkErrorAndRethrow();
//表名
TableName tableName = TableName.valueOf("employees");
// 生成 rowkey
String pkId = String.valueOf(SnowflakeIdUtil.getdidId(SnowflakeIdUtil.DCD_SNOWFLAKE)); // msg.getPkId()
byte[] orginkey = Bytes.toBytes(pkId.toString());
// 为了避免ROWKEY过长,取前4位
String md5AsHex = MD5Hash.getMD5AsHex(orginkey).substring(0,4);
String rowkey = md5AsHex + pkId;
Put put = new Put(Bytes.toBytes(rowkey));
// 列簇,列名,列值
put.addColumn(Bytes.toBytes("cf"), Bytes.toBytes("eId"), Bytes.toBytes(String.valueOf(value.getEId())));
put.addColumn(Bytes.toBytes("cf"), Bytes.toBytes("eName"), Bytes.toBytes(value.getEName()));
put.addColumn(Bytes.toBytes("cf"), Bytes.toBytes("eSal"), Bytes.toBytes(value.getESal()));
put.addColumn(Bytes.toBytes("cf"), Bytes.toBytes("eDept"), Bytes.toBytes(value.getEDept()));
put.addColumn(Bytes.toBytes("cf"), Bytes.toBytes("ts"), Bytes.toBytes(value.getTs()));
put.addColumn(Bytes.toBytes("cf"), Bytes.toBytes("empStartTime"), Bytes.toBytes(MyDateUtils.getDate2Str(value.getEmpStartTime())));
put.addColumn(Bytes.toBytes("cf"), Bytes.toBytes("dt"), Bytes.toBytes(value.getDt()));
mutator.mutate(put);
// flush when the buffer number of mutations greater than the configured max size.
if (bufferFlushMaxMutations > 0 && numPendingRequests.incrementAndGet() >= bufferFlushMaxMutations) {
flush();
}
}
private void flush() throws Exception {
mutator.flush();
numPendingRequests.set(0);
checkErrorAndRethrow();
}
private void checkErrorAndRethrow() {
Throwable cause = failureThrowable.get();
if (cause != null) {
throw new RuntimeException("An error occurred in HBaseSink.", cause);
}
}
@Override
public void close() throws Exception {
closed = true;
if (mutator != null) {
try {
mutator.close();
} catch (IOException e) {
log.warn("Exception occurs while closing HBase BufferedMutator.", e);
}
this.mutator = null;
}
if (conn != null) {
try {
conn.close();
} catch (IOException e) {
log.warn("Exception occurs while closing HBase Connection.", e);
}
this.conn = null;
}
if (scheduledFuture != null) {
scheduledFuture.cancel(false);
if (executor != null) {
executor.shutdownNow();
}
}
}
@Override
public void snapshotState(FunctionSnapshotContext functionSnapshotContext) throws Exception {
while (numPendingRequests.get() != 0) {
flush();
}
}
@Override
public void initializeState(FunctionInitializationContext functionInitializationContext) throws Exception {
}
@Override
public void onException(RetriesExhaustedWithDetailsException e, BufferedMutator bufferedMutator) throws RetriesExhaustedWithDetailsException {
failureThrowable.compareAndSet(null, e);
}
}
改造完之后,我们可以在不同的环境下修改下参数即可,我们运行程序,发现和上边运行的结果一样,当然在大数据量的情况下,我们插入的效率会比较快哦。。。