一,Flink1.11引入了CDC操作,在官网我们可以看到的是:
1,Canal https://ci.apache.org/projects/flink/flink-docs-release-1.11/zh/dev/table/connectors/formats/canal.html
2,Debezium https://ci.apache.org/projects/flink/flink-docs-release-1.11/zh/dev/table/connectors/formats/debezium.html
然后社区提供 了
3,mysql-cdc https://github.com/ververica/flink-cdc-connectors/wiki/MySQL-CDC-Connector
4,changelog-json https://github.com/ververica/flink-cdc-connectors/wiki/Changelog-JSON-Format
所以在这里我会做一个笔记,演示一下正常的canal-json案例,mysql-cdc案例 changelog-json案例
二,先说一下使用场景,实际都是针对mysql的数据变化,针对binlog做到了数据的实时读取。
1,读取binlog数据聚合结果写入到kafka
2,之前位维表join是缓存,或者定时更新维表数据,不能做到更实时,有了mysql-cdc,更能实时更新维表数据了。
3,数据同步。
三,依次的代码案例演示
1,canal-json
需要依赖:
org.apache.flink
flink-json
${flink.version}
public class Cannal2kafkaTest { private static final String PRINT_SINK_SQL = "create table sink_print ( \n" + " aaa DECIMAL(10, 2) \n" + ") with ('connector' = 'print' )"; private static final String CANAL_JSON_SQL = "CREATE TABLE topic_products (" + " id BIGINT," + " name STRING," + " description STRING," + " weight DECIMAL(10, 2)" + ") WITH (" + " 'connector' = 'kafka'," + " 'topic' = 'products_test'," + // " 'topic' = 'products'," + " 'properties.bootstrap.servers' = 'dev-ct6-dc-worker01:9092,dev-ct6-dc-worker02:9092,dev-ct6-dc-worker03:9092'," + " 'properties.group.id' = 'test1'," + // " 'format' = 'canal-json'" + " 'format' = 'canal-json'," + " 'scan.startup.mode' = 'earliest-offset'" + ")"; private static final String ODS_SQL = "CREATE TABLE ods_topic (\n" + " user_id VARCHAR ," + " item_id VARCHAR," + " category_id VARCHAR," + " behavior VARCHAR," + " proctime TIMESTAMP(3)," + " ts VARCHAR" + ") WITH (" + " 'connector' = 'kafka'," + " 'topic' = 'ods_kafka'," + " 'properties.bootstrap.servers' = 'dev-ct6-dc-worker01:9092,dev-ct6-dc-worker02:9092,dev-ct6-dc-worker03:9092'," + " 'properties.group.id' = 'test1'," + " 'format' = 'json'," + " 'scan.startup.mode' = 'earliest-offset'" + ")"; public static void main(String[] args) throws Exception { StreamExecutionEnvironment bsEnv = StreamExecutionEnvironment.getExecutionEnvironment(); EnvironmentSettings bsSettings = EnvironmentSettings.newInstance().useBlinkPlanner().inStreamingMode().build(); StreamTableEnvironment tEnv = StreamTableEnvironment.create(bsEnv, bsSettings); bsEnv.enableCheckpointing(5000); bsEnv.setStreamTimeCharacteristic(TimeCharacteristic.EventTime); tEnv.executeSql(CANAL_JSON_SQL); tEnv.executeSql(PRINT_SINK_SQL); // tEnv.executeSql(ODS_SQL); // Table table = tEnv.sqlQuery("select * from topic_products"); // tEnv.toRetractStream(table, Row.class).print("$$$$$$$$:"); // tEnv.executeSql("insert into sink_print select sum(weight) as last_values from topic_products"); // tEnv.executeSql("select * from topic_products"); Table table = tEnv.sqlQuery("select id ,sum(weight) from topic_products group by id "); tEnv.toRetractStream(table, Row.class).print("¥¥¥¥输出:"); bsEnv.execute("胜多负少方式方法") ; } }
2, changelog-json 它的作用是可以讲聚合,updade stream写入到kafka,我们就可以全程以kafka作为存储,sql化操作
需要依赖:
com.alibaba.ververica
flink-format-changelog-json
1.0.0
object CdcSinkKafka { def main(args: Array[String]): Unit = { val env = StreamExecutionEnvironment.getExecutionEnvironment env.enableCheckpointing(30001) env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime) val bsSettings = EnvironmentSettings.newInstance.useBlinkPlanner.inStreamingMode.build val stenv = StreamTableEnvironment.create(env, bsSettings) val source = s""" |CREATE TABLE kafka_table ( | category_id STRING, | user_id INT, | item_id STRING, | behavior STRING, | ts STRING |) WITH ( | 'connector' = 'kafka', | 'topic' = 'user_behavior', | 'properties.bootstrap.servers' = 'dev-ct6-dc-worker01:9092,dev-ct6-dc-worker02:9092,dev-ct6-dc-worker03:9092', | 'properties.group.id' = 'test1', | 'format' = 'json', | 'scan.startup.mode' = 'earliest-offset' |) """.stripMargin stenv.executeSql(source) val sink = s""" |CREATE TABLE kafka_gmv ( | id STRING, | gmv DECIMAL(10, 5) |) WITH ( | 'connector' = 'kafka', | 'topic' = 'kafka_gmv', | 'scan.startup.mode' = 'earliest-offset', | 'properties.bootstrap.servers' = 'dev-ct6-dc-worker01:9092,dev-ct6-dc-worker02:9092,dev-ct6-dc-worker03:9092', | 'format' = 'changelog-json' |) """.stripMargin stenv.executeSql(sink) val insert = s""" | INSERT INTO kafka_gmv | SELECT behavior, SUM(user_id) as gmv | FROM kafka_table | GROUP BY behavior """.stripMargin stenv.executeSql(insert) val query = s""" |SELECT * FROM kafka_gmv; """.stripMargin stenv.executeSql(query).print() } }
3,mysql-cdc (目前不建议上生产,可以测试一下,用在维表join上面应该是挺棒的,而且他会将对应的表数据全部加载)
需要依赖:
com.alibaba.ververica
flink-connector-mysql-cdc
1.0.0
object Test1 { def main(args: Array[String]): Unit = { val env = StreamExecutionEnvironment.getExecutionEnvironment env.enableCheckpointing(30001) env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime) val bsSettings = EnvironmentSettings.newInstance.useBlinkPlanner.inStreamingMode.build val stenv = StreamTableEnvironment.create(env, bsSettings) val mysql_cdc = s""" |CREATE TABLE orders ( | id INT, | name STRING, | region_id INT, | area_code STRING |) WITH ( | 'connector' = 'mysql-cdc', | 'hostname' = '192.168.6.143', | 'port' = '3306', | 'username' = 'root', | 'password' = '12345678', | 'database-name' = 'flink_test2', | 'table-name' = 'base.*' --这里可以正则 |) | """.stripMargin val sink_print = """ |create table sink_print ( aaa INT,bbb STRING,ccc INT,ddd STRING) with ('connector' = 'print' ) """.stripMargin stenv.executeSql(mysql_cdc) stenv.executeSql(sink_print) stenv.executeSql("insert into sink_print select * from orders") // stenv.executeSql("insert into select * from orders").print() /* val source = s""" |CREATE TABLE kafka_table ( | category_id STRING, | user_id INT, | item_id STRING, | behavior STRING, | ts STRING |) WITH ( | 'connector' = 'kafka', | 'topic' = 'user_behavior', | 'properties.bootstrap.servers' = 'dev-ct6-dc-worker01:9092,dev-ct6-dc-worker02:9092,dev-ct6-dc-worker03:9092', | 'properties.group.id' = 'test1', | 'format' = 'json', | 'scan.startup.mode' = 'earliest-offset' |) """.stripMargin stenv.executeSql(source) val sink = s""" |CREATE TABLE kafka_gmv ( | id STRING, | gmv DECIMAL(10, 5) |) WITH ( | 'connector' = 'kafka', | 'topic' = 'kafka_gmv', | 'scan.startup.mode' = 'earliest-offset', | 'properties.bootstrap.servers' = 'dev-ct6-dc-worker01:9092,dev-ct6-dc-worker02:9092,dev-ct6-dc-worker03:9092', | 'format' = 'changelog-json' |) """.stripMargin stenv.executeSql(sink) val insert = s""" | INSERT INTO kafka_gmv | SELECT behavior, SUM(user_id) as gmv | FROM kafka_table | GROUP BY behavior """.stripMargin stenv.executeSql(insert) val query = s""" |SELECT * FROM kafka_gmv; """.stripMargin stenv.executeSql(query).print()*/ } }
这里我都是写的scala,因为zeppelin上提交任务不支持java,其实都一样,没太多区别。
以后有啥需要补充的再加上吧。
增加一个mysql cdc 维表join的效果:
//todo 流join mysql cdc。 public class Mysql_cdc_join { private static final String PRINT_SINK_SQL = "create table sink_print ( \n" + " id INT," + " name STRING," + " region_id INT," + " area_code STRING" + ") with ('connector' = 'print' )"; private static final String PRINT_SINK_SQL2 = "create table sink_print2 (" + " s_order_id INT," + " m_id INT " + ") with ('connector' = 'print' )"; private static final String MYSQL_CDC_SQL = "CREATE TABLE orders (" + " id INT," + " name STRING," + " region_id INT," + " area_code STRING " + " ) WITH ( " + " 'connector' = 'mysql-cdc'," + " 'hostname' = '192.168.6.143'," + " 'port' = '3306'," + " 'username' = 'root'," + " 'password' = '12345678'," + " 'database-name' = 'flink_test2'," + " 'table-name' = 'base_province'" + ")"; public static void main(String[] args) { StreamExecutionEnvironment bsEnv = StreamExecutionEnvironment.getExecutionEnvironment(); EnvironmentSettings bsSettings = EnvironmentSettings.newInstance().useBlinkPlanner().inStreamingMode().build(); StreamTableEnvironment stenv = StreamTableEnvironment.create(bsEnv, bsSettings); bsEnv.enableCheckpointing(5000); bsEnv.setStreamTimeCharacteristic(TimeCharacteristic.EventTime); DataStream
> ds = bsEnv.addSource(new SourceFunction >() { @Override public void run(SourceContext > out) throws Exception { Random random = new Random(); Random random2 = new Random(); Integer id = 0; while (true) { int sale = random.nextInt(1000); // int id = random2.nextInt(100); id++; out.collect(new Tuple4<>(id, "user", "product", Long.valueOf(sale))); Thread.sleep(1000L); } } @Override public void cancel() { } }); // todo 把 DataStream 注册为表,添加一个process时间处理字段(这里也可以设置为rowtime) stenv.createTemporaryView("stream_order", ds, $("order_id"), $("users"), $("product"), $("number"), $("proctime").proctime()); //todo 维度表 stenv.executeSql(MYSQL_CDC_SQL); // stenv.executeSql(PRINT_SINK_SQL); stenv.executeSql(PRINT_SINK_SQL2); // stenv.executeSql("insert into sink_print select * from orders"); //todo 维度表join String joinSql = "insert into sink_print2 SELECT" + " s.order_id," + " m.id " + "FROM " + "stream_order s " + // " JOIN orders FOR SYSTEM_TIME AS OF s.proctime as m " + " JOIN orders m " + " ON m.id = s.order_id"; stenv.executeSql(joinSql); } }
注意:在之前 我们维表join是通过下图,这种方式只能在1.12才能实现跟cdc的join:
我们在上面的代码里面是普通join。
结果演示:
开始的时候mysql存储 id = 5,6, 7的数据
中途我删除了id为5的数据,然后添加了id=100的数据,在过了一点时候之后
+I(100,100)才打印出来,正确。
这个时候流数据id肯定大于100了,我在mysql添加一条 id=55的数据,控制台立马打印了。