Flink Table API & SQL 编程

目录

Maven依赖

DEMO1 消费Kafka主题数据,生产主题数据

DEMO2 Table与DataStream的互相转换

DEMO3 时间窗口

sql-client的使用

执行建表语句

filesystem数据源

kafka数据源

执行查询语句



Maven依赖

    
        
            org.apache.flink
            flink-clients_2.11
            ${flink.version}
        
        
            org.apache.flink
            flink-table-api-java-bridge_2.11
            ${flink.version}
        
        
            org.apache.flink
            flink-table-planner-blink_2.11
            ${flink.version}
        
        
            org.apache.flink
            flink-streaming-scala_2.11
            ${flink.version}
        
        
            org.apache.flink
            flink-streaming-java_2.11
            ${flink.version}
        
        
            org.apache.flink
            flink-table-common
            ${flink.version}
        

        
            org.apache.flink
            flink-connector-kafka_2.11
            ${flink.version}
        
        
        
            org.apache.flink
            flink-json
            ${flink.version}
        
        
        
            org.apache.flink
            flink-csv
            ${flink.version}
        
    

DEMO1 消费Kafka主题数据,生产主题数据

import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;

/**
 * Description: 从Kafka主题kafkaInput读取数据,筛选后将指定数据写入kafkaOutput
 * 数据样式:
 * {"id":"1","user_id":"1001","status":"1"}
 * {"id":"2","user_id":"1002","status":"1"}
 * {"id":"3","user_id":"1003","status":"1"}
 * {"id":"4","user_id":"1004","status":"1"}
 * {"id":"5","user_id":"1005","status":"0"}
 */
public class Demo01 {
    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env);
        tableEnv.executeSql("CREATE TABLE kafka_input(id BIGINT,user_id BIGINT," +
                "status STRING) " +
                "WITH ('connector' = 'kafka','topic' = 'kafkaInput'," +
                "'properties.bootstrap.servers' = 'vm01:9092'," +
                "'properties.group.id' = 'testGroup'," +
                "'scan.startup.mode' = 'latest-offset'," +
                "'format' = 'json')");
        Table table = tableEnv.sqlQuery("SELECT * FROM kafka_input WHERE status='1'");
        tableEnv.executeSql("CREATE TABLE kafka_output(id BIGINT,user_id BIGINT," +
                "status STRING)" +
                " WITH ('connector'='kafka','topic'='kafkaOutput'," +
                "'properties.bootstrap.servers' = 'vm01:9092'," +
                "'format'='json'," +
                "'sink.partitioner'='round-robin')");
        tableEnv.executeSql("INSERT INTO kafka_output SELECT * FROM " + table);
        // 会报错:No operators defined in streaming topology. Cannot execute.但不影响程序运行
        // tableEnv.execute("Run TableapiKafkaDemo");
    }
}

DEMO2 Table与DataStream的互相转换

import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;


public class Demo02 {
    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        StreamTableEnvironment tEnv = StreamTableEnvironment.create(env);
        // kafka table -> stream
//        table2Stream(tEnv);
        // stream -> table -> stream
        stream2Table(env, tEnv);
        env.execute();
    }

    /**
     * kafka table -> stream print()
     * 数据格式:1,Jordan,male
     * @param tEnv StreamTableEnvironment

     */
    private static void table2Stream(StreamTableEnvironment tEnv) {
        String createTableSql = "CREATE TABLE user_info(" +
                "id INT," +
                "name STRING," +
                "gender STRING" +
                ")WITH(" +
                "'connector'='kafka'," +
                "'topic'='user_info'," +
                "'properties.bootstrap.servers'='vm01:9092'," +
                "'properties.group.id'='testGroup'," +
                "'scan.startup.mode'='latest-offset'," +
                "'format'='csv'" +
                ")";
        tEnv.executeSql(createTableSql);
        Table table = tEnv.sqlQuery("SELECT * FROM user_info");
        tEnv.toDataStream(table).print();
    }

    /**
     * stream -> table -> stream
     * 
     * @param env  StreamExecutionEnvironment
     * @param tEnv StreamTableEnvironment
     */
    private static void stream2Table(StreamExecutionEnvironment env, StreamTableEnvironment tEnv) {
        DataStream dataStream = env.fromElements("Jordan", "Alice", "Tom");
        Table inputTable = tEnv.fromDataStream(dataStream);
        tEnv.createTemporaryView("INPUT_TABLE", inputTable);
        Table resultTable = tEnv.sqlQuery("SELECT UPPER(f0) FROM INPUT_TABLE");
        resultTable.printSchema();
        tEnv.toDataStream(resultTable).print();
    }
}

DEMO3 时间窗口

建表时定义一个TIMESTAMP(3)数据格式的列作为时间窗口取值列,并通过:WATERMARK FOR row_time AS row_time - INTERVAL '5' SECOND 指定水位线为5秒,然后正常开窗和使用聚合函数。

import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.Slide;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.Tumble;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;

import static org.apache.flink.table.api.Expressions.$;
import static org.apache.flink.table.api.Expressions.lit;

/**
 * Kafka数据格式:1,jordan,79,2022-03-05 00:00:30
 */
public class Demo03 {
    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        StreamTableEnvironment tEnv = StreamTableEnvironment.create(env);
        String createTableSql = "CREATE TABLE user_info(" +
                "id INT," +
                "name STRING," +
                "score INT," +
                "row_time TIMESTAMP(3)," +
                "WATERMARK FOR row_time AS row_time - INTERVAL '5' SECOND" +
                ")WITH(" +
                "'connector'='kafka'," +
                "'topic'='t_score'," +
                "'properties.bootstrap.servers'='vm01:9092'," +
                "'properties.group.id'='testGroup'," +
                "'scan.startup.mode'='latest-offset'," +
                "'format'='csv'" +
                ")";
        tEnv.executeSql(createTableSql);
        Table table = tEnv.sqlQuery("SELECT * FROM user_info");
        Table resultTable = table.window(Tumble.over(lit(10).seconds()).on($("row_time")).as("w"))
//                            table.window(Slide.over(lit(10).seconds()).every(lit(20).seconds()).on($("row_time")).as("w"))
                .groupBy($("name"), $("w"))
                .select($("name"), $("score").avg(), $("w").end().as("hour"));
        resultTable.printSchema();
        tEnv.toDataStream(resultTable).print();

        env.execute("Demo03");
    }
}

sql-client的使用

# 启动集群
${FLINK_HOME}/bin/start-cluster.sh
# 启动命令行
${FLINK_HOME}/bin/sql-client.sh

此时可以交互式执行sql语句。

此外,可以通过使用以下方式执行sql脚本

# 其中:~/file.sql为可正常执行的sql脚本,-f也可换成--file
${FLINK_HOME}/bin/sql-client.sh -f ~/file.sql

执行建表语句

如果需要使用时间窗口,则必须指定watermark

filesystem数据源

CREATE TABLE bid(
    bidtime TIMESTAMP(3),
    price DECIMAL(10, 2),
    item STRING,
	WATERMARK FOR bidtime AS bidtime - INTERVAL '10' MINUTES
) WITH ( 
    'connector' = 'filesystem',
    'path' = '/root/data/bid.csv',
    'format' = 'csv'
);

kafka数据源

CREATE TABLE user_info(
    id INT,
    name STRING,
    score INT,
    row_time TIMESTAMP(3),
    WATERMARK FOR row_time AS row_time - INTERVAL '5' SECOND
) WITH (
    'connector'='kafka',
    'topic'='t_score',
    'properties.bootstrap.servers'='vm01:9092',
    'properties.group.id'='testGroup',
    'scan.startup.mode'='latest-offset',
    'format'='csv'
);

执行查询语句

一般的查询语句与通用SQL类似,对于内置函数,可以通过SHOW FUNCTIONS进行查看。

下面演示的是窗口函数的使用。

# 所查询的表为章节【执行建表语句->filesystem数据源】所建的表
SELECT window_start, window_end, SUM(price) FROM TABLE
 (TUMBLE(TABLE bid, DESCRIPTOR(bidtime), INTERVAL '10' MINUTES))
GROUP BY window_start, window_end;

你可能感兴趣的:(大数据,flink,flink)