官网介绍:https://ci.apache.org/projects/flink/flink-docs-release-1.8/dev/table/
Flink的tableAPI允许我们对流式处理以及批量处理都使用sql语句的方式来进行开发。只要我们知道了dataStream或者dataSet可以转换成为Table,那么我们就可以方便的从各个地方获取数据,然后转换成为Table,通过TableAPI或者SQL来实现我们的数据的处理等
Flink的表API和SQL程序可以连接到其他外部系统来读写批处理表和流表。Table source提供对存储在外部 系统(如数据库、键值存储、消息队列或文件系统)中的数据的访问。Table Sink将表发送到外部存储系统。
3.1 :1、使用FlinkSQL实现读取CSV文件数据,并进行查询
<dependency>
<groupId>org.apache.flinkgroupId>
<artifactId>flink-table-planner_2.11artifactId>
<version>1.8.1version>
dependency>
<dependency>
<groupId>org.apache.flinkgroupId>
<artifactId>flink-table-api-scala-bridge_2.11artifactId>
<version>1.8.1version>
dependency>
<dependency>
<groupId>org.apache.flinkgroupId>
<artifactId>flink-table-api-scala_2.11artifactId>
<version>1.8.1version>
dependency>
<dependency>
<groupId>org.apache.flinkgroupId>
<artifactId>flink-table-commonartifactId>
<version>1.8.1version>
dependency>
import org.apache.flink.core.fs.FileSystem.WriteMode
import org.apache.flink.streaming.api.scala.{StreamExecutionEnvironment}
import org.apache.flink.table.api.{Table, Types}
import org.apache.flink.table.api.scala.StreamTableEnvironment
import org.apache.flink.table.sinks.{CsvTableSink}
import org.apache.flink.table.sources.CsvTableSource
object FlinkStreamSQL {
def main(args: Array[String]): Unit = {
//流式sql,获取运行环境
val streamEnvironment: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
//流式table处理环境
val tableEnvironment: StreamTableEnvironment = StreamTableEnvironment.create(streamEnvironment)
//注册我们的tableSource
val source: CsvTableSource = CsvTableSource.builder()
.field("id", Types.INT)
.field("name", Types.STRING)
.field("age", Types.INT)
.fieldDelimiter(",")
.ignoreFirstLine()
.ignoreParseErrors()
.lineDelimiter("\r\n")
.path("D:\\flinksql.csv")
.build()
//将tableSource注册成为我们的表
tableEnvironment.registerTableSource("user",source)
//查询年龄大于20岁的人
val result: Table = tableEnvironment.scan("user").filter("age >20")
//打印我们表的元数据信息===》也就是字段信息
//将查询出来的结果,保存到我们的csv文件里面去
val sink = new CsvTableSink("D:\\sink.csv","===",1,WriteMode.OVERWRITE)
result.writeToSink(sink)
streamEnvironment.execute()
}
}
3.2:DataStream与Table的互相转换操作
import org.apache.flink.core.fs.FileSystem.WriteMode
import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment}
import org.apache.flink.table.api._
import org.apache.flink.api.scala._
import org.apache.flink.table.api.scala.StreamTableEnvironment
import org.apache.flink.table.sinks.CsvTableSink
object FlinkStreamSQL {
def main(args: Array[String]): Unit = {
val environment: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
val streamSQLEnvironment: StreamTableEnvironment = StreamTableEnvironment.create(environment)
val socketStream: DataStream[String] = environment.socketTextStream("node01",8000)
//101,zhangsan,18
//102,lisi,20
//103,wangwu,25
//104,zhaoliu,8
val userStream: DataStream[User] = socketStream.map(x =>User(x.split(",")(0).toInt,x.split(",")(1),x.split(",")(2).toInt) )
//将我们的流注册成为一张表
streamSQLEnvironment.registerDataStream("userTable",userStream)
//通过sql语句的方式来进行查询
//通过表达式来进行查询
//使用tableAPI来进行查询
// val table: Table = streamSQLEnvironment.scan("userTable").filter("age > 10")
//使用sql方式来进行查询
val table: Table = streamSQLEnvironment.sqlQuery("select * from userTable")
val sink3 = new CsvTableSink("D:\\sink3.csv","===",1,WriteMode.OVERWRITE)
table.writeToSink(sink3)
//使用append模式将Table转换成为dataStream,不能用于sum,count,avg等操作,只能用于添加数据操作
val appendStream: DataStream[User] = streamSQLEnvironment.toAppendStream[User](table)
//使用retract模式将Table转换成为DataStream
val retractStream: DataStream[(Boolean, User)] = streamSQLEnvironment.toRetractStream[User](table)
environment.execute()
}
}
case class User(id:Int,name:String,age:Int)
101,zhangsan,18
102,lisi,20
103,wangwu,25
104,zhaoliu,8
3.3:DataSet与Table的互相转换操作
import org.apache.flink.api.scala._
import org.apache.flink.api.scala.ExecutionEnvironment
import org.apache.flink.core.fs.FileSystem.WriteMode
import org.apache.flink.table.api.scala.BatchTableEnvironment
import org.apache.flink.table.sinks.CsvTableSink
object FlinkBatchSQL {
def main(args: Array[String]): Unit = {
val environment: ExecutionEnvironment = ExecutionEnvironment.getExecutionEnvironment
val batchSQL: BatchTableEnvironment = BatchTableEnvironment.create(environment)
val sourceSet: DataSet[String] = environment.readTextFile("D:\\dataSet.csv")
val userSet: DataSet[User2] = sourceSet.map(x => {
println(x)
val line: Array[String] = x.split(",")
User2(line(0).toInt, line(1), line(2).toInt)
})
import org.apache.flink.table.api._
batchSQL.registerDataSet("user",userSet)
//val table: Table = batchSQL.scan("user").filter("age > 18")
//注意:user关键字是flink当中的保留字段,如果用到了这些保留字段,需要转译
val table: Table = batchSQL.sqlQuery("select id,name,age from `user` ")
val sink = new CsvTableSink("D:\\batchSink.csv","===",1,WriteMode.OVERWRITE)
table.writeToSink(sink)
//将Table转换成为DataSet
val tableSet: DataSet[User2] = batchSQL.toDataSet[User2](table)
tableSet.map(x =>x.age).print()
environment.execute()
}
}
case class User2(id:Int,name:String,age:Int)
//flink代码开发需要导入隐式转换包
import org.apache.flink.api.scala._
//对于flink tableAPI或者SQL的开发,则需要导入隐式转换包
import org.apache.flink.table.api._
// 如果用到了flink当中的保留字段,需要转译
val table: Table = batchSQL.sqlQuery("select id,name,age from `user` ")
<dependency>
<groupId>org.apache.flinkgroupId>
<artifactId>flink-jsonartifactId>
<version>1.8.1version>
dependency>
<dependency>
<groupId>org.apache.flinkgroupId>
<artifactId>flink-connector-kafka-0.11_2.11artifactId>
<version>1.8.1version>
dependency>
<dependency>
<groupId>org.apache.kafkagroupId>
<artifactId>kafka-clientsartifactId>
<version>1.1.0version>
dependency>
<dependency>
<groupId>org.slf4jgroupId>
<artifactId>slf4j-apiartifactId>
<version>1.7.25version>
dependency>
<dependency>
<groupId>org.slf4jgroupId>
<artifactId>slf4j-log4j12artifactId>
<version>1.7.25version>
dependency>
cd /opt/install/kafka_2.11-1.1.0
bin/kafka-topics.sh --create --topic kafka_source_table --partitions 3 --replication-factor 1 --zookeeper node01:2181,node02:2181,node03:2181
import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.core.fs.FileSystem.WriteMode
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.table.api.{Table, _}
import org.apache.flink.table.api.scala.StreamTableEnvironment
import org.apache.flink.table.descriptors.{Json, Kafka, Schema}
import org.apache.flink.table.sinks.CsvTableSink
object KafkaJsonSource {
def main(args: Array[String]): Unit = {
val streamEnvironment: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
//隐式转换
//checkpoint配置
streamEnvironment.enableCheckpointing(100);
streamEnvironment.getCheckpointConfig.setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE);
streamEnvironment.getCheckpointConfig.setMinPauseBetweenCheckpoints(500);
streamEnvironment.getCheckpointConfig.setCheckpointTimeout(60000);
streamEnvironment.getCheckpointConfig.setMaxConcurrentCheckpoints(1);
streamEnvironment.getCheckpointConfig.enableExternalizedCheckpoints(CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION);
val tableEnvironment: StreamTableEnvironment = StreamTableEnvironment.create(streamEnvironment)
val kafka: Kafka = new Kafka()
.version("0.11")
.topic("kafka_source_table")
.startFromLatest()
.property("group.id", "test_group")
.property("bootstrap.servers", "node01:9092,node02:9092,node03:9092")
val json: Json = new Json().failOnMissingField(false).deriveSchema()
//{"userId":1119,"day":"2017-03-02","begintime":1488326400000,"endtime":1488327000000,"data":[{"package":"com.browser","activetime":120000}]}
val schema: Schema = new Schema()
.field("userId", Types.INT)
.field("day", Types.STRING)
.field("begintime", Types.LONG)
.field("endtime", Types.LONG)
tableEnvironment
.connect(kafka)
.withFormat(json)
.withSchema(schema)
.inAppendMode()
.registerTableSource("user_log")
//使用sql来查询数据
val table: Table = tableEnvironment.sqlQuery("select userId,`day` ,begintime,endtime from user_log")
table.printSchema()
//定义sink,输出数据到哪里
val sink = new CsvTableSink("D:\\flink_kafka.csv","====",1,WriteMode.OVERWRITE)
//注册数据输出目的地
tableEnvironment.registerTableSink("csvSink",
Array[String]("f0","f1","f2","f3"),
Array[TypeInformation[_]](Types.INT, Types.STRING, Types.LONG, Types.LONG),sink)
//将数据插入到数据目的地
table.insertInto("csvSink")
streamEnvironment.execute("kafkaSource")
}
}
cd /opt/install/kafka_2.11-1.1.0
bin/kafka-console-producer.sh --topic kafka_source_table --broker-list node01:9092,node02:9092,node03:9092
# 发送数据格式如下
{"userId":19,"day":"2017-03-02","begintime":1585184021,"endtime":1585184041}
{"userId":20,"day":"2017-03-02","begintime":1585184021,"endtime":1585184041}
{"userId":21,"day":"2017-03-02","begintime":1585184021,"endtime":1585184041}
{"userId":22,"day":"2017-03-02","begintime":1585184021,"endtime":1585184041}
{"userId":23,"day":"2017-03-02","begintime":1585184021,"endtime":1585184041}