1 map
1.1 MapFunction
sensor_1,1549044122,10
sensor_1,1549044123,20
sensor_1,1549044124,30
sensor_2,1549044125,40
sensor_1,1549044126,50
sensor_2,1549044127,60
sensor_1,1549044128,70
sensor_3,1549044129,80
sensor_3,1549044130,90
sensor_3,1549044130,100
import org.apache.flink.streaming.api.scala._
object SourceFileMap {
def main(args: Array[String]): Unit = {
//1.创建执行的环境
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
//2.从指定路径获取数据
val fileDS: DataStream[String] = env.readTextFile("input/sensor-data.log")
val MapDS = fileDS.map(
lines => {
//更加逗号切割 获取每个元素
val datas: Array[String] = lines.split(",")
WaterSensor(datas(0), datas(1).toLong, datas(2).toInt)
}
)
//3.打印
MapDS.print()
//4.执行
env.execute("map")
}
/**
* 定义样例类:水位传感器:用于接收空高数据
*
* @param id 传感器编号
* @param ts 时间戳
* @param vc 空高
*/
case class WaterSensor(id: String, ts: Long, vc: Double)
}
1.2 RichMapFunction
import org.apache.flink.api.common.functions.{MapFunction, RichMapFunction}
import org.apache.flink.configuration.Configuration
import org.apache.flink.streaming.api.scala._
object Transform_RichMapFunction {
def main(args: Array[String]): Unit = {
//1.创建执行的环境
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
//2.从指定路径获取数据
val sensorDS: DataStream[String] = env.readTextFile("input/sensor-data.log")
val myMapDS: DataStream[WaterSensor] = sensorDS.map(new MyRichMapFunction)
//3.打印
myMapDS.print()
//4.执行
env.execute("map")
}
/**
* 自定义继承 MapFunction
* MapFunction[T,O]
* 自定义输入和输出
*
*/
class MyRichMapFunction extends RichMapFunction[String,WaterSensor]{
override def map(value: String): WaterSensor = {
val datas: Array[String] = value.split(",")
// WaterSensor(datas(0), datas(1).toLong, datas(2).toInt)
WaterSensor(getRuntimeContext.getTaskName, datas(1).toLong, datas(2).toInt)
}
// 富函数提供了生命周期方法
override def open(parameters: Configuration): Unit = {}
override def close(): Unit = {}
}
/**
* 定义样例类:水位传感器:用于接收空高数据
*
* @param id 传感器编号
* @param ts 时间戳
* @param vc 空高
*/
case class WaterSensor(id: String, ts: Long, vc: Double)
}
2. filter
import org.apache.flink.streaming.api.scala._
object Transform_Filter {
def main(args: Array[String]): Unit = {
// 1.创建执行环境
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
env.setParallelism(1)
// 2.读取数据
val listDS: DataStream[List[Int]] = env.fromCollection(
List(
List(1, 2, 3, 4,1, 2, 3, 4),
List(5, 6, 7,1,1,1,1, 2, 3, 4,1, 2, 3, 4),
List(1, 2, 3, 4),
List(5, 6, 7,1,1,1),
List(1, 2, 3, 4),
List(5, 6, 7,1,1,1)
)
)
// true就留下,false就抛弃
listDS.filter(num => {
num.size>5
})
.print("filter")
// 4. 执行
env.execute()
}
}
3 keyBy
import org.apache.flink.streaming.api.scala._
object Transform_KeyBy {
def main(args: Array[String]): Unit = {
// 1.创建执行环境
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
env.setParallelism(1)
// 2.读取数据
val sensorDS: DataStream[String] = env.readTextFile("input/sensor-data.log")
//3.转换为样例类
val mapDS = sensorDS.map(
lines => {
val datas = lines.split(",")
WaterSensor(datas(0), datas(1).toLong, datas(2).toInt)
}
)
// 4. 使用keyby进行分组
// TODO 关于返回的key的类型:
// 1. 如果是位置索引 或 字段名称 ,程序无法推断出key的类型,所以给一个java的Tuple类型
// 2. 如果是匿名函数 或 函数类 的方式,可以推断出key的类型,比较推荐使用
// *** 分组的概念:分组只是逻辑上进行分组,打上了记号(标签),跟并行度没有绝对的关系
// 同一个分组的数据在一起(不离不弃)
// 同一个分区里可以有多个不同的组
// val sensorKS: KeyedStream[WaterSensor, Tuple] = mapDS.keyBy(0)
// val sensorKS: KeyedStream[WaterSensor, Tuple] = mapDS.keyBy("id")
val sensorKS: KeyedStream[WaterSensor, String] = mapDS.keyBy(_.id)
// val sensorKS: KeyedStream[WaterSensor, String] = mapDS.keyBy(
// new KeySelector[WaterSensor, String] {
// override def getKey(value: WaterSensor): String = {
// value.id
// }
// }
// )
sensorKS.print().setParallelism(5)
// 4. 执行
env.execute()
}
/**
* 定义样例类:水位传感器:用于接收空高数据
*
* @param id 传感器编号
* @param ts 时间戳
* @param vc 空高
*/
case class WaterSensor(id: String, ts: Long, vc: Double)
}