Flink的Transform【博学谷学习记录】

1 map

1.1 MapFunction

sensor_1,1549044122,10
sensor_1,1549044123,20
sensor_1,1549044124,30
sensor_2,1549044125,40
sensor_1,1549044126,50
sensor_2,1549044127,60
sensor_1,1549044128,70
sensor_3,1549044129,80
sensor_3,1549044130,90
sensor_3,1549044130,100
import org.apache.flink.streaming.api.scala._

object SourceFileMap {

  def main(args: Array[String]): Unit = {
    //1.创建执行的环境
    val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
    //2.从指定路径获取数据
    val fileDS: DataStream[String] = env.readTextFile("input/sensor-data.log")

    val MapDS = fileDS.map(
      lines => {
        //更加逗号切割 获取每个元素
        val datas: Array[String] = lines.split(",")
        WaterSensor(datas(0), datas(1).toLong, datas(2).toInt)
      }
    )

    //3.打印
    MapDS.print()

    //4.执行
    env.execute("map")

  }

  /**
   * 定义样例类:水位传感器:用于接收空高数据
   *
   * @param id 传感器编号
   * @param ts 时间戳
   * @param vc 空高
   */
  case class WaterSensor(id: String, ts: Long, vc: Double)

}

1.2 RichMapFunction

import org.apache.flink.api.common.functions.{MapFunction, RichMapFunction}
import org.apache.flink.configuration.Configuration
import org.apache.flink.streaming.api.scala._


object Transform_RichMapFunction {

  def main(args: Array[String]): Unit = {
    //1.创建执行的环境
    val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
    //2.从指定路径获取数据
    val sensorDS: DataStream[String] = env.readTextFile("input/sensor-data.log")

    val myMapDS: DataStream[WaterSensor] = sensorDS.map(new MyRichMapFunction)

    //3.打印
    myMapDS.print()

    //4.执行
    env.execute("map")

  }

  /**
   * 自定义继承 MapFunction
   * MapFunction[T,O]
   * 自定义输入和输出
   *
   */
  class MyRichMapFunction extends RichMapFunction[String,WaterSensor]{

    override def map(value: String): WaterSensor = {
      val datas: Array[String] = value.split(",")
      //      WaterSensor(datas(0), datas(1).toLong, datas(2).toInt)
      WaterSensor(getRuntimeContext.getTaskName, datas(1).toLong, datas(2).toInt)
    }

    // 富函数提供了生命周期方法
    override def open(parameters: Configuration): Unit = {}

    override def close(): Unit = {}


  }

  /**
   * 定义样例类:水位传感器:用于接收空高数据
   *
   * @param id 传感器编号
   * @param ts 时间戳
   * @param vc 空高
   */
  case class WaterSensor(id: String, ts: Long, vc: Double)

}

2. filter

import org.apache.flink.streaming.api.scala._


object Transform_Filter {

  def main(args: Array[String]): Unit = {

    // 1.创建执行环境
    val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(1)

    // 2.读取数据
    val listDS: DataStream[List[Int]] = env.fromCollection(
      List(
        List(1, 2, 3, 4,1, 2, 3, 4),
        List(5, 6, 7,1,1,1,1, 2, 3, 4,1, 2, 3, 4),
        List(1, 2, 3, 4),
        List(5, 6, 7,1,1,1),
        List(1, 2, 3, 4),
        List(5, 6, 7,1,1,1)
      )
    )
    // true就留下,false就抛弃
    listDS.filter(num => {
      num.size>5
      })
      .print("filter")
    // 4. 执行
    env.execute()
  }
}

3 keyBy

import org.apache.flink.streaming.api.scala._

object Transform_KeyBy {

  def main(args: Array[String]): Unit = {

    // 1.创建执行环境
    val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(1)

    // 2.读取数据
    val sensorDS: DataStream[String] = env.readTextFile("input/sensor-data.log")

    //3.转换为样例类
    val mapDS = sensorDS.map(
      lines => {
        val datas = lines.split(",")
        WaterSensor(datas(0), datas(1).toLong, datas(2).toInt)
      }
    )

    // 4. 使用keyby进行分组
    // TODO 关于返回的key的类型:
    // 1. 如果是位置索引 或 字段名称 ,程序无法推断出key的类型,所以给一个java的Tuple类型
    // 2. 如果是匿名函数 或 函数类 的方式,可以推断出key的类型,比较推荐使用
    // *** 分组的概念:分组只是逻辑上进行分组,打上了记号(标签),跟并行度没有绝对的关系
    //      同一个分组的数据在一起(不离不弃)
    //      同一个分区里可以有多个不同的组

    //        val sensorKS: KeyedStream[WaterSensor, Tuple] = mapDS.keyBy(0)
    //    val sensorKS: KeyedStream[WaterSensor, Tuple] = mapDS.keyBy("id")
    val sensorKS: KeyedStream[WaterSensor, String] = mapDS.keyBy(_.id)
    //    val sensorKS: KeyedStream[WaterSensor, String] = mapDS.keyBy(
    //      new KeySelector[WaterSensor, String] {
    //        override def getKey(value: WaterSensor): String = {
    //          value.id
    //        }
    //      }
    //    )

    sensorKS.print().setParallelism(5)

    // 4. 执行
    env.execute()
  }

  /**
   * 定义样例类:水位传感器:用于接收空高数据
   *
   * @param id 传感器编号
   * @param ts 时间戳
   * @param vc 空高
   */
  case class WaterSensor(id: String, ts: Long, vc: Double)
}

你可能感兴趣的:(flink,学习,scala)