Apache Flink——DataStream的DataSource以及DataSink(章节二)

DataStream的DataSource以及DataSink

DataSource

DataSource是flink数据的来源,用户可以通过env,addSource(SourceFunction)将SourceFunction添加到程序中,Flink内置许多已经实现的SourceFunction,但是用户也可以自定义实现SourceFunction(非并行化)接口,也可以实现ParallelSourceFunction(并行化)接口,如果有状态需要管理还可以实现RichParallelSourceFunction接口。

File Based

  • readTextFile(path)-逐行读取(一次)文本文件,即符合TextInputFormat规范并以字符串形式返回的文件.
//1.创建流计算执⾏环境
 val env = StreamExecutionEnvironment.getExecutionEnvironment
 //2.创建DataStream - 细化
 val text:DataStream[String] = env.readTextFile("hdfs://CentOS:9000/demo/words")
 //3.执⾏DataStream的转换算⼦
 val counts = text.flatMap(line=>line.split("\\s+"))
 .map(word=>(word,1))
 .keyBy(0)
 .sum(1)
 //4.将计算的结果在控制打印
 counts.print()
 //5.执⾏流计算任务
 env.execute("Window Stream WordCount")
  • readFile(fileInputFormat,path)-根据指定的文件输入格式读取(一次)文件.
//1.创建流计算执⾏环境
 val env = StreamExecutionEnvironment.getExecutionEnvironment
 //2.创建DataStream - 细化
 var inputFormat:FileInputFormat[String]=new TextInputFormat(null)
 val text:DataStream[String] =
env.readFile(inputFormat,"hdfs://CentOS:9000/demo/words")
 //3.执⾏DataStream的转换算⼦
 val counts = text.flatMap(line=>line.split("\\s+"))
 .map(word=>(word,1))
 .keyBy(0)
 .sum(1)
 //4.将计算的结果在控制打印
 counts.print()
 //5.执⾏流计算任务
 env.execute("Window Stream WordCount")
  • readFile(fileInputFormat, path, watchType, interval, pathFilter, typeInfo)-该⽅法会检查采集⽬录下的⽂件,如果⽂件发⽣变化系统会重新采集。此时可能会导致⽂件的重
    复计算。⼀般来说不建议修改⽂件内容,直接上传新⽂件即可.
//1.创建流计算执⾏环境
 val env = StreamExecutionEnvironment.getExecutionEnvironment
 //2.创建DataStream - 细化
 var inputFormat:FileInputFormat[String]=new TextInputFormat(null)
 val text:DataStream[String] = env.readFile(inputFormat,
 "hdfs://CentOS:9000/demo/words",FileProcessingMode.PROCESS_CONTINUOUSLY,1000)
 //3.执⾏DataStream的转换算⼦
 val counts = text.flatMap(line=>line.split("\\s+"))
 .map(word=>(word,1))
 .keyBy(0)
 .sum(1)
 //4.将计算的结果在控制打印
 counts.print()
 //5.执⾏流计算任务
 env.execute("Window Stream WordCount")

Socket Based

  • socketTextStream-从socket中读取,一般用于测试
//1.创建流计算执⾏环境
 val env = StreamExecutionEnvironment.getExecutionEnvironment
 //2.创建DataStream - 细化
 val text = env.socketTextStream("CentOS", 9999,'\n',3)
 //3.执⾏DataStream的转换算⼦
 val counts = text.flatMap(line=>line.split("\\s+"))
 .map(word=>(word,1))
 .keyBy(0)
 .sum(1)
 //4.将计算的结果在控制打印
 counts.print()
 //5.执⾏流计算任务
 env.execute("Window Stream WordCount")

Collection Based

  • fromCollection(Collection)-从集合中读取元素。
//1.创建流计算执⾏环境
val env = StreamExecutionEnvironment.getExecutionEnvironment
 //2.创建DataStream - 细化
 val text = env.fromCollection(List("this is a demo","hello word"))
 //3.执⾏DataStream的转换算⼦
 val counts = text.flatMap(line=>line.split("\\s+"))
 .map(word=>(word,1))
 .keyBy(0)
 .sum(1)
 //4.将计算的结果在控制打印
 counts.print()
 //5.执⾏流计算任务
 env.execute("Window Stream WordCount")

自定义SourceFunction

  • SourceFunction
mport org.apache.flink.streaming.api.functions.source.SourceFunction

import scala.util.Random

//用户自定义的非并行的sourceFunction
class UserDefinedNonParallelSourceFunction extends SourceFunction[String] {

  @volatile//防止线程拷贝变量
  var isRunning:Boolean=true
  val lines:Array[String]=Array("this is a demo","hello world","ni hao ma")

  //在该⽅法中启动线程,通过sourceContext的collect⽅法发送数据
  override def run(ctx: SourceFunction.SourceContext[String]): Unit = {
    while(isRunning){
      println(Thread.currentThread().getId)
      Thread.sleep(1000)
      //输送数据给下游
      ctx.collect(lines(new Random().nextInt(lines.length)))
    }
  }

  override def cancel(): Unit = {
    println("==========isRunning++++++++++")
    isRunning = false
  }
}
  • ParallelSourceFunction
import org.apache.flink.streaming.api.functions.source.{ParallelSourceFunction, SourceFunction}

import scala.util.Random

//用户自定义并行的
class UserDefinedParallelSourceFunction extends ParallelSourceFunction[String]{

  @volatile//防止线程拷贝变量
  var isRunning:Boolean = true
  var lines:Array[String] = Array("this is a demo","i love you","hello world")

  override def run(ctx: SourceFunction.SourceContext[String]): Unit = {
    while(isRunning)
      {
        println("当前线程id:" + Thread.currentThread().getId)
        Thread.sleep(3000)
        ctx.collect(lines(new Random().nextInt(lines.length)))
      }

  }

  override def cancel(): Unit = {
    isRunning = false
  }
}

测试自定义Source

//用户自定义的source
object UserDefinedSource {
  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    val text = env.addSource(new UserDefinedParallelSourceFunction)
    val counts = text.flatMap(_.split(" "))
      .map((_, 1))
      .keyBy(0)
      .sum(1)
    counts.print()
    env.execute()
  }
}

KafkaSource

  • 引入依赖
 
            org.apache.flink
            flink-connector-kafka_2.11
            1.10.0
 
  • SimpleStringSchema-只会反序列化kafka里的value
//1.创建流计算执⾏环境
 val env = StreamExecutionEnvironment.getExecutionEnvironment
 //2.创建DataStream - 细化
 val props = new Properties()
 props.setProperty("bootstrap.servers", "CentOS:9092")
 props.setProperty("group.id", "g1")
 val text = env.addSource(new FlinkKafkaConsumer[String]("topic01",new
SimpleStringSchema(),props))
 //3.执⾏DataStream的转换算⼦
 val counts = text.flatMap(line=>line.split("\\s+"))
 .map(word=>(word,1))
 .keyBy(0)
 .sum(1)
 //4.将计算的结果在控制打印
 counts.print()
 //5.执⾏流计算任务
 env.execute("Window Stream WordCount")
  • KafkaDeserializationSchema
//用户自定义的反序列化规则
class UserDefinedKafkaDeserializationSchema extends KafkaDeserializationSchema[(String,String,String,Int,Long)]{
  override def isEndOfStream(t: (String, String, String, Int, Long)): Boolean = false

  override def deserialize(cr: ConsumerRecord[Array[Byte], Array[Byte]]): (String, String, String, Int, Long) = {
    if(cr.key() != null){
      (new String(cr.key()),new String(cr.value()),cr.topic(),cr.partition(),cr.offset())
    }else{
      ("key",new String(cr.value()),cr.topic(),cr.partition(),cr.offset())
    }
  }

  override def getProducedType: TypeInformation[(String, String, String, Int, Long)] = {
    createTypeInformation[(String, String, String, Int, Long)]
  }
}
  • JSONKeyValueDeserializationSchema-要求Kafka中的topic的key和value都必须是json格式,也可以在使⽤的时候,指定是否读取元数据
    (topic、分区、offset等)
//flink对接kafka=====接受kafka的数据
object KafkaSource {
  def main(args: Array[String]): Unit = {
    //构建flink流计算执行环境
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(4)
    //kafka数据源
    val prop = new Properties()
    prop.setProperty("bootstrap.servers", "CentOS:9092")
    // 仅 Kafka 0.8 需要
    prop.setProperty("group.id", "g1")
    //隐式转换
    import org.apache.flink.api.scala._
    //kafkaSource
    val kafkaSource = new FlinkKafkaConsumer[ObjectNode]("topic01",new JSONKeyValueDeserializationSchema(true),prop)
    kafkaSource.setStartFromEarliest()      // 尽可能从最早的记录开始
    kafkaSource.setStartFromLatest()        // 从最新的记录开始
    //kafkaSource.setStartFromTimestamp(...)  // 从指定的时间开始(毫秒)
    kafkaSource.setStartFromGroupOffsets()  // 默认的方法
    val text = env.addSource(kafkaSource)
    //3.执⾏DataStream的转换算⼦
    val counts = text.map(t => {
      println("meta:" + t)
      (t.get("value").get("id"), t.get("value").get("name"))
    })
    counts.print()
    println(env.getExecutionPlan)
    env.execute("Kafka Consumer")
  }
}

DataSink

Flink提供了多种内置的输出格式,这些格式被封装在DataStream中:

File Based

  • writeAsText() / TextOutputFormat-将元素按行写入为字符串。这些字符串是通过调用每个元素的toString()方法获得的。
  • writeAsCsv(…) / CsvOutputFormat-将元组写入逗号分隔的csv文件。行和字段分隔符是可配置的。每个字段的值来自对象的toString()方法.
  • writeUsingOutputFormat() / FileOutputFormat-方法和自定义文件输出的基类。支持自定义对象到字节的转换。
//Flink写进文件中
object FileSink {
  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    val text = env.socketTextStream("CentOS",9998)
    val counts = text.flatMap(_.split(" "))
      .map((_, 1))
      .keyBy(0)
      .sum(1)
    counts.print()
    counts.writeUsingOutputFormat(new TextOutputFormat[(String, Int)](new Path("file:///E:/flink_results")))
    env.execute()
  }
}

如果写到hdfs中,需要产生大量数据才能看到效果,因为写入hdfs文件系统的缓冲区比较大,以上写入文件系统的sink不能参与系统的检查点,在实际生产环境中通常使用flink-connector-filesystem写入到外围系统。


            org.apache.flink
            flink-connector-filesystem_2.11
            1.10.0
 

新版本写法

//写进hdfs中--新版本
object FileSystemSinkNew {
  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(1)
    val text = env.socketTextStream("CentOS",9998)
    val counts = text.flatMap(_.split(" "))
      .map((_, 1))
      .keyBy(0)
      .sum(1)
    counts.print()
    //构建sink
    val fileSink = StreamingFileSink.forRowFormat(new Path("hdfs://CentOS:9000/flink_results"), new SimpleStringEncoder[(String, Int)]("UTF-8"))
      .withBucketAssigner(new DateTimeBucketAssigner[(String, Int)]("yyyy-MM-dd"))
      .build()
    counts.addSink(fileSink)
    env.execute()
  }
}

老版本写法

//老版本写法
object FileSystemSinkOld {
  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(1)
    val text = env.socketTextStream("CentOS",9998)
    val counts = text.flatMap(_.split(" "))
      .map((_, 1))
      .keyBy(0)
      .sum(1)
    counts.print("测试")
    val bucketSink = new BucketingSink[(String,Int)]("hdfs://CentOS:9000/flink-results_bucket")
    bucketSink.setBucketer(new DateTimeBucketer[(String, Int)]("yyyy-MM-dd"))
    bucketSink.setBatchSize(1024)
    counts.addSink(bucketSink)
    env.execute()

  }
}

print() & printToErr()

打印标准输出/标准错误流中每个元素的toString()值。可选地,可以提供前缀(msg)作为输出的前缀。这有助于区分不同的打印调用。如果并行度大于1,输出也将以产生输出的任务的标识符作为前缀

自定义SinkFunction

lass UserDefinedSinkFunction extends RichSinkFunction[(String,Int)]{
  override def open(parameters: Configuration): Unit = {
    println("打开连接")
  }

  override def invoke(value: (String, Int), context: SinkFunction.Context[_]): Unit = {
    println("输出:" + value)
  }

  override def close(): Unit = {
    println("关闭连接")
  }
}
object UserDefinedSink {
  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(1)
    val text = env.socketTextStream("CentOS",9998)
    val counts = text.flatMap(_.split(" "))
      .map((_, 1))
      .keyBy(0)
      .sum(1)
    counts.addSink(new UserDefinedSinkFunction)
    env.execute()
  }
}

RedisSink


            org.apache.bahir
            flink-connector-redis_2.11
            1.0
  
//用户自定义的RedisMapper
class UserDefinedRedisMapper extends RedisMapper[(String,Int)]{
  override def getCommandDescription: RedisCommandDescription = {
    new RedisCommandDescription(RedisCommand.HSET,"wordcounts")
  }

  override def getKeyFromData(t: (String, Int)): String = t._1

  override def getValueFromData(t: (String, Int)): String = t._2+""
}
//写入redis
object RedisSink {
  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(2)
    val text = env.readTextFile("hdfs://CentOS:9000/demo/words")
    val counts = text.flatMap(_.split(" "))
      .map((_, 1))
      .keyBy(0)
      .sum(1)
    counts.print()
    val jedisConf = new FlinkJedisPoolConfig.Builder()
        .setHost("192.168.192.19")
        .setPort(6379)
        .build()
    counts.addSink(new RedisSink(jedisConf,new UserDefinedRedisMapper))
    env.execute()
  }
}

KafkaSink


            org.apache.flink
            flink-connector-kafka_2.11
            1.10.0
 

方案一-KafkaSerializationSchema

class UserDefiedKafkaSerializationSchema extends KafkaSerializationSchema[(String,Int)]{
  override def serialize(element: (String, Int), timestamp: lang.Long): ProducerRecord[Array[Byte], Array[Byte]] = {
    new ProducerRecord("topic01",element._1.getBytes(),element._2.toString.getBytes())
  }
}

这种方式的"default_topic没有意义"

方案二-KeyedSerializationSchema

class UserDefinedKeyedSerializationSchema extends KeyedSerializationSchema[(String,Int)]{
  override def serializeKey(element: (String, Int)): Array[Byte] = {
    println("key._1:" + element._1 + "key._2:" + element._2)
    element._1.getBytes()
  }

  override def serializeValue(element: (String, Int)): Array[Byte] = {
    println("value._1:" + element._1 + "value._2:" + element._2)
   element._2.toString.getBytes()
  }

  override def getTargetTopic(element: (String, Int)): String = "topic01"
}
//写进kafka中
object KafkaSink {
  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(2)
    val text = env.socketTextStream("CentOS",9998)
    val counts = text.flatMap(_.split(" "))
      .map((_, 1))
      .keyBy(0)
      .sum(1)
   // counts.print()
    val prop = new Properties()
    prop.setProperty(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG,"CentOS:9092")
    prop.setProperty(ProducerConfig.LINGER_MS_CONFIG,"500")
    prop.setProperty(ProducerConfig.BATCH_SIZE_CONFIG,"100")
    //sink
    val kafkaSink = new FlinkKafkaProducer[(String,Int)]("topic01",new UserDefinedKeyedSerializationSchema(),prop,Semantic.AT_LEAST_ONCE)
    kafkaSink.setWriteTimestampToKafka(true)
    counts.addSink(kafkaSink)
    env.execute()
  }
}

你可能感兴趣的:(Apache Flink——DataStream的DataSource以及DataSink(章节二))