DataSource是flink数据的来源,用户可以通过env,addSource(SourceFunction)将SourceFunction添加到程序中,Flink内置许多已经实现的SourceFunction,但是用户也可以自定义实现SourceFunction(非并行化)接口,也可以实现ParallelSourceFunction(并行化)接口,如果有状态需要管理还可以实现RichParallelSourceFunction接口。
//1.创建流计算执⾏环境
val env = StreamExecutionEnvironment.getExecutionEnvironment
//2.创建DataStream - 细化
val text:DataStream[String] = env.readTextFile("hdfs://CentOS:9000/demo/words")
//3.执⾏DataStream的转换算⼦
val counts = text.flatMap(line=>line.split("\\s+"))
.map(word=>(word,1))
.keyBy(0)
.sum(1)
//4.将计算的结果在控制打印
counts.print()
//5.执⾏流计算任务
env.execute("Window Stream WordCount")
//1.创建流计算执⾏环境
val env = StreamExecutionEnvironment.getExecutionEnvironment
//2.创建DataStream - 细化
var inputFormat:FileInputFormat[String]=new TextInputFormat(null)
val text:DataStream[String] =
env.readFile(inputFormat,"hdfs://CentOS:9000/demo/words")
//3.执⾏DataStream的转换算⼦
val counts = text.flatMap(line=>line.split("\\s+"))
.map(word=>(word,1))
.keyBy(0)
.sum(1)
//4.将计算的结果在控制打印
counts.print()
//5.执⾏流计算任务
env.execute("Window Stream WordCount")
//1.创建流计算执⾏环境
val env = StreamExecutionEnvironment.getExecutionEnvironment
//2.创建DataStream - 细化
var inputFormat:FileInputFormat[String]=new TextInputFormat(null)
val text:DataStream[String] = env.readFile(inputFormat,
"hdfs://CentOS:9000/demo/words",FileProcessingMode.PROCESS_CONTINUOUSLY,1000)
//3.执⾏DataStream的转换算⼦
val counts = text.flatMap(line=>line.split("\\s+"))
.map(word=>(word,1))
.keyBy(0)
.sum(1)
//4.将计算的结果在控制打印
counts.print()
//5.执⾏流计算任务
env.execute("Window Stream WordCount")
//1.创建流计算执⾏环境
val env = StreamExecutionEnvironment.getExecutionEnvironment
//2.创建DataStream - 细化
val text = env.socketTextStream("CentOS", 9999,'\n',3)
//3.执⾏DataStream的转换算⼦
val counts = text.flatMap(line=>line.split("\\s+"))
.map(word=>(word,1))
.keyBy(0)
.sum(1)
//4.将计算的结果在控制打印
counts.print()
//5.执⾏流计算任务
env.execute("Window Stream WordCount")
//1.创建流计算执⾏环境
val env = StreamExecutionEnvironment.getExecutionEnvironment
//2.创建DataStream - 细化
val text = env.fromCollection(List("this is a demo","hello word"))
//3.执⾏DataStream的转换算⼦
val counts = text.flatMap(line=>line.split("\\s+"))
.map(word=>(word,1))
.keyBy(0)
.sum(1)
//4.将计算的结果在控制打印
counts.print()
//5.执⾏流计算任务
env.execute("Window Stream WordCount")
mport org.apache.flink.streaming.api.functions.source.SourceFunction
import scala.util.Random
//用户自定义的非并行的sourceFunction
class UserDefinedNonParallelSourceFunction extends SourceFunction[String] {
@volatile//防止线程拷贝变量
var isRunning:Boolean=true
val lines:Array[String]=Array("this is a demo","hello world","ni hao ma")
//在该⽅法中启动线程,通过sourceContext的collect⽅法发送数据
override def run(ctx: SourceFunction.SourceContext[String]): Unit = {
while(isRunning){
println(Thread.currentThread().getId)
Thread.sleep(1000)
//输送数据给下游
ctx.collect(lines(new Random().nextInt(lines.length)))
}
}
override def cancel(): Unit = {
println("==========isRunning++++++++++")
isRunning = false
}
}
import org.apache.flink.streaming.api.functions.source.{ParallelSourceFunction, SourceFunction}
import scala.util.Random
//用户自定义并行的
class UserDefinedParallelSourceFunction extends ParallelSourceFunction[String]{
@volatile//防止线程拷贝变量
var isRunning:Boolean = true
var lines:Array[String] = Array("this is a demo","i love you","hello world")
override def run(ctx: SourceFunction.SourceContext[String]): Unit = {
while(isRunning)
{
println("当前线程id:" + Thread.currentThread().getId)
Thread.sleep(3000)
ctx.collect(lines(new Random().nextInt(lines.length)))
}
}
override def cancel(): Unit = {
isRunning = false
}
}
测试自定义Source
//用户自定义的source
object UserDefinedSource {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
val text = env.addSource(new UserDefinedParallelSourceFunction)
val counts = text.flatMap(_.split(" "))
.map((_, 1))
.keyBy(0)
.sum(1)
counts.print()
env.execute()
}
}
org.apache.flink
flink-connector-kafka_2.11
1.10.0
//1.创建流计算执⾏环境
val env = StreamExecutionEnvironment.getExecutionEnvironment
//2.创建DataStream - 细化
val props = new Properties()
props.setProperty("bootstrap.servers", "CentOS:9092")
props.setProperty("group.id", "g1")
val text = env.addSource(new FlinkKafkaConsumer[String]("topic01",new
SimpleStringSchema(),props))
//3.执⾏DataStream的转换算⼦
val counts = text.flatMap(line=>line.split("\\s+"))
.map(word=>(word,1))
.keyBy(0)
.sum(1)
//4.将计算的结果在控制打印
counts.print()
//5.执⾏流计算任务
env.execute("Window Stream WordCount")
//用户自定义的反序列化规则
class UserDefinedKafkaDeserializationSchema extends KafkaDeserializationSchema[(String,String,String,Int,Long)]{
override def isEndOfStream(t: (String, String, String, Int, Long)): Boolean = false
override def deserialize(cr: ConsumerRecord[Array[Byte], Array[Byte]]): (String, String, String, Int, Long) = {
if(cr.key() != null){
(new String(cr.key()),new String(cr.value()),cr.topic(),cr.partition(),cr.offset())
}else{
("key",new String(cr.value()),cr.topic(),cr.partition(),cr.offset())
}
}
override def getProducedType: TypeInformation[(String, String, String, Int, Long)] = {
createTypeInformation[(String, String, String, Int, Long)]
}
}
//flink对接kafka=====接受kafka的数据
object KafkaSource {
def main(args: Array[String]): Unit = {
//构建flink流计算执行环境
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setParallelism(4)
//kafka数据源
val prop = new Properties()
prop.setProperty("bootstrap.servers", "CentOS:9092")
// 仅 Kafka 0.8 需要
prop.setProperty("group.id", "g1")
//隐式转换
import org.apache.flink.api.scala._
//kafkaSource
val kafkaSource = new FlinkKafkaConsumer[ObjectNode]("topic01",new JSONKeyValueDeserializationSchema(true),prop)
kafkaSource.setStartFromEarliest() // 尽可能从最早的记录开始
kafkaSource.setStartFromLatest() // 从最新的记录开始
//kafkaSource.setStartFromTimestamp(...) // 从指定的时间开始(毫秒)
kafkaSource.setStartFromGroupOffsets() // 默认的方法
val text = env.addSource(kafkaSource)
//3.执⾏DataStream的转换算⼦
val counts = text.map(t => {
println("meta:" + t)
(t.get("value").get("id"), t.get("value").get("name"))
})
counts.print()
println(env.getExecutionPlan)
env.execute("Kafka Consumer")
}
}
Flink提供了多种内置的输出格式,这些格式被封装在DataStream中:
//Flink写进文件中
object FileSink {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
val text = env.socketTextStream("CentOS",9998)
val counts = text.flatMap(_.split(" "))
.map((_, 1))
.keyBy(0)
.sum(1)
counts.print()
counts.writeUsingOutputFormat(new TextOutputFormat[(String, Int)](new Path("file:///E:/flink_results")))
env.execute()
}
}
如果写到hdfs中,需要产生大量数据才能看到效果,因为写入hdfs文件系统的缓冲区比较大,以上写入文件系统的sink不能参与系统的检查点,在实际生产环境中通常使用flink-connector-filesystem写入到外围系统。
org.apache.flink
flink-connector-filesystem_2.11
1.10.0
新版本写法
//写进hdfs中--新版本
object FileSystemSinkNew {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setParallelism(1)
val text = env.socketTextStream("CentOS",9998)
val counts = text.flatMap(_.split(" "))
.map((_, 1))
.keyBy(0)
.sum(1)
counts.print()
//构建sink
val fileSink = StreamingFileSink.forRowFormat(new Path("hdfs://CentOS:9000/flink_results"), new SimpleStringEncoder[(String, Int)]("UTF-8"))
.withBucketAssigner(new DateTimeBucketAssigner[(String, Int)]("yyyy-MM-dd"))
.build()
counts.addSink(fileSink)
env.execute()
}
}
老版本写法
//老版本写法
object FileSystemSinkOld {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setParallelism(1)
val text = env.socketTextStream("CentOS",9998)
val counts = text.flatMap(_.split(" "))
.map((_, 1))
.keyBy(0)
.sum(1)
counts.print("测试")
val bucketSink = new BucketingSink[(String,Int)]("hdfs://CentOS:9000/flink-results_bucket")
bucketSink.setBucketer(new DateTimeBucketer[(String, Int)]("yyyy-MM-dd"))
bucketSink.setBatchSize(1024)
counts.addSink(bucketSink)
env.execute()
}
}
打印标准输出/标准错误流中每个元素的toString()值。可选地,可以提供前缀(msg)作为输出的前缀。这有助于区分不同的打印调用。如果并行度大于1,输出也将以产生输出的任务的标识符作为前缀
lass UserDefinedSinkFunction extends RichSinkFunction[(String,Int)]{
override def open(parameters: Configuration): Unit = {
println("打开连接")
}
override def invoke(value: (String, Int), context: SinkFunction.Context[_]): Unit = {
println("输出:" + value)
}
override def close(): Unit = {
println("关闭连接")
}
}
object UserDefinedSink {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setParallelism(1)
val text = env.socketTextStream("CentOS",9998)
val counts = text.flatMap(_.split(" "))
.map((_, 1))
.keyBy(0)
.sum(1)
counts.addSink(new UserDefinedSinkFunction)
env.execute()
}
}
org.apache.bahir
flink-connector-redis_2.11
1.0
//用户自定义的RedisMapper
class UserDefinedRedisMapper extends RedisMapper[(String,Int)]{
override def getCommandDescription: RedisCommandDescription = {
new RedisCommandDescription(RedisCommand.HSET,"wordcounts")
}
override def getKeyFromData(t: (String, Int)): String = t._1
override def getValueFromData(t: (String, Int)): String = t._2+""
}
//写入redis
object RedisSink {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setParallelism(2)
val text = env.readTextFile("hdfs://CentOS:9000/demo/words")
val counts = text.flatMap(_.split(" "))
.map((_, 1))
.keyBy(0)
.sum(1)
counts.print()
val jedisConf = new FlinkJedisPoolConfig.Builder()
.setHost("192.168.192.19")
.setPort(6379)
.build()
counts.addSink(new RedisSink(jedisConf,new UserDefinedRedisMapper))
env.execute()
}
}
org.apache.flink
flink-connector-kafka_2.11
1.10.0
方案一-KafkaSerializationSchema
class UserDefiedKafkaSerializationSchema extends KafkaSerializationSchema[(String,Int)]{
override def serialize(element: (String, Int), timestamp: lang.Long): ProducerRecord[Array[Byte], Array[Byte]] = {
new ProducerRecord("topic01",element._1.getBytes(),element._2.toString.getBytes())
}
}
这种方式的"default_topic没有意义"
方案二-KeyedSerializationSchema
class UserDefinedKeyedSerializationSchema extends KeyedSerializationSchema[(String,Int)]{
override def serializeKey(element: (String, Int)): Array[Byte] = {
println("key._1:" + element._1 + "key._2:" + element._2)
element._1.getBytes()
}
override def serializeValue(element: (String, Int)): Array[Byte] = {
println("value._1:" + element._1 + "value._2:" + element._2)
element._2.toString.getBytes()
}
override def getTargetTopic(element: (String, Int)): String = "topic01"
}
//写进kafka中
object KafkaSink {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setParallelism(2)
val text = env.socketTextStream("CentOS",9998)
val counts = text.flatMap(_.split(" "))
.map((_, 1))
.keyBy(0)
.sum(1)
// counts.print()
val prop = new Properties()
prop.setProperty(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG,"CentOS:9092")
prop.setProperty(ProducerConfig.LINGER_MS_CONFIG,"500")
prop.setProperty(ProducerConfig.BATCH_SIZE_CONFIG,"100")
//sink
val kafkaSink = new FlinkKafkaProducer[(String,Int)]("topic01",new UserDefinedKeyedSerializationSchema(),prop,Semantic.AT_LEAST_ONCE)
kafkaSink.setWriteTimestampToKafka(true)
counts.addSink(kafkaSink)
env.execute()
}
}