import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment}
import org.apache.flink.streaming.api.windowing.time.Time
object FlinkSource1 {
def main(args: Array[String]): Unit = {
//获取程序入口类
val streamExecution: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
val socketText: DataStream[String] = streamExecution.socketTextStream("node01",8000)
//注意:必须要添加这一行隐式转行,否则下面的flatmap方法执行会报错
import org.apache.flink.api.scala._
val result: DataStream[(String, Int)] = socketText.flatMap(x => x.split(" "))
.map(x => (x, 1))
.keyBy(0)
.timeWindow(Time.seconds(5), Time.seconds(5)) //统计最近5秒钟的数据
.sum(1)
//打印结果数据
result.print().setParallelism(1)
//执行程序
streamExecution.execute()
}
}
<repositories>
<repository>
<id>cloudera</id>
<url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
</repository>
</repositories>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.6.0-mr1-cdh5.14.2</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.6.0-cdh5.14.2</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>2.6.0-cdh5.14.2</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
<version>2.6.0-cdh5.14.2</version>
</dependency>
object FlinkSource2 {
def main(args: Array[String]): Unit = {
val executionEnvironment: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
import org.apache.flink.api.scala._
//从文本读取数据
val hdfStream: DataStream[String] = executionEnvironment.readTextFile("hdfs://node01:8020/flink_input/")
val result: DataStream[(String, Int)] = hdfStream.flatMap(x => x.split(" ")).map(x =>(x,1)).keyBy(0).sum(1)
result.print().setParallelism(1)
executionEnvironment.execute("hdfsSource")
}
}
import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment}
object FlinkSource3 {
def main(args: Array[String]): Unit = {
val environment: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
import org.apache.flink.api.scala._
val value: DataStream[String] = environment.fromElements[String]("hadoop hive","spark flink")
val result2: DataStream[(String, Int)] = value.flatMap(x => x.split(" ")).map(x =>(x,1)).keyBy(0).sum(1)
result2.print().setParallelism(1)
environment.execute()
}
}
如果flink自带的一些数据源还不够的工作使用的话,我们还可以自定义数据源
flink提供了大量的已经实现好的source方法,你也可以自定义source
通过实现sourceFunction接口来自定义无并行度的source,
或者你也可以通过实现ParallelSourceFunction 接口 or 继承RichParallelSourceFunction 来自定义有并行度的source
第一步:自定义class类实现SourceFunction接口
class MySource extends SourceFunction[String]{
var isRunning:Boolean = true
override def run(sourceContext: SourceFunction.SourceContext[String]): Unit = {
while (isRunning){
sourceContext.collect("hello world")
}
}
override def cancel(): Unit = {
isRunning = false
}
}
import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment}
import org.apache.flink.streaming.api.windowing.time.Time
object FlinkSource4 {
def main(args: Array[String]): Unit = {
val environment: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
import org.apache.flink.api.scala._
val result: DataStream[String] = environment.addSource(new MySource)
val tupleResult: DataStream[(String, Int)] = result.flatMap(x => x.split(" ")).map(x => (x, 1))
.keyBy(0)
.timeWindowAll(Time.seconds(1)) //每隔两秒钟处理一次数据
.sum(1)
tupleResult.print().setParallelism(1)
environment.execute()
}
}
import org.apache.flink.streaming.api.functions.source.{ParallelSourceFunction, SourceFunction}
class MyParalleSource extends ParallelSourceFunction[String] {
var isRunning:Boolean = true
override def run(sourceContext: SourceFunction.SourceContext[String]): Unit = {
while (true){
sourceContext.collect("hello world")
}
}
override def cancel(): Unit = {
isRunning = false
}
}
第二步:使用自定义数据源object FlinkSource5 {
def main(args: Array[String]): Unit = {
val environment: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
import org.apache.flink.api.scala._
val sourceStream: DataStream[String] = environment.addSource(new MyParalleSource)
val result: DataStream[(String, Int)] = sourceStream.flatMap(x => x.split(" ")).map(x => (x, 1))
.keyBy(0)
.sum(1)
result.print().setParallelism(2)
environment.execute("paralleSource")
}
}