DataSource指定了流计算的输入,用户可以通过StreamExecutionEnvironment.addSource(sourceFunction)
,Flink已经预先实现了一些DataSource的实现,如果用户需要自定义自己的实现可以通过实现SourceFunction
接口(非并行Source)或者ParallelSourceFunction
接口(实现并行Source)或者继承RichParallelSourceFunction
.
readTextFile(path)
- 读取文本文件,底层通过TextInputFormat
一行行读取文件数据,返回是一个DataStream[String] - 仅仅处理一次
//1.创建StreamExecutionEnvironment
val fsEnv = StreamExecutionEnvironment.getExecutionEnvironment
//2.创建DataStream -细化
val filePath="file:///D:\\data"
val dataStream: DataStream[String] = fsEnv.readTextFile(filePath)
//3.对数据做转换
dataStream.flatMap(_.split("\\s+"))
.map((_,1))
.keyBy(0)
.sum(1)
.print()
fsEnv.execute("FlinkWordCountsQuickStart")
readFile(fileInputFormat, path)
- 读取文本文件,底层通过指定输入格式 - 仅仅处理一次
//1.创建StreamExecutionEnvironment
val fsEnv = StreamExecutionEnvironment.getExecutionEnvironment
//2.创建DataStream -细化
val filePath="file:///D:\\data"
val inputFormat = new TextInputFormat(null)
val dataStream: DataStream[String] = fsEnv.readFile(inputFormat,filePath)
//3.对数据做转换
dataStream.flatMap(_.split("\\s+"))
.map((_,1))
.keyBy(0)
.sum(1)
.print()
fsEnv.execute("FlinkWordCountsQuickStart")
readFile(fileInputFormat, path, watchType, interval, pathFilter)
- 以上两个方法底层调用都是该方法。
//1.创建StreamExecutionEnvironment
val fsEnv = StreamExecutionEnvironment.getExecutionEnvironment
//2.创建DataStream -细化
val filePath="file:///D:\\data"
val inputFormat = new TextInputFormat(null)
inputFormat.setFilesFilter(new FilePathFilter {
override def filterPath(path: Path): Boolean = {
if(path.getName().startsWith("1")){ //过滤不符合的文件
return true
}
false
}
})
val dataStream: DataStream[String] = fsEnv.readFile(inputFormat,filePath,
FileProcessingMode.PROCESS_CONTINUOUSLY,1000)
//3.对数据做转换
dataStream.flatMap(_.split("\\s+"))
.map((_,1))
.keyBy(0)
.sum(1)
.print()
fsEnv.execute("FlinkWordCountsQuickStart")
定期的扫描文件,如果文件内容被修改了,该文件会被完整的重新读取。因此可能会产生重复计算。
//1.创建StreamExecutionEnvironment
val fsEnv = StreamExecutionEnvironment.getExecutionEnvironment
//2.创建DataStream -细化
val dataStream: DataStream[String] = fsEnv.fromCollection(List("this is a demo","hello world"))
//3.对数据做转换
dataStream.flatMap(_.split("\\s+"))
.map((_,1))
.keyBy(0)
.sum(1)
.print()
fsEnv.execute("FlinkWordCountsQuickStart")
class UserDefineParallelSourceFunction extends ParallelSourceFunction[String]{
val lines=Array("this is a demo","hello world","hello flink")
@volatile
var isRunning=true
//运行
override def run(sourceContext: SourceFunction.SourceContext[String]): Unit = {
while (isRunning){
Thread.sleep(1000)
sourceContext.collect(lines(new Random().nextInt(lines.length)))
}
}
//取消
override def cancel(): Unit = {
isRunning=false
}
}
//1.创建StreamExecutionEnvironment
val fsEnv = StreamExecutionEnvironment.getExecutionEnvironment
//2.创建DataStream -细化
val dataStream: DataStream[String] = fsEnv.addSource(new UserDefineParallelSourceFunction)
dataStream.setParallelism(10)
//3.对数据做转换
dataStream.flatMap(_.split("\\s+"))
.map((_,1))
.keyBy(0)
.sum(1)
.print()
fsEnv.execute("FlinkWordCountsQuickStart")
<dependency>
<groupId>org.apache.flinkgroupId>
<artifactId>flink-connector-kafka_2.11artifactId>
<version>1.8.1version>
dependency>
//1.创建StreamExecutionEnvironment
val fsEnv = StreamExecutionEnvironment.getExecutionEnvironment
//2.创建DataStream -细化
val props = new Properties()
props.setProperty(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG,
"Spark:9092")
props.setProperty(ConsumerConfig.GROUP_ID_CONFIG, "g1")
val flinkKafkaConsumer = new FlinkKafkaConsumer[String]("topic01",
new SimpleStringSchema(),props)
val dataStream: DataStream[String] = fsEnv.addSource(flinkKafkaConsumer)
dataStream.setParallelism(10)
//3.对数据做转换
dataStream.flatMap(_.split("\\s+"))
.map((_,1))
.keyBy(0)
.sum(1)
.print()
fsEnv.execute("FlinkWordCountsQuickStart")
只能获取value信息,如果用户需要获取key/offset/partition信息用户需要定制
KafkaDeserializationSchema
获取Record元数据信息
class UserDefineKafkaDeserializationSchema
extends KafkaDeserializationSchema[(Int,Long,String,String,String)]{
override def isEndOfStream(t: (Int, Long, String, String, String)): Boolean = {
return false;
}
override def deserialize(r: ConsumerRecord[Array[Byte], Array[Byte]]): (Int, Long, String, String, String) = {
if(r.key()==null){
(r.partition(),r.offset(),r.topic(),"",new String(r.value()))
}else{
(r.partition(),r.offset(),r.topic(),StringUtils.arrayToString(r.key()),new String(r.value()))
}
}
//告知返回值类型
override def getProducedType: TypeInformation[(Int, Long, String, String, String)] = {
createTypeInformation[(Int, Long, String, String, String)]
}
}
//1.创建StreamExecutionEnvironment
val fsEnv = StreamExecutionEnvironment.getExecutionEnvironment
//2.创建DataStream -细化
val props = new Properties()
props.setProperty(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG,
"Spark:9092")
props.setProperty(ConsumerConfig.GROUP_ID_CONFIG, "g1")
val flinkKafkaConsumer = new FlinkKafkaConsumer[(Int,Long,String,String,String)]("topic01",
new UserDefineKafkaDeserializationSchema(),props)
val dataStream: DataStream[(Int,Long,String,String,String)] = fsEnv.addSource(flinkKafkaConsumer)
dataStream.print()
fsEnv.execute("FlinkWordCountsQuickStart")