Flink程序
步骤一:获取执行环境,获取的是程序入口
步骤二:获取数据源
步骤三:数据处理
Out.collect(new Tuple2<>(word,1))
Out,collect(Tuple2.of(word,1))这俩结果一样
步骤四:数据的输出
步骤五:启动应用程序
工作中算子参数,用面向对象的
抽离复杂的算子
实时需求分析
实时统计每隔1秒统计最近2秒单词出现的次数
开发环境部署
<properties>
<flink.version>1.9.0</flink.version>
<scala.version>2.11.8</scala.version>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-java_2.11</artifactId>
<version>${flink.version}</version>
</dependency>
</dependencies>
实时代码开发(java)
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.java.utils.ParameterTool;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.util.Collector;
public class WindowWordCountJava {
public static void main(String[] args) throws Exception {
//flink提供的工具类,获取传递的参数
ParameterTool parameterTool = ParameterTool.fromArgs(args);
String hostname = parameterTool.get("hostname");
int port = parameterTool.getInt("port");
//步骤一:获取执行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
//步骤二:获取数据源
DataStream<String> dataStream = env.socketTextStream(hostname, port);
//步骤三:执行逻辑操作
DataStream<WordCount> wordAndOneStream = dataStream.flatMap(new FlatMapFunction<String, WordCount>() {
public void flatMap(String line, Collector<WordCount> out) {
String[] fields = line.split(",");
for (String word : fields) {
out.collect(new WordCount(word, 1L));
}
}
});
DataStream<WordCount> resultStream = wordAndOneStream.keyBy("word")
.timeWindow(Time.seconds(2), Time.seconds(1))//每隔1秒计算最近2秒
.sum("count");
//步骤四:结果打印
resultStream.print();
//步骤五:任务启动
env.execute("WindowWordCountJava");
}
public static class WordCount{
public String word;
public long count;
//记得要有这个空构建
public WordCount(){
}
public WordCount(String word,long count){
this.word = word;
this.count = count;
}
@Override
public String toString() {
return "WordCount{" +
"word='" + word + '\'' +
", count=" + count +
'}';
}
}
}
实时代码开发(scala)
添加依赖
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-scala_2.11</artifactId>
<version>${flink.version}</version>
</dependency>
scala开发依赖和编译插件
<properties>
<flink.version>1.9.0</flink.version>
<scala.version>2.11.8</scala.version>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-java_2.11</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-scala_2.11</artifactId>
<version>${flink.version}</version>
</dependency>
</dependencies>
<build>
<pluginManagement>
<plugins>
<!-- scala插件 -->
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>3.2.2</version>
</plugin>
<!-- maven 插件 -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.5.1</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
</plugins>
</pluginManagement>
<plugins>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<executions>
<execution>
<id>scala-compile-first</id>
<phase>process-resources</phase>
<goals>
<goal>add-source</goal>
<goal>compile</goal>
</goals>
</execution>
<execution>
<id>scala-test-compile</id>
<phase>process-test-resources</phase>
<goals>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<executions>
<execution>
<phase>compile</phase>
<goals>
<goal>compile</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>2.4.3</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF
META-INF/*.DSA
META-INF/*.RSA
代码实现
import org.apache.flink.api.java.utils.ParameterTool
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.streaming.api.windowing.time.Time
/**
* 滑动窗口
* 每隔1秒钟统计最近2秒内的数据,打印到控制台。
*/
object WindowWordCountScala {
def main(args: Array[String]): Unit = {
//获取参数
val hostname = ParameterTool.fromArgs(args).get("hostname")
val port = ParameterTool.fromArgs(args).getInt("port")
//TODO 导入隐式转换
import org.apache.flink.api.scala._
//步骤一:获取执行环境
val env = StreamExecutionEnvironment.getExecutionEnvironment
//步骤二:获取数据源
val textStream = env.socketTextStream(hostname,port)
//步骤三:数据处理
val wordCountStream = textStream.flatMap(line => line.split(","))
.map((_, 1))
.keyBy(0)
.timeWindow(Time.seconds(2), Time.seconds(1))
.sum(1)
//步骤四:数据结果处理
wordCountStream.print()
//步骤六:启动程序
env.execute("WindowWordCountScala")
}
}
离线代码开发(java)
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.operators.AggregateOperator;
import org.apache.flink.api.java.operators.DataSource;
import org.apache.flink.api.java.operators.FlatMapOperator;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.util.Collector;
public class WordCount {
public static void main(String[] args)throws Exception {
//步骤一:获取离线的程序入口
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
String inputPath="D:\\input\\hello.txt";
//步骤二:获取数据源
DataSource<String> dataSet = env.readTextFile(inputPath);
//步骤三:数据处理
FlatMapOperator<String, Tuple2<String, Integer>> wordAndOneDataSet = dataSet.flatMap(new FlatMapFunction<String, Tuple2<String, Integer>>() {
@Override
public void flatMap(String line, Collector<Tuple2<String, Integer>> collector) throws Exception {
String[] fileds = line.split(",");
for (String word : fileds) {
collector.collect(new Tuple2<String, Integer>(word, 1));
}
}
});
AggregateOperator<Tuple2<String, Integer>> result = wordAndOneDataSet.groupBy(0)
.sum(1);
//步骤四:数据结果处理
result.writeAsText("D:\\kkb\\flinklesson\\src\\output\\result").setParallelism(1);
//步骤五:启动程序
env.execute("word count");
}
}
换一种写法,把flink的算子抽离出来,代码看起来会更清晰。
public class WordCount {
public static void main(String[] args)throws Exception {
//步骤一:获取离线的程序入口
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
String inputPath="D:\\input\\hello.txt";
//步骤二:获取数据源
DataSource<String> dataSet = env.readTextFile(inputPath);
//步骤三:数据处理
FlatMapOperator<String, Tuple2<String, Integer>> wordAndOneDataSet = dataSet.flatMap(new MySplitWordsTask());
AggregateOperator<Tuple2<String, Integer>> result = wordAndOneDataSet.groupBy(0)
.sum(1);
//步骤四:数据结果处理
result.writeAsText("D:\\kkb\\flinklesson\\src\\output\\result1").setParallelism(1);
//步骤五:启动程序
env.execute("word count");
}
public static class MySplitWordsTask implements FlatMapFunction<String,Tuple2<String,Integer>>{
@Override
public void flatMap(String line, Collector<Tuple2<String, Integer>> collector) throws Exception {
String[] fileds = line.split(",");
for (String word : fileds) {
collector.collect(new Tuple2<String, Integer>(word, 1));
}
}
}
}