<dependencies>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-scala_2.11</artifactId>
<version>1.7.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.flink/flink-streaming-scala-->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-scala_2.11</artifactId>
<version>1.7.0</version>
</dependency>
</dependencies>
<build>
<plugins>
<!-- 该插件用于将 Scala 代码编译成 class 文件 -->
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>3.2.2</version>
<executions>
<execution>
<!-- 声明绑定到 maven 的 compile 阶段 -->
<goals>
<goal>compile</goal>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<version>3.0.0</version>
<configuration>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
val environment:ExecutionEnvironment = ExecutionEnvironment.getExecutionEnvironment
val lineDataSet:DataSet[String] = environment.readTextFile("D:\\Resourse\\18_Flink\\datas\\input")
val wordDataSet:DataSet[String] = lineDataSet.flatMap(_.split(" "))
val wordAndNumDataSet:DataSet[(String,Int)] = wordDataSet.map((_,1))
// 0 代表以元组的第一个元素进行分组
val groupDataSet: GroupedDataSet[(String, Int)] = wordAndNumDataSet.groupBy(0)
//1 元组第二个元素进行求和
val aggrDataSet: AggregateDataSet[(String, Int)] = groupDataSet.sum(1)
aggrDataSet.print()
(hadoop,1)
(flink,2)
(hello,3)
(spark,3)
val environment: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
val textDataStream: DataStream[String] = environment.socketTextStream("node102",54321)
val flatMapStream: DataStream[String] = textDataStream.flatMap(_.split(" "))
val mapDataStream:DataStream[(String,Int)] = flatMapStream.map((_,1))
val keyByDataStream: KeyedStream[(String, Int), Tuple] = mapDataStream.keyBy(0)
val reduceDataStream: DataStream[(String, Int)] = keyByDataStream.reduce((item1,item2) => (item1._1,item1._2 + item2._2))
reduceDataStream.print()
environment.execute()
[bduser@node102 ~]$ nc -lk 54321
123
a a a
a a a
aa
a a
a
a
a
a
a
a
aafdsf
sdf s
g
f
13224
3> (a,1)
1> (123,1)
3> (a,2)
1> (f,1)
2> (g,1)
3> (,1)
3> (a,3)
3> (,2)
1> (aafdsf,1)
3> (a,4)
3> (a,5)
1> (sdf,1)
3> (a,6)
3> (,3)
3> (a,7)
3> (a,8)
3> (,4)
3> (a,9)
3> (a,10)
3> (a,11)
3> (a,12)
3> (,5)
3> (a,13)
3> (s,1)
3> (13224,1)
3> (aa,1)
3> (,6)
3> (a,14)
3> (,7)
3> (,8)
[bduser@node102 softwares]$ tar -zxvf flink-1.7.0-bin-hadoop27-scala_2.11.tgz -C /opt/modules
#FLINK_HOME
export FLINK_HOME=/opt/modules/flink-1.7.0
export PATH=$PATH:$FLINK_HOME/bin
[bduser@node102 ~]$ xcall echo $FLINK_HOME
----------------node102-------------------
/opt/modules/flink-1.7.0
----------------node103-------------------
/opt/modules/flink-1.7.0
----------------node104-------------------
/opt/modules/flink-1.7.0
[bduser@node102 ~]$ start-cluster.sh
Starting cluster.
Starting standalonesession daemon on host node102.
Starting taskexecutor daemon on host node102.
Starting taskexecutor daemon on host node103.
Starting taskexecutor daemon on host node104.
[bduser@node102 ~]$ xcall jps
----------------node102-------------------
2353 Jps
1800 StandaloneSessionClusterEntrypoint
2267 TaskManagerRunner
----------------node103-------------------
1681 Jps
1656 TaskManagerRunner
----------------node104-------------------
1601 Jps
1576 TaskManagerRunner
val paramTool: ParameterTool = ParameterTool.fromArgs(args)
val host = paramTool.get("host")
val port = paramTool.getInt("port")
val outputPath = paramTool.get("output")
val environment: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
val textDataStream: DataStream[String] = environment.socketTextStream(host, port)
val flatMapStream: DataStream[String] = textDataStream.flatMap(_.split(" "))
val mapDataStream: DataStream[(String, Int)] = flatMapStream.map((_,1))
val keyByDataStream: KeyedStream[(String, Int), Tuple] = mapDataStream.keyBy(0)
val reduceDataStream: DataStream[(String, Int)] = keyByDataStream.reduce((item1,item2)=>(item1._1, item1._2+item2._2)).setParallelism(10)
reduceDataStream.writeAsText(outputPath).setParallelism(1)
environment.execute()
val paramTool: ParameterTool = ParameterTool.fromArgs(args)
val inputPath = paramTool.get("input")
val outputPath = paramTool.get("output")
val environment: ExecutionEnvironment = ExecutionEnvironment.getExecutionEnvironment
val lineDataSet: DataSet[String] = environment.readTextFile(inputPath)
val wordDataSet: DataSet[String] = lineDataSet.flatMap(_.split(" "))
val wordAndNumDataSet: DataSet[(String, Int)] = wordDataSet.map((_,1))
val groupDataSet: GroupedDataSet[(String, Int)] = wordAndNumDataSet.groupBy(0)
val aggrDataSet: AggregateDataSet[(String, Int)] = groupDataSet.sum(1)
aggrDataSet.writeAsCsv(outputPath, ",").setParallelism(1)
environment.execute()
[bduser@node102 datas]$ flink run -c com.nefu.flink.test.DataSetWordCount /home/bduser/HelloFlink-1.0-SNAPSHOT.jar --input /home/bduser/datas/space.txt --output ~/finkoutput
Starting execution of program
[bduser@node102 datas]$ flink run -c com.nefu.flink.test.DataSetWordCount /home/bduser/HelloFlink-1.0-SNAPSHOT.jar --input /home/bduser/datas/space.txt --output ~/finkoutput
[bduser@node102 ~]$ cat finkoutput
-,1,echo,2,flink,1,hello,2,spark,1,[
[bduser@node102 ~]$ start-yarn.sh
starting yarn daemons
starting resourcemanager, logging to /opt/modules/hadoop-2.7.6/logs/yarn-bduser-resourcemanager-node102.out
node102: starting nodemanager, logging to /opt/modules/hadoop-2.7.6/logs/yarn-bduser-nodemanager-node102.out
node104: starting nodemanager, logging to /opt/modules/hadoop-2.7.6/logs/yarn-bduser-nodemanager-node104.out
node103: starting nodemanager, logging to /opt/modules/hadoop-2.7.6/logs/yarn-bduser-nodemanager-node103.out
[bduser@node102 ~]$ yarn-session.sh -n 2 -s 2 -jm 1024 -tm 1024 -nm flinkdemo -d
其中:
-n(–container): TaskManager 的数量。
-s(–slots): 每个 TaskManager 的 slot 数量,默认一个 slot 一个 core,默认每个taskmanager 的 slot 的个数为 1,有时可以多一些 taskmanager,做冗余。
-jm: JobManager 的内存(单位 MB)。
-tm:每个 taskmanager 的内存(单位 MB)。
-nm: yarn 的 appName(现在 yarn 的 ui 上的名字)。
-d:后台执行。
[bduser@node102 ~]$ flink run -c com.nefu.flink.test.DataSetWordCount /home/bduser/HelloFlink-1.0-SNAPSHOT.jar --input /home/bduser/datas/space.txt --output ~/finkoutput