从文件中读取单词,计算频次
import org.apache.flink.api.scala.ExecutionEnvironment
object BatchWordCountJob {
def main(args: Array[String]) {
val env = ExecutionEnvironment.getExecutionEnvironment
val input ="/Users/mac126/Desktop/hello.txt";
val text = env.readTextFile(input)
import org.apache.flink.api.scala._
text.flatMap(_.toLowerCase().split("\t")).filter(_.nonEmpty).map((_,1))
.groupBy(0).sum(1).print()
}
最后可在控制台看到打印的内容
mport org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.util.Collector;
/**
* java开发wordcount
*/
public class BatchWordCountJob {
public static void main(String[] args) throws Exception{
final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
String file="/Users/mac126/Desktop/hello.txt";
DataSet text = env.readTextFile(file);
text.flatMap(new FlatMapFunction>() {
@Override
public void flatMap(String value, Collector> collector) throws Exception {
String [] tokens = value.toLowerCase().split("\t");
for (String token :tokens){
if (token.trim().length()>0){
collector.collect(new Tuple2(token,1));
}
}
}
}).groupBy(0).sum(1).print();
}
}
flink入门一般使用socket数据作为流式数据来源。在mac上命令行使用nc -lk 9000开启socket发送端,在flink中接收socket消息如下
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.streaming.api.windowing.time.Time
object StreamWordCountJob {
def main(args: Array[String]) {
// set up the streaming execution environment
val env = StreamExecutionEnvironment.getExecutionEnvironment
val text =env.socketTextStream("localhost",9000)
import org.apache.flink.api.scala._
text.flatMap(_.split(",")).map((_,1)).keyBy(0).timeWindow(Time.seconds(5)).sum(1).print().setParallelism(1)
env.execute("StreamWordCountJob")
}
}
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.util.Collector;
public class StreamWordCountJob {
public static void main(String[] args) throws Exception{
final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
DataStreamSource text = env.socketTextStream("localhost",9000);
text.flatMap(new FlatMapFunction>() {
@Override
public void flatMap(String value, Collector> collector) throws Exception {
String [] tokens = value.toLowerCase().split(",");
for (String token:tokens){
if (token.length()>0){
collector.collect(new Tuple2(token,1));
}
}
}
}).keyBy(0).sum(1).print().setParallelism(1);
env.execute("StreamWordCountJob");
}
}
以上就是Flink入门的第一个小例子。大家可以在自己电脑上试下。
一般在IDEA上安装Scala插件进行开发
4.0.0
my.flink.quickstart
quickstart
0.1
jar
Flink Quickstart Job
http://www.myorganization.org
apache.snapshots
Apache Development Snapshot Repository
https://repository.apache.org/content/repositories/snapshots/
false
true
UTF-8
1.8.0
2.11
2.11.12
org.apache.flink
flink-scala_${scala.binary.version}
${flink.version}
provided
org.apache.flink
flink-streaming-scala_${scala.binary.version}
${flink.version}
provided
org.scala-lang
scala-library
${scala.version}
provided
org.slf4j
slf4j-log4j12
1.7.7
runtime
log4j
log4j
1.2.17
runtime
org.apache.maven.plugins
maven-shade-plugin
3.0.0
package
shade
org.apache.flink:force-shading
com.google.code.findbugs:jsr305
org.slf4j:*
log4j:*
*:*
META-INF/*.SF
META-INF/*.DSA
META-INF/*.RSA
my.flink.quickstart.StreamingJob
org.apache.maven.plugins
maven-compiler-plugin
3.1
1.8
net.alchim31.maven
scala-maven-plugin
3.2.2
compile
testCompile
org.apache.maven.plugins
maven-eclipse-plugin
2.8
true
org.scala-ide.sdt.core.scalanature
org.eclipse.jdt.core.javanature
org.scala-ide.sdt.core.scalabuilder
org.scala-ide.sdt.launching.SCALA_CONTAINER
org.eclipse.jdt.launching.JRE_CONTAINER
org.scala-lang:scala-library
org.scala-lang:scala-compiler
**/*.scala
**/*.java
org.codehaus.mojo
build-helper-maven-plugin
1.7
add-source
generate-sources
add-source
add-test-source
generate-test-sources
add-test-source
add-dependencies-for-IDEA
idea.version
org.apache.flink
flink-scala_${scala.binary.version}
${flink.version}
compile
org.apache.flink
flink-streaming-scala_${scala.binary.version}
${flink.version}
compile
org.scala-lang
scala-library
${scala.version}
compile
4.0.0
my-flink-project
my-flink-project
0.1
jar
Flink Quickstart Job
http://www.myorganization.org
UTF-8
1.6.2
1.8
2.11
${java.version}
${java.version}
apache.snapshots
Apache Development Snapshot Repository
https://repository.apache.org/content/repositories/snapshots/
false
true
org.apache.flink
flink-java
${flink.version}
provided
org.apache.flink
flink-streaming-java_${scala.binary.version}
${flink.version}
provided
org.slf4j
slf4j-log4j12
1.7.7
runtime
log4j
log4j
1.2.17
runtime
org.apache.maven.plugins
maven-compiler-plugin
3.1
${java.version}
org.apache.maven.plugins
maven-shade-plugin
3.0.0
package
shade
org.apache.flink:force-shading
com.google.code.findbugs:jsr305
org.slf4j:*
log4j:*
*:*
META-INF/*.SF
META-INF/*.DSA
META-INF/*.RSA
myflink.StreamingJob
org.eclipse.m2e
lifecycle-mapping
1.0.0
org.apache.maven.plugins
maven-shade-plugin
[3.0.0,)
shade
org.apache.maven.plugins
maven-compiler-plugin
[3.1,)
testCompile
compile
add-dependencies-for-IDEA
idea.version
org.apache.flink
flink-java
${flink.version}
compile
org.apache.flink
flink-streaming-java_${scala.binary.version}
${flink.version}
compile