目录
1 批处理api开发(scala语言开发,记得创建一个maven工程之后就添加scala语言)
2 java的批处理
3 scala的流式编程
4 java流式编程的代码
5 source (基本版本)
6 mysq作为一个source
7 kafka作为source
7.1 老版本的kafka的api
7.2 新版本的kafka基础版(就是偏移量就没自定义)
7.3 新版本的升级版本,指定偏移量
环境:三台虚拟机 (qianfeng01:8081,qianfeng02:8081,qianfeng03:8081)
首先在一台虚拟机上运行
start-cluster.sh
这样flink就启动了
idea开发
pom.xml(这个版本号就根据自己机器的版本号修改,不同版本真的存在兼容性问题)
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
4.0.0
com.qianfeng
qianfeng-flink-v9
1.0
1.8
1.8
1.8
UTF-8
2.12.8
2.12
2.7.6
1.14.3
2.4.1
org.scala-lang
scala-library
${scala.version}
org.apache.flink
flink-clients_2.11
${flink.version}
org.apache.flink
flink-scala_2.11
${flink.version}
org.apache.flink
flink-streaming-scala_2.11
${flink.version}
mysql
mysql-connector-java
5.1.27
org.apache.flink
flink-java
${flink.version}
org.apache.flink
flink-streaming-java_2.11
${flink.version}
org.apache.flink
flink-connector-kafka_2.12
1.14.3
org.apache.hadoop
hadoop-client
${hadoop.version}
package com.qf.bigdata
import org.apache.flink.api.scala.{DataSet, ExecutionEnvironment}
import org.apache.flink.api.scala._
import org.apache.flink.core.fs.FileSystem
//批次处理的API单词的计数统计
object Demo01_BatchWCScala {
def main(args: Array[String]): Unit = {
// 创建flink执行入口
val env = ExecutionEnvironment.getExecutionEnvironment
//通过执行对象获取批次的数据
val ds:DataSet[String]=env.readTextFile("data/test.txt")
//处理数据
val rs:DataSet[(String,Int)] = ds.flatMap(_.split(" "))
.map((_,1))
.groupBy(0)
.sum(1)
.print() //默认就是执行了execute}
}
就是在工程的src文件夹同级建一个data文件夹,里面自定义一个文件,写单词
下图就是我的test.txt
变形
val rs:DataSet[(String,Int)] = ds.flatMap(_.split(" "))
.map((_,1))
.groupBy(0)
.sum(1)
//.print() //默认就是执行了execute
//添加并行度
.setParallelism(1) //存在包的问题,就是降低了并行度,不然就是计算机所有的进程拆分的文件
rs.writeAsText("data/wcOutput",FileSystem.WriteMode.OVERWRITE)
env.execute()
就是去掉打印控制台,添加并行度,以及保存文件和执行
变形(把死的文件路径改成活的args)
val ds:DataSet[String]=env.readTextFile(args(0))
rs.writeAsText(args(1),FileSystem.WriteMode.OVERWRITE)
传参的设置
package com.qf.bigdata;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.operators.AggregateOperator;
import org.apache.flink.api.java.operators.DataSource;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.util.Collector;public class Demo01 {
public static void main(String[] args) throws Exception {
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
DataSourceds = env.readTextFile("data/test.txt");
//处理数据
AggregateOperator> rs = ds.flatMap(new FlatMapFunction >() {
@Override
public void flatMap(String value, Collector> out) throws Exception {
String[] s = value.split(" ");
for(String word:s){
out.collect(new Tuple2<>(word,1));
}
}
}).groupBy(0).sum(1);
rs.print();
}
}
变形
package com.qf.bigdata;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.operators.AggregateOperator;
import org.apache.flink.api.java.operators.DataSource;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.util.Collector;public class Demo01 {
public static void main(String[] args) throws Exception {
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
DataSourceds = env.readTextFile("data/test.txt");
//处理数据
AggregateOperator> rs = ds.flatMap(new FlatMapFunction >() {
@Override
public void flatMap(String value, Collector> out) throws Exception {
String[] s = value.split(" ");
for(String word:s){
out.collect(new Tuple2<>(word,1));
}
}
}).groupBy(0).sum(1);
//rs.print();
rs.setParallelism(1).writeAsText("data/output");
env.execute();
}
}
package com.qf.bigdata
import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment}
import org.apache.flink.api.scala._
//流处理
object Demo_Stream {
def main(args: Array[String]): Unit = {
//创建flink流处理程序执行入口
val env= StreamExecutionEnvironment.getExecutionEnvironment
//同执行入口创建数据源
val ds: DataStream[String] = env.socketTextStream("qianfeng01",8888)
val rs: DataStream[(String,Int)]=ds.flatMap(_.split(""))
.map((_,1))
.keyBy(0) //分组
.sum(1)
rs.print()
//虚拟机上敲nc -l 8888
env.execute("DS_WC")
}
}
程序边执行的时候,要在虚拟机上敲一下nc -l 8888
package com.qf.bigdata;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.operators.AggregateOperator;
import org.apache.flink.api.java.operators.DataSource;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.util.Collector;
public class Demo04_Stream {
public static void main(String[] args) throws Exception{
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
DataStreamSourceds = env.socketTextStream("qianfeng01",8888); //处理数据
SingleOutputStreamOperator> rs = ds.flatMap(new FlatMapFunction >() {
@Override
public void flatMap(String value, Collector> out) throws Exception {
String[] s = value.split(" ");
for(String word:s){
out.collect(new Tuple2<>(word,1));
}
}
}).keyBy(0).sum(1);
rs.print();
//rs.setParallelism(1).writeAsText("data/output");
env.execute();
}
}
打包上传到虚拟机里面
然后编写一个词汇的文档
在虚拟机中的运行代码(我这边有错误,别跟着我)
flink run -c com.qf.bigdata.Demo01_BatchWCScala /root/data/test.jar /root/words /root/a
package com.qf.bigdata
import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment}
import org.apache.flink.api.scala._
//流处理
object Demo_Stream {
def main(args: Array[String]): Unit = {
//创建flink流处理程序执行入口
// 创建执行入口
val env = StreamExecutionEnvironment.getExecutionEnvironment
import org.apache.flink.api.scala._
// 基于文件读取数据源
val ds: DataStream[String] = env.readTextFile("data/test.txt")
ds.print()
// 基于集合
val ds2: DataStream[String] = env.fromElements("zhangsan liming lisi", "zhangsan liming lisi")
ds2.print()
env.fromCollection(List(1,2,3,4,5)).print()
//env.fromParallelCollection()
env.fromSequence(0,10).print()
// 启动
env.execute()
}
}
对于source的一个写法,有两种
package com.qf.bigdata
import javax.security.auth.login.Configuration
import org.apache.flink.streaming.api.functions.source.{RichParallelSourceFunction, SourceFunction}
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironmentimport scala.util.Random
object Demo07_DataStreamMySource {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
import org.apache.flink.api.scala._
//获取自定义数据源
env.addSource(new SourceFunction[String] {
//运行每条数据
override def run(ctx: SourceFunction.SourceContext[String]): Unit = {
val random = new Random()
while (true){
val i = random.nextInt(10)
ctx.collect(i.toString)
Thread.sleep(1000)
}
}
//取消运行
override def cancel(): Unit = ???
})
//.print() //运行一定要带上env.execute()
//但是不能调整并行度 只能单一cpu运行
//.setParallelism(10)
env.addSource(new RichParallelSourceFunction[String] {
override def run(ctx: SourceFunction.SourceContext[String]): Unit = {
val random = new Random()
while (true){
val i = random.nextInt(10)
ctx.collect(i.toString)
Thread.sleep(1000)
}
}
//取消运行
override def cancel(): Unit = ???
//自行重写
//用于初始化操作的,可以实现数据库的连接初始化
//不知道怎么回事,我后面代码写不了,重写的}).setParallelism(3).print() //3个3个得打印
env.execute() //打印和这个代码不能少
}
}
我的mysql装在qianfeng03上面了。
package com.qf.bigdata
import java.sql.{Connection, DriverManager, PreparedStatement, ResultSet}
import org.apache.flink.configuration.Configuration
import org.apache.flink.streaming.api.functions.source.{RichParallelSourceFunction, SourceFunction}
import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment}object Demo08_mysql {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
import org.apache.flink.api.scala._
//读取自定义数据源Mysql
val ds: DataStream[test] = env.addSource(new MySQLSource)
ds.setParallelism(1).print()
//启动
env.execute()
}
}
class MySQLSource extends RichParallelSourceFunction[test]{
var conn:Connection = _
var ps:PreparedStatement = _
var rs:ResultSet = _
//加载连接
override def open(parameters:Configuration): Unit = {
Class.forName("com.mysql.jdbc.Driver")
conn = DriverManager.getConnection("jdbc:mysql://qianfeng03:3306/sz2103","root","@Mmforu45")
ps = conn.prepareStatement("select * from test")
}
override def close():Unit={
if(rs != null){
rs.close()
}
if(ps != null){
ps.close()
}
if(conn != null){
conn.close()
}
}override def run(ctx:SourceFunction.SourceContext[test]):Unit = {
rs = ps.executeQuery()
while (rs.next()){
ctx.collect(test(rs.getInt(1),rs.getInt(2)))
}
}override def cancel(): Unit = ???
}
case class test(id:Int,age:Int)
在数据库里面建表 插数据
create table test (
id int(11),
age int(11)
);
insert into test values(1, 20),(2, 25);
package com.qf.bigdata
import java.util.Properties
import java.util.regex.Pattern
import java._import org.apache.flink.api.common.eventtime.WatermarkStrategy
import org.apache.flink.api.common.serialization.SimpleStringSchema
import org.apache.flink.connector.kafka.source.KafkaSource
import org.apache.flink.connector.kafka.source.enumerator.initializer.OffsetsInitializer
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer
import org.apache.kafka.clients.consumer.OffsetResetStrategy
import org.apache.kafka.common.TopicPartition
object Demo09KafkaSrouce {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
import org.apache.flink.api.scala._
//env.getCheckpointConfig //配置检查点机制
//老版本的kafka
val prop = new Properties()
prop.setProperty("bootstrap.servers","qianfeng01:9092,qianfeng02:9092,qianfeng03:9092")
prop.setProperty("group.id","test")
env.addSource(new FlinkKafkaConsumer[String]("food",new SimpleStringSchema,prop))
.print()//执行任务
env.execute()
}}
打开虚拟机
因为我三台机器都开了,所以kafka服务端就开了三个ip
虚拟机
zkServer.sh start
kafka-server-start.sh -daemon /usr/local/kafka/config/server.propertieskafka-topics.sh \
--zookeeper qianfeng01:2181,qianfeng02:2181,qianfeng03:2181/kafka \
--list
kafka-console-producer.sh \
--broker-list qianfeng01:9092,qianfeng02:9092,qianfeng03:9092 \
--topic food
上面就是kafka生产者
先运行一下idea上面的程序
然后在虚拟机的生产者后面敲内容
这个程序会一直运行的哦~
现在敲新代码(初级版本,就是敲了一个kafka的端口,组名,序列化,指定位置,还能跑哈哈哈)
注意flink和kafka的api默认那个watermark是开了的,但是这个项目我们要关掉~
package com.qf.bigdata
import java.util.Properties
import java.util.regex.Pattern
import java._import org.apache.flink.api.common.eventtime.WatermarkStrategy
import org.apache.flink.api.common.serialization.SimpleStringSchema
import org.apache.flink.connector.kafka.source.KafkaSource
import org.apache.flink.connector.kafka.source.enumerator.initializer.OffsetsInitializer
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer
import org.apache.kafka.clients.consumer.OffsetResetStrategy
import org.apache.kafka.common.TopicPartition
object Demo09KafkaSrouce {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
import org.apache.flink.api.scala._
//env.getCheckpointConfig //配置检查点机制
//老版本的kafka
// val prop = new Properties()
// prop.setProperty("bootstrap.servers","qianfeng01:9092,qianfeng02:9092,qianfeng03:9092")
// prop.setProperty("group.id","test")
// env.addSource(new FlinkKafkaConsumer[String]("food",new SimpleStringSchema,prop))
// .print()
//新版方式
val partitionSet = new util.HashSet(util.Arrays.asList(
new TopicPartition("food",0)
//new TopicPartition("flink-kafka",2)
)
)
//设置offset消费位置
val topicPartitionToLong = new util.HashMap[TopicPartition,Long]()
topicPartitionToLong.put(new TopicPartition("food",0),0L)
val source = KafkaSource
.builder()
.setBootstrapServers("qianfeng01:9092,qianfeng02:9092,qianfeng03:9092") //连接ip端口
.setGroupId("MyGroup") //组名就是自己自定义啊
.setTopics("food") //topic
.setValueOnlyDeserializer(new SimpleStringSchema) //反序列化
//指定offset消费位置,如果没有offset就会启动后面消费策略
.setStartingOffsets(OffsetsInitializer.earliest())
.build()
env.fromSource(source,WatermarkStrategy.noWatermarks(),"kafkaSource")
.print()
//执行任务
env.execute()
}}
package com.qf.bigdata
import java.util.Properties
import java.util.regex.Pattern
import java._
import java.lang._import org.apache.flink.api.common.eventtime.WatermarkStrategy
import org.apache.flink.api.common.serialization.SimpleStringSchema
import org.apache.flink.connector.kafka.source.KafkaSource
import org.apache.flink.connector.kafka.source.enumerator.initializer.OffsetsInitializer
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer
import org.apache.kafka.clients.consumer.OffsetResetStrategy
import org.apache.kafka.common.TopicPartition
object Demo09KafkaSrouce {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
import org.apache.flink.api.scala._
//env.getCheckpointConfig //配置检查点机制
//老版本的kafka
// val prop = new Properties()
// prop.setProperty("bootstrap.servers","qianfeng01:9092,qianfeng02:9092,qianfeng03:9092")
// prop.setProperty("group.id","test")
// env.addSource(new FlinkKafkaConsumer[String]("food",new SimpleStringSchema,prop))
// .print()
//新版方式
val partitionSet = new util.HashSet(util.Arrays.asList(
new TopicPartition("food",0)
//,
//new TopicPartition("food",2)
)
)
//设置offset消费位置
val topicPartitionToLong = new util.HashMap[TopicPartition,Long]()
topicPartitionToLong.put(new TopicPartition("food",0),0L)
val source = KafkaSource
.builder()
.setBootstrapServers("qianfeng01:9092,qianfeng02:9092,qianfeng03:9092") //连接ip端口
.setGroupId("MyGroup") //组名就是自己自定义啊
.setTopics("food") //topic
//.setTopicPattern(new Pattern("topic.*")) //消费所有前缀topic的主题
.setValueOnlyDeserializer(new SimpleStringSchema) //反序列化
//指定offset消费位置,如果没有offset就会启动后面消费策略
.setStartingOffsets(
OffsetsInitializer.offsets(topicPartitionToLong,OffsetResetStrategy.EARLIEST)
)
.setStartingOffsets(OffsetsInitializer.latest())
//.setPartitions(partitionSet) //指定分区
//.setUnbounded(OffsetsInitializer.timestamp(10000L))
// .setBounded(OffsetsInitializer.timestamp(100000L))
//.setProperty()//添加属性
.build()
env.fromSource(source,WatermarkStrategy.noWatermarks(),"kafkaSource")
.print()
//执行任务
env.execute()
}
}
这个案例就是加了一个import java.lang._
就是我这边自定义kafka偏移量会报错,但是程序可以运行的!