Flink提供三层API,每个API在简洁性和表达之间提供不同的权衡,并针对不同的用例
SQL/Table API(dynamic tables)
DataStream API(streams,windows)
ProcessFunction(event,state,time)
不要跟ProcessWindowFunction混为一谈
ProcessFunction是一个低阶的流处理操作,它可以访问流处理程序的基础构建模块:
事件(event)(流元素)
状态(state)(容错性,一致性,仅在keyed stream中)
定时器(times)(event time和processing time,仅在keyed stream中)
ProcessFunction可以看做是一个具有keyed state和timers访问权的FlatMapFunction
通过FuntimeContext访问keyed state
计时器允许应用程序对处理时间和事件时间中的更改做出响应。对processElement(...)函数每次
调用都获得一个Context对象,该对象可以访问元素的event time timestamp和TimerService
TimerService可用于将来的event/process time 瞬间注册回调。当达到计时器的特定时间时,将调用onTimer(...)方法。在该调用期间,所有状态都再次限定
在创建计时器时使用的键的范围内,从而允许计时器操作键控状态
简单来说:ProcessFunction可以看做是一个具有keyed state和timers访问权限的FlatMapFunction,是一个低阶的流处理操作算子;需要说明的是:ProcessFunction不同于
windowFunction和ProcessWindowFunction,后2者属于DateStream Api的范畴,大家使用的时候可以注意一下:ProcessFunction类可以重谢Open和Close方法。
ProcessFunction:
package Flink_API;
import com.alibaba.fastjson.JSON;
import org.apache.flink.streaming.api.TimeCharacteristic;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.ProcessFunction;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer010;
import org.apache.flink.streaming.util.serialization.SimpleStringSchema;
import org.apache.flink.util.Collector;
import java.io.Serializable;
import java.util.Properties;
public class TestProcessFunction {
public static void main(String[] rags) throws Exception {
//获取Flink的运行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
env.setParallelism(1);
//读取用户浏览信息
Properties consumerProperties=new Properties();
consumerProperties.setProperty("bootstrap.servers","page01");
consumerProperties.setProperty("groud.id","browsegroup");
DataStreamSource dataStreamSource=env.addSource(new FlinkKafkaConsumer010("topic",new SimpleStringSchema(),consumerProperties));
//解析数据
DataStream processData=dataStreamSource.process(new ProcessFunction() {
@Override
public void processElement(String s, Context context, Collector collector) throws Exception {
try{
UserBrowseLog browseLog= JSON.parseObject(s,UserBrowseLog.class);
if(browseLog!=null){
collector.collect(browseLog);
}
}catch(Exception e){
System.out.print("解析Json_UserBrowseLog异常,异常信息是:"+e.getMessage());
}
}
});
processData.print();
env.execute("TestProcessFunction");
}
public static class UserBrowseLog implements Serializable {
private String userID;
private String eventTime;
private String eventType;
private String productID;
private Integer productPrice;
public String getUserID() {
return userID;
}
public void setUserID(String userID) {
this.userID = userID;
}
public String getEventTime() {
return eventTime;
}
public void setEventTime(String eventTime) {
this.eventTime = eventTime;
}
public String getEventType() {
return eventType;
}
public void setEventType(String eventType) {
this.eventType = eventType;
}
public String getProductID() {
return productID;
}
public void setProductID(String productID) {
this.productID = productID;
}
public Integer getProductPrice() {
return productPrice;
}
public void setProductPrice(Integer productPrice) {
this.productPrice = productPrice;
}
@Override
public String toString() {
return "UserBrowseLog{" +
"userID='" + userID + '\'' +
", eventTime='" + eventTime + '\'' +
", eventType='" + eventType + '\'' +
", productID='" + productID + '\'' +
", productPrice=" + productPrice +
'}';
}
}
}
数据处理的核心就是对数据进行各种转化Transformation操作,在Flink上就是通过转换将一个或多个DataStream转换成新的DataStream
Map算子:输入一个元素,然后返回一个元素,中间可以进行清洗转换等操作,一对一转换,即一条转换成另外一条
package Flink_API;
import com.alibaba.fastjson.JSON;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.streaming.api.TimeCharacteristic;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer010;
import org.apache.flink.streaming.util.serialization.SimpleStringSchema;
import org.apache.flink.util.Collector;
import java.io.Serializable;
import java.util.Properties;
public class TestMap {
public static void main(String[] rags) throws Exception {
//获取Flink的运行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
env.setParallelism(1);
//读取用户浏览信息
Properties consumerProperties=new Properties();
consumerProperties.setProperty("bootstrap.servers","page01");
consumerProperties.setProperty("groud.id","browsegroup");
DataStreamSource dataStreamSource=env.addSource(new FlinkKafkaConsumer010("topic",new SimpleStringSchema(),consumerProperties));
//解析数据
DataStream processData=dataStreamSource.map(new MapFunction() {
@Override
public UserBrowseLog map(String s) throws Exception {
UserBrowseLog browseLog=null;
try{
browseLog= JSON.parseObject(s, UserBrowseLog.class);
System.out.print(browseLog==null);
}catch(Exception e){
System.out.print("解析Json_UserBrowseLog异常,异常信息是:"+e.getMessage());
}
return browseLog;
}
});
processData.print();
env.execute("TestProcessFunction");
}
public static class UserBrowseLog implements Serializable {
private String userID;
private String eventTime;
private String eventType;
private String productID;
private Integer productPrice;
public String getUserID() {
return userID;
}
public void setUserID(String userID) {
this.userID = userID;
}
public String getEventTime() {
return eventTime;
}
public void setEventTime(String eventTime) {
this.eventTime = eventTime;
}
public String getEventType() {
return eventType;
}
public void setEventType(String eventType) {
this.eventType = eventType;
}
public String getProductID() {
return productID;
}
public void setProductID(String productID) {
this.productID = productID;
}
public Integer getProductPrice() {
return productPrice;
}
public void setProductPrice(Integer productPrice) {
this.productPrice = productPrice;
}
@Override
public String toString() {
return "UserBrowseLog{" +
"userID='" + userID + '\'' +
", eventTime='" + eventTime + '\'' +
", eventType='" + eventType + '\'' +
", productID='" + productID + '\'' +
", productPrice=" + productPrice +
'}';
}
}
}
FlatMap:输入一个元素,可以返回0个,1个活多个元素,即编程0行或者多行
小知识汇总:
Flink提交方式俩种:测试的时候用第一种,部署的时候用第二种
第一种:先创建一个集群,然后把任务提交到集群(yarn-session.sh表示启动yarn集群,-n 2表示启动俩个taskManager,-jm 1024表示jobmanager内存大小,-tm 1024表示taskmanager内存大小)
yarn-session.sh -n 2 -jm 1024 -tm 1024 -d
flink run -c xxxx.jar
第二种:提交任务,先创建一个临时集群
flink run -m yarn-cluster -yn 2-yjm 1024 -ytm 1024 -c xxxxx.jar
package Flink_API;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.TimeCharacteristic;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer010;
import org.apache.flink.streaming.util.serialization.SimpleStringSchema;
import org.apache.flink.util.Collector;
import java.io.Serializable;
import java.util.Properties;
public class TestFlatMap {
public static void main(String[] rags) throws Exception {
//获取Flink的运行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
env.setParallelism(1);
//读取用户浏览信息
Properties consumerProperties=new Properties();
consumerProperties.setProperty("bootstrap.servers","page01");
consumerProperties.setProperty("groud.id","browsegroup");
DataStreamSource dataStreamSource=env.addSource(new FlinkKafkaConsumer010("topic",new SimpleStringSchema(),consumerProperties));
//解析数据
DataStream> processData=dataStreamSource.flatMap(new FlatMapFunction>() {
@Override
public void flatMap(String s, Collector> collector) throws Exception {
String[] split =s.split("\\w+");
for(String word:split){
collector.collect(Tuple2.of(word,1));
}
}
}).setParallelism(3);
processData.print().setParallelism(1);
env.execute("TestFlatMap");
}
public static class UserBrowseLog implements Serializable {
private String userID;
private String eventTime;
private String eventType;
private String productID;
private Integer productPrice;
public String getUserID() {
return userID;
}
public void setUserID(String userID) {
this.userID = userID;
}
public String getEventTime() {
return eventTime;
}
public void setEventTime(String eventTime) {
this.eventTime = eventTime;
}
public String getEventType() {
return eventType;
}
public void setEventType(String eventType) {
this.eventType = eventType;
}
public String getProductID() {
return productID;
}
public void setProductID(String productID) {
this.productID = productID;
}
public Integer getProductPrice() {
return productPrice;
}
public void setProductPrice(Integer productPrice) {
this.productPrice = productPrice;
}
@Override
public String toString() {
return "UserBrowseLog{" +
"userID='" + userID + '\'' +
", eventTime='" + eventTime + '\'' +
", eventType='" + eventType + '\'' +
", productID='" + productID + '\'' +
", productPrice=" + productPrice +
'}';
}
}
}