Window Join将流中两个key相同的元素联结在一起。这种联结方式看起来非常像inner join,两个元素必须都存在,才会出现在结果中。
在Flink中,分为有三种不同类型的典型窗口:滚动窗口、滑动窗口、会话窗口。我们以窗口的类型分开讲解。
在执行窗口join时,会将所有key能够匹配上、且处在同一个滚动窗口的事件进行join,join之后传递到JoinFunction或者FlatJoinFunction。这种join看起来就像是INNER JOIN,滚动窗口operator不会将一个在某个流中,而在另一个流中不存在的元素发送到下游。
上述图,表示两个流进行滚动窗口join,我们发现,只要是两个流中都有的元素,才发生了join操作。
来做个案例:
使用两个指定Source模拟数据,一个Source是订单明细,一个Source是商品数据。我们通过window join,将数据关联到一起。
输出结果如下:
1、先将Flink的依赖导入进来
aliyunmaven
http://maven.aliyun.com/nexus/content/groups/public/
1.12.0
2.12
5.1.47
org.apache.flink
flink-java
${flink-version}
org.apache.flink
flink-streaming-java_${scala-version}
${flink-version}
org.apache.flink
flink-clients_${scala-version}
${flink-version}
com.alibaba
fastjson
1.2.62
package com.istudy.bean;
import com.alibaba.fastjson.JSON;
import java.math.BigDecimal;
/**
* @projectname: HaiStream
* @description:
* @author: Mr.Zhang
* @create: 2021-03-13 17:06
**/
public class FactOrderItem {
private String goodsId;
private String goodsName;
private BigDecimal count;
private BigDecimal totalMoney;
@Override
public String toString() {
return JSON.toJSONString(this);
}
public String getGoodsId() {
return goodsId;
}
public void setGoodsId(String goodsId) {
this.goodsId = goodsId;
}
public String getGoodsName() {
return goodsName;
}
public void setGoodsName(String goodsName) {
this.goodsName = goodsName;
}
public BigDecimal getCount() {
return count;
}
public void setCount(BigDecimal count) {
this.count = count;
}
public BigDecimal getTotalMoney() {
return totalMoney;
}
public void setTotalMoney(BigDecimal totalMoney) {
this.totalMoney = totalMoney;
}
}
package com.istudy.bean;
import com.alibaba.fastjson.JSON;
import java.math.BigDecimal;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;
/**
* @projectname: HaiStream
* @description:先为本次的测试构建两个实体类,一个是Goods(商品类)、另一个OrderItem(订单明细)
* @author: Mr.Zhang
* @create: 2021-03-13 17:03
**/
public class Goods {
private String goodsId;
private String goodsName;
private BigDecimal goodsPrice;
public static List GOODS_LIST;
public static Random r;
static {
r = new Random();
GOODS_LIST = new ArrayList<>();
GOODS_LIST.add(new Goods("1", "小米12", new BigDecimal(4890)));
GOODS_LIST.add(new Goods("2", "iphone12", new BigDecimal(12000)));
GOODS_LIST.add(new Goods("3", "MacBookPro", new BigDecimal(15000)));
GOODS_LIST.add(new Goods("4", "Thinkpad X1", new BigDecimal(9800)));
GOODS_LIST.add(new Goods("5", "MeiZu One", new BigDecimal(3200)));
GOODS_LIST.add(new Goods("6", "Mate 40", new BigDecimal(6500)));
}
public static Goods randomGoods() {
int rIndex = r.nextInt(GOODS_LIST.size());
return GOODS_LIST.get(rIndex);
}
public Goods() {
}
public Goods(String goodsId, String goodsName, BigDecimal goodsPrice) {
this.goodsId = goodsId;
this.goodsName = goodsName;
this.goodsPrice = goodsPrice;
}
public String getGoodsId() {
return goodsId;
}
public void setGoodsId(String goodsId) {
this.goodsId = goodsId;
}
public String getGoodsName() {
return goodsName;
}
public void setGoodsName(String goodsName) {
this.goodsName = goodsName;
}
public BigDecimal getGoodsPrice() {
return goodsPrice;
}
public void setGoodsPrice(BigDecimal goodsPrice) {
this.goodsPrice = goodsPrice;
}
@Override
public String toString() {
return JSON.toJSONString(this);
}
public static void main(String[] args) {
randomGoods();
}
}
package com.istudy.bean;
import com.alibaba.fastjson.JSON;
/**
* @projectname: HaiStream
* @description:先为本次的测试构建两个实体类,一个是Goods(商品类)、另一个OrderItem(订单明细)
* @author: Mr.Zhang
* @create: 2021-03-13 17:05
**/
public class OrderItem {
private String itemId;
private String goodsId;
private Integer count;
@Override
public String toString() {
return JSON.toJSONString(this);
}
public String getItemId() {
return itemId;
}
public void setItemId(String itemId) {
this.itemId = itemId;
}
public String getGoodsId() {
return goodsId;
}
public void setGoodsId(String goodsId) {
this.goodsId = goodsId;
}
public Integer getCount() {
return count;
}
public void setCount(Integer count) {
this.count = count;
}
}
package com.istudy.streamsource;
import com.istudy.bean.Goods;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.functions.source.RichSourceFunction;
import java.util.concurrent.TimeUnit;
/**
* @projectname: HaiStream
* @description:构建一个商品Stream源(这个好比就是维表)
* @author: Mr.Zhang
* @create: 2021-03-13 17:07
**/
public class GoodsSource extends RichSourceFunction {
private Boolean isCancel;
@Override
public void open(Configuration parameters) throws Exception {
isCancel = false;
}
@Override
public void run(SourceContext sourceContext) throws Exception {
while(!isCancel) {
Goods.GOODS_LIST.stream().forEach(goods -> sourceContext.collect(goods));
TimeUnit.SECONDS.sleep(1);
}
}
@Override
public void cancel() {
isCancel = true;
}
}
package com.istudy.streamsource;
import com.istudy.bean.Goods;
import com.istudy.bean.OrderItem;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.functions.source.RichSourceFunction;
import java.util.Random;
import java.util.UUID;
import java.util.concurrent.TimeUnit;
/**
* @projectname: HaiStream
* @description:构建订单明细Stream源
* @author: Mr.Zhang
* @create: 2021-03-13 17:19
**/
public class OrderItemSource extends RichSourceFunction {
private Boolean isCancel;
private Random r;
@Override
public void open(Configuration parameters) throws Exception {
isCancel = false;
r = new Random();
}
@Override
public void run(SourceContext sourceContext) throws Exception {
while(!isCancel) {
Goods goods = Goods.randomGoods();
OrderItem orderItem = new OrderItem();
orderItem.setGoodsId(goods.getGoodsId());
orderItem.setCount(r.nextInt(10) + 1);
orderItem.setItemId(UUID.randomUUID().toString());
sourceContext.collect(orderItem);
orderItem.setGoodsId("111");
sourceContext.collect(orderItem);
TimeUnit.SECONDS.sleep(1);
}
}
@Override
public void cancel() {
isCancel = true;
}
}
package com.istudy.watermark;
import com.istudy.bean.Goods;
import org.apache.flink.api.common.eventtime.*;
import org.apache.flink.streaming.api.functions.TimestampAssigner;
/**
* @projectname: HaiStream
* @description:构建水印分配器(此处为了简单),直接使用系统时间了
* @author: Mr.Zhang
* @create: 2021-03-13 17:18
**/
public class GoodsWatermark implements WatermarkStrategy {
@Override
public TimestampAssigner createTimestampAssigner(TimestampAssignerSupplier.Context context) {
return (element, recordTimestamp) -> System.currentTimeMillis();
}
@Override
public WatermarkGenerator createWatermarkGenerator(WatermarkGeneratorSupplier.Context context) {
return new WatermarkGenerator() {
@Override
public void onEvent(Goods event, long eventTimestamp, WatermarkOutput output) {
output.emitWatermark(new Watermark(System.currentTimeMillis()));
}
@Override
public void onPeriodicEmit(WatermarkOutput output) {
output.emitWatermark(new Watermark(System.currentTimeMillis()));
}
};
}
}
package com.istudy.watermark;
import com.istudy.bean.OrderItem;
import org.apache.flink.api.common.eventtime.*;
import org.apache.flink.streaming.api.functions.TimestampAssigner;
/**
* @projectname: HaiStream
* @description:
* @author: Mr.Zhang
* @create: 2021-03-13 17:17
**/
public class OrderItemWatermark implements WatermarkStrategy {
@Override
public TimestampAssigner createTimestampAssigner(TimestampAssignerSupplier.Context context) {
return (element, recordTimestamp) -> System.currentTimeMillis();
}
@Override
public WatermarkGenerator createWatermarkGenerator(WatermarkGeneratorSupplier.Context context) {
return new WatermarkGenerator() {
@Override
public void onEvent(OrderItem event, long eventTimestamp, WatermarkOutput output) {
output.emitWatermark(new Watermark(System.currentTimeMillis()));
}
@Override
public void onPeriodicEmit(WatermarkOutput output) {
output.emitWatermark(new Watermark(System.currentTimeMillis()));
}
};
}
}
package com.istudy.work;
import com.istudy.bean.FactOrderItem;
import com.istudy.bean.Goods;
import com.istudy.bean.OrderItem;
import com.istudy.streamsource.GoodsSource;
import com.istudy.streamsource.OrderItemSource;
import com.istudy.watermark.GoodsWatermark;
import com.istudy.watermark.OrderItemWatermark;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import java.math.BigDecimal;
/**
* @projectname: HaiStream
* @description:
* @author: Mr.Zhang
* @create: 2021-03-13 17:16
**/
public class TumbleWindowJoin {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
// 构建商品数据流
SingleOutputStreamOperator goodsDS = env.addSource(new GoodsSource(), TypeInformation.of(Goods.class))
.assignTimestampsAndWatermarks(new GoodsWatermark() {
});
// 构建订单明细数据流
SingleOutputStreamOperator orderItemDS = env.addSource(new OrderItemSource(), TypeInformation.of(OrderItem.class))
.assignTimestampsAndWatermarks(new OrderItemWatermark());
// 进行关联查询
DataStream factOrderItemDS = orderItemDS.join(goodsDS)
//todo 1、Window Join首先需要使用where和equalTo指定使用哪个key来进行关联,此处我们通过应用方法,基于GoodsId来关联两个流中的元素。
// 第一个流orderItemDS
.where(OrderItem::getGoodsId)
// 第二流goodsDS
.equalTo(Goods::getGoodsId)
//todo 2、设置了5秒的滚动窗口,流的元素关联都会在这个5秒的窗口中进行关联。
.window(TumblingEventTimeWindows.of(Time.seconds(5)))
//todo 3、apply方法中实现了,将两个不同类型的元素关联并生成一个新类型的元素。
.apply((OrderItem item, Goods goods) -> {
FactOrderItem factOrderItem = new FactOrderItem();
factOrderItem.setGoodsId(goods.getGoodsId());
factOrderItem.setGoodsName(goods.getGoodsName());
factOrderItem.setCount(new BigDecimal(item.getCount()));
factOrderItem.setTotalMoney(goods.getGoodsPrice().multiply(new BigDecimal(item.getCount())));
return factOrderItem;
});
factOrderItemDS.print();
env.execute("滚动窗口JOIN");
}
}