package com.im.flink.task.batch;
import org.apache.flink.api.common.functions.MapPartitionFunction;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.operators.DataSource;
import org.apache.flink.util.Collector;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
/**
* describe:
*
* @author lm
* @date 2019/11/3
*/
public class BatchDeamoMapPartition {
public static void main(String[] args) throws Exception {
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
List list= new ArrayList<>();
list.add("hello you");
list.add("helloe me");
DataSource text = env.fromCollection(list);
DataSet mapPartitionData = text.mapPartition(new MapPartitionFunction() {
@Override
public void mapPartition(Iterable iterable, Collector collector) throws Exception {
// 获取数据库连接 ,此时是一个分区的数据获取一次连接【优点,每个分区获取一次连接】
// values中保存一个分区的数据
// 处理数据
Iterator it = iterable.iterator();
while (it.hasNext()){
String next = it.next();
String[] split = next.split("\\ ");
for (String word:split){
System.out.println("BatchDeamoMapPartition.mapPartition:"+word);
collector.collect(word);
}
}
// 关闭连接
}
});
mapPartitionData.distinct().print();
}
}
package com.im.flink.task.batch;
import org.apache.flink.api.common.functions.JoinFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.operators.DataSource;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.tuple.Tuple3;
import java.util.ArrayList;
/**
* describe: 内连接
*
* @author lm
* @date 2019/11/4
*/
public class BatchDemoJoin {
public static void main(String[] args) throws Exception {
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
// <用户id 用户姓名>
ArrayList> data = new ArrayList<>();
data.add(new Tuple2<>(1,"zs"));
data.add(new Tuple2<>(2,"ls"));
data.add(new Tuple2<>(3,"ww"));
// <用户id 所在城市>
ArrayList> data1 = new ArrayList<>();
data1.add(new Tuple2<>(1,"beijing"));
data1.add(new Tuple2<>(2,"shanghai"));
data1.add(new Tuple2<>(3,"guangzhou"));
DataSource> text1 = env.fromCollection(data);
DataSource> text2 =env.fromCollection(data1);
try {
text1.join(text2).where(0)//指定第一个数据集中需要进行比较的元素角标
.equalTo(0)//指定第二个数据集中需要进行比较的元素角标
.with(new JoinFunction, Tuple2, Object>() {
@Override
public Object join(Tuple2 integerStringTuple2, Tuple2 integerStringTuple22) throws Exception {
return new Tuple3<>(integerStringTuple2.f0,integerStringTuple2.f1,integerStringTuple22.f1);
}
}).print();
} catch (Exception e) {
e.printStackTrace();
}
text1.join(text2).where(0)//指定第一个数据集中需要进行比较的元素角标
.equalTo(0)//指定第二个数据集中需要进行比较的元素角标
.map(new MapFunction,Tuple2>, Object>() {
@Override
public Object map(Tuple2, Tuple2> tuple2Tuple2Tuple2) throws Exception {
return new Tuple3<>(tuple2Tuple2Tuple2.f0.f0,tuple2Tuple2Tuple2.f0.f1,tuple2Tuple2Tuple2.f1.f1);
}
}).print();
}
}
package com.im.flink.task.batch;
import org.apache.flink.api.common.functions.JoinFunction;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.operators.DataSource;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.tuple.Tuple3;
import java.util.ArrayList;
/**
* describe: 外连接
*
* 左外连接
* 右外连接
* 全外连接
*
* @author lm
* @date 2019/11/4
*/
public class BatchDemoOutJoin {
public static void main(String[] args) throws Exception {
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
// <用户id 用户姓名>
ArrayList> data = new ArrayList<>();
data.add(new Tuple2<>(1,"zs"));
data.add(new Tuple2<>(2,"ls"));
data.add(new Tuple2<>(3,"ww"));
// <用户id 所在城市>
ArrayList> data1 = new ArrayList<>();
data1.add(new Tuple2<>(1,"beijing"));
data1.add(new Tuple2<>(2,"shanghai"));
data1.add(new Tuple2<>(4,"guangzhou"));
DataSource> text1 = env.fromCollection(data);
DataSource> text2 =env.fromCollection(data1);
/**
* 左外连接
*
* 注意 ,second这个tuple中的元素可能为空
*/
text1.leftOuterJoin(text2).where(0)
.equalTo(0)
.with(new JoinFunction, Tuple2, Object>() {
@Override
public Object join(Tuple2 integerStringTuple2, Tuple2 integerStringTuple22) throws Exception {
if(integerStringTuple22 ==null){
return new Tuple3<>(integerStringTuple2.f0,integerStringTuple2.f1,"null");
}else {
return new Tuple3<>(integerStringTuple2.f0,integerStringTuple2.f1,integerStringTuple22.f1);
}
}
}).print();
/**
*
* 右外连接
*/
text1.rightOuterJoin(text2)
.where(0)
.equalTo(0)
.with(new JoinFunction, Tuple2, Object>() {
@Override
public Object join(Tuple2 integerStringTuple2, Tuple2 integerStringTuple22) throws Exception {
if (integerStringTuple2 == null) {
return new Tuple3<>(integerStringTuple22.f0, "null", integerStringTuple22.f1);
} else {
return new Tuple3<>(integerStringTuple2.f0, integerStringTuple2.f1, integerStringTuple22.f1);
}
}
}).print();
/**
* 全外连接
*/
text1.rightOuterJoin(text2)
.where(0)
.equalTo(0)
.with(new JoinFunction, Tuple2, Object>() {
@Override
public Object join(Tuple2 integerStringTuple2, Tuple2 integerStringTuple22) throws Exception {
if (integerStringTuple2 == null) {
return new Tuple3<>(integerStringTuple22.f0, "null", integerStringTuple22.f1);
} else if (integerStringTuple22 == null) {
return new Tuple3<>(integerStringTuple2.f0, integerStringTuple2.f1, "null");
} else {
return new Tuple3<>(integerStringTuple2.f0, integerStringTuple2.f1, integerStringTuple22.f1);
}
}
}).print();
}
}
package com.im.flink.task.batch;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.operators.DataSource;
import org.apache.flink.api.java.tuple.Tuple2;
import java.util.ArrayList;
/**
* describe: 笛卡尔积
*
* @author lm
* @date 2019/11/4
*/
public class BatchDemoCross {
public static void main(String[] args) {
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
// <用户id 用户姓名>
ArrayList data = new ArrayList<>();
data.add("zs");
data.add("ls");
data.add("ww");
// <用户id 所在城市>
ArrayList data1 = new ArrayList<>();
data1.add(1);
data1.add(2);
data1.add(4);
DataSource text1 = env.fromCollection(data);
DataSource text2 =env.fromCollection(data1);
try {
text1.cross(text2).print();
} catch (Exception e) {
e.printStackTrace();
}
}
}
package com.im.flink.task.batch;
import org.apache.flink.api.common.operators.Order;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.operators.DataSource;
import org.apache.flink.api.java.tuple.Tuple2;
import java.util.ArrayList;
/**
* describe: 获取前N个元素
*
* @author lm
* @date 2019/11/4
*/
public class BatchDemoFirstn {
public static void main(String[] args) throws Exception {
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
// <用户id 用户姓名>
ArrayList> data = new ArrayList<>();
data.add(new Tuple2<>(6,"zs"));
data.add(new Tuple2<>(5,"ww"));
data.add(new Tuple2<>(4,"ww"));
data.add(new Tuple2<>(1,"zs"));
data.add(new Tuple2<>(2,"ls"));
data.add(new Tuple2<>(4,"ww"));
DataSource> datas = env.fromCollection(data);
// 打印前3个数据,按照数据插入的顺序
datas.first(3).print();
System.out.println("==============================");
// 数据中的第一列分组,获取每组的前2个元素
datas.groupBy(0).first(2).print();
System.out.println("==============================");
// 数据中的第一列分组 ,再根据第二列进行组内排序,获取每组的前两个元素
datas.groupBy(0).sortGroup(1,Order.ASCENDING).first(2).print();
System.out.println("==============================");
// 不分组 ,全局排序获取集合的前3个元素
datas.sortPartition(0,Order.ASCENDING).sortPartition(1,Order.DESCENDING).first(3).print();;
}
}