DataSet API 介绍

Transformations

  • map:输入一个元素,然后返回一个元素,中间可以做一些清洗转换等操作
  • FlatMap:输入一个元素,可以返回零个,或一个多个元素
  • MapPartiton:类似map,一次处理一个分区的数据【如果再进行map处理的时候需要获取第三方资源连接,建议使用MapPartiton】
  • filter:过滤函数,对传入的数据进行判断,符合条件的数据会被保留下
  • reduce:对数据进行聚合操作,结合当前元素和上一次reduce返回的值进行聚合操作,然后返回一个新值
  • aggregations:sum(),min(),max()等
  • Distinct:返回一个数据集中去重之后的元素,data.distinct();
  • join:内连接
  • OuterJoin:外连接
  • Cross:获取两个数据集的笛卡尔积
  • Union:返回两个数据集的总和,数据类型需要一致
  • First-n:获取集合中的前N个元素
  • Sort Partiton:再本地对数据集的所有分区进行排序,通过sortPartition的链接调用来完成对多个字段的排序
package com.im.flink.task.batch;

import org.apache.flink.api.common.functions.MapPartitionFunction;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.operators.DataSource;
import org.apache.flink.util.Collector;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

/**
 * describe:
 *
 * @author lm
 * @date 2019/11/3
 */
public class BatchDeamoMapPartition {

    public static void main(String[] args) throws Exception {

       ExecutionEnvironment env  = ExecutionEnvironment.getExecutionEnvironment();


       List list= new ArrayList<>();

       list.add("hello you");
       list.add("helloe me");

        DataSource text = env.fromCollection(list);

        DataSet mapPartitionData = text.mapPartition(new MapPartitionFunction() {

            @Override
            public void mapPartition(Iterable iterable, Collector collector) throws Exception {
                // 获取数据库连接 ,此时是一个分区的数据获取一次连接【优点,每个分区获取一次连接】
                // values中保存一个分区的数据
                // 处理数据

                Iterator it = iterable.iterator();
                while (it.hasNext()){
                    String next = it.next();
                    String[] split = next.split("\\ ");
                    for (String word:split){
                        System.out.println("BatchDeamoMapPartition.mapPartition:"+word);
                        collector.collect(word);
                    }
                }
                // 关闭连接
            }
        });

        mapPartitionData.distinct().print();

    }


}

 

package com.im.flink.task.batch;

import org.apache.flink.api.common.functions.JoinFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.operators.DataSource;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.tuple.Tuple3;


import java.util.ArrayList;

/**
 * describe: 内连接
 *
 * @author lm
 * @date 2019/11/4
 */
public class BatchDemoJoin {


    public static void main(String[] args) throws Exception {

        ExecutionEnvironment env  = ExecutionEnvironment.getExecutionEnvironment();

        // <用户id 用户姓名>
        ArrayList> data = new ArrayList<>();

        data.add(new Tuple2<>(1,"zs"));
        data.add(new Tuple2<>(2,"ls"));
        data.add(new Tuple2<>(3,"ww"));
        // <用户id 所在城市>
        ArrayList> data1 = new ArrayList<>();

        data1.add(new Tuple2<>(1,"beijing"));
        data1.add(new Tuple2<>(2,"shanghai"));
        data1.add(new Tuple2<>(3,"guangzhou"));

        DataSource> text1 = env.fromCollection(data);
        DataSource> text2 =env.fromCollection(data1);

        try {
            text1.join(text2).where(0)//指定第一个数据集中需要进行比较的元素角标
            .equalTo(0)//指定第二个数据集中需要进行比较的元素角标
                    .with(new JoinFunction, Tuple2, Object>() {
                        @Override
                        public Object join(Tuple2 integerStringTuple2, Tuple2 integerStringTuple22) throws Exception {
                            return new Tuple3<>(integerStringTuple2.f0,integerStringTuple2.f1,integerStringTuple22.f1);
                        }
                    }).print();
        } catch (Exception e) {
            e.printStackTrace();
        }

        text1.join(text2).where(0)//指定第一个数据集中需要进行比较的元素角标
                .equalTo(0)//指定第二个数据集中需要进行比较的元素角标
        .map(new MapFunction,Tuple2>, Object>() {

            @Override
            public Object map(Tuple2, Tuple2> tuple2Tuple2Tuple2) throws Exception {
                return new Tuple3<>(tuple2Tuple2Tuple2.f0.f0,tuple2Tuple2Tuple2.f0.f1,tuple2Tuple2Tuple2.f1.f1);
            }
        }).print();


    }
}

 

package com.im.flink.task.batch;

import org.apache.flink.api.common.functions.JoinFunction;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.operators.DataSource;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.tuple.Tuple3;

import java.util.ArrayList;

/**
 * describe: 外连接
 *
 * 左外连接
 * 右外连接
 * 全外连接
 *
 * @author lm
 * @date 2019/11/4
 */
public class BatchDemoOutJoin {

    public static void main(String[] args) throws Exception {

        ExecutionEnvironment env  = ExecutionEnvironment.getExecutionEnvironment();

        // <用户id 用户姓名>
        ArrayList> data = new ArrayList<>();

        data.add(new Tuple2<>(1,"zs"));
        data.add(new Tuple2<>(2,"ls"));
        data.add(new Tuple2<>(3,"ww"));
        // <用户id 所在城市>
        ArrayList> data1 = new ArrayList<>();

        data1.add(new Tuple2<>(1,"beijing"));
        data1.add(new Tuple2<>(2,"shanghai"));
        data1.add(new Tuple2<>(4,"guangzhou"));

        DataSource> text1 = env.fromCollection(data);
        DataSource> text2 =env.fromCollection(data1);

        /**
         * 左外连接
         *
         * 注意 ,second这个tuple中的元素可能为空
         */
        text1.leftOuterJoin(text2).where(0)
                .equalTo(0)
                .with(new JoinFunction, Tuple2, Object>() {
                    @Override
                    public Object join(Tuple2 integerStringTuple2, Tuple2 integerStringTuple22) throws Exception {
                        if(integerStringTuple22 ==null){
                            return new Tuple3<>(integerStringTuple2.f0,integerStringTuple2.f1,"null");
                        }else {
                            return new Tuple3<>(integerStringTuple2.f0,integerStringTuple2.f1,integerStringTuple22.f1);
                        }
                    }
                }).print();
        /**
         *
         * 右外连接
         */
        text1.rightOuterJoin(text2)
                .where(0)
                .equalTo(0)
                .with(new JoinFunction, Tuple2, Object>() {
                    @Override
                    public Object join(Tuple2 integerStringTuple2, Tuple2 integerStringTuple22) throws Exception {
                        if (integerStringTuple2 == null) {
                            return new Tuple3<>(integerStringTuple22.f0,  "null", integerStringTuple22.f1);
                        } else {
                            return new Tuple3<>(integerStringTuple2.f0, integerStringTuple2.f1, integerStringTuple22.f1);
                        }
                    }
                }).print();

        /**
         * 全外连接
         */
        text1.rightOuterJoin(text2)
                .where(0)
                .equalTo(0)
                .with(new JoinFunction, Tuple2, Object>() {
                    @Override
                    public Object join(Tuple2 integerStringTuple2, Tuple2 integerStringTuple22) throws Exception {
                        if (integerStringTuple2 == null) {
                            return new Tuple3<>(integerStringTuple22.f0, "null", integerStringTuple22.f1);

                        } else if (integerStringTuple22 == null) {
                            return new Tuple3<>(integerStringTuple2.f0, integerStringTuple2.f1, "null");
                        } else {
                           return  new Tuple3<>(integerStringTuple2.f0, integerStringTuple2.f1, integerStringTuple22.f1);
                        }
                    }
                }).print();

    }
}

 

package com.im.flink.task.batch;

import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.operators.DataSource;
import org.apache.flink.api.java.tuple.Tuple2;

import java.util.ArrayList;

/**
 * describe: 笛卡尔积
 *
 * @author lm
 * @date 2019/11/4
 */
public class BatchDemoCross {

    public static void main(String[] args) {

        ExecutionEnvironment env  = ExecutionEnvironment.getExecutionEnvironment();

        // <用户id 用户姓名>
        ArrayList data = new ArrayList<>();

        data.add("zs");
        data.add("ls");
        data.add("ww");
        // <用户id 所在城市>
        ArrayList data1 = new ArrayList<>();

        data1.add(1);
        data1.add(2);
        data1.add(4);

        DataSource text1 = env.fromCollection(data);
        DataSource text2 =env.fromCollection(data1);

        try {
            text1.cross(text2).print();
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}
package com.im.flink.task.batch;

import org.apache.flink.api.common.operators.Order;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.operators.DataSource;
import org.apache.flink.api.java.tuple.Tuple2;

import java.util.ArrayList;

/**
 * describe: 获取前N个元素
 *
 * @author lm
 * @date 2019/11/4
 */
public class BatchDemoFirstn {

    public static void main(String[] args) throws Exception {

        ExecutionEnvironment env  = ExecutionEnvironment.getExecutionEnvironment();

        // <用户id 用户姓名>
        ArrayList> data = new ArrayList<>();

        data.add(new Tuple2<>(6,"zs"));
        data.add(new Tuple2<>(5,"ww"));
        data.add(new Tuple2<>(4,"ww"));
        data.add(new Tuple2<>(1,"zs"));
        data.add(new Tuple2<>(2,"ls"));
        data.add(new Tuple2<>(4,"ww"));

        DataSource> datas = env.fromCollection(data);
        // 打印前3个数据,按照数据插入的顺序
        datas.first(3).print();
        System.out.println("==============================");

        // 数据中的第一列分组,获取每组的前2个元素
        datas.groupBy(0).first(2).print();
        System.out.println("==============================");
        // 数据中的第一列分组 ,再根据第二列进行组内排序,获取每组的前两个元素
        datas.groupBy(0).sortGroup(1,Order.ASCENDING).first(2).print();
        System.out.println("==============================");
        // 不分组 ,全局排序获取集合的前3个元素
        datas.sortPartition(0,Order.ASCENDING).sortPartition(1,Order.DESCENDING).first(3).print();;

    }
}

 

你可能感兴趣的:(Flink)