Spark_3

学习Spark第三天
一.常用transformation
Spark_3_第1张图片

package main;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.VoidFunction;
import scala.Tuple2;

import java.util.Arrays;
import java.util.Iterator;
import java.util.List;

public class Transfromtion {
    public static void main(String[] args) {
//        myMap();
//结果:
//        2
//        4
//        6
//        8
//        10
//        12

//        myFilter();
//结果:
//        2
//        4
//        6
//        8
//        10

//        myFlatMap();
//结果:
//        hello
//        me
//        hello
//        you
//        hello
//        world

//        myGroupByKey();
//结果:
//        班级:class1
//        85
//        49
//        92
//        =========================
//        班级:class2
//        56
//        89
//        68
//        =========================

//        myReduceByKey();
//结果:
//        class1:71
//        class2:57

//        mySortByKey();
//结果:
//        45:xiaobao
//        60:daqiang
//        71:wangwu
//        92:xiaozhang
//        100:xiaoli

//        myJoinAndCogroup();
//结果:
//        1:zhansan:99
//        3:wangwu:56
//        2:lisi:89
//        ================================
//        1:[zhansan]:[45, 78]
//        3:[wangwu]:[38, 58]
//        2:[lisi]:[82, 76]
//    }

//    map算子:对每一个元素*2
    private static void myMap(){
        SparkConf conf = new SparkConf().setMaster("local").setAppName("myMap");

        JavaSparkContext sc = new JavaSparkContext(conf);

        //构建集合
        List<Integer> numbers = Arrays.asList(1, 2, 3, 4, 5, 6);
        //并行化集合,初始RDD
        JavaRDD<Integer> numbersRDD = sc.parallelize(numbers);
        //使用map算子,将集合中的每个元素*2
        // map算子,是对任何算数据类型的RDD都可以调用的
        // 在java中,map算子接收的参数是Function对象
        // 创建的Function对象,一定会让你设置第二个参数,这个泛型参数,就是返回新元素的类型
        // 同时call()方法的返回类型,也必须与第二个个泛型类型同步
        // 在call()可以对原始数据进行各种操作,并返回一组新的元素组成一个新的RDD
        JavaRDD<Integer> map = numbersRDD.map(new Function<Integer, Integer>() {
            @Override
            public Integer call(Integer integer) throws Exception {
                return integer * 2;
            }
        });
        // 打印新的RDD中的内容
        map.foreach(new VoidFunction<Integer>() {
            @Override
            public void call(Integer integer) throws Exception {
                System.out.println(integer);
            }
        });
        sc.close();
    }


//    filter算子函数:过滤集合总的偶数
    private static void myFilter(){
        SparkConf conf = new SparkConf().setAppName("myFilter").setMaster("local");

        JavaSparkContext sc = new JavaSparkContext(conf);

        List<Integer> numbers = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);

        JavaRDD<Integer> numberRDD = sc.parallelize(numbers);
        //对初始RDD执行filter算子,将其中的偶数过滤出来
        //filter传入的也是faction,其他的输赢注意点,实际和map是一样的
        //每一个RDD初始化中的元素,都会传入call()方法,此时你可以执行各种自定义的计算逻辑
        //莱帕胺段这个元素石佛营是你想要的
        //如果你想要在新的RDD中保留这个元素,那么就返回true;否则,不想保留这个元素,返回false
        JavaRDD<Integer> filter = numberRDD.filter(new Function<Integer, Boolean>() {
            @Override
            public Boolean call(Integer integer) throws Exception {
                return integer % 2 == 0;
            }
        });
        filter.foreach(new VoidFunction<Integer>() {
            @Override
            public void call(Integer integer) throws Exception {
                System.out.println(integer);
            }
        });
        sc.close();
    }

    /*
    flatMap案例:将文本拆分成多个单词
     */
    private static void myFlatMap(){
        SparkConf conf = new SparkConf().setMaster("local").setAppName("myFlatMap()");

        JavaSparkContext sc = new JavaSparkContext(conf);

        //模拟集合
        List<String> lineList = Arrays.asList("hello me", "hello you", "hello world");

        JavaRDD<String> lines = sc.parallelize(lineList);
        //对RDD执行fletMap算子,将每一行文本,拆分成多个单词
        //flatMap算子,在Java中,接受的参数是flatMapFunction
        //我们需要自己定义FlatMapFunction的第二个泛型类型,即,代表了返回新的元素的类型
        //call()方法 ,返回类型,不是object,而是Iterable,这里的object也余地二哥泛型类型相同
        //flatMap其实就是接受原始RDD中的每个元素,并进行各种逻辑计算好处理,返回多个元素
        //多个元素,即封装在Iterable中,可以使用ArrayList等集合
        JavaRDD<String> words = lines.flatMap(new FlatMapFunction<String, String>() {
            @Override
            public Iterable<String> call(String s) throws Exception {
                return Arrays.asList(s.split(" "));
            }
        });

        words.foreach(new VoidFunction<String>() {
            @Override
            public void call(String s) throws Exception {
                System.out.println(s);
            }
        });
        sc.close();
    }
    /*
    groupByKey案例:按照班级成绩进行分类
     */
    private static void myGroupByKey(){
        SparkConf conf = new SparkConf().setAppName("myGroupByKey").setMaster("local");

        JavaSparkContext sc = new JavaSparkContext(conf);

        List<Tuple2<String, Integer>> scoreList = Arrays.asList(
                new Tuple2<String, Integer>("class1", 85),
                new Tuple2<String, Integer>("class2", 56),
                new Tuple2<String, Integer>("class1", 49),
                new Tuple2<String, Integer>("class2", 89),
                new Tuple2<String, Integer>("class1", 92),
                new Tuple2<String, Integer>("class2", 68)

        );
        JavaPairRDD<String, Integer> scores = sc.parallelizePairs(scoreList);

        JavaPairRDD<String, Iterable<Integer>> groupByKeyScore = scores.groupByKey();
        groupByKeyScore.foreach(new VoidFunction<Tuple2<String, Iterable<Integer>>>() {
            @Override
            public void call(Tuple2<String, Iterable<Integer>> stringIterableTuple2) throws Exception {
                System.out.println("班级"+":"+ stringIterableTuple2._1);
                Iterator<Integer> iterator = stringIterableTuple2._2.iterator();
                //iterator.hasNext()判断是否还有元素
                while(iterator.hasNext()){
                    System.out.println(iterator.next());
                }
                System.out.println("=========================");
            }
        });
        sc.close();
    }

    /*
    reduceByKey:统计每个班的总分
     */
    private static void myReduceByKey(){
        SparkConf conf = new SparkConf().setMaster("local").setAppName("myReduceByKey");
        JavaSparkContext sc = new JavaSparkContext(conf);

        List<Tuple2<String, Integer>> sumClass = Arrays.asList(
                new Tuple2<String, Integer>("class1", 26),
                new Tuple2<String, Integer>("class1", 45),
                new Tuple2<String, Integer>("class2", 35),
                new Tuple2<String, Integer>("class2", 22)
        );
        JavaPairRDD<String, Integer> sumClassRDD = sc.parallelizePairs(sumClass);

        //针对sum RDD执行reduceByKey
        //reduceByKey,接受的参数是Function2类型,他有三个参数,实际上代表三个值
        //第一个泛型类和第二个泛型类,代表原始RDD中的value的类型
        //对每个key进行的reduce,都会一次将第一个,第二个value传入,将值在与第三个value传入
        //在这里也会定义两个泛型类型,代表call()方法的两个掺入参数的类型
        //第三个泛型类型,代表了每次reduce操作返回值得类型,默认也是与原始RDD的value类型相同的
        //reduceByKey算子返回的RDD,还是JavaPairRDD

        JavaPairRDD<String, Integer> sumReduce = sumClassRDD.reduceByKey(new Function2<Integer, Integer, Integer>() {
            @Override
            //对每个key,都会将其value,依次传入call方法
            //从而聚合出每一个key对应的一个value
            //然而,将每一个key对应的value组合称一个Tuple2,作为新的RDD元素
            public Integer call(Integer integer, Integer integer2) throws Exception {
                return integer + integer2;
            }
        });
        sumReduce.foreach(new VoidFunction<Tuple2<String, Integer>>() {
            @Override
            public void call(Tuple2<String, Integer> tuple2) throws Exception {
                System.out.println(tuple2._1+":"+tuple2._2);
            }
        });
        sc.close();
    }
    /*
    sourtByKey:按照学生分数排序
     */
    private static void mySortByKey(){
        SparkConf conf = new SparkConf().setMaster("local").setAppName("mySortByKey");
        JavaSparkContext sc = new JavaSparkContext(conf);

        List<Tuple2<Integer, String>> stuScore = Arrays.asList(
                new Tuple2<Integer, String>(100, "xiaoli"),
                new Tuple2<Integer, String>(92, "xiaozhang"),
                new Tuple2<Integer, String>(45, "xiaobao"),
                new Tuple2<Integer, String>(60, "daqiang"),
                new Tuple2<Integer, String>(71, "wangwu")
        );
        JavaPairRDD<Integer, String> stuScoreRDD = sc.parallelizePairs(stuScore);

        //对scores RDD 执行sortByKey算子
        //如果参数是false,是降序;默认升序
        //sorutByKey其实就是根据key进行排序,可以手动指定升序或降序
        //返回的,还是JavaPairRDD,其中的元素内容,都是和原始的RDD一模一样
        //但是就是RDD中的元素顺序不同
        JavaPairRDD<Integer, String> rdd = stuScoreRDD.sortByKey();
        rdd.foreach(new VoidFunction<Tuple2<Integer, String>>() {
            @Override
            public void call(Tuple2<Integer, String> tuple2) throws Exception {
                System.out.println(tuple2._1+":"+tuple2._2);
            }
        });
        sc.close();
    }
    /*
    join与cogroup:打印学生的成绩
     */
    private static void myJoinAndCogroup(){
        SparkConf conf = new SparkConf().setAppName("myJoinAndCogroup").setMaster("local");
        JavaSparkContext sc = new JavaSparkContext(conf);
        List<Tuple2<Integer, String>> stuList = Arrays.asList(
                new Tuple2<Integer, String>(1, "zhansan"),
                new Tuple2<Integer, String>(2, "lisi"),
                new Tuple2<Integer, String>(3, "wangwu")
        );
        List<Tuple2<Integer, Integer>> scoreOneList = Arrays.asList(
                new Tuple2<Integer, Integer>(1, 99),
                new Tuple2<Integer, Integer>(2, 89),
                new Tuple2<Integer, Integer>(3, 56)
        );
        List<Tuple2<Integer, Integer>> scoreTowList = Arrays.asList(
                new Tuple2<Integer, Integer>(1, 45),
                new Tuple2<Integer, Integer>(2, 82),
                new Tuple2<Integer, Integer>(3, 38),
                new Tuple2<Integer, Integer>(1, 78),
                new Tuple2<Integer, Integer>(2, 76),
                new Tuple2<Integer, Integer>(3, 58)
                );
        //并行化俩个RDD
        JavaPairRDD<Integer, String> students = sc.parallelizePairs(stuList);
        final JavaPairRDD<Integer, Integer> scoreOne = sc.parallelizePairs(scoreOneList);
        JavaPairRDD<Integer, Integer> scoreTow = sc.parallelizePairs(scoreTowList);
        //使用Join关联俩个RDD
        //join后,还是会根据key进行join,并返回JavaPairRDD
        //但是JavaPairRDD的第一个泛型类型,之前两个JavaPairRDD的key的类型,因为是通过key进行join的
        //第二个泛型类型,是Tuple2的类型,Tuple2的俩个泛型分别是原始RDD的value类型
        //join,就返回的RDD的每一个元素,就是通过key join上的一个pair
        //例如:(1,1),(1,2),(1,3)的一个RDD
        //还有一个:(1,4),(2,1),(3,1)的一个RDD
        //join以后,实际上会的到(1,(1,4)),(1,(2,4)),(1,(3,4))
        JavaPairRDD<Integer, Tuple2<String, Integer>> studentsJoinScoreOne = students.join(scoreOne);
        studentsJoinScoreOne.foreach(new VoidFunction<Tuple2<Integer, Tuple2<String, Integer>>>() {
            @Override
            public void call(Tuple2<Integer, Tuple2<String, Integer>> tuple2) throws Exception {
                System.out.println(tuple2._1+":"+tuple2._2._1+":"+tuple2._2._2);
            }
        });
        System.out.println("================================");

        //cogroup与join不同
        //相当于,一个key join上的所有value,都放到一个Iterable
        JavaPairRDD<Integer, Tuple2<Iterable<String>, Iterable<Integer>>> cogroups = students.cogroup(scoreTow);
        cogroups.foreach(new VoidFunction<Tuple2<Integer, Tuple2<Iterable<String>, Iterable<Integer>>>>() {
            @Override
            public void call(Tuple2<Integer, Tuple2<Iterable<String>, Iterable<Integer>>> tuple2) throws Exception {
                System.out.println(tuple2._1+":"+tuple2._2._1+":"+tuple2._2._2);
            }
        });
        sc.close();


    }


}
 
  

二.常用action
Spark_3_第2张图片

package main;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import scala.Tuple2;

import java.util.Arrays;
import java.util.List;
import java.util.Map;

public class Action {
    public static void main(String[] args) {

//        reduce();
//        55

//        collect();
//        2
//        4
//        6
//        8
//        10
//        12
//        14
//        16
//        18
//        20

//        count();
//        10

//        take();
//        1
//        2
//        3

//        saveAsTextFile();

//        countByKey();
//        class1:3
//        class2:3


    }
    private static void reduce(){
        //对集合元素求和
        SparkConf conf = new SparkConf().setMaster("local").setAppName("reduce");
        JavaSparkContext sc = new JavaSparkContext(conf);

        List<Integer> numList = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
        JavaRDD<Integer> numRDD = sc.parallelize(numList);
        Integer sum = numRDD.reduce(new Function2<Integer, Integer, Integer>() {
            @Override
            public Integer call(Integer integer, Integer integer2) throws Exception {
                return integer + integer2;
            }
        });
        System.out.println(sum);
        sc.close();
    }
    private static void collect(){
        //对集合里每个元素 *2
        SparkConf conf = new SparkConf().setAppName("collect").setMaster("local");
        JavaSparkContext sc = new JavaSparkContext(conf);

        List<Integer> numList = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);

        JavaRDD<Integer> numRDD = sc.parallelize(numList);
        JavaRDD<Integer> numc2 = numRDD.map(new Function<Integer, Integer>() {
            @Override
            public Integer call(Integer integer) throws Exception {
                return integer * 2;
            }
        });
        List<Integer> collect = numc2.collect();
        for (Integer a: collect){
            System.out.println(a);
        }
        sc.close();
    }
    private static void count(){
        //获取RDD元素总和
        SparkConf conf = new SparkConf().setMaster("local").setAppName("count");
        JavaSparkContext sc = new JavaSparkContext(conf);

        List<Integer> numList = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
        JavaRDD<Integer> nums = sc.parallelize(numList);
        long count = nums.count();
        System.out.println(count);
        sc.close();
    }
    private static void take(){
        //获取RDD元素的前几个元素

        SparkConf conf = new SparkConf().setAppName("take").setMaster("local");
        JavaSparkContext sc = new JavaSparkContext(conf);

        List<Integer> numbers = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
        JavaRDD<Integer> numb = sc.parallelize(numbers);
        //take操作是从远程集群上获取RDD中的数据,但只是前n个
        List<Integer> take = numb.take(3);
        for (Integer a: take){
            System.out.println(a);
        }
        sc.close();
    }

    private static void saveAsTextFile(){
        SparkConf conf = new SparkConf().setMaster("local").setAppName("saveAsTextFile");
        JavaSparkContext sc = new JavaSparkContext(conf);

        List<Integer> numberList = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);

        JavaRDD<Integer> numbers = sc.parallelize(numberList);
        JavaRDD<Integer> numc3 = numbers.map(new Function<Integer, Integer>() {
            @Override
            public Integer call(Integer integer) throws Exception {
                return integer * 3;
            }
        });
        numc3.saveAsTextFile("hdfs://hadoop1:9000/numc3.txt");
        sc.close();
    }

    private static void countByKey(){
        SparkConf conf = new SparkConf().setAppName("countByKey").setMaster("local");

        JavaSparkContext sc = new JavaSparkContext(conf);

        List<Tuple2<String, String>> studentList = Arrays.asList(
                new Tuple2<String, String>("class1", "leo"),
                new Tuple2<String, String>("class2", "xiaoli"),
                new Tuple2<String, String>("class2", "xiaohe"),
                new Tuple2<String, String>("class1", "wangwu"),
                new Tuple2<String, String>("class2", "zhangli"),
                new Tuple2<String, String>("class1", "choutong")
        );
        JavaPairRDD<String, String> student = sc.parallelizePairs(studentList);
        //对RDD应用countByKey操作,统计操作每个班级学生人数,也就是统计每个key对应的元素个数
        //这就是countByKey的作用
        Map<String, Object> countStudent = student.countByKey();
        for(Map.Entry<String,Object> a:countStudent.entrySet()){
            System.out.println(a.getKey()+":"+a.getValue());
        }
        sc.close();
    }

}

你可能感兴趣的:(Spark_3)