概述
特性
//关键字:parallelize、List
//Spark默认会根据集群的情况来设置分区的数量,也可以通过parallelize的第二参数来指定
val rdd:RDD[String] = sc.parallelize(List("hello world","hello java","hello scala"))
//相对路径:当前工程根目录下的data目录下的hello.txt文件
val rdd:RDD[String]=sc.textFile("data/hello.txt")
//绝对路径:D盘目录下的hello.txt文件
val rdd:RDD[String]=sc.textFile("D:/hello.txt")
//读取HDFS上的文件
//Spark默认为HDFS文件的每一个数据块创建一个分区,也可以通过textFile()第二个参数指定,但只能比数据块数量多
val rdd:RDD[String]=sc.textFile("hdfs://IP地址:端口号/目录/文件")
使用Transformation算子实际只会记录RDD的转换过程但不会真正执行,只有遇到Action算子才会真正的执行前面的动作算子
举个栗子:textFile读取本地文件来创建RDD,哪怕实际上该文件并不存在,也能成功创建RDD。当RDD遇到第一个行动算子( actions)操作时,需要对RDD进行计算,此时才会报错,明白了没?也就说明了转化操作的本质:仅仅是记录旧RDD如何转化成新RDD ,但不会立即进行计算,以免浪费资源。
常用的转换算子
Scala版
//对每个元素执行指定操作
println("_____________map算子___________________")
val mapRdd:RDD[String]=sc.parallelize(List("hello","world"))
//通过map将每个元素形成元组
mapRdd.map((_,1)).collect.foreach(println)
/*输出结果:
(hello,1)
(world,1)
*/
//过滤器
println("____________filter算子_________________")
val filterRdd:RDD[Int]=sc.parallelize(List(1,2,3,4,5))
filterRdd.filter(_%2==0).collect.foreach(println)
/*输出结果:
2
4
*/
//只对value进行操作
println("____________mapValue算子_________________")
val mapvalueRdd:RDD[(Int,String)] = sc.parallelize(List("dog","tiger","cat")).map(x=>(x.length,x))
mapvalueRdd.mapValues(x=>"*"+x+"*").collect.foreach(println)
/*输出结果:
(3,*dog*)
(5,*tiger*)
(3,*cat*)
*/
//去重
println("______________distinct算子________________")
val disRdd:RDD[Int]=sc.parallelize(List(1,2,2,2,3,3,4))
disRdd.distinct.collect.foreach(println)
/*输出结果:
1
2
3
4*/
//根据判断key值是否相等来决定是不是执行括号内的代码
println("______________reduceByKey算子________________")
val rbkRdd:RDD[(Int,String)] = sc.parallelize(List("dog","tiger","cat","lion","eagle")).map(x=>(x.length,x))
rbkRdd.reduceByKey((a,b)=>a+b).collect.foreach(println)
/*输出结果:
(3,dogcat)
(4,lion)
(5,tigereagle)
*/
println("______________groupBykey算子________________")
val gbkRdd:RDD[(Int,String)] = sc.parallelize(List("dog","tiger","cat","lion","eagle")).map(x=>(x.length,x))
//返回的value是迭代器
val gbkRdd2:RDD[(Int,Iterable[String])]=gbkRdd.groupByKey()
gbkRdd3.collect.foreach(println)
/*输出结果:
(3,CompactBuffer(dog,cat))
(4,CompactBuffer(lion))
(5,CompactBuffer(tiger,eagle))
*/
//聚合
println("______________union算子________________")
val unRdd1:RDD[Int]=sc.parallelize(List(1,2))
val unRdd2:RDD[Int]=sc.parallelize(List(3,4))
unRdd1.union(unRdd2).collect.foreach(println)
/*输出结果:
1
2
3
4
*/
//返回左集合不存在于右集合的所有元素
println("___________________subtract算子_______________________")
val rddSub:RDD[Int] = sc.parallelize(List(1,2,3,4))
val rddSub2:RDD[Int] = sc.parallelize(List(1,2,3))
rddSub.subtract(rddSub2).collect.foreach(x=>print(x+","))
/*输出结果:
4
*/
//将两个集合中key值相同的元素连接
println("___________________join算子_______________________")
val rddJoin:RDD[(String,Int)] = sc.parallelize(List(new Tuple2[String,Int]("a",1),new Tuple2[String,Int]("b",1)))
val rddJoin2:RDD[(String,Int)] = sc.parallelize(List(new Tuple2[String,Int]("a",2),new Tuple2[String,Int]("a",2),new Tuple2[String,Int]("b",2),new Tuple2[String,Int]("c",2)))
rddJoin.join(rddJoin2).collect.foreach(println)
/*输出结果:
(a,(1,2))
(a,(1,2))
(b,(1,2))*/
//全连接,存在相同key的元素和join一样,不存在相同的key的则返回None
println("___________________fullOuterJoin算子_______________________")
val rddFoj:RDD[(String,Int)] = sc.parallelize(List(new Tuple2[String,Int]("a",1),new Tuple2[String,Int]("b",1)))
val rddFoj2:RDD[(String,Int)] = sc.parallelize(List(new Tuple2[String,Int]("a",2),new Tuple2[String,Int]("b",2),new Tuple2[String,Int]("c",2)))
rddFoj.fullOuterJoin(rddFoj2).collect.foreach(println)
/*输出结果:
(a,(Some(1),Some(2)))
(b,(Some(1),Some(2)))
(c,(None,Some(2)))*/
//左关联:右集合中有无关联的丢弃,左边有无关联的保留
println("___________________LeftOuterJoin算子_______________________")
val rddLoj:RDD[(String,Int)] = sc.parallelize(List(new Tuple2[String,Int]("a",1),new Tuple2[String,Int]("b",1),new Tuple2[String,Int]("c",1)))
val rddLoj2:RDD[(String,Int)] = sc.parallelize(List(new Tuple2[String,Int]("a",2),new Tuple2[String,Int]("b",2),new Tuple2[String,Int]("d",2)))
rddLoj.leftOuterJoin(rddLoj2).collect.foreach(println)
/*输出结果:
(a,(1,Some(2)))
(b,(1,Some(2)))
(c,(1,None))*/
//右关联:左集合中有无关联的丢弃,右边有无关联的保留
println("___________________RightOuterJoin算子_______________________")
val rddRoj:RDD[(String,Int)] = sc.parallelize(List(new Tuple2[String,Int]("a",1),new Tuple2[String,Int]("b",1),new Tuple2[String,Int]("c",1)))
val rddRoj2:RDD[(String,Int)] = sc.parallelize(List(new Tuple2[String,Int]("a",2),new Tuple2[String,Int]("b",2),new Tuple2[String,Int]("d",2)))
rddRoj.rightOuterJoin(rddRoj2).collect.foreach(println)
/*输出结果:
(d,(None,2))
(a,(Some(1),2))
(b,(Some(1),2))*/
}
}
JAVA版本
//过滤器,根据条件筛选元素
System.out.println("_________________filter算子________________________");
//创建ArrayList集合
ArrayList<Integer> arrFilter = new ArrayList<>();
arrFilter.add(1);
arrFilter.add(2);
arrFilter.add(3);
//创建RDD,将集合作为RDD数据源
JavaRDD<Integer> rddFilter = sc.parallelize(arrFilter);
//方法:def filter(f : org.apache.spark.api.java.function.Function[T, java.lang.Boolean]) : org.apache.spark.api.java.JavaRDD[T]
//解释:参数为Function对象,重写call方法,指定call方法的输入类型为Integer,返回值为Boolean类型,因为实现的是判断元素是否满足指定条件,算子会根据判断结果来决定是否返回元素
//实现:判断元素对2取余是否等于0
JavaRDD<Integer> rddFilter2 = rddFilter.filter(new Function<Integer, Boolean>() {
@Override
public Boolean call(Integer integer) throws Exception {
return integer % 2 == 0;
}
});
//使用collect算子,将元素转成list集合
List<Integer> collectFilter = rddFilter2.collect();
//遍历list集合
for (Integer integer : collectFilter) {
System.out.println(integer);
}
/*
* 输出结果:
* 2
* */
//对所有元素进行相同操作,返回RDD中的元素个数,与原RDD元素个数一一对应
System.out.println("_________________map算子________________________");
//创建ArrayList集合
ArrayList<Integer> arrMap = new ArrayList<>();
arrMap.add(1);
arrMap.add(2);
arrMap.add(3);
//创建RDD,将集合作为RDD数据源
JavaRDD<Integer> rddMap = sc.parallelize(arrMap);
//方法:def map[R](f : org.apache.spark.api.java.function.Function[T, R]) : org.apache.spark.api.java.JavaRDD[R]
//解释:在map算子中创建Function对象,重写call方法(创建Function需要指定重写call方法的输入、输出类型)
//实现:将所有元素对2取余
JavaRDD<Integer> rddMap2 = rddMap.map(new Function<Integer, Integer>() {
@Override
public Integer call(Integer integer) throws Exception {
return integer % 2;
}
});
List<Integer> collect = rddMap2.collect();
for (Integer integer : collect) {
System.out.println(integer);
}
/*输出结果:
* 1
* 0
* 1*/
//一个元素,生成多个元素
System.out.println("_________________flatMap算子________________________");
ArrayList<String> arrFlatMap = new ArrayList<>();
arrFlatMap.add("hello world");
arrFlatMap.add("hello scala");
arrFlatMap.add("hello spark");
JavaRDD<String> rddFlatMap = sc.parallelize(arrFlatMap);
//方法:def flatMap[U](f : org.apache.spark.api.java.function.FlatMapFunction[T, U]) : org.apache.spark.api.java.JavaRDD[U]
//解释:flatMap的参数是FlatMapFunction对象,指定重写方法的输入类型为String,返回的类型是元组Tuple2
//实现:将字符串按空格切分为单个元素,在将单独的字符串以[String,1]的组合塞入tuple元组中
JavaRDD<Tuple2<String, Integer>> rddFlatMap2 = rddFlatMap.flatMap(new FlatMapFunction<String, Tuple2<String, Integer>>() {
@Override
public Iterator<Tuple2<String, Integer>> call(String s) throws Exception {
ArrayList<Tuple2<String, Integer>> arr = new ArrayList<>();
String[] s1 = s.split(" ");
for (String s2 : s1) {
arr.add(new Tuple2<>(s2, 1));
}
return arr.iterator();
}
});
List<Tuple2<String, Integer>> collectFlatMap = rddFlatMap2.collect();
for (Tuple2<String, Integer> stringIntegerTuple2 : collectFlatMap) {
System.out.println(stringIntegerTuple2);
}
/*
* 输出结果:
* (hello,1)
(world,1)
(hello,1)
(scala,1)
(hello,1)
(spark,1)
*/
//将RDD元素去重后生成新的RDD
System.out.println("_________________distinct算子________________________");
ArrayList<String> arrDistinct = new ArrayList<>();
arrDistinct.add("a");
arrDistinct.add("a");
arrDistinct.add("b");
arrDistinct.add("c");
JavaRDD<String> rddDistinct = sc.parallelize(arrDistinct);
//这里直接调用了无参数的Distinct
JavaRDD<String> rddDistinct2 = rddDistinct.distinct();
List<String> collectDistinct = rddDistinct2.collect();
for (String s : collectDistinct) {
System.out.println(s);
}
/*
* 输出结果
* a
* b
* c
* */
//合并两个RDD
System.out.println("_________________union算子________________________");
JavaRDD<String> rddUnion= sc.parallelize(Arrays.asList("a","b"));
JavaRDD<String> rddUnion2= sc.parallelize(Arrays.asList("c","d"));
JavaRDD<String> rddUnion3 = rddUnion.union(rddUnion2);
List<String> collectUnion = rddUnion3.collect();
for (String s : collectUnion) {
System.out.print(s+",");
}
System.out.println();
/*输出结果:a,b,c,d,*/
//返回两个RDD的交集,并且去重,需要混洗数据,比较浪费性能
System.out.println("_________________intersection算子________________________");
JavaRDD<String> rddIntersection= sc.parallelize(Arrays.asList("a","b","c"));
JavaRDD<String> rddIntersection2= sc.parallelize(Arrays.asList("a","b","c","d","e","a"));
JavaRDD<String> rddIntersection3 = rddIntersection.intersection(rddIntersection2);
List<String> collectIntersection= rddIntersection3.collect();
for (String s : collectIntersection) {
System.out.println(s);
}
/*输出结果:
* a
* b
* c
* */
//RDD1.subtract(RDD2),返回在RDD1中出现,但是不在RDD2中出现的元素,不去重
System.out.println("_________________subtract算子________________________");
JavaRDD<String> rddSubtract= sc.parallelize(Arrays.asList("a","a","b","c","d","e"));
JavaRDD<String> rddSubtract2= sc.parallelize(Arrays.asList("c","d","e"));
JavaRDD<String> rddSubtract3 = rddSubtract.subtract(rddSubtract2);
List<String> collectSub = rddSubtract3.collect();
for (String s : collectSub) {
System.out.println(s);
}
/*
* 输出结果:
* a
* a
* b
* */
//RDD1.cartesian(RDD2) 返回RDD1和RDD2的笛卡儿积,这个开销非常大
System.out.println("_________________cartesian算子________________________");
JavaRDD<String> rddCartesian= sc.parallelize(Arrays.asList("1","2","3"));
JavaRDD<String> rddCartesian2= sc.parallelize(Arrays.asList("a","b","c"));
JavaPairRDD<String, String> rddCartesian3 = rddCartesian.cartesian(rddCartesian2);
List<Tuple2<String, String>> collectCartesion = rddCartesian3.collect();
for (Tuple2<String, String> stringStringTuple2 : collectCartesion) {
System.out.println(stringStringTuple2);
}
/*输出结果:
(1,a)
(1,b)
(1,c)
(2,a)
(2,b)
(2,c)
(3,a)
(3,b)
(3,c)*/
//对每个元素操作,最后返回元组,一个元素生成一个结果
System.out.println("_________________mapToPair算子________________________");
ArrayList<String> arrMapToPair = new ArrayList<>();
arrMapToPair.add("aa bb");
arrMapToPair.add("cc dd");
arrMapToPair.add("ee ff");
JavaRDD<String> rddMapToPair = sc.parallelize(arrMapToPair);
//方法:def mapToPair[K2, V2](f : org.apache.spark.api.java.function.PairFunction[T, K2, V2]) : org.apache.spark.api.java.JavaPairRDD[K2, V2]
//解释:mapToPairde的参数是PairFunction对象,对象需要传入3个参数,分别是传入参数的类型,与返回结果的两个输出类型
//实现:将元素按空格切割,取第一个,最为key,1作为value,以元组的对象返回
JavaPairRDD<String, Integer> rddMapToPair2 = rddMapToPair.mapToPair(new PairFunction<String, String, Integer>() {
@Override
public Tuple2<String, Integer> call(String s) throws Exception {
return new Tuple2<>(s.split(" ")[0], 1);
}
});
List<Tuple2<String, Integer>> collectMapToPair = rddMapToPair2.collect();
for (Tuple2<String, Integer> stringIntegerTuple2 : collectMapToPair) {
System.out.println(stringIntegerTuple2);
}
/*
* 输出结果
* (aa,1)
* (cc,1)
* (ee,1)
* */
// mapToPair是一对一,一个元素返回一个元素,而flatMapToPair可以一个元素返回多个
System.out.println("_________________flatMapToPair算子________________________");
ArrayList<String> arrFlatMapToPair = new ArrayList<>();
arrFlatMapToPair.add("aa bb");
arrFlatMapToPair.add("cc dd");
arrFlatMapToPair.add("ee ff");
JavaRDD<String> rddFlatMapToPair2 = sc.parallelize(arrFlatMapToPair);
//方法:def flatMapToPair[K2, V2](f : org.apache.spark.api.java.function.PairFlatMapFunction[T, K2, V2]) : org.apache.spark.api.java.JavaPairRDD[K2, V2]
//解释:flatMapToPair的参数是PairFlatMapFunction对象,同样是重写call方法,但返回值是Iterator迭代器
//实现:将元素按空格拆分,以拆分后的字符串为key,1为value组成Tuple,放入提前建好的ArrayList集合中,最后通过集合.Iterator的方法,转成迭代器返回
JavaPairRDD<String, Integer> rddFlatMapToPair3 = rddFlatMapToPair2.flatMapToPair(new PairFlatMapFunction<String, String, Integer>() {
@Override
public Iterator<Tuple2<String, Integer>> call(String s) throws Exception {
ArrayList<Tuple2<String, Integer>> arr = new ArrayList<>();
String[] s1 = s.split(" ");
for (String s2 : s1) {
arr.add(new Tuple2<>(s2, 1));
}
return arr.iterator();
}
});
List<Tuple2<String, Integer>> collectFlatMapToPari = rddFlatMapToPair3.collect();
for (Tuple2<String, Integer> stringIntegerTuple2 : collectFlatMapToPari) {
System.out.println(stringIntegerTuple2);
}
/* 输出结果:
(aa,1)
(bb,1)
(cc,1)
(dd,1)
(ee,1)
(ff,1)
*/
//聚合运算
System.out.println("_________________combinByKey算子________________________");
ArrayList<Tuple2<String,Integer>> arrCombinByKey=new ArrayList<>();
//插入两名学生的成绩
arrCombinByKey.add(new Tuple2<>("zs",98));
arrCombinByKey.add(new Tuple2<>("zs",72));
arrCombinByKey.add(new Tuple2<>("zs",90));
arrCombinByKey.add(new Tuple2<>("ls",91));
arrCombinByKey.add(new Tuple2<>("ls",67));
arrCombinByKey.add(new Tuple2<>("ls",80));
JavaRDD<Tuple2<String, Integer>> rddCombinByKey = sc.parallelize(arrCombinByKey);
//将元素通过mapToPair转换成二元组,才能使用combinByKey
JavaPairRDD<String, Tuple2<String, Integer>> rddCombinByKey2 = rddCombinByKey.mapToPair(new PairFunction<Tuple2<String, Integer>, String, Tuple2<String, Integer>>() {
@Override
public Tuple2<String, Tuple2<String, Integer>> call(Tuple2<String, Integer> stringIntegerTuple2) throws Exception {
return new Tuple2<>(stringIntegerTuple2._1, stringIntegerTuple2);
}
});
//CombinByKey有三个参数,都是Function对象
//先实现第一个Function
//function拿到元组的value进行操作,此时的value依然是元组,第一个function将value中的成绩取出,作为key,1做为value形成新的元组传至第二个function
Function<Tuple2<String, Integer>, Tuple2<Integer, Integer>> function1 = new Function<Tuple2<String, Integer>, Tuple2<Integer, Integer>>() {
@Override
public Tuple2<Integer, Integer> call(Tuple2<String, Integer> stringIntegerTuple2) throws Exception {
return new Tuple2<>(stringIntegerTuple2._2, 1);
}
};
//实现第二个Funcition
//第二个function将拿到第一个function的结果作为参数一,参数二有combinByKey根据key值也就是同一个人,来传入这个人的其他value,接下就是实现成绩相加,课程数+1,得出的数据形成新的元组传至第三个Function
Function2<Tuple2<Integer, Integer>, Tuple2<String, Integer>, Tuple2<Integer, Integer>> function2 = new Function2<Tuple2<Integer, Integer>, Tuple2<String, Integer>, Tuple2<Integer, Integer>>() {
@Override
public Tuple2<Integer, Integer> call(Tuple2<Integer, Integer> integerIntegerTuple2, Tuple2<String, Integer> stringIntegerTuple2) throws Exception {
return new Tuple2<>(integerIntegerTuple2._1 + stringIntegerTuple2._2, integerIntegerTuple2._2 + 1);
}
};
//实现第三个Function
//第三个function将拿到多个第二个function计算的结果,这种情况是因为不同的分区计算的结果,最后汇总在一起
Function2<Tuple2<Integer, Integer>, Tuple2<Integer, Integer>, Tuple2<Integer, Integer>> function3 = new Function2<Tuple2<Integer, Integer>, Tuple2<Integer, Integer>, Tuple2<Integer, Integer>>() {
@Override
public Tuple2<Integer, Integer> call(Tuple2<Integer, Integer> integerIntegerTuple2, Tuple2<Integer, Integer> integerIntegerTuple22) throws Exception {
return new Tuple2<>(integerIntegerTuple2._1 + integerIntegerTuple22._1, integerIntegerTuple2._2 + integerIntegerTuple22._2);
}
};
//调用combinBykey,将三个Function对象传入
JavaPairRDD<String, Tuple2<Integer, Integer>> rddCombinByKey4 = rddCombinByKey2.combineByKey(function1, function2, function3);
//将结果转成List
List<Tuple2<String, Tuple2<Integer, Integer>>> collectCombinByKey = rddCombinByKey4.collect();
//遍历输出
for (Tuple2<String, Tuple2<Integer, Integer>> stringTuple2Tuple2 : collectCombinByKey) {
//将汇总的总成绩除以总门数就得出了该学生的综合平均值
System.out.println(stringTuple2Tuple2._1+"\t"+stringTuple2Tuple2._2._1/stringTuple2Tuple2._2._2);
}
/*
输出结果:
zs 86
ls 79
*/
//聚合运算
System.out.println("_________________reduceByKey算子________________________");
JavaRDD<String> rddReduceByKey = sc.parallelize(Arrays.asList("hello world hello scala hello spark"));
//使用reduceByKey需要先对元素生成对应的元组,所以这里将使用flatMapToPair,而flatMapToPair的参数是PairFlatMapFunction对象
//实现flatMapToPair中需要的PairFlatMapFunction
//功能:根据空格对元素分隔,再将分隔后的字符串作为key,1作为value组成元组,放入ArrayList集合中最后return出去
PairFlatMapFunction<String, String, Integer> pfm = new PairFlatMapFunction<String, String, Integer>() {
@Override
public Iterator<Tuple2<String, Integer>> call(String s) throws Exception {
ArrayList<Tuple2<String, Integer>> list = new ArrayList<>();
String[] s1 = s.split(" ");
for (int i = 0; i < s1.length; i++) {
Tuple2<String, Integer> stringIntegerTuple2 = new Tuple2<>(s1[i], 1);
list.add(stringIntegerTuple2);
}
return list.iterator();
}
};
//实现reduceByKey的参数Function对象
//功能:将key值相同的Tuple中的value值进行相加
Function2<Integer, Integer, Integer> rby = new Function2<Integer, Integer, Integer>() {
@Override
public Integer call(Integer integer, Integer integer2) throws Exception {
return integer + integer2;
}
};
//用rdd调用flatMapToPair传入上面实现的PairFlatMapFunction对象,在调用reduceByKey传入上面实现的Function对象
//这里实现的是WorldCount的功能
JavaPairRDD<String, Integer> stringIntegerJavaPairRDD = rddReduceByKey.flatMapToPair(pfm).reduceByKey(rby);
//将算子转成List集合
List<Tuple2<String, Integer>> collectReduceByKey = stringIntegerJavaPairRDD.collect();
//遍历输出
for (Tuple2<String, Integer> stringIntegerTuple2 : collectReduceByKey) {
System.out.println(stringIntegerTuple2);
}
/*输出结果:
(hello,3)
(world,1)
(spark,1)
(scala,1)*/
//foldByKey也是聚合运算,但是会多一个参数,这个参数会跟每个value进行相同操作后,再去执行聚合
System.out.println("_________________foldByKey算子________________________");
ArrayList<Tuple2<String,Integer>> arrayList = new ArrayList();
arrayList.add(new Tuple2<>("A",1));
arrayList.add(new Tuple2<>("A",2));
arrayList.add(new Tuple2<>("B",1));
arrayList.add(new Tuple2<>("c",1));
JavaRDD<Tuple2<String, Integer>> rddFoldByKey = sc.parallelize(arrayList);
//这里foldByKey无法直接对rdd直接操作,需要转换一下,把rdd放入JavaPairRDD.fromJavaRDD()中
//方法:def foldByKey(zeroValue : V, func : org.apache.spark.api.java.function.Function2[V, V, V]) : org.apache.spark.api.java.JavaPairRDD[K, V]
//解释:foldByKey有两个参数,第一个参数zeroValue为Int,第二个参数为Function2的对象,传入相同key的value,最后返回一个value
//举两个例子说一下zeroValue的作用:
//例子一:zeroValue=1,如果实现的Function2的call方法是将相同的key的value进行相加,在相加之前,zeroValue会对每个value+1,即("A",1)=>("A",1+1),("A",2)=>("A",2+1)最后才会执行("A",2)+("A",3)=("A",5)
//例子二:zeroValue=2,如果实现的Function2的call方法是将相同的key的value进行相乘,在相乘之前,zeroValue会对每个value*2,即("A",1)=>("A",1*2),("A",2)=>("A",2*2)最后才会执行("A",2)*("A",4)=("A",8)
//以此类推,就是这样的一个操作
JavaPairRDD<String, Integer> rddFoldByKey2 = JavaPairRDD.fromJavaRDD(rddFoldByKey).foldByKey(1, new Function2<Integer, Integer, Integer>() {
@Override
public Integer call(Integer integer,Integer integer2) throws Exception {
return integer + integer2;
}
});
List<Tuple2<String, Integer>> collectFoldByKey = rddFoldByKey2.collect();
for (Tuple2<String, Integer> stringIntegerTuple2 : collectFoldByKey) {
System.out.println(stringIntegerTuple2);
}
/*输出结果:
(A,5)
(B,2)
(c,2)*/
//排序,默认降序,传入参数false为升序
System.out.println("_________________SortByKey算子________________________");
ArrayList<Tuple2<Integer,String>> list = new ArrayList<>();
list.add(new Tuple2<>(98,"zs"));
list.add(new Tuple2<>(84,"ls"));
list.add(new Tuple2<>(99,"ww"));
list.add(new Tuple2<>(72,"ll"));
list.add(new Tuple2<>(79,"lq"));
JavaRDD<Tuple2<Integer, String>> rddSortByKey = sc.parallelize(list);
//依需要转换一下。然后直接调用sortByKey就可以了。我这里直接collect了
List<Tuple2<Integer, String>> collectSortByKey = JavaPairRDD.fromJavaRDD(rddSortByKey).sortByKey().collect();
for (Tuple2<Integer, String> rddSortByKey2 : collectSortByKey) {
System.out.println(rddSortByKey2);
}
/*输出结果:
(72,ll)
(79,lq)
(84,ls)
(98,zs)
(99,ww)*/
//根据key分组
System.out.println("_________________groupByKey算子________________________");
JavaRDD<Tuple2<String,Integer>> rddGBK = sc.parallelize(Arrays.asList(new Tuple2("xiaoming", 90), new Tuple2("xiaoming", 80), new Tuple2("lihua", 60), new Tuple2("lihua", 98)));
JavaPairRDD<String, Integer> rddGBK2 = JavaPairRDD.fromJavaRDD(rddGBK);
JavaPairRDD<String, Iterable<Integer>> rdd3 = rddGBK2.groupByKey();
List<Tuple2<String, Iterable<Integer>>> collectGBK = rdd3.collect();
for (Tuple2<String, Iterable<Integer>> s : collectGBK) {
for (Integer integer : s._2) {
System.out.println(s._1+","+integer);
}
}
/*输出结果:
lihua,60
lihua,98
xiaoming,90
xiaoming,80*/
//将多个rdd中的元素分组在一起
System.out.println("_________________Cogroup算子________________________");
JavaRDD<Tuple2<String,Integer>> rddCg = sc.parallelize(Arrays.asList(new Tuple2("xiaoming", 1), new Tuple2("xiaoming", 1), new Tuple2("lihua", 1), new Tuple2("lihua", 1)));
JavaRDD<Tuple2<String,Integer>> rddCg2 = sc.parallelize(Arrays.asList(new Tuple2("xiaoming", 2), new Tuple2("xiaoming", 2), new Tuple2("lihua", 2), new Tuple2("lihua", 2)));
JavaRDD<Tuple2<String,Integer>> rddCg3 = sc.parallelize(Arrays.asList(new Tuple2("xiaoming", 3), new Tuple2("xiaoming", 3), new Tuple2("lihua", 3), new Tuple2("lihua", 3)));
JavaPairRDD<String, Integer> rddCgJava = JavaPairRDD.fromJavaRDD(rddCg);
JavaPairRDD<String, Integer> rddCgJava2 = JavaPairRDD.fromJavaRDD(rddCg2);
JavaPairRDD<String, Integer> rddCgJava3 = JavaPairRDD.fromJavaRDD(rddCg3);
//返回的元组中的value是每个集合中相同key的值组成的结果
JavaPairRDD<String, Tuple3<Iterable<Integer>, Iterable<Integer>, Iterable<Integer>>> rddCgAll = rddCgJava.cogroup(rddCgJava2, rddCgJava3);
List<Tuple2<String, Tuple3<Iterable<Integer>, Iterable<Integer>, Iterable<Integer>>>> collectCg = rddCgAll.collect();
for (Tuple2<String, Tuple3<Iterable<Integer>, Iterable<Integer>, Iterable<Integer>>> s : collectCg) {
System.out.println(s._1+","+s._2);
}
/*输出结果:
xiaoming,([1, 1],[2, 2],[3, 3])
lihua,([1, 1],[2, 2],[3, 3])*/
//返回左集合不存在于右集合的所有元素
System.out.println("_________________subtract算子________________________");
JavaRDD<Integer> rddSub1 = sc.parallelize(Arrays.asList(1, 2, 3, 4));
JavaRDD<Integer> rddSub2 = sc.parallelize(Arrays.asList(1, 2, 3));
JavaRDD<Integer> rddSub3 = rddSub1.subtract(rddSub2);
List<Integer> collectS = rddSub3.collect();
for (Integer integer : collectS) {
System.out.println(integer);
}
/*输出结果:
4
*/
//取交集:将两个集合中key值相同的元素连接
System.out.println("_________________join算子________________________");
JavaRDD<Tuple2<String,Integer>> rddJoin1 = sc.parallelize(Arrays.asList(new Tuple2<String,Integer>("a",1),new Tuple2<String,Integer>("b",1)));
JavaRDD<Tuple2<String,Integer>> rddJoin2 = sc.parallelize(Arrays.asList(new Tuple2<String,Integer>("a",2),new Tuple2<String,Integer>("b",2),new Tuple2<String,Integer>("c",2)));
JavaPairRDD<String, Integer> rddJavaJoin1 = JavaPairRDD.fromJavaRDD(rddJoin1);
JavaPairRDD<String, Integer> rddJavaJoin2 = JavaPairRDD.fromJavaRDD(rddJoin2);
JavaPairRDD<String, Tuple2<Integer, Integer>> join = rddJavaJoin1.join(rddJavaJoin2);
List<Tuple2<String, Tuple2<Integer, Integer>>> collectJoin = join.collect();
for (Tuple2<String, Tuple2<Integer, Integer>> s : collectJoin) {
System.out.println(s);
}
/*输出结果:
(a,(1,2))
(b,(1,2))*/
//全连接,元组的value类型为Optional,当存在无对应连接的元组,返回Optional.empty
System.out.println("_________________fullOuterJoin算子________________________");
JavaRDD<Tuple2<String,Integer>> rddfoj1 = sc.parallelize(Arrays.asList(new Tuple2<String,Integer>("a",1),new Tuple2<String,Integer>("b",1)));
JavaRDD<Tuple2<String,Integer>> rddfoj2 = sc.parallelize(Arrays.asList(new Tuple2<String,Integer>("a",2),new Tuple2<String,Integer>("b",2),new Tuple2<String,Integer>("c",2)));
JavaPairRDD<String, Integer> rddJavafoj1 = JavaPairRDD.fromJavaRDD(rddfoj1);
JavaPairRDD<String, Integer> rddJavafoj2 = JavaPairRDD.fromJavaRDD(rddfoj2);
JavaPairRDD<String, Tuple2<Optional<Integer>, Optional<Integer>>> rddfoj = rddJavafoj1.fullOuterJoin(rddJavafoj2);
List<Tuple2<String, Tuple2<Optional<Integer>, Optional<Integer>>>> collectfoj = rddfoj.collect();
for (Tuple2<String, Tuple2<Optional<Integer>, Optional<Integer>>> s : collectfoj) {
System.out.println(s);
}
/*输出结果:
(a,(Optional[1],Optional[2]))
(b,(Optional[1],Optional[2]))
(c,(Optional.empty,Optional[2]))*/
//左关联:右集合中有无关联的丢弃,左边有无关联的保留
System.out.println("_________________LeftOuterJoin算子________________________");
JavaRDD<Tuple2<String,Integer>> rddLoj1 = sc.parallelize(Arrays.asList(new Tuple2<String,Integer>("a",1),new Tuple2<String,Integer>("b",1),new Tuple2<String,Integer>("c",1)));
JavaRDD<Tuple2<String,Integer>> rddLoj2 = sc.parallelize(Arrays.asList(new Tuple2<String,Integer>("a",2),new Tuple2<String,Integer>("b",2),new Tuple2<String,Integer>("d",2)));
JavaPairRDD<String, Integer> rddJavaLoj1 = JavaPairRDD.fromJavaRDD(rddLoj1);
JavaPairRDD<String, Integer> rddJavaLoj2 = JavaPairRDD.fromJavaRDD(rddLoj2);
JavaPairRDD<String, Tuple2<Integer, Optional<Integer>>> rddLoj = rddJavaLoj1.leftOuterJoin(rddJavaLoj2);
List<Tuple2<String, Tuple2<Integer, Optional<Integer>>>> collectLoj = rddLoj.collect();
for (Tuple2<String, Tuple2<Integer, Optional<Integer>>> s : collectLoj) {
System.out.println(s);
}
/* 输出结果:
(a,(1,Optional[2]))
(b,(1,Optional[2]))
(c,(1,Optional.empty))*/
//右关联:左集合中有无关联的丢弃,右边有无关联的保留
System.out.println("________________RightOuterJoin算子________________________");
JavaRDD<Tuple2<String,Integer>> rddRoj1 = sc.parallelize(Arrays.asList(new Tuple2<String,Integer>("a",1),new Tuple2<String,Integer>("b",1),new Tuple2<String,Integer>("c",1)));
JavaRDD<Tuple2<String,Integer>> rddRoj2 = sc.parallelize(Arrays.asList(new Tuple2<String,Integer>("a",2),new Tuple2<String,Integer>("b",2),new Tuple2<String,Integer>("d",2)));
JavaPairRDD<String, Integer> rddJavaRoj1 = JavaPairRDD.fromJavaRDD(rddRoj1);
JavaPairRDD<String, Integer> rddJavaRoj2 = JavaPairRDD.fromJavaRDD(rddRoj2);
JavaPairRDD<String, Tuple2<Optional<Integer>, Integer>> rddRoj = rddJavaRoj1.rightOuterJoin(rddJavaRoj2);
List<Tuple2<String, Tuple2<Optional<Integer>, Integer>>> collectRoj = rddRoj.collect();
for (Tuple2<String, Tuple2<Optional<Integer>, Integer>> s : collectRoj) {
System.out.println(s);
}
/*输出结果:
(a,(Optional[1],2))
(b,(Optional[1],2))
(d,(Optional.empty,2))*/
}
}
常用的动作算子
scala版
//返回第一个元素
println("_____________first_________________")
val rddFirst:RDD[Int] = sc.parallelize(List(1,2,3,4))
println(rddFirst.first())
/*输出结果:
1
*/
//返回第前n个元素
println("_____________take_________________")
val rddTake:RDD[Int] = sc.parallelize(List(1,2,3,4))
val rddTakeArray = rddTake.take(2)
for (elem <- rddTakeArray) {
println(elem)
}
/*输出结果:
1
2
*/
//返回RDD中所有元素
println("_____________collect_________________")
val rddCollect:RDD[Int] = sc.parallelize(List(1,2,3,4))
val ints = rddCollect.collect()
for (elem <- ints) {
println(elem)
}
/*输出结果:
1
2
3
4*/
//返回RDD中元素个数
println("_____________count_________________")
val rddCount:RDD[Int] = sc.parallelize(List(1,2,3,4))
println(rddCount.count())
/*输出结果:
4
*/
//返回RDD中各元素出现的次数
println("_____________countByValue_________________")
val rddCBV:RDD[Int] = sc.parallelize(List(1,1,2,3,3,4))
val rddCBVmap = rddCBV.countByValue()
for (elem <- rddCBVmap) {
println(elem._1+"出现了:"+elem._2+"次")
}
/*输出结果:
1出现了:2次
2出现了:1次
3出现了:2次
4出现了:1次*/
//并行整合RDD中所有数据
println("_____________reduce_________________")
val rddReduce:RDD[Int] = sc.parallelize(List(1,2,3,4))
println(rddReduce.reduce(_ + _))
/*输出结果:
10
*/
//和 reduce() 一 样, 但是提供了初始值num,每个元素计算时,先要合这个初始值进行折叠, 注意,这里会按照每个分区进行fold,然后分区之间还会再次进行fold
println("_____________fold_________________")
val rddFold:RDD[Int] = sc.parallelize(List(1,2,3,4),2)
println(rddFold.fold(1)((x,y)=>{println(x,y);x+y}))
/*输出结果:
(当分区为1 的时候) 12
(当分区为2 的时候) 13
...
*/
//按照升序排列rdd,根据传入的参数取前n个元素
println("_____________top_________________")
val rddTop:RDD[Int] = sc.parallelize(List(1,2,3,4))
val arrayTop = rddTop.top(2)
for (elem <- arrayTop) {
println(elem)
}
/*输出结果:
4
3
*/
//于top相反,将rdd按降序排列,取前n个元素
println("_____________takeOrdered_________________")
val rddTo:RDD[Int] = sc.parallelize(List(1,2,3,4))
val arrayTo = rddTo.takeOrdered(2)
for (elem <- arrayTo) {
println(elem)
}
/* 输出结果:
1
2
*/
//对RDD中的每个元素执行指定函数
println("_____________foreach_____________________")
val rdd:RDD[Int] = sc.parallelize(List(1,2,3,4))
rdd.foreach(println)
/*输出结果(存在分区,每次输出顺序会不同):
1
2
3
4*/