/**
* 通过将一个函数应用于这个RDD的每个分区,返回一个新的RDD。
* `preservesPartitioning`指示输入函数是否保留分区
*/
def mapPartitions[U: ClassTag](
f: Iterator[T] => Iterator[U],
preservesPartitioning: Boolean = false): RDD[U] = withScope {
val cleanedF = sc.clean(f)
new MapPartitionsRDD(
this,
(context: TaskContext, index: Int, iter: Iterator[T]) => cleanedF(iter),
preservesPartitioning)
}
/**
* 通过在RDD的每个分区上应用一个函数来返回一个新的RDD,同时跟踪原始分区的索引。
*/
def mapPartitionsWithIndex[U: ClassTag](
f: (Int, Iterator[T]) => Iterator[U],
preservesPartitioning: Boolean = false): RDD[U] = withScope {
val cleanedF = sc.clean(f)
new MapPartitionsRDD(
this,
(context: TaskContext, index: Int, iter: Iterator[T]) => cleanedF(index, iter),
preservesPartitioning)
}
object Core005 {
def main(args: Array[String]) {
val conf = new SparkConf().setMaster("local").setAppName("core05")
val sc = new SparkContext(conf)
val rdd = sc.parallelize(List(1, 2, 3, 4, 5), 2)
//类似Java的mapPartitionsToPair
val rdd1 = rdd.mapPartitions(ite => {
val list = new ListBuffer[Tuple2[Integer, Integer]](); //scala.collection.mutable.ListBuffer
while (ite.hasNext) {
val next = ite.next()
list += Tuple2(next, next * 2)
}
list.iterator
}, false)
rdd1.foreach(x => print(x + " "))
结果:(1,2) (2,4) (3,6) (4,8) (5,10)
//mapPartitions
val rdd2 = rdd.mapPartitions(ite => {
val list = new ListBuffer[Integer](); //scala.collection.mutable.ListBuffer
while (ite.hasNext) {
val next = ite.next()
list += next
}
list.iterator
}, false)
rdd2.foreach(x => print(x + " "))
结果:1 2 3 4 5
//mapPartitionsWithIndex
val rdd3 = rdd.mapPartitionsWithIndex((index, ite) => {
val map = Map[String,List[Integer]]() //scala.collection.mutable.Map
val indexStr = "part-" + index
while (ite.hasNext) {
if (map.contains(indexStr)) {
var tmpList = map(indexStr)
tmpList = ite.next() :: tmpList
map(indexStr) = tmpList
} else {
map(indexStr) = List[Integer](ite.next())
}
}
map.iterator
}, false)
rdd3.foreach(x => print(x + " "))
}
}
结果:(part-0,List(2, 1)) (part-1,List(5, 4, 3))
public class Core05 {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setMaster("local").setAppName("mapPartitions");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD rdd1 = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5), 2);
//mapPartitionsToPair
JavaPairRDD<Integer, Integer> rdd01 = rdd1.mapPartitionsToPair(new PairFlatMapFunction<Iterator<Integer>, Integer, Integer>() {
@Override
public Iterator<Tuple2<Integer, Integer>> call(Iterator<Integer> i) throws Exception {
ArrayList<Tuple2<Integer, Integer>> list = new ArrayList<Tuple2<Integer, Integer>>();
while (i.hasNext()) {
int num = i.next();
list.add(new Tuple2<Integer, Integer>(num, num * 2));
}
return list.iterator();
}
});
rdd01.foreach(x -> System.out.println(x));
结果:(1,2) (2,4) (3,6) (4,8) (5,10)
//mapPartitions
JavaRDD<Tuple2<Integer, Integer>> rdd02 = rdd1.mapPartitions(new FlatMapFunction<Iterator<Integer>, Tuple2<Integer, Integer>>() {
@Override
public Iterator<Tuple2<Integer, Integer>> call(Iterator<Integer> i) throws Exception {
ArrayList<Tuple2<Integer, Integer>> list = new ArrayList<Tuple2<Integer, Integer>>();
while (i.hasNext()) {
int next = i.next();
list.add(new Tuple2<Integer, Integer>(next, next * 2));
}
return list.iterator();
}
});
rdd02.foreach(x -> System.out.println(x));
结果:1 2 3 4 5
//mapPartitionsWithIndex
JavaRDD<Tuple2<Integer, ArrayList<Integer>>> rdd03 = rdd1.mapPartitionsWithIndex(new Function2<Integer, Iterator<Integer>, Iterator<Tuple2<Integer, ArrayList<Integer>>>>() {
@Override
public Iterator<Tuple2<Integer, ArrayList<Integer>>> call(Integer v1, Iterator<Integer> v2) throws Exception {
HashMap<Integer, ArrayList<Integer>> map = new HashMap<Integer, ArrayList<Integer>>();
ArrayList<Integer> list = null;
while (v2.hasNext()) {
Integer next = v2.next();
if (map.containsKey(v1) && list != null) {
ArrayList<Integer> tmpList = map.get(v1);
tmpList.add(next);
map.put(v1, tmpList);
} else {
list = new ArrayList<Integer>();
list.add(next);
map.put(v1, list);
}
}
Iterator<Integer> iterator = map.keySet().iterator();
HashSet<Tuple2<Integer, ArrayList<Integer>>> set = new HashSet<Tuple2<Integer, ArrayList<Integer>>>();
while (iterator.hasNext()) {
int next = iterator.next();
set.add(new Tuple2<Integer, ArrayList<Integer>>(next, map.get(next)));
}
return set.iterator();
}
}, false);
rdd03.foreach(x -> System.out.println(x));
结果:(0,[1, 2]) (1,[3, 4, 5])
JavaRDD.util.List> rdd04 = rdd1.glom();
rdd04.foreach(x -> System.out.println(x));
}
}
打印各个分区的操作,可以使用 glom( ) 的方法
结果:[1, 2] [3, 4, 5]
(1)使用mapPartitions替代普通map
mapPartitions类的算子,一次函数调用会处理一个partition所有的数据,而不是一次函数调用处理一条,性能相对来说会高一些。但是有的时候,使用mapPartitions会出现OOM(内存溢出)的问题。因为单次函数调用就要处理掉一个partition所有的数据,如果内存不够,垃圾回收时是无法回收掉太多对象的,很可能出现OOM异常。所以使用这类操作时要慎重!