JavaPairRDD<String, Integer> pairRDD1 = javaSparkContext.parallelize(Arrays.asList("2016-02-25", "2016-02-24", "2016-02-26")).mapPartitionsToPair(new PairFlatMapFunction<Iterator<String>, String, Integer>() { @Override public Iterable<Tuple2<String, Integer>> call(Iterator<String> iterator) throws Exception { LOG.info("fuck1.........."); acc1.add(1); List<Tuple2<String, Integer>> list = new ArrayList<Tuple2<String, Integer>>(); while (iterator.hasNext()) { list.add(new Tuple2<String, Integer>(iterator.next(), 1)); } return list; } }); long count1 = pairRDD1.count(); LOG.info("pairRDD1 元素个数:" + count1); JavaPairRDD<String, Integer> pairRDD2 = javaSparkContext.parallelize(Arrays.asList("2016-02-25", "2016-02-27", "2016-02-28")).mapPartitionsToPair(new PairFlatMapFunction<Iterator<String>, String, Integer>() { @Override public Iterable<Tuple2<String, Integer>> call(Iterator<String> iterator) throws Exception { LOG.info("fuck2.........."); acc2.add(1); List<Tuple2<String, Integer>> list = new ArrayList<Tuple2<String, Integer>>(); while (iterator.hasNext()) { list.add(new Tuple2<String, Integer>(iterator.next(), 1)); } return list; } }); long count2 = pairRDD2.count(); LOG.info("pairRDD2 元素个数:" + count2); JavaPairRDD<String, Integer> pairRDD3 = pairRDD1.leftOuterJoin(pairRDD2).mapPartitionsToPair(new PairFlatMapFunction<Iterator<Tuple2<String, Tuple2<Integer, Optional<Integer>>>>, String, Integer>() { @Override public Iterable<Tuple2<String, Integer>> call(Iterator<Tuple2<String, Tuple2<Integer, Optional<Integer>>>> tuple2Iterator) throws Exception { LOG.info("fuck3.........."); acc3.add(1); List<Tuple2<String, Integer>> list = new ArrayList<Tuple2<String, Integer>>(); Tuple2<String, Tuple2<Integer, Optional<Integer>>> tuple2 = null; while (tuple2Iterator.hasNext()) { tuple2 = tuple2Iterator.next(); String dateKey = tuple2._1(); Tuple2<Integer, Optional<Integer>> integerOptionalTuple2 = tuple2._2(); if (integerOptionalTuple2._2().orNull() == null) { continue; } list.add(new Tuple2<String, Integer>(dateKey, 1)); } return list; } }); long count3 = pairRDD3.count(); LOG.info("pairRDD3 元素个数:" + count3); pairRDD3.foreachPartition(new VoidFunction<Iterator<Tuple2<String, Integer>>>() { @Override public void call(Iterator<Tuple2<String, Integer>> tuple2Iterator) throws Exception { while (tuple2Iterator.hasNext()) { LOG.info("list:::::: " + tuple2Iterator.next()); } } }); pairRDD3.foreachPartition(new VoidFunction<Iterator<Tuple2<String, Integer>>>() { @Override public void call(Iterator<Tuple2<String, Integer>> tuple2Iterator) throws Exception { while (tuple2Iterator.hasNext()) { LOG.info("list:::::: " + tuple2Iterator.next()); } } }); LOG.info("acc1 : " + acc1.localValue()); LOG.info("acc2 : " + acc2.localValue()); LOG.info("acc3 : " + acc3.localValue());
一、当pairRDD1与pairRDD2 不cache的时候 执行结果:
03-23 14:22:50 [INFO] [job.SparkLoadDataTest(182)] acc1 : 4
03-23 14:22:50 [INFO] [job.SparkLoadDataTest(183)] acc2 : 4
03-23 14:22:50 [INFO] [job.SparkLoadDataTest(184)] acc3 : 6
二、当pairRDD1与pairRDD2 cache的时候 执行结果:
03-23 14:28:41 [INFO] [job.SparkLoadDataTest(177)] acc1 : 2
03-23 14:28:41 [INFO] [job.SparkLoadDataTest(178)] acc2 : 2
03-23 14:28:41 [INFO] [job.SparkLoadDataTest(179)] acc3 : 6
三、当pairRDD3不cache的时候
03-23 14:28:41 [INFO] [job.SparkLoadDataTest(179)] acc3 : 6
四、当pairRDD3cache的时候
03-23 14:31:49 [INFO] [job.SparkLoadDataTest(179)] acc3 : 2
根据一、二得出结论:pairRDD1与pairRDD2 cache的时候从外部设备加载了1次,不cache的时候加载了了2次,所以cache的效率高。还有如果pairRDD1与pairRDD2 不打印数量且不cache也能提高效率。
根据三、四得出结论:如果某个RDD的数据不cache,且这个RDD想在多个地方使用,那么这个RDD就会重新去整合数据或者从外部设备加载数据,这个效率是很低下的,只有cache的时候,才不会再加载数据从外部设备。