本次博客分发三大部分
一,Spark的RDD用JAVA的实现
二,Spark的RDD的说明
三,Spark的Scala的实现
1.1 Java代码实现
package com.lyl.it;
import java.util.Arrays;
import java.util.List;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.VoidFunction;
public class MapOperator {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("MapOperator").setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
List numbers = Arrays.asList(1,2,3,4,5);
JavaRDD numberRDD = sc.parallelize(numbers);
JavaRDD result = numberRDD.map(new Function() {
private static final long serialVersionUID = 1L;
public Integer call(Integer number) throws Exception {
return number*10;
}
});
result.foreach(new VoidFunction() {
private static final long serialVersionUID = 1L;
public void call(Integer result) throws Exception {
System.out.println(result);
}
});
sc.close();
}
}
1.2,说明map算子是实现对于每个元素进行操作,下面是程序运行的结果
18/07/19 09:42:14 INFO SparkContext: Created broadcast 0 from broadcast at DAGScheduler.scala:1006
18/07/19 09:42:14 INFO DAGScheduler: Submitting 1 missing tasks from ResultStage 0 (MapPartitionsRDD[1] at map at MapOperator.java:22)
18/07/19 09:42:15 INFO TaskSchedulerImpl: Adding task set 0.0 with 1 tasks
18/07/19 09:42:15 INFO TaskSetManager: Starting task 0.0 in stage 0.0 (TID 0, localhost, partition 0,PROCESS_LOCAL, 2170 bytes)
18/07/19 09:42:15 INFO Executor: Running task 0.0 in stage 0.0 (TID 0)
10
20
30
40
50
18/07/19 09:42:15 INFO Executor: Finished task 0.0 in stage 0.0 (TID 0). 915 bytes result sent to driver
18/07/19 09:42:15 INFO TaskSetManager: Finished task 0.0 in stage 0.0 (TID 0) in 387 ms on localhost (1/1)
18/07/19 09:42:15 INFO TaskSchedulerImpl: Removed TaskSet 0.0, whose tasks have all completed, from pool
18/07/19 09:42:15 INFO DAGScheduler: ResultStage 0 (foreach at MapOperator.java:31) finished in 0.516 s
18/07/19 09:42:15 INFO DAGScheduler: Job 0 finished: foreach at MapOperator.java:31, took 1.936862 s
18/07/19 09:42:15 INFO SparkUI: Stopped Spark web UI at http://192.168.21.1:4040
1.3,Scala的实现
package com.lyl.it
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
object MapOperator {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("MapOperator").setMaster("local")
val sc = new SparkContext(conf)
val numbers = Array(1,2,3,4,5)
sc.parallelize(numbers).map ( num => num*10).foreach {result => println(result)}
}
}
2.1,Java代码实现
package com.lyl.it;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.VoidFunction;
public class MapPartitonOperator {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("MapPartitonOperator").setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
List names = Arrays.asList("xu","kk","pp","ii");
JavaRDD nameRDD = sc.parallelize(names);
final Map scoreMap = new HashMap();
scoreMap.put("xu", 150);
scoreMap.put("kk", 100);
scoreMap.put("pp", 200);
scoreMap.put("ii", 190);
JavaRDD sorceRDD = nameRDD.mapPartitions(new FlatMapFunction, Integer>() {
private static final long serialVersionUID = 1L;
public Iterable call(Iterator iterator)
throws Exception {
List list = new ArrayList();
while(iterator.hasNext()){
String name = iterator.next();
Integer sorce = scoreMap.get(name);
list.add(sorce);
}
return list;
}
});
sorceRDD.foreach(new VoidFunction() {
private static final long serialVersionUID = 1L;
public void call(Integer sorce) throws Exception {
System.out.println(sorce);
}
});
sc.close();
}
}
2,2,说明MapPartiton算子是对于一个Partiton的元素进行操作,下面是程序运行的结果
18/07/19 10:55:50 INFO MemoryStore: Block broadcast_0 stored as values in memory (estimated size 2.5 KB, free 1121.6 MB)
18/07/19 10:55:50 INFO MemoryStore: Block broadcast_0_piece0 stored as bytes in memory (estimated size 1632.0 B, free 1121.6 MB)
18/07/19 10:55:50 INFO BlockManagerInfo: Added broadcast_0_piece0 in memory on localhost:54623 (size: 1632.0 B, free: 1121.6 MB)
18/07/19 10:55:50 INFO SparkContext: Created broadcast 0 from broadcast at DAGScheduler.scala:1006
18/07/19 10:55:50 INFO DAGScheduler: Submitting 1 missing tasks from ResultStage 0 (MapPartitionsRDD[1] at mapPartitions at MapPartitonOperator.java:32)
18/07/19 10:55:50 INFO TaskSchedulerImpl: Adding task set 0.0 with 1 tasks
18/07/19 10:55:50 INFO TaskSetManager: Starting task 0.0 in stage 0.0 (TID 0, localhost, partition 0,PROCESS_LOCAL, 2100 bytes)
18/07/19 10:55:50 INFO Executor: Running task 0.0 in stage 0.0 (TID 0)
150
100
200
190
18/07/19 10:55:50 INFO Executor: Finished task 0.0 in stage 0.0 (TID 0). 915 bytes result sent to driver
18/07/19 10:55:50 INFO TaskSetManager: Finished task 0.0 in stage 0.0 (TID 0) in 212 ms on localhost (1/1)
18/07/19 10:55:50 INFO TaskSchedulerImpl: Removed TaskSet 0.0, whose tasks have all completed, from pool
2.3,Scala的实现
package com.lyl.it
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.api.java.function.FlatMapFunction
object MapPartitonOperator {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("MapPartitonOperator").setMaster("local")
val sc = new SparkContext(conf)
val names = Array("xu","kk","pp","ii")
val scoreMap:Map[String,Int] = Map("xu"->150, "kk"->100,"pp"-> 200,"ii"->190)
def keyFunc(iter: Iterator[String]) : Iterator[Int] = {
var list = List[Int]()
while (iter.hasNext){
val name = iter.next
val sorce = scoreMap(name)
list .::= (sorce)
// list = list.+:(sorce)
}
list.iterator
}
sc.parallelize(names).mapPartitions(keyFunc).foreach { sorce => println(sorce)}
}
}
3.1,Java代码实现
package com.lyl.it;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.VoidFunction;
public class MapPartitonWithIndexOperator {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("MapPartitonWithIndexOperator").setMaster("local");
//conf.set("spark.default.paralleism", "3");
JavaSparkContext sc = new JavaSparkContext(conf);
List names = Arrays.asList("xu","kk","pp","ii");
JavaRDD nameRDD = sc.parallelize(names,2);
JavaRDD nameWithPartionIndex = nameRDD.mapPartitionsWithIndex(new Function2, Iterator>() {
private static final long serialVersionUID = 1L;
public Iterator call(Integer index, Iterator iterator)
throws Exception {
List list = new ArrayList();
while (iterator.hasNext()) {
String name = iterator.next();
String result = index +":"+name;
list.add(result);
}
return list.iterator();
}
}, true);
nameWithPartionIndex.foreach(new VoidFunction() {
private static final long serialVersionUID = 4092833416807456355L;
public void call(String result) throws Exception {
System.out.println(result);
}
});
sc.close();
}
}
3.2,说明MapPartitonWithIndex算子是追踪元素是属于是在那一个并行度执行,下面是程序运行的结果
18/07/19 11:13:17 INFO BlockManagerInfo: Added broadcast_0_piece0 in memory on localhost:54888 (size: 1437.0 B, free: 1121.6 MB)
18/07/19 11:13:17 INFO SparkContext: Created broadcast 0 from broadcast at DAGScheduler.scala:1006
18/07/19 11:13:17 INFO DAGScheduler: Submitting 2 missing tasks from ResultStage 0 (MapPartitionsRDD[1] at mapPartitionsWithIndex at MapPartitonWithIndexOperator.java:24)
18/07/19 11:13:17 INFO TaskSchedulerImpl: Adding task set 0.0 with 2 tasks
18/07/19 11:13:17 INFO TaskSetManager: Starting task 0.0 in stage 0.0 (TID 0, localhost, partition 0,PROCESS_LOCAL, 2090 bytes)
18/07/19 11:13:17 INFO Executor: Running task 0.0 in stage 0.0 (TID 0)
0:xu
0:kk
18/07/19 11:13:17 INFO Executor: Finished task 0.0 in stage 0.0 (TID 0). 915 bytes result sent to driver
18/07/19 11:13:17 INFO TaskSetManager: Starting task 1.0 in stage 0.0 (TID 1, localhost, partition 1,PROCESS_LOCAL, 2090 bytes)
18/07/19 11:13:17 INFO Executor: Running task 1.0 in stage 0.0 (TID 1)
1:pp
1:ii
18/07/19 11:13:17 INFO TaskSetManager: Finished task 0.0 in stage 0.0 (TID 0) in 209 ms on localhost (1/2)
18/07/19 11:13:17 INFO Executor: Finished task 1.0 in stage 0.0 (TID 1). 915 bytes result sent to driver
3.3,Scala的实现
package com.lyl.it
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
object MapPartitonWithIndexOperator {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("MapPartitonWithIndexOperator").setMaster("local")
val sc = new SparkContext(conf)
val names = Array("xu","kk","pp","ii")
def indexFunc(index:Int,iterator:Iterator[String]) : Iterator[String] = {
var list = List[String]()
while (iterator.hasNext) {
val name = iterator.next
val result = index+":"+name
list.::=(result)
}
list.iterator
}
sc.parallelize(names, 2)
.mapPartitionsWithIndex(indexFunc, true)
.foreach { result => println(result) }
}
}
4.1,Java代码实现
package com.lyl.it;
import java.util.Arrays;
import java.util.List;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.VoidFunction;
public class FilterOperator {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("FilterOperator").setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
List numbers = Arrays.asList(1, 2, 3, 4, 5);
JavaRDD numberRDD = sc.parallelize(numbers);
JavaRDD results = numberRDD.filter(new Function() {
private static final long serialVersionUID = 1L;
public Boolean call(Integer number) throws Exception {
return number%2==0;
}
});
results.foreach(new VoidFunction() {
private static final long serialVersionUID = 1L;
public void call(Integer result) throws Exception {
System.out.println(result);
}
});
sc.close();
}
}
4.2,说明Filter算子主要是对于元素的过滤,下面是程序运行的结果
18/07/19 21:27:03 INFO SparkContext: Created broadcast 0 from broadcast at DAGScheduler.scala:1006
18/07/19 21:27:03 INFO DAGScheduler: Submitting 1 missing tasks from ResultStage 0 (MapPartitionsRDD[1] at filter at FilterOperator.java:22)
18/07/19 21:27:03 INFO TaskSchedulerImpl: Adding task set 0.0 with 1 tasks
18/07/19 21:27:03 INFO TaskSetManager: Starting task 0.0 in stage 0.0 (TID 0, localhost, partition 0,PROCESS_LOCAL, 2170 bytes)
18/07/19 21:27:03 INFO Executor: Running task 0.0 in stage 0.0 (TID 0)
2
4
18/07/19 21:27:03 INFO Executor: Finished task 0.0 in stage 0.0 (TID 0). 915 bytes result sent to driver
18/07/19 21:27:03 INFO TaskSetManager: Finished task 0.0 in stage 0.0 (TID 0) in 364 ms on localhost (1/1)
4.3 ,Scala的实现
package com.lyl.it
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
object FilterOperator {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("FilterOperator").setMaster("local")
val sc = new SparkContext(conf)
var numbers = Array(1,2,3,4,5)
sc.parallelize(numbers).filter ( number => number%2==0).foreach {result => println(result)}
}
}
5.1,Java代码实现
package com.lyl.it;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function2;
public class CoalesceOperator {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("CoalesceOperator").setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
List staffList = Arrays.asList("xx1", "xx2", "xx3", "xx4",
"xx5", "xx6", "xx7", "xx8", "xx9", "xx10", "xx11", "xx12");
JavaRDD staffRDD = sc.parallelize(staffList, 6);
JavaRDD staffRDD2 = staffRDD.mapPartitionsWithIndex(
new Function2, Iterator>() {
private static final long serialVersionUID = 1L;
public Iterator call(Integer index,Iterator iterator) throws Exception {
List list = new ArrayList();
while (iterator.hasNext()) {
String staff = iterator.next();
list.add("部门[" + (index) + "]" + staff);
}
return list.iterator();
}
}, true);
for (String staffInfo:staffRDD2.collect()) {
System.out.println(staffInfo);
}
sc.close();
}
}
5.2,说明使用Coalesce算子能让partition的数据更加紧凑下面是程序运行的结果
18/07/19 21:29:05 INFO TaskSetManager: Finished task 5.0 in stage 0.0 (TID 5) in 25 ms on localhost (6/6)
18/07/19 21:29:05 INFO DAGScheduler: ResultStage 0 (collect at CoalesceOperator.java:40) finished in 0.330 s
18/07/19 21:29:05 INFO TaskSchedulerImpl: Removed TaskSet 0.0, whose tasks have all completed, from pool
18/07/19 21:29:05 INFO DAGScheduler: Job 0 finished: collect at CoalesceOperator.java:40, took 0.816818 s
部门[0]xx1
部门[0]xx2
部门[1]xx3
部门[1]xx4
部门[2]xx5
部门[2]xx6
部门[3]xx7
部门[3]xx8
部门[4]xx9
部门[4]xx10
部门[5]xx11
部门[5]xx12
18/07/19 21:29:05 INFO BlockManagerInfo: Removed broadcast_0_piece0 on localhost:51230 in memory (size: 1407.0 B, free: 1121.6 MB)
18/07/19 21:29:05 INFO SparkUI: Stopped Spark web UI at http://192.168.21.1:4040
5.3,Scala的实现
package com.lyl.it
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
object CoalesceOperator {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("CoalesceOperator").setMaster("local")
val sc = new SparkContext(conf)
val staffList = Array("xx1", "xx2", "xx3", "xx4","xx5", "xx6",
"xx7", "xx8", "xx9", "xx10", "xx11", "xx12")
def indexFunc(index:Int,iterator:Iterator[String]) : Iterator[String] = {
var list = List[String]()
while (iterator.hasNext) {
val staff = iterator.next
val result = "部门["+index+"]"+staff
list.::=(result)
}
list.iterator
}
val staffRDD2 = sc.parallelize(staffList, 6).mapPartitionsWithIndex(indexFunc, true);
val resultList = staffRDD2.collect()
for( result <- resultList ){
println(result);
}
}
}
6.1,Java代码实现
package com.lyl.it;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function2;
public class RepartitonOperator {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("RepartitonOperator").setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
List staffList = Arrays.asList("xx1", "xx2", "xx3", "xx4",
"xx5", "xx6", "xx7", "xx8", "xx9", "xx10", "xx11", "xx12");
JavaRDD staffRDD = sc.parallelize(staffList,3);
JavaRDD staffRDD2 = staffRDD.mapPartitionsWithIndex(new Function2, Iterator>() {
private static final long serialVersionUID = 1L;
public Iterator call(Integer index, Iterator iterator)
throws Exception {
List list = new ArrayList();
while (iterator.hasNext()) {
String staff = iterator.next();
list.add("部门["+(index)+"]"+staff);
}
return list.iterator();
}
}, true);
for (String staffInfo:staffRDD2.collect()) {
System.out.println(staffInfo);
}
JavaRDD staffRDD3 = staffRDD2.repartition(6);
JavaRDD staffRDD4 = staffRDD3.mapPartitionsWithIndex(new Function2, Iterator>() {
private static final long serialVersionUID = 1L;
public Iterator call(Integer index, Iterator iterator)
throws Exception {
List list = new ArrayList();
while (iterator.hasNext()) {
String staff = iterator.next();
list.add("部门["+(index)+"]"+staff);
}
return list.iterator();
}
}, true);
for(String staffInfo:staffRDD4.collect()){
System.out.println(staffInfo);
}
sc.close();
}
}
6.2,说明Repartiton算子主要是把partiton的数量变多,实际上也增加了并行度,下面是程序运行的结果
18/07/23 09:33:26 INFO Executor: Finished task 2.0 in stage 0.0 (TID 2). 978 bytes result sent to driver
18/07/23 09:33:26 INFO TaskSetManager: Finished task 2.0 in stage 0.0 (TID 2) in 48 ms on localhost (3/3)
18/07/23 09:33:26 INFO DAGScheduler: ResultStage 0 (collect at RepartitonOperator.java:38) finished in 0.462 s
18/07/23 09:33:26 INFO TaskSchedulerImpl: Removed TaskSet 0.0, whose tasks have all completed, from pool
18/07/23 09:33:26 INFO DAGScheduler: Job 0 finished: collect at RepartitonOperator.java:38, took 1.594547 s
部门[0]xx1
部门[0]xx2
部门[0]xx3
部门[0]xx4
部门[1]xx5
部门[1]xx6
部门[1]xx7
部门[1]xx8
部门[2]xx9
部门[2]xx10
部门[2]xx11
部门[2]xx12
18/07/23 09:33:27 INFO BlockManagerInfo: Removed broadcast_0_piece0 on localhost:51415 in memory (size: 1408.0 B, free: 1121.6 MB)
18/07/23 09:33:27 INFO ContextCleaner: Cleaned accumulator 1
18/07/23 09:33:27 INFO SparkContext: Starting job: collect at RepartitonOperator.java:60
18/07/23 09:33:27 INFO DAGScheduler: Registering RDD 2 (repartition at RepartitonOperator.java:43)
18/07/23 09:33:27 INFO DAGScheduler: Got job 1 (collect at RepartitonOperator.java:60) with 6 output partitions
18/07/23 09:33:27 INFO DAGScheduler: Final stage: ResultStage 2 (collect at RepartitonOperator.java:60)
18/07/23 09:33:27 INFO DAGScheduler: Parents of final stage: List(ShuffleMapStage 1)
18/07/23 09:33:27 INFO DAGScheduler: Missing parents: List(ShuffleMapStage 1)
18/07/23 09:33:27 INFO DAGScheduler: Submitting ShuffleMapStage 1 (MapPartitionsRDD[2] at repartition at RepartitonOperator.java:43), which has no missing parents
18/07/23 09:33:27 INFO MemoryStore: Block broadcast_1 stored as values in memory (estimated size 3.1 KB, free 1121.6 MB)
18/07/23 09:33:27 INFO MemoryStore: Block broadcast_1_piece0 stored as bytes in memory (estimated size 1908.0 B, free 1121.6 MB)
18/07/23 09:33:27 INFO BlockManagerInfo: Added broadcast_1_piece0 in memory on localhost:51415 (size: 1908.0 B, free: 1121.6 MB)
18/07/23 09:33:27 INFO SparkContext: Created broadcast 1 from broadcast at DAGScheduler.scala:1006
18/07/23 09:33:27 INFO DAGScheduler: Submitting 3 missing tasks from ShuffleMapStage 1 (MapPartitionsRDD[2] at repartition at RepartitonOperator.java:43)
18/07/23 09:33:27 INFO TaskSchedulerImpl: Adding task set 1.0 with 3 tasks
18/07/23 09:33:27 INFO TaskSetManager: Starting task 0.0 in stage 1.0 (TID 3, localhost, partition 0,PROCESS_LOCAL, 2093 bytes)
18/07/23 09:33:27 INFO Executor: Running task 0.0 in stage 1.0 (TID 3)
18/07/23 09:33:27 INFO Executor: Finished task 0.0 in stage 1.0 (TID 3). 1163 bytes result sent to driver
18/07/23 09:33:27 INFO TaskSetManager: Starting task 1.0 in stage 1.0 (TID 4, localhost, partition 1,PROCESS_LOCAL, 2093 bytes)
18/07/23 09:33:27 INFO Executor: Running task 1.0 in stage 1.0 (TID 4)
18/07/23 09:33:27 INFO TaskSetManager: Finished task 0.0 in stage 1.0 (TID 3) in 255 ms on localhost (1/3)
18/07/23 09:33:27 INFO Executor: Finished task 1.0 in stage 1.0 (TID 4). 1163 bytes result sent to driver
18/07/23 09:33:27 INFO TaskSetManager: Starting task 2.0 in stage 1.0 (TID 5, localhost, partition 2,PROCESS_LOCAL, 2096 bytes)
18/07/23 09:33:27 INFO Executor: Running task 2.0 in stage 1.0 (TID 5)
18/07/23 09:33:27 INFO TaskSetManager: Finished task 1.0 in stage 1.0 (TID 4) in 83 ms on localhost (2/3)
18/07/23 09:33:27 INFO Executor: Finished task 2.0 in stage 1.0 (TID 5). 1163 bytes result sent to driver
18/07/23 09:33:27 INFO TaskSetManager: Finished task 2.0 in stage 1.0 (TID 5) in 81 ms on localhost (3/3)
18/07/23 09:33:27 INFO DAGScheduler: ShuffleMapStage 1 (repartition at RepartitonOperator.java:43) finished in 0.411 s
18/07/23 09:33:27 INFO TaskSchedulerImpl: Removed TaskSet 1.0, whose tasks have all completed, from pool
18/07/23 09:33:27 INFO DAGScheduler: looking for newly runnable stages
18/07/23 09:33:27 INFO DAGScheduler: running: Set()
18/07/23 09:33:27 INFO DAGScheduler: waiting: Set(ResultStage 2)
18/07/23 09:33:27 INFO DAGScheduler: failed: Set()
18/07/23 09:33:27 INFO DAGScheduler: Submitting ResultStage 2 (MapPartitionsRDD[6] at mapPartitionsWithIndex at RepartitonOperator.java:45), which has no missing parents
18/07/23 09:33:27 INFO MemoryStore: Block broadcast_2 stored as values in memory (estimated size 3.8 KB, free 1121.6 MB)
18/07/23 09:33:27 INFO MemoryStore: Block broadcast_2_piece0 stored as bytes in memory (estimated size 2.2 KB, free 1121.6 MB)
18/07/23 09:33:27 INFO BlockManagerInfo: Added broadcast_2_piece0 in memory on localhost:51415 (size: 2.2 KB, free: 1121.6 MB)
18/07/23 09:33:27 INFO SparkContext: Created broadcast 2 from broadcast at DAGScheduler.scala:1006
18/07/23 09:33:27 INFO DAGScheduler: Submitting 6 missing tasks from ResultStage 2 (MapPartitionsRDD[6] at mapPartitionsWithIndex at RepartitonOperator.java:45)
18/07/23 09:33:27 INFO TaskSchedulerImpl: Adding task set 2.0 with 6 tasks
18/07/23 09:33:27 INFO TaskSetManager: Starting task 0.0 in stage 2.0 (TID 6, localhost, partition 0,NODE_LOCAL, 2163 bytes)
18/07/23 09:33:27 INFO Executor: Running task 0.0 in stage 2.0 (TID 6)
18/07/23 09:33:27 INFO ShuffleBlockFetcherIterator: Getting 3 non-empty blocks out of 3 blocks
18/07/23 09:33:27 INFO ShuffleBlockFetcherIterator: Started 0 remote fetches in 16 ms
18/07/23 09:33:28 INFO Executor: Finished task 0.0 in stage 2.0 (TID 6). 1214 bytes result sent to driver
18/07/23 09:33:28 INFO TaskSetManager: Starting task 1.0 in stage 2.0 (TID 7, localhost, partition 1,NODE_LOCAL, 2163 bytes)
18/07/23 09:33:28 INFO Executor: Running task 1.0 in stage 2.0 (TID 7)
18/07/23 09:33:28 INFO TaskSetManager: Finished task 0.0 in stage 2.0 (TID 6) in 209 ms on localhost (1/6)
18/07/23 09:33:28 INFO ShuffleBlockFetcherIterator: Getting 3 non-empty blocks out of 3 blocks
18/07/23 09:33:28 INFO ShuffleBlockFetcherIterator: Started 0 remote fetches in 1 ms
18/07/23 09:33:28 INFO Executor: Finished task 1.0 in stage 2.0 (TID 7). 1238 bytes result sent to driver
18/07/23 09:33:28 INFO TaskSetManager: Starting task 2.0 in stage 2.0 (TID 8, localhost, partition 2,NODE_LOCAL, 2163 bytes)
18/07/23 09:33:28 INFO Executor: Running task 2.0 in stage 2.0 (TID 8)
18/07/23 09:33:28 INFO TaskSetManager: Finished task 1.0 in stage 2.0 (TID 7) in 27 ms on localhost (2/6)
18/07/23 09:33:28 INFO ShuffleBlockFetcherIterator: Getting 3 non-empty blocks out of 3 blocks
18/07/23 09:33:28 INFO ShuffleBlockFetcherIterator: Started 0 remote fetches in 1 ms
18/07/23 09:33:28 INFO Executor: Finished task 2.0 in stage 2.0 (TID 8). 1214 bytes result sent to driver
18/07/23 09:33:28 INFO TaskSetManager: Starting task 3.0 in stage 2.0 (TID 9, localhost, partition 3,NODE_LOCAL, 2163 bytes)
18/07/23 09:33:28 INFO Executor: Running task 3.0 in stage 2.0 (TID 9)
18/07/23 09:33:28 INFO TaskSetManager: Finished task 2.0 in stage 2.0 (TID 8) in 27 ms on localhost (3/6)
18/07/23 09:33:28 INFO ShuffleBlockFetcherIterator: Getting 3 non-empty blocks out of 3 blocks
18/07/23 09:33:28 INFO ShuffleBlockFetcherIterator: Started 0 remote fetches in 1 ms
18/07/23 09:33:28 INFO Executor: Finished task 3.0 in stage 2.0 (TID 9). 1189 bytes result sent to driver
18/07/23 09:33:28 INFO TaskSetManager: Starting task 4.0 in stage 2.0 (TID 10, localhost, partition 4,NODE_LOCAL, 2163 bytes)
18/07/23 09:33:28 INFO Executor: Running task 4.0 in stage 2.0 (TID 10)
18/07/23 09:33:28 INFO TaskSetManager: Finished task 3.0 in stage 2.0 (TID 9) in 24 ms on localhost (4/6)
18/07/23 09:33:28 INFO ShuffleBlockFetcherIterator: Getting 3 non-empty blocks out of 3 blocks
18/07/23 09:33:28 INFO ShuffleBlockFetcherIterator: Started 0 remote fetches in 1 ms
18/07/23 09:33:28 INFO Executor: Finished task 4.0 in stage 2.0 (TID 10). 1213 bytes result sent to driver
18/07/23 09:33:28 INFO TaskSetManager: Starting task 5.0 in stage 2.0 (TID 11, localhost, partition 5,NODE_LOCAL, 2163 bytes)
18/07/23 09:33:28 INFO Executor: Running task 5.0 in stage 2.0 (TID 11)
18/07/23 09:33:28 INFO TaskSetManager: Finished task 4.0 in stage 2.0 (TID 10) in 28 ms on localhost (5/6)
18/07/23 09:33:28 INFO ShuffleBlockFetcherIterator: Getting 3 non-empty blocks out of 3 blocks
18/07/23 09:33:28 INFO ShuffleBlockFetcherIterator: Started 0 remote fetches in 1 ms
18/07/23 09:33:28 INFO Executor: Finished task 5.0 in stage 2.0 (TID 11). 1213 bytes result sent to driver
18/07/23 09:33:28 INFO TaskSetManager: Finished task 5.0 in stage 2.0 (TID 11) in 26 ms on localhost (6/6)
18/07/23 09:33:28 INFO DAGScheduler: ResultStage 2 (collect at RepartitonOperator.java:60) finished in 0.328 s
18/07/23 09:33:28 INFO TaskSchedulerImpl: Removed TaskSet 2.0, whose tasks have all completed, from pool
18/07/23 09:33:28 INFO DAGScheduler: Job 1 finished: collect at RepartitonOperator.java:60, took 0.885546 s
部门[0]部门[1]xx7
部门[0]部门[2]xx10
部门[1]部门[0]xx1
部门[1]部门[1]xx8
部门[1]部门[2]xx11
部门[2]部门[0]xx2
部门[2]部门[2]xx12
部门[3]部门[0]xx3
部门[4]部门[0]xx4
部门[4]部门[1]xx5
部门[5]部门[1]xx6
部门[5]部门[2]xx9
18/07/23 09:33:28 INFO SparkUI: Stopped Spark web UI at http://192.168.21.1:4040
18/07/23 09:33:28 INFO MapOutputTrackerMasterEndpoint: MapOutputTrackerMasterEndpoint stopped!
6.3,Scala代码实现
package com.lyl.it
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
object RepartitonOperator {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("RepartitonOperator").setMaster("local")
val sc = new SparkContext(conf)
val staffList = Array("xx1", "xx2", "xx3", "xx4","xx5", "xx6",
"xx7", "xx8", "xx9", "xx10", "xx11", "xx12")
def indexFunc(index:Int,iterator:Iterator[String]) : Iterator[String] = {
var list = List[String]()
while (iterator.hasNext) {
val staff = iterator.next
val result = "部门["+index+"]"+staff
list.::=(result)
}
list.iterator
}
val staffRDD4 = sc.parallelize(staffList, 3)
.mapPartitionsWithIndex(indexFunc, true)
.repartition(6)
.mapPartitionsWithIndex(indexFunc, true)
for(result <- staffRDD4.collect() ){
println(result)
}
}
}
7.1,Java代码实现
package com.lyl.it;
import java.util.Arrays;
import java.util.List;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.VoidFunction;
public class FlatMapOperator {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("FlatMapOperator").setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
List lineList = Arrays.asList("hello","hello","hello2","uuu");
JavaRDD lines = sc.parallelize(lineList);
JavaRDD words = lines.flatMap(new FlatMapFunction() {
private static final long serialVersionUID = 1L;
public Iterable call(String line) throws Exception {
return Arrays.asList(line.split(" "));
}
});
words.foreach(new VoidFunction() {
private static final long serialVersionUID = 1L;
public void call(String result) throws Exception {
System.out.println(result);
}
});
sc.close();
}
}
7.2,说明?? 下面是程序运行的结果
18/08/02 15:16:34 INFO TaskSchedulerImpl: Adding task set 0.0 with 1 tasks
18/08/02 15:16:34 INFO TaskSetManager: Starting task 0.0 in stage 0.0 (TID 0, localhost, partition 0,PROCESS_LOCAL, 2108 bytes)
18/08/02 15:16:34 INFO Executor: Running task 0.0 in stage 0.0 (TID 0)
hello
hello
hello2
uuu
18/08/02 15:16:34 INFO Executor: Finished task 0.0 in stage 0.0 (TID 0). 915 bytes result sent to driver
7.3,Scala的实现
package com.lyl.it
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
object FlatMapOperator {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("FlatMapOperator").setMaster("local")
val sc = new SparkContext(conf)
val lineList = Array("hello","hello","hello2","uuu")
sc.parallelize(lineList)
.flatMap (line => line.split(" "))
.foreach {
result => println(result)
}
}
}
8.1,Java代码实现
package com.lyl.it;
import java.util.Arrays;
import java.util.List;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import scala.Tuple2;
public class AggregateByKeyOperator {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("AggregateByKeyOperator").setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD lines = sc.textFile("rc//ch.txt");
JavaRDD words = lines.flatMap(new FlatMapFunction() {
private static final long serialVersionUID = 1L;
public Iterable call(String line) throws Exception {
return Arrays.asList(line.split(" "));
}
});
JavaPairRDD pairs = words.mapToPair(new PairFunction() {
private static final long serialVersionUID = 1L;
public Tuple2 call(String word)
throws Exception {
return new Tuple2(word, 1);
}
});
JavaPairRDD wordCounts = pairs.aggregateByKey(0,new Function2() {
private static final long serialVersionUID = 1L;
public Integer call(Integer v1, Integer v2)
throws Exception {
return v1 + v2;
}
}, new Function2() {
private static final long serialVersionUID = 1L;
public Integer call(Integer v1, Integer v2)
throws Exception {
return v1 + v2;
}
});
List> list = wordCounts.collect();
for (Tuple2 wc : list) {
System.out.println(wc);
}
sc.close();
}
}
8.2,说明??
ch.txt的数据
ww cc rr cc ww
ddf dfd
下面是程序运行的结果
18/08/02 15:24:34 INFO DAGScheduler: ResultStage 1 (collect at AggregateByKeyOperator.java:61) finished in 0.033 s
18/08/02 15:24:34 INFO DAGScheduler: Job 0 finished: collect at AggregateByKeyOperator.java:61, took 0.227513 s
(dfd,1)
(ddf,1)
(ww,2)
(rr,1)
(cc,2)
18/08/02 15:24:34 INFO SparkUI: Stopped Spark web UI at http://192.168.158.1:4040
8.3,Scala的实现
package com.lyl.it
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
object AggregateByKeyOperator {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("AggregateByKeyOperator").setMaster("local")
val sc = new SparkContext(conf)
def seq(v1:Int,v2:Int) : Int = {
val v = v1+v2
v
}
def combOp(v1:Int,v2:Int) : Int = {
val v = v1+v2
v
}
val list = sc.textFile("rc//ch.txt")
.flatMap(line => line.split(" "))
.map(word => (word,1))
.aggregateByKey(0)(seq, combOp)
for(wc <- list.collect() ){
println(wc)
}
}
}