Spark的RDD操作和描述_1

本次博客分发三大部分

一,Spark的RDD用JAVA的实现

二,Spark的RDD的说明

三,Spark的Scala的实现

1,Map算子

1.1 Java代码实现

package com.lyl.it;

import java.util.Arrays;
import java.util.List;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.VoidFunction;

public class MapOperator {
	
	public static void main(String[] args) {
		SparkConf conf = new SparkConf().setAppName("MapOperator").setMaster("local");
		JavaSparkContext sc = new JavaSparkContext(conf);
		
		List  numbers = Arrays.asList(1,2,3,4,5);
		
		JavaRDD numberRDD = sc.parallelize(numbers);
		
		JavaRDD result = numberRDD.map(new Function() {
			
			private static final long serialVersionUID = 1L;

			public Integer call(Integer number) throws Exception {
				return number*10;
			}
		});
		
		result.foreach(new VoidFunction() {
			
			private static final long serialVersionUID = 1L;

			public void call(Integer result) throws Exception {
				System.out.println(result);
			}
		});
		
		sc.close();
	}
}

1.2,说明map算子是实现对于每个元素进行操作,下面是程序运行的结果

18/07/19 09:42:14 INFO SparkContext: Created broadcast 0 from broadcast at DAGScheduler.scala:1006
18/07/19 09:42:14 INFO DAGScheduler: Submitting 1 missing tasks from ResultStage 0 (MapPartitionsRDD[1] at map at MapOperator.java:22)
18/07/19 09:42:15 INFO TaskSchedulerImpl: Adding task set 0.0 with 1 tasks
18/07/19 09:42:15 INFO TaskSetManager: Starting task 0.0 in stage 0.0 (TID 0, localhost, partition 0,PROCESS_LOCAL, 2170 bytes)
18/07/19 09:42:15 INFO Executor: Running task 0.0 in stage 0.0 (TID 0)
10
20
30
40
50
18/07/19 09:42:15 INFO Executor: Finished task 0.0 in stage 0.0 (TID 0). 915 bytes result sent to driver
18/07/19 09:42:15 INFO TaskSetManager: Finished task 0.0 in stage 0.0 (TID 0) in 387 ms on localhost (1/1)
18/07/19 09:42:15 INFO TaskSchedulerImpl: Removed TaskSet 0.0, whose tasks have all completed, from pool 
18/07/19 09:42:15 INFO DAGScheduler: ResultStage 0 (foreach at MapOperator.java:31) finished in 0.516 s
18/07/19 09:42:15 INFO DAGScheduler: Job 0 finished: foreach at MapOperator.java:31, took 1.936862 s
18/07/19 09:42:15 INFO SparkUI: Stopped Spark web UI at http://192.168.21.1:4040

1.3,Scala的实现

package com.lyl.it

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext

object MapOperator {
  
  def main(args: Array[String]): Unit = {
       val conf = new SparkConf().setAppName("MapOperator").setMaster("local")
       val sc = new SparkContext(conf)
       val numbers = Array(1,2,3,4,5)
       
       sc.parallelize(numbers).map ( num => num*10).foreach {result => println(result)}
       
  }
   
}

2,MapPartiton算子

2.1,Java代码实现

package com.lyl.it;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.VoidFunction;

public class MapPartitonOperator {
	
	public static void main(String[] args) {
		SparkConf conf = new SparkConf().setAppName("MapPartitonOperator").setMaster("local");
		JavaSparkContext sc = new JavaSparkContext(conf);
		
		List names = Arrays.asList("xu","kk","pp","ii");
		
		JavaRDD nameRDD = sc.parallelize(names);
		
		final Map scoreMap = new HashMap();
		scoreMap.put("xu", 150);
		scoreMap.put("kk", 100);
		scoreMap.put("pp", 200);
		scoreMap.put("ii", 190);
		
		JavaRDD sorceRDD = nameRDD.mapPartitions(new FlatMapFunction, Integer>() {
		
			private static final long serialVersionUID = 1L;

			public Iterable call(Iterator iterator)
					throws Exception {
			   List list = new ArrayList();
			   while(iterator.hasNext()){
				   String name = iterator.next();
				   Integer sorce = scoreMap.get(name);
				   list.add(sorce);
			   }
				return list;
			}
		});
		
		sorceRDD.foreach(new VoidFunction() {
		
			private static final long serialVersionUID = 1L;

			public void call(Integer sorce) throws Exception {
				System.out.println(sorce);
			}
		});
		sc.close();
	}
}

2,2,说明MapPartiton算子是对于一个Partiton的元素进行操作,下面是程序运行的结果

18/07/19 10:55:50 INFO MemoryStore: Block broadcast_0 stored as values in memory (estimated size 2.5 KB, free 1121.6 MB)
18/07/19 10:55:50 INFO MemoryStore: Block broadcast_0_piece0 stored as bytes in memory (estimated size 1632.0 B, free 1121.6 MB)
18/07/19 10:55:50 INFO BlockManagerInfo: Added broadcast_0_piece0 in memory on localhost:54623 (size: 1632.0 B, free: 1121.6 MB)
18/07/19 10:55:50 INFO SparkContext: Created broadcast 0 from broadcast at DAGScheduler.scala:1006
18/07/19 10:55:50 INFO DAGScheduler: Submitting 1 missing tasks from ResultStage 0 (MapPartitionsRDD[1] at mapPartitions at MapPartitonOperator.java:32)
18/07/19 10:55:50 INFO TaskSchedulerImpl: Adding task set 0.0 with 1 tasks
18/07/19 10:55:50 INFO TaskSetManager: Starting task 0.0 in stage 0.0 (TID 0, localhost, partition 0,PROCESS_LOCAL, 2100 bytes)
18/07/19 10:55:50 INFO Executor: Running task 0.0 in stage 0.0 (TID 0)
150
100
200
190
18/07/19 10:55:50 INFO Executor: Finished task 0.0 in stage 0.0 (TID 0). 915 bytes result sent to driver
18/07/19 10:55:50 INFO TaskSetManager: Finished task 0.0 in stage 0.0 (TID 0) in 212 ms on localhost (1/1)
18/07/19 10:55:50 INFO TaskSchedulerImpl: Removed TaskSet 0.0, whose tasks have all completed, from pool 

2.3,Scala的实现

package com.lyl.it

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.api.java.function.FlatMapFunction

object MapPartitonOperator {
  
   def main(args: Array[String]): Unit = {
       val conf = new SparkConf().setAppName("MapPartitonOperator").setMaster("local")
       val sc = new SparkContext(conf)
       
       val names = Array("xu","kk","pp","ii")
       
       val scoreMap:Map[String,Int] = Map("xu"->150, "kk"->100,"pp"-> 200,"ii"->190)
       
      def keyFunc(iter: Iterator[String]) : Iterator[Int] = {
        
        var list = List[Int]()
        
           while (iter.hasNext){
            val name = iter.next
            val sorce = scoreMap(name)
            list .::= (sorce)
//            list = list.+:(sorce)

           }
         list.iterator
      }
       
     sc.parallelize(names).mapPartitions(keyFunc).foreach { sorce => println(sorce)}
   }
}

3,MapPartitonWithIndex的算子

3.1,Java代码实现

package com.lyl.it;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.VoidFunction;

public class MapPartitonWithIndexOperator {
	
	public static void main(String[] args) {
        SparkConf conf = new SparkConf().setAppName("MapPartitonWithIndexOperator").setMaster("local");
		//conf.set("spark.default.paralleism", "3");
        JavaSparkContext sc = new JavaSparkContext(conf);
		List names = Arrays.asList("xu","kk","pp","ii");
		
		JavaRDD nameRDD = sc.parallelize(names,2);
		
		JavaRDD nameWithPartionIndex = nameRDD.mapPartitionsWithIndex(new Function2, Iterator>() {
			
			private static final long serialVersionUID = 1L;

			public Iterator call(Integer index, Iterator iterator)
					throws Exception {
				List list = new ArrayList();
				while (iterator.hasNext()) {
					String name = iterator.next();
					String result = index +":"+name;
					list.add(result);
				}
				return list.iterator();
			}
		}, true);
		
		nameWithPartionIndex.foreach(new VoidFunction() {
			
			private static final long serialVersionUID = 4092833416807456355L;

			public void call(String result) throws Exception {
				System.out.println(result);
			}
		});
		
		sc.close();
	}

}

3.2,说明MapPartitonWithIndex算子是追踪元素是属于是在那一个并行度执行,下面是程序运行的结果

18/07/19 11:13:17 INFO BlockManagerInfo: Added broadcast_0_piece0 in memory on localhost:54888 (size: 1437.0 B, free: 1121.6 MB)
18/07/19 11:13:17 INFO SparkContext: Created broadcast 0 from broadcast at DAGScheduler.scala:1006
18/07/19 11:13:17 INFO DAGScheduler: Submitting 2 missing tasks from ResultStage 0 (MapPartitionsRDD[1] at mapPartitionsWithIndex at MapPartitonWithIndexOperator.java:24)
18/07/19 11:13:17 INFO TaskSchedulerImpl: Adding task set 0.0 with 2 tasks
18/07/19 11:13:17 INFO TaskSetManager: Starting task 0.0 in stage 0.0 (TID 0, localhost, partition 0,PROCESS_LOCAL, 2090 bytes)
18/07/19 11:13:17 INFO Executor: Running task 0.0 in stage 0.0 (TID 0)
0:xu
0:kk
18/07/19 11:13:17 INFO Executor: Finished task 0.0 in stage 0.0 (TID 0). 915 bytes result sent to driver
18/07/19 11:13:17 INFO TaskSetManager: Starting task 1.0 in stage 0.0 (TID 1, localhost, partition 1,PROCESS_LOCAL, 2090 bytes)
18/07/19 11:13:17 INFO Executor: Running task 1.0 in stage 0.0 (TID 1)
1:pp
1:ii
18/07/19 11:13:17 INFO TaskSetManager: Finished task 0.0 in stage 0.0 (TID 0) in 209 ms on localhost (1/2)
18/07/19 11:13:17 INFO Executor: Finished task 1.0 in stage 0.0 (TID 1). 915 bytes result sent to driver

3.3,Scala的实现

package com.lyl.it

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext


object MapPartitonWithIndexOperator {
      
     def main(args: Array[String]): Unit = {
        val conf = new SparkConf().setAppName("MapPartitonWithIndexOperator").setMaster("local")
        val sc = new SparkContext(conf)
        
        val names = Array("xu","kk","pp","ii")
        
        def indexFunc(index:Int,iterator:Iterator[String]) : Iterator[String] = {
          var list = List[String]()
          
          while (iterator.hasNext) {
            val name = iterator.next
            val result = index+":"+name
            list.::=(result)
          }
          
          list.iterator
        }
        
        sc.parallelize(names, 2)
          .mapPartitionsWithIndex(indexFunc, true)
          .foreach { result => println(result) }
        
     }
}

4,Filter算子

4.1,Java代码实现

package com.lyl.it;

import java.util.Arrays;
import java.util.List;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.VoidFunction;

public class FilterOperator {

	public static void main(String[] args) {
		SparkConf conf = new SparkConf().setAppName("FilterOperator").setMaster("local");
		JavaSparkContext sc = new JavaSparkContext(conf);

		List numbers = Arrays.asList(1, 2, 3, 4, 5);

		JavaRDD numberRDD = sc.parallelize(numbers);
		
		JavaRDD results = numberRDD.filter(new Function() {
			
			private static final long serialVersionUID = 1L;

			public Boolean call(Integer number) throws Exception {
				return number%2==0;
			}
		});
		
		results.foreach(new VoidFunction() {
			
			private static final long serialVersionUID = 1L;

			public void call(Integer result) throws Exception {
				System.out.println(result);
				
			}
		});
		
		sc.close();
	}
}

4.2,说明Filter算子主要是对于元素的过滤,下面是程序运行的结果

18/07/19 21:27:03 INFO SparkContext: Created broadcast 0 from broadcast at DAGScheduler.scala:1006
18/07/19 21:27:03 INFO DAGScheduler: Submitting 1 missing tasks from ResultStage 0 (MapPartitionsRDD[1] at filter at FilterOperator.java:22)
18/07/19 21:27:03 INFO TaskSchedulerImpl: Adding task set 0.0 with 1 tasks
18/07/19 21:27:03 INFO TaskSetManager: Starting task 0.0 in stage 0.0 (TID 0, localhost, partition 0,PROCESS_LOCAL, 2170 bytes)
18/07/19 21:27:03 INFO Executor: Running task 0.0 in stage 0.0 (TID 0)
2
4
18/07/19 21:27:03 INFO Executor: Finished task 0.0 in stage 0.0 (TID 0). 915 bytes result sent to driver
18/07/19 21:27:03 INFO TaskSetManager: Finished task 0.0 in stage 0.0 (TID 0) in 364 ms on localhost (1/1)

4.3 ,Scala的实现

package com.lyl.it

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext

object FilterOperator {
    
    def main(args: Array[String]): Unit = {
       val conf = new SparkConf().setAppName("FilterOperator").setMaster("local")
       val sc = new SparkContext(conf)
       
       var numbers = Array(1,2,3,4,5)
       
       sc.parallelize(numbers).filter ( number => number%2==0).foreach {result => println(result)}
    }
}

5, Coalesce算子

5.1,Java代码实现

package com.lyl.it;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function2;

public class CoalesceOperator {

	public static void main(String[] args) {
		SparkConf conf = new SparkConf().setAppName("CoalesceOperator").setMaster("local");

		JavaSparkContext sc = new JavaSparkContext(conf);

		List staffList = Arrays.asList("xx1", "xx2", "xx3", "xx4",
				"xx5", "xx6", "xx7", "xx8", "xx9", "xx10", "xx11", "xx12");

		JavaRDD staffRDD = sc.parallelize(staffList, 6);

		JavaRDD staffRDD2 = staffRDD.mapPartitionsWithIndex(
				new Function2, Iterator>() {

					private static final long serialVersionUID = 1L;

					public Iterator call(Integer index,Iterator iterator) throws Exception {
						List list = new ArrayList();
						while (iterator.hasNext()) {
							String staff = iterator.next();
							list.add("部门[" + (index) + "]" + staff);
						}
						return list.iterator();
					}
				}, true);
		
		for (String staffInfo:staffRDD2.collect()) {
			System.out.println(staffInfo);
		}
		
		sc.close();
	}
}

5.2,说明使用Coalesce算子能让partition的数据更加紧凑下面是程序运行的结果

18/07/19 21:29:05 INFO TaskSetManager: Finished task 5.0 in stage 0.0 (TID 5) in 25 ms on localhost (6/6)
18/07/19 21:29:05 INFO DAGScheduler: ResultStage 0 (collect at CoalesceOperator.java:40) finished in 0.330 s
18/07/19 21:29:05 INFO TaskSchedulerImpl: Removed TaskSet 0.0, whose tasks have all completed, from pool 
18/07/19 21:29:05 INFO DAGScheduler: Job 0 finished: collect at CoalesceOperator.java:40, took 0.816818 s
部门[0]xx1
部门[0]xx2
部门[1]xx3
部门[1]xx4
部门[2]xx5
部门[2]xx6
部门[3]xx7
部门[3]xx8
部门[4]xx9
部门[4]xx10
部门[5]xx11
部门[5]xx12
18/07/19 21:29:05 INFO BlockManagerInfo: Removed broadcast_0_piece0 on localhost:51230 in memory (size: 1407.0 B, free: 1121.6 MB)
18/07/19 21:29:05 INFO SparkUI: Stopped Spark web UI at http://192.168.21.1:4040

5.3,Scala的实现

package com.lyl.it

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext

object CoalesceOperator {
      
    def main(args: Array[String]): Unit = {
      val conf = new SparkConf().setAppName("CoalesceOperator").setMaster("local")
      val sc = new SparkContext(conf)
      
      val staffList = Array("xx1", "xx2", "xx3", "xx4","xx5", "xx6", 
                            "xx7", "xx8", "xx9", "xx10", "xx11", "xx12")
      
      def indexFunc(index:Int,iterator:Iterator[String]) : Iterator[String] = {
         var list = List[String]()
          
          while (iterator.hasNext) {
            val staff = iterator.next
            val result = "部门["+index+"]"+staff
            list.::=(result)
          }
          list.iterator
        }
      
      val staffRDD2 = sc.parallelize(staffList, 6).mapPartitionsWithIndex(indexFunc, true);
      
      val resultList = staffRDD2.collect()
      
      for( result <- resultList ){
         println(result);
      }
    }
}

6,Repartiton算子

6.1,Java代码实现

package com.lyl.it;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function2;

public class RepartitonOperator {
	
	public static void main(String[] args) {
		SparkConf conf = new SparkConf().setAppName("RepartitonOperator").setMaster("local");
		JavaSparkContext sc = new JavaSparkContext(conf);
		
		List staffList = Arrays.asList("xx1", "xx2", "xx3", "xx4",
				"xx5", "xx6", "xx7", "xx8", "xx9", "xx10", "xx11", "xx12");
		
		JavaRDD staffRDD = sc.parallelize(staffList,3);
		JavaRDD staffRDD2 = staffRDD.mapPartitionsWithIndex(new Function2, Iterator>() {

			private static final long serialVersionUID = 1L;

			public Iterator call(Integer index, Iterator iterator)
					throws Exception {
				List list = new ArrayList();
				while (iterator.hasNext()) {
					  String staff = iterator.next();
					  list.add("部门["+(index)+"]"+staff);
				}
				return list.iterator();
			}
		}, true);
		
		for (String staffInfo:staffRDD2.collect()) {
			System.out.println(staffInfo);
			
		}
		
		JavaRDD staffRDD3 = staffRDD2.repartition(6);
		
		JavaRDD staffRDD4 = staffRDD3.mapPartitionsWithIndex(new Function2, Iterator>() {
			
			private static final long serialVersionUID = 1L;

			public Iterator call(Integer index, Iterator iterator)
					throws Exception {
				List list = new ArrayList();
				while (iterator.hasNext()) {
					  String staff = iterator.next();
					  list.add("部门["+(index)+"]"+staff);
				}
				return list.iterator();
			}
		}, true);
		
		for(String staffInfo:staffRDD4.collect()){
			System.out.println(staffInfo);
		}
		
		sc.close();
	}
}

6.2,说明Repartiton算子主要是把partiton的数量变多,实际上也增加了并行度,下面是程序运行的结果

18/07/23 09:33:26 INFO Executor: Finished task 2.0 in stage 0.0 (TID 2). 978 bytes result sent to driver
18/07/23 09:33:26 INFO TaskSetManager: Finished task 2.0 in stage 0.0 (TID 2) in 48 ms on localhost (3/3)
18/07/23 09:33:26 INFO DAGScheduler: ResultStage 0 (collect at RepartitonOperator.java:38) finished in 0.462 s
18/07/23 09:33:26 INFO TaskSchedulerImpl: Removed TaskSet 0.0, whose tasks have all completed, from pool 
18/07/23 09:33:26 INFO DAGScheduler: Job 0 finished: collect at RepartitonOperator.java:38, took 1.594547 s
部门[0]xx1
部门[0]xx2
部门[0]xx3
部门[0]xx4
部门[1]xx5
部门[1]xx6
部门[1]xx7
部门[1]xx8
部门[2]xx9
部门[2]xx10
部门[2]xx11
部门[2]xx12
18/07/23 09:33:27 INFO BlockManagerInfo: Removed broadcast_0_piece0 on localhost:51415 in memory (size: 1408.0 B, free: 1121.6 MB)
18/07/23 09:33:27 INFO ContextCleaner: Cleaned accumulator 1
18/07/23 09:33:27 INFO SparkContext: Starting job: collect at RepartitonOperator.java:60
18/07/23 09:33:27 INFO DAGScheduler: Registering RDD 2 (repartition at RepartitonOperator.java:43)
18/07/23 09:33:27 INFO DAGScheduler: Got job 1 (collect at RepartitonOperator.java:60) with 6 output partitions
18/07/23 09:33:27 INFO DAGScheduler: Final stage: ResultStage 2 (collect at RepartitonOperator.java:60)
18/07/23 09:33:27 INFO DAGScheduler: Parents of final stage: List(ShuffleMapStage 1)
18/07/23 09:33:27 INFO DAGScheduler: Missing parents: List(ShuffleMapStage 1)
18/07/23 09:33:27 INFO DAGScheduler: Submitting ShuffleMapStage 1 (MapPartitionsRDD[2] at repartition at RepartitonOperator.java:43), which has no missing parents
18/07/23 09:33:27 INFO MemoryStore: Block broadcast_1 stored as values in memory (estimated size 3.1 KB, free 1121.6 MB)
18/07/23 09:33:27 INFO MemoryStore: Block broadcast_1_piece0 stored as bytes in memory (estimated size 1908.0 B, free 1121.6 MB)
18/07/23 09:33:27 INFO BlockManagerInfo: Added broadcast_1_piece0 in memory on localhost:51415 (size: 1908.0 B, free: 1121.6 MB)
18/07/23 09:33:27 INFO SparkContext: Created broadcast 1 from broadcast at DAGScheduler.scala:1006
18/07/23 09:33:27 INFO DAGScheduler: Submitting 3 missing tasks from ShuffleMapStage 1 (MapPartitionsRDD[2] at repartition at RepartitonOperator.java:43)
18/07/23 09:33:27 INFO TaskSchedulerImpl: Adding task set 1.0 with 3 tasks
18/07/23 09:33:27 INFO TaskSetManager: Starting task 0.0 in stage 1.0 (TID 3, localhost, partition 0,PROCESS_LOCAL, 2093 bytes)
18/07/23 09:33:27 INFO Executor: Running task 0.0 in stage 1.0 (TID 3)
18/07/23 09:33:27 INFO Executor: Finished task 0.0 in stage 1.0 (TID 3). 1163 bytes result sent to driver
18/07/23 09:33:27 INFO TaskSetManager: Starting task 1.0 in stage 1.0 (TID 4, localhost, partition 1,PROCESS_LOCAL, 2093 bytes)
18/07/23 09:33:27 INFO Executor: Running task 1.0 in stage 1.0 (TID 4)
18/07/23 09:33:27 INFO TaskSetManager: Finished task 0.0 in stage 1.0 (TID 3) in 255 ms on localhost (1/3)
18/07/23 09:33:27 INFO Executor: Finished task 1.0 in stage 1.0 (TID 4). 1163 bytes result sent to driver
18/07/23 09:33:27 INFO TaskSetManager: Starting task 2.0 in stage 1.0 (TID 5, localhost, partition 2,PROCESS_LOCAL, 2096 bytes)
18/07/23 09:33:27 INFO Executor: Running task 2.0 in stage 1.0 (TID 5)
18/07/23 09:33:27 INFO TaskSetManager: Finished task 1.0 in stage 1.0 (TID 4) in 83 ms on localhost (2/3)
18/07/23 09:33:27 INFO Executor: Finished task 2.0 in stage 1.0 (TID 5). 1163 bytes result sent to driver
18/07/23 09:33:27 INFO TaskSetManager: Finished task 2.0 in stage 1.0 (TID 5) in 81 ms on localhost (3/3)
18/07/23 09:33:27 INFO DAGScheduler: ShuffleMapStage 1 (repartition at RepartitonOperator.java:43) finished in 0.411 s
18/07/23 09:33:27 INFO TaskSchedulerImpl: Removed TaskSet 1.0, whose tasks have all completed, from pool 
18/07/23 09:33:27 INFO DAGScheduler: looking for newly runnable stages
18/07/23 09:33:27 INFO DAGScheduler: running: Set()
18/07/23 09:33:27 INFO DAGScheduler: waiting: Set(ResultStage 2)
18/07/23 09:33:27 INFO DAGScheduler: failed: Set()
18/07/23 09:33:27 INFO DAGScheduler: Submitting ResultStage 2 (MapPartitionsRDD[6] at mapPartitionsWithIndex at RepartitonOperator.java:45), which has no missing parents
18/07/23 09:33:27 INFO MemoryStore: Block broadcast_2 stored as values in memory (estimated size 3.8 KB, free 1121.6 MB)
18/07/23 09:33:27 INFO MemoryStore: Block broadcast_2_piece0 stored as bytes in memory (estimated size 2.2 KB, free 1121.6 MB)
18/07/23 09:33:27 INFO BlockManagerInfo: Added broadcast_2_piece0 in memory on localhost:51415 (size: 2.2 KB, free: 1121.6 MB)
18/07/23 09:33:27 INFO SparkContext: Created broadcast 2 from broadcast at DAGScheduler.scala:1006
18/07/23 09:33:27 INFO DAGScheduler: Submitting 6 missing tasks from ResultStage 2 (MapPartitionsRDD[6] at mapPartitionsWithIndex at RepartitonOperator.java:45)
18/07/23 09:33:27 INFO TaskSchedulerImpl: Adding task set 2.0 with 6 tasks
18/07/23 09:33:27 INFO TaskSetManager: Starting task 0.0 in stage 2.0 (TID 6, localhost, partition 0,NODE_LOCAL, 2163 bytes)
18/07/23 09:33:27 INFO Executor: Running task 0.0 in stage 2.0 (TID 6)
18/07/23 09:33:27 INFO ShuffleBlockFetcherIterator: Getting 3 non-empty blocks out of 3 blocks
18/07/23 09:33:27 INFO ShuffleBlockFetcherIterator: Started 0 remote fetches in 16 ms
18/07/23 09:33:28 INFO Executor: Finished task 0.0 in stage 2.0 (TID 6). 1214 bytes result sent to driver
18/07/23 09:33:28 INFO TaskSetManager: Starting task 1.0 in stage 2.0 (TID 7, localhost, partition 1,NODE_LOCAL, 2163 bytes)
18/07/23 09:33:28 INFO Executor: Running task 1.0 in stage 2.0 (TID 7)
18/07/23 09:33:28 INFO TaskSetManager: Finished task 0.0 in stage 2.0 (TID 6) in 209 ms on localhost (1/6)
18/07/23 09:33:28 INFO ShuffleBlockFetcherIterator: Getting 3 non-empty blocks out of 3 blocks
18/07/23 09:33:28 INFO ShuffleBlockFetcherIterator: Started 0 remote fetches in 1 ms
18/07/23 09:33:28 INFO Executor: Finished task 1.0 in stage 2.0 (TID 7). 1238 bytes result sent to driver
18/07/23 09:33:28 INFO TaskSetManager: Starting task 2.0 in stage 2.0 (TID 8, localhost, partition 2,NODE_LOCAL, 2163 bytes)
18/07/23 09:33:28 INFO Executor: Running task 2.0 in stage 2.0 (TID 8)
18/07/23 09:33:28 INFO TaskSetManager: Finished task 1.0 in stage 2.0 (TID 7) in 27 ms on localhost (2/6)
18/07/23 09:33:28 INFO ShuffleBlockFetcherIterator: Getting 3 non-empty blocks out of 3 blocks
18/07/23 09:33:28 INFO ShuffleBlockFetcherIterator: Started 0 remote fetches in 1 ms
18/07/23 09:33:28 INFO Executor: Finished task 2.0 in stage 2.0 (TID 8). 1214 bytes result sent to driver
18/07/23 09:33:28 INFO TaskSetManager: Starting task 3.0 in stage 2.0 (TID 9, localhost, partition 3,NODE_LOCAL, 2163 bytes)
18/07/23 09:33:28 INFO Executor: Running task 3.0 in stage 2.0 (TID 9)
18/07/23 09:33:28 INFO TaskSetManager: Finished task 2.0 in stage 2.0 (TID 8) in 27 ms on localhost (3/6)
18/07/23 09:33:28 INFO ShuffleBlockFetcherIterator: Getting 3 non-empty blocks out of 3 blocks
18/07/23 09:33:28 INFO ShuffleBlockFetcherIterator: Started 0 remote fetches in 1 ms
18/07/23 09:33:28 INFO Executor: Finished task 3.0 in stage 2.0 (TID 9). 1189 bytes result sent to driver
18/07/23 09:33:28 INFO TaskSetManager: Starting task 4.0 in stage 2.0 (TID 10, localhost, partition 4,NODE_LOCAL, 2163 bytes)
18/07/23 09:33:28 INFO Executor: Running task 4.0 in stage 2.0 (TID 10)
18/07/23 09:33:28 INFO TaskSetManager: Finished task 3.0 in stage 2.0 (TID 9) in 24 ms on localhost (4/6)
18/07/23 09:33:28 INFO ShuffleBlockFetcherIterator: Getting 3 non-empty blocks out of 3 blocks
18/07/23 09:33:28 INFO ShuffleBlockFetcherIterator: Started 0 remote fetches in 1 ms
18/07/23 09:33:28 INFO Executor: Finished task 4.0 in stage 2.0 (TID 10). 1213 bytes result sent to driver
18/07/23 09:33:28 INFO TaskSetManager: Starting task 5.0 in stage 2.0 (TID 11, localhost, partition 5,NODE_LOCAL, 2163 bytes)
18/07/23 09:33:28 INFO Executor: Running task 5.0 in stage 2.0 (TID 11)
18/07/23 09:33:28 INFO TaskSetManager: Finished task 4.0 in stage 2.0 (TID 10) in 28 ms on localhost (5/6)
18/07/23 09:33:28 INFO ShuffleBlockFetcherIterator: Getting 3 non-empty blocks out of 3 blocks
18/07/23 09:33:28 INFO ShuffleBlockFetcherIterator: Started 0 remote fetches in 1 ms
18/07/23 09:33:28 INFO Executor: Finished task 5.0 in stage 2.0 (TID 11). 1213 bytes result sent to driver
18/07/23 09:33:28 INFO TaskSetManager: Finished task 5.0 in stage 2.0 (TID 11) in 26 ms on localhost (6/6)
18/07/23 09:33:28 INFO DAGScheduler: ResultStage 2 (collect at RepartitonOperator.java:60) finished in 0.328 s
18/07/23 09:33:28 INFO TaskSchedulerImpl: Removed TaskSet 2.0, whose tasks have all completed, from pool 
18/07/23 09:33:28 INFO DAGScheduler: Job 1 finished: collect at RepartitonOperator.java:60, took 0.885546 s
部门[0]部门[1]xx7
部门[0]部门[2]xx10
部门[1]部门[0]xx1
部门[1]部门[1]xx8
部门[1]部门[2]xx11
部门[2]部门[0]xx2
部门[2]部门[2]xx12
部门[3]部门[0]xx3
部门[4]部门[0]xx4
部门[4]部门[1]xx5
部门[5]部门[1]xx6
部门[5]部门[2]xx9
18/07/23 09:33:28 INFO SparkUI: Stopped Spark web UI at http://192.168.21.1:4040
18/07/23 09:33:28 INFO MapOutputTrackerMasterEndpoint: MapOutputTrackerMasterEndpoint stopped!

6.3,Scala代码实现

package com.lyl.it

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext

object RepartitonOperator {
    
     def main(args: Array[String]): Unit = {
      val conf = new SparkConf().setAppName("RepartitonOperator").setMaster("local")
      val sc = new SparkContext(conf)
      
      val staffList = Array("xx1", "xx2", "xx3", "xx4","xx5", "xx6", 
                            "xx7", "xx8", "xx9", "xx10", "xx11", "xx12")
                            
                            
      def indexFunc(index:Int,iterator:Iterator[String]) : Iterator[String] = {
         var list = List[String]()
          
          while (iterator.hasNext) {
            val staff = iterator.next
            val result = "部门["+index+"]"+staff
            list.::=(result)
          }
          list.iterator
      }
                            
      val staffRDD4 = sc.parallelize(staffList, 3)
                        .mapPartitionsWithIndex(indexFunc, true)
                        .repartition(6)
                        .mapPartitionsWithIndex(indexFunc, true)
                        
      for(result <- staffRDD4.collect() ){
         println(result)
      }
      
     }
}

7,FlatMap算子

7.1,Java代码实现

package com.lyl.it;

import java.util.Arrays;
import java.util.List;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.VoidFunction;

public class FlatMapOperator {
	
	public static void main(String[] args) {
		SparkConf conf = new SparkConf().setAppName("FlatMapOperator").setMaster("local");

		JavaSparkContext sc = new JavaSparkContext(conf);
		
		List lineList = Arrays.asList("hello","hello","hello2","uuu");
		
		JavaRDD lines = sc.parallelize(lineList);
		
		JavaRDD words = lines.flatMap(new FlatMapFunction() {

			private static final long serialVersionUID = 1L;

			public Iterable call(String line) throws Exception {
				return Arrays.asList(line.split(" "));
			}
		});
		
		words.foreach(new VoidFunction() {
			
			private static final long serialVersionUID = 1L;

			public void call(String result) throws Exception {
              		System.out.println(result);		
			}
		});
		
		sc.close();
	}
}

7.2,说明?? 下面是程序运行的结果

18/08/02 15:16:34 INFO TaskSchedulerImpl: Adding task set 0.0 with 1 tasks
18/08/02 15:16:34 INFO TaskSetManager: Starting task 0.0 in stage 0.0 (TID 0, localhost, partition 0,PROCESS_LOCAL, 2108 bytes)
18/08/02 15:16:34 INFO Executor: Running task 0.0 in stage 0.0 (TID 0)
hello
hello
hello2
uuu
18/08/02 15:16:34 INFO Executor: Finished task 0.0 in stage 0.0 (TID 0). 915 bytes result sent to driver

7.3,Scala的实现

package com.lyl.it

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext

object FlatMapOperator {
      
     def main(args: Array[String]): Unit = {
       val conf = new SparkConf().setAppName("FlatMapOperator").setMaster("local")
       val sc = new SparkContext(conf)
       
       val lineList = Array("hello","hello","hello2","uuu")
       sc.parallelize(lineList)
         .flatMap (line => line.split(" "))
         .foreach { 
          result => println(result)
         }
     }
}

8, AggregateByKey算子

8.1,Java代码实现

package com.lyl.it;

import java.util.Arrays;
import java.util.List;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;

import scala.Tuple2;

public class AggregateByKeyOperator {

	public static void main(String[] args) {

		SparkConf conf = new SparkConf().setAppName("AggregateByKeyOperator").setMaster("local");
		JavaSparkContext sc = new JavaSparkContext(conf);

		JavaRDD lines = sc.textFile("rc//ch.txt");
		JavaRDD words = lines.flatMap(new FlatMapFunction() {
				
					private static final long serialVersionUID = 1L;

					public Iterable call(String line) throws Exception {
						return Arrays.asList(line.split(" "));
					}
				});

		JavaPairRDD pairs = words.mapToPair(new PairFunction() {

					private static final long serialVersionUID = 1L;

					public Tuple2 call(String word)
							throws Exception {
						return new Tuple2(word, 1);
					}
				});

		JavaPairRDD wordCounts = pairs.aggregateByKey(0,new Function2() {

					private static final long serialVersionUID = 1L;

					public Integer call(Integer v1, Integer v2)
							throws Exception {
						return v1 + v2;
					}
				}, new Function2() {
			
					private static final long serialVersionUID = 1L;

					public Integer call(Integer v1, Integer v2)
							throws Exception {
						return v1 + v2;
					}
				});

		List> list = wordCounts.collect();
		for (Tuple2 wc : list) {
			System.out.println(wc);
		}
		sc.close();
	}
}

8.2,说明??

  ch.txt的数据

ww cc rr cc ww
ddf dfd

下面是程序运行的结果

18/08/02 15:24:34 INFO DAGScheduler: ResultStage 1 (collect at AggregateByKeyOperator.java:61) finished in 0.033 s
18/08/02 15:24:34 INFO DAGScheduler: Job 0 finished: collect at AggregateByKeyOperator.java:61, took 0.227513 s
(dfd,1)
(ddf,1)
(ww,2)
(rr,1)
(cc,2)
18/08/02 15:24:34 INFO SparkUI: Stopped Spark web UI at http://192.168.158.1:4040

8.3,Scala的实现

package com.lyl.it

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext

object AggregateByKeyOperator {
    
    def main(args: Array[String]): Unit = {
      val conf = new SparkConf().setAppName("AggregateByKeyOperator").setMaster("local")
      val sc = new SparkContext(conf)
      
      
      def seq(v1:Int,v2:Int) : Int = {
          val v = v1+v2
               v
      }
      
      def combOp(v1:Int,v2:Int) : Int = {
          val v = v1+v2
               v
      }
      
      val list = sc.textFile("rc//ch.txt")
                   .flatMap(line => line.split(" "))
                   .map(word => (word,1))
                   .aggregateByKey(0)(seq, combOp)
                     
     for(wc <- list.collect() ){
         println(wc)
      }
      
    }
}

 

你可能感兴趣的:(Spark的RDD操作和描述_1)