Spark的RDD操作和描述_2

本次博客分发三大部分

一,Spark的RDD用JAVA的实现

二,Spark的RDD的说明

三,Spark的Scala的实现

1, Cartesian算子

1.1,Java代码实现

package com.lyl.it;

import java.util.Arrays;
import java.util.List;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;

import scala.Tuple2;

public class CartesianOperator {

	public static void main(String[] args) {
		SparkConf conf = new SparkConf().setAppName("CartesianOperator").setMaster("local");

		JavaSparkContext sc = new JavaSparkContext(conf);
		List words = Arrays.asList("A","B","C","D","F");
		List numbers = Arrays.asList("1","2","3","4","5");
		JavaRDD wordsRDD = sc.parallelize(words);
		JavaRDD numberRDD = sc.parallelize(numbers);
		
		JavaPairRDD pairs = wordsRDD.cartesian(numberRDD);
		for (Tuple2 pair:pairs.collect()) {
			System.out.println(pair);
		}
		
		sc.close();
	}
}

1.2,说明把对两个RDD的元素进行合并,下面是程序运行的结果

18/08/02 15:37:35 INFO DAGScheduler: Job 0 finished: collect at CartesianOperator.java:25, took 0.178710 s
(A,1)
(A,2)
(A,3)
(A,4)
(A,5)
(B,1)
(B,2)
(B,3)
(B,4)
(B,5)
(C,1)
(C,2)
(C,3)
(C,4)
(C,5)
(D,1)
(D,2)
(D,3)
(D,4)
(D,5)
(F,1)
(F,2)
(F,3)
(F,4)
(F,5)
18/08/02 15:37:35 INFO BlockManagerInfo: Removed broadcast_0_piece0 on localhost:54275 in memory (size: 1350.0 B, free: 1115.3 MB)

1.3,Scala的实现

package com.lyl.it

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext

object CartesianOperator {
  
       def main(args: Array[String]): Unit = {
        val conf = new SparkConf().setAppName("CartesianOperator").setMaster("local")
        val sc = new SparkContext(conf)
        
        val words = Array("A","B","C","D","F")
        val numbers = Array("1","2","3","4","5")
        
        val wordsRDD = sc.parallelize(words)
        val numberRDD = sc.parallelize(numbers)
        
        val pairs = wordsRDD.cartesian(numberRDD)
        
        for( pairs <- pairs.collect() ){
          println(pairs)
         }
      }     
}

2, CogroupOperator算子 

2.1,Java代码实现

package com.lyl.it;

import java.util.Arrays;
import java.util.List;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.VoidFunction;

import scala.Tuple2;

public class CogroupOperator {
    
	public static void main(String[] args) {
		SparkConf conf = new SparkConf().setAppName("CogroupOperator").setMaster("local");
		JavaSparkContext sc = new JavaSparkContext(conf);
		
		@SuppressWarnings("unchecked")
		List> studentList = Arrays.asList(
				new Tuple2("1","A"),
				new Tuple2("2","B"),
				new Tuple2("1","C"),
				new Tuple2("3","A"),
				new Tuple2("1","F"),
				new Tuple2("2","A")
				);
		
		@SuppressWarnings("unchecked")
		List> sourcetList = Arrays.asList(
				new Tuple2("1","100"),
				new Tuple2("2","90"),
				new Tuple2("1","80"),
				new Tuple2("3","60"),
				new Tuple2("1","50"),
				new Tuple2("2","40")
				);
		
		JavaPairRDD students = sc.parallelizePairs(studentList);
		JavaPairRDD scores = sc.parallelizePairs(sourcetList);
		
		JavaPairRDD, Iterable>>
		studentScores = students.cogroup(scores);
		
		studentScores.foreach(new VoidFunction,Iterable>>>() {
			
			private static final long serialVersionUID = 1L;

			public void call(
					Tuple2, Iterable>> tuple)
					throws Exception {
				System.out.println("student id: "+tuple._1);
				System.out.println("student name: "+tuple._2._1);
				System.out.println("student score: "+tuple._2._2);
			}
		});
		
		sc.close();
	}
}

2.2,说明获得分组元素,下面是程序运行的结果

18/08/02 15:44:23 INFO ShuffleBlockFetcherIterator: Started 0 remote fetches in 0 ms
student id: 2
student name: [B, A]
student score: [90, 40]
student id: 3
student name: [A]
student score: [60]
student id: 1
student name: [A, C, F]
student score: [100, 80, 50]
18/08/02 15:44:23 INFO Executor: Finished task 0.0 in stage 2.0 (TID 2). 1165 bytes result sent to driver

2.3,Scala的实现

package com.lyl.it

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext

object CogroupOperator {
       
      def main(args: Array[String]): Unit = {  
      val conf = new SparkConf().setAppName("CogroupOperator").setMaster("local")
      val sc = new SparkContext(conf)
      
      val studentList = Seq(("1","A"),("2","B"),("1","C"),("3","A"),("1","F"),("2","A"))
      val sourcetList = Seq(("1","100"),("2","90"),("1","80"),("3","60"),("1","50"),("2","40"))
      
      val students = sc.parallelize(studentList)
      val scores = sc.parallelize(sourcetList)
      students.cogroup(scores)
              .foreach{
               tuple => println("student id: "+tuple._1)
                        println("student name: "+tuple._2._1)
                        println("student score: "+tuple._2._2)
               }
      
      }
}

3, CollectOperator算子 

3.1,Java代码实现

package com.lyl.it;

import java.util.Arrays;
import java.util.List;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;

public class CollectOperator {
	
	public static void main(String[] args) {
		SparkConf conf = new SparkConf().setAppName("CollectOperator").setMaster("local");
		JavaSparkContext sc = new JavaSparkContext(conf);
		
		List numberList = Arrays.asList(1,2,3,4,5);
		JavaRDD numbers = sc.parallelize(numberList);
		
		JavaRDD doubleNumbers = numbers.map(new Function() {

			private static final long serialVersionUID = 1L;

			public Integer call(Integer v) throws Exception {
				return v*2;
			}
		});
		
		List doubleNumberList = doubleNumbers.collect();
		for (Integer num:doubleNumberList) {
			System.out.println(num);
		}
		
		sc.close();
	}
}

3.2,说明把数据汇到一个节点上,下面是程序运行的结果

18/08/02 15:47:05 INFO DAGScheduler: Job 0 finished: collect at CollectOperator.java:29, took 0.181348 s
2
4
6
8
10
18/08/02 15:47:05 INFO SparkUI: Stopped Spark web UI at http://192.168.158.1:4040
18/08/02 15:47:05 INFO MapOutputTrackerMasterEndpoint: MapOutputTrackerMasterEndpoint stopped!

3.3,Scala的实现

package com.lyl.it

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext

object CollectOperator {
  
     def main(args: Array[String]): Unit = {
        val conf = new SparkConf().setAppName("CollectOperator").setMaster("local")
        val sc = new SparkContext(conf)
        val numberList = Array(1,2,3,4,5)
        val doubleNumbers = sc.parallelize(numberList)
                           .map(v => v*2)
                           .collect()
        for(num <- doubleNumbers){
          println(num)
        }
     }
}

4, CountByKey算子 

4.1,Java代码实现

package com.lyl.it;

import java.util.Arrays;
import java.util.List;
import java.util.Map;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;

import scala.Tuple2;

public class CountByKeyOperator {
	
	public static void main(String[] args) {
		SparkConf conf = new SparkConf().setAppName("CountByKeyOperator").setMaster("local");
		JavaSparkContext sc = new JavaSparkContext(conf);
		
		@SuppressWarnings("unchecked")
		List> scoreList = Arrays.asList(
				new Tuple2("1","A"),
				new Tuple2("2","A"),
				new Tuple2("1","A"),
				new Tuple2("3","A"),
				new Tuple2("1","A"),
				new Tuple2("2","A")
				);
		
		JavaPairRDD students = sc.parallelizePairs(scoreList);
		
		Map counts = students.countByKey();
		
		for (Map.Entry studentCount:counts.entrySet()) {
			System.out.println(studentCount.getKey()+":"+studentCount.getValue());
		}
		
		sc.close();
	}
}

4.2,说明?? 下面是程序运行的结果

18/08/02 15:51:26 INFO DAGScheduler: Job 0 finished: countByKey at CountByKeyOperator.java:31, took 0.268164 s
2:2
3:1
1:3
18/08/02 15:51:26 INFO SparkUI: Stopped Spark web UI at http://192.168.158.1:4040

4.3,Scala的实现

package com.lyl.it

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext

object CountByKeyOperator {
     
     def main(args: Array[String]): Unit = {
        val conf = new SparkConf().setAppName("CollectOperator").setMaster("local")
        val sc = new SparkContext(conf)
        
        val scoreList = Seq(("1","A"),("2","A"),("1","A"),("3","A"),("1","A"),("2","A"))
        val counts = sc.parallelize(scoreList).countByKey()
        
        for(studentCount <- counts){
           println(studentCount)
        }
        
     }
}

5,Count算子 

5.1,Java代码实现

package com.lyl.it;

import java.util.Arrays;
import java.util.List;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;

public class CountOperator {

	public static void main(String[] args) {
		SparkConf conf = new SparkConf().setAppName("CountOperator").setMaster("local");
		JavaSparkContext sc = new JavaSparkContext(conf);
		
		List numberList = Arrays.asList(1,2,3,4,5);
		JavaRDD numbers = sc.parallelize(numberList);
		
		long count = numbers.count();
		System.out.println(count);
		
		sc.close();
	}
}

5.2,说明?? 下面是程序运行的结果

18/08/02 15:57:52 INFO DAGScheduler: Job 0 finished: count at CountOperator.java:19, took 0.209459 s
5
18/08/02 15:57:52 INFO SparkUI: Stopped Spark web UI at http://192.168.158.1:4040
18/08/02 15:57:52 INFO MapOutputTrackerMasterEndpoint: MapOutputTrackerMasterEndpoint stopped!

5.3,Scala的实现

package com.lyl.it

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext

object CountOperator {
      
      def main(args: Array[String]): Unit = {
        val conf = new SparkConf().setAppName("CountOperator").setMaster("local")
        val sc = new SparkContext(conf)
        val numberList = Array(1,2,3,4,5)
        val count = sc.parallelize(numberList).count()
        println(count)
        
      }
}

 

6,Dinstinct算子 

6.1,Java代码实现

6.2,说明?? 下面是程序运行的结果

6.3,Scala的实现

 

你可能感兴趣的:(Spark的RDD操作和描述_2)