本次博客分发三大部分
一,Spark的RDD用JAVA的实现
二,Spark的RDD的说明
三,Spark的Scala的实现
1.1,Java代码实现
package com.lyl.it;
import java.util.Arrays;
import java.util.List;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import scala.Tuple2;
public class CartesianOperator {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("CartesianOperator").setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
List words = Arrays.asList("A","B","C","D","F");
List numbers = Arrays.asList("1","2","3","4","5");
JavaRDD wordsRDD = sc.parallelize(words);
JavaRDD numberRDD = sc.parallelize(numbers);
JavaPairRDD pairs = wordsRDD.cartesian(numberRDD);
for (Tuple2 pair:pairs.collect()) {
System.out.println(pair);
}
sc.close();
}
}
1.2,说明把对两个RDD的元素进行合并,下面是程序运行的结果
18/08/02 15:37:35 INFO DAGScheduler: Job 0 finished: collect at CartesianOperator.java:25, took 0.178710 s
(A,1)
(A,2)
(A,3)
(A,4)
(A,5)
(B,1)
(B,2)
(B,3)
(B,4)
(B,5)
(C,1)
(C,2)
(C,3)
(C,4)
(C,5)
(D,1)
(D,2)
(D,3)
(D,4)
(D,5)
(F,1)
(F,2)
(F,3)
(F,4)
(F,5)
18/08/02 15:37:35 INFO BlockManagerInfo: Removed broadcast_0_piece0 on localhost:54275 in memory (size: 1350.0 B, free: 1115.3 MB)
1.3,Scala的实现
package com.lyl.it
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
object CartesianOperator {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("CartesianOperator").setMaster("local")
val sc = new SparkContext(conf)
val words = Array("A","B","C","D","F")
val numbers = Array("1","2","3","4","5")
val wordsRDD = sc.parallelize(words)
val numberRDD = sc.parallelize(numbers)
val pairs = wordsRDD.cartesian(numberRDD)
for( pairs <- pairs.collect() ){
println(pairs)
}
}
}
2.1,Java代码实现
package com.lyl.it;
import java.util.Arrays;
import java.util.List;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.VoidFunction;
import scala.Tuple2;
public class CogroupOperator {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("CogroupOperator").setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
@SuppressWarnings("unchecked")
List> studentList = Arrays.asList(
new Tuple2("1","A"),
new Tuple2("2","B"),
new Tuple2("1","C"),
new Tuple2("3","A"),
new Tuple2("1","F"),
new Tuple2("2","A")
);
@SuppressWarnings("unchecked")
List> sourcetList = Arrays.asList(
new Tuple2("1","100"),
new Tuple2("2","90"),
new Tuple2("1","80"),
new Tuple2("3","60"),
new Tuple2("1","50"),
new Tuple2("2","40")
);
JavaPairRDD students = sc.parallelizePairs(studentList);
JavaPairRDD scores = sc.parallelizePairs(sourcetList);
JavaPairRDD, Iterable>>
studentScores = students.cogroup(scores);
studentScores.foreach(new VoidFunction,Iterable>>>() {
private static final long serialVersionUID = 1L;
public void call(
Tuple2, Iterable>> tuple)
throws Exception {
System.out.println("student id: "+tuple._1);
System.out.println("student name: "+tuple._2._1);
System.out.println("student score: "+tuple._2._2);
}
});
sc.close();
}
}
2.2,说明获得分组元素,下面是程序运行的结果
18/08/02 15:44:23 INFO ShuffleBlockFetcherIterator: Started 0 remote fetches in 0 ms
student id: 2
student name: [B, A]
student score: [90, 40]
student id: 3
student name: [A]
student score: [60]
student id: 1
student name: [A, C, F]
student score: [100, 80, 50]
18/08/02 15:44:23 INFO Executor: Finished task 0.0 in stage 2.0 (TID 2). 1165 bytes result sent to driver
2.3,Scala的实现
package com.lyl.it
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
object CogroupOperator {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("CogroupOperator").setMaster("local")
val sc = new SparkContext(conf)
val studentList = Seq(("1","A"),("2","B"),("1","C"),("3","A"),("1","F"),("2","A"))
val sourcetList = Seq(("1","100"),("2","90"),("1","80"),("3","60"),("1","50"),("2","40"))
val students = sc.parallelize(studentList)
val scores = sc.parallelize(sourcetList)
students.cogroup(scores)
.foreach{
tuple => println("student id: "+tuple._1)
println("student name: "+tuple._2._1)
println("student score: "+tuple._2._2)
}
}
}
3.1,Java代码实现
package com.lyl.it;
import java.util.Arrays;
import java.util.List;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
public class CollectOperator {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("CollectOperator").setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
List numberList = Arrays.asList(1,2,3,4,5);
JavaRDD numbers = sc.parallelize(numberList);
JavaRDD doubleNumbers = numbers.map(new Function() {
private static final long serialVersionUID = 1L;
public Integer call(Integer v) throws Exception {
return v*2;
}
});
List doubleNumberList = doubleNumbers.collect();
for (Integer num:doubleNumberList) {
System.out.println(num);
}
sc.close();
}
}
3.2,说明把数据汇到一个节点上,下面是程序运行的结果
18/08/02 15:47:05 INFO DAGScheduler: Job 0 finished: collect at CollectOperator.java:29, took 0.181348 s
2
4
6
8
10
18/08/02 15:47:05 INFO SparkUI: Stopped Spark web UI at http://192.168.158.1:4040
18/08/02 15:47:05 INFO MapOutputTrackerMasterEndpoint: MapOutputTrackerMasterEndpoint stopped!
3.3,Scala的实现
package com.lyl.it
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
object CollectOperator {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("CollectOperator").setMaster("local")
val sc = new SparkContext(conf)
val numberList = Array(1,2,3,4,5)
val doubleNumbers = sc.parallelize(numberList)
.map(v => v*2)
.collect()
for(num <- doubleNumbers){
println(num)
}
}
}
4.1,Java代码实现
package com.lyl.it;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import scala.Tuple2;
public class CountByKeyOperator {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("CountByKeyOperator").setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
@SuppressWarnings("unchecked")
List> scoreList = Arrays.asList(
new Tuple2("1","A"),
new Tuple2("2","A"),
new Tuple2("1","A"),
new Tuple2("3","A"),
new Tuple2("1","A"),
new Tuple2("2","A")
);
JavaPairRDD students = sc.parallelizePairs(scoreList);
Map counts = students.countByKey();
for (Map.Entry studentCount:counts.entrySet()) {
System.out.println(studentCount.getKey()+":"+studentCount.getValue());
}
sc.close();
}
}
4.2,说明?? 下面是程序运行的结果
18/08/02 15:51:26 INFO DAGScheduler: Job 0 finished: countByKey at CountByKeyOperator.java:31, took 0.268164 s
2:2
3:1
1:3
18/08/02 15:51:26 INFO SparkUI: Stopped Spark web UI at http://192.168.158.1:4040
4.3,Scala的实现
package com.lyl.it
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
object CountByKeyOperator {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("CollectOperator").setMaster("local")
val sc = new SparkContext(conf)
val scoreList = Seq(("1","A"),("2","A"),("1","A"),("3","A"),("1","A"),("2","A"))
val counts = sc.parallelize(scoreList).countByKey()
for(studentCount <- counts){
println(studentCount)
}
}
}
5.1,Java代码实现
package com.lyl.it;
import java.util.Arrays;
import java.util.List;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
public class CountOperator {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("CountOperator").setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
List numberList = Arrays.asList(1,2,3,4,5);
JavaRDD numbers = sc.parallelize(numberList);
long count = numbers.count();
System.out.println(count);
sc.close();
}
}
5.2,说明?? 下面是程序运行的结果
18/08/02 15:57:52 INFO DAGScheduler: Job 0 finished: count at CountOperator.java:19, took 0.209459 s
5
18/08/02 15:57:52 INFO SparkUI: Stopped Spark web UI at http://192.168.158.1:4040
18/08/02 15:57:52 INFO MapOutputTrackerMasterEndpoint: MapOutputTrackerMasterEndpoint stopped!
5.3,Scala的实现
package com.lyl.it
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
object CountOperator {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("CountOperator").setMaster("local")
val sc = new SparkContext(conf)
val numberList = Array(1,2,3,4,5)
val count = sc.parallelize(numberList).count()
println(count)
}
}
6.1,Java代码实现
6.2,说明?? 下面是程序运行的结果
6.3,Scala的实现