package rddDemo.transformation;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.VoidFunction;
import scala.Tuple2;
import java.util.ArrayList;
import java.util.List;
/**
* 笛卡尔积算子 cartesian
* Created by asus on 2018/7/15.
*/
public class CartesianDemoJava {
public static void main(String[] args) {
SparkConf conf = new SparkConf() ;
conf.setAppName("CartesianDemoJava") ;
conf.setMaster("local[2]") ;
System.setProperty("hadoop.home.dir" , "E:\\hadoop-2.6.0") ;
JavaSparkContext sc = new JavaSparkContext(conf) ;
// 班级一
List classOne = new ArrayList<>() ;
// 班级二
List classTwo = new ArrayList<>() ;
classOne.add("lao wang") ;
classOne.add("lao zhang") ;
classOne.add("lao zhao") ;
classOne.add("lao li") ;
classTwo.add("xiao wang") ;
classTwo.add("xiao zhang") ;
classTwo.add("xiao li") ;
classTwo.add("xiao zhao") ;
JavaRDD classOneRDD = sc.parallelize(classOne , 2) ;
JavaRDD classTwoRDD = sc.parallelize(classTwo , 2) ;
JavaPairRDD classOneCartisianTwoRDD = classOneRDD.cartesian(classTwoRDD) ;
classOneCartisianTwoRDD.foreach(new VoidFunction>() {
@Override
public void call(Tuple2 s) throws Exception {
System.out.println("( " + s._1 + " , " + s._2 + " )");
}
});
sc.stop();
}
}
package rddDemo.transformation;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function2;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
/**
* Created by asus on 2018/6/16.
*/
public class CoalesceDemoJava {
public static void main(String[] args) {
SparkConf conf = new SparkConf() ;
conf.setAppName("CoalesceDemoJava") ;
conf.setMaster("local[2]") ;
System.setProperty("hadoop.home.dir" , "E:\\hadoop-2.6.0") ;
JavaSparkContext sc = new JavaSparkContext(conf) ;
List numbers = new ArrayList() ;
for(int i = 0 ; i <= 100 ; i ++) {
numbers.add(i) ;
}
// 创建number RDD
JavaRDD numRdd = sc.parallelize(numbers , 10) ;
// 计算原始number RDD 中个数字所属的partition index 编号
JavaRDD numRddWithPartitionIndex = numRdd.mapPartitionsWithIndex(new Function2, Iterator>() {
@Override
public Iterator call(Integer index, Iterator numIter) throws Exception {
List numString = new ArrayList() ;
while(numIter.hasNext()) {
Integer num = numIter.next() ;
numString.add("number " + num + "with partition index " + index) ;
}
return numString.iterator();
}
} , false) ;
for(String numInfo : numRddWithPartitionIndex.collect()) {
System.out.println(numInfo);
}
// 使用coalesce 算子试图减少partition的数量,默认shuffle为false
JavaRDD coalescedNumRdd = numRdd.coalesce(5) ;
// 计算coalesce之后number RDD 中个数字所属的partition index 编号
JavaRDD coalescedNumRddWithPartitionIndex = coalescedNumRdd.mapPartitionsWithIndex(new Function2, Iterator>() {
@Override
public Iterator call(Integer index, Iterator numIter) throws Exception {
List numString = new ArrayList<>() ;
while(numIter.hasNext()) {
Integer num = numIter.next() ;
numString.add("number " + num + "with partition index " + index) ;
}
return numString.iterator();
}
} , false) ;
for(String numInfo : coalescedNumRddWithPartitionIndex.collect()) {
System.out.println(numInfo);
}
sc.stop() ;
}
}
package rddDemo.transformation;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.VoidFunction;
import scala.Tuple2;
import java.util.ArrayList;
import java.util.List ;
/**
* Created by asus on 2018/7/18.
*/
public class CogroupDemoJava {
public static void main(String[] args) {
SparkConf conf = new SparkConf() ;
conf.setAppName("CogroupDemoJava") ;
conf.setMaster("local[2]") ;
System.setProperty("hadoop.home.dir" , "E:\\hadoop-2.6.0") ;
JavaSparkContext sc = new JavaSparkContext(conf) ;
List> scores_1 = new ArrayList<>() ;
List> scores_2 = new ArrayList<>() ;
scores_1.add(new Tuple2("lao wang" , 10)) ;
scores_1.add(new Tuple2("lao zhang" , 20)) ;
scores_1.add(new Tuple2("lao zhao" , 30)) ;
scores_1.add(new Tuple2("lao li" , 40)) ;
scores_2.add(new Tuple2("lao wang" , 10)) ;
scores_2.add(new Tuple2("xiao zhang" , 20)) ;
scores_2.add(new Tuple2("lao zhao" , 30)) ;
scores_2.add(new Tuple2("xiao li" , 40)) ;
JavaPairRDD scoreOneRDD = sc.parallelizePairs(scores_1) ;
JavaPairRDD scoreTwoRDD = sc.parallelizePairs(scores_2) ;
JavaPairRDD , Iterable>> cogroupRDD = scoreOneRDD.cogroup(scoreTwoRDD) ;
cogroupRDD.foreach(new VoidFunction, Iterable>>>() {
@Override
public void call(Tuple2, Iterable>> t) throws Exception {
System.out.println("key -> " + t._1);
System.out.print("value1 -> ");
for(Integer s : t._2._1) {
System.out.print(s + " ");
}
System.out.println() ;
System.out.print("value2 -> ");
for(Integer s : t._2._2) {
System.out.print(s + " ");
}
System.out.println();
}
});
sc.stop();
}
}
package rddDemo.transformation;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext ;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.VoidFunction;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List ;
/**
* distinct RDD 去重算子,有shuffle
* Created by asus on 2018/7/11.
*/
public class DistinctDemoJava {
public static void main(String[] args) {
SparkConf conf = new SparkConf() ;
conf.setAppName("DistinctDemoJava") ;
conf.setMaster("local[2]") ;
System.setProperty("hadoop.home.dir" , "E:\\hadoop-2.6.0") ;
JavaSparkContext sc = new JavaSparkContext(conf) ;
List names = new ArrayList<>() ;
names.add("lao wang") ;
names.add("lao zhang") ;
names.add("lao li") ;
names.add("lao zhao") ;
names.add("lao wang") ;
names.add("lao zhang") ;
names.add("lao li") ;
names.add("lao zhao") ;
JavaRDD nameRDD = sc.parallelize(names , 4) ;
JavaRDD nameWithPartitionIndexRDD = nameRDD.mapPartitionsWithIndex(new Function2, Iterator>() {
@Override
public Iterator call(Integer index, Iterator names) throws Exception {
List nameWithIndex = new ArrayList<>() ;
while(names.hasNext()) {
nameWithIndex.add("name : " + names.next() + " with index " + index) ;
}
return nameWithIndex.iterator();
}
} , false) ;
nameWithPartitionIndexRDD.foreach(new VoidFunction() {
@Override
public void call(String s) throws Exception {
System.out.println(s);
}
}) ;
JavaRDD distinctNameRDD = nameRDD.distinct() ;
JavaRDD distinctNameRDDWithPartitionIndex = distinctNameRDD.mapPartitionsWithIndex(new Function2, Iterator>() {
private static final long serialVersionID = 4L ;
@Override
public Iterator call(Integer index, Iterator names) throws Exception {
List nameWithIndex = new ArrayList<>() ;
while(names.hasNext()) {
nameWithIndex.add("name : " + names.next() + " with index " + index) ;
}
return nameWithIndex.iterator();
}
} , false) ;
distinctNameRDDWithPartitionIndex.foreach(new VoidFunction() {
@Override
public void call(String s) throws Exception {
System.out.println(s);
}
});
sc.stop() ;
}
}
package rddDemo.transformation;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.VoidFunction;
import java.util.Arrays;
import java.util.List;
/**
* Created by asus on 2018/6/16.
* transformation 算子 filter f : U => bool
*/
public class FilterDemoJava {
public static void main(String[] args) {
SparkConf conf = new SparkConf() ;
conf.setAppName("FilterDemoJava") ;
conf.setMaster("local[2]") ;
System.setProperty("hadoop.home.dir" , "E:\\hadoop-2.6.0") ;
JavaSparkContext sc = new JavaSparkContext(conf) ;
List numbers = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10) ;
JavaRDD numRdd = sc.parallelize(numbers) ;
JavaRDD numLarger5 = numRdd.filter(new Function() {
@Override
public Boolean call(Integer v1) throws Exception {
if(v1 > 5) {
return true ;
}
return false;
}
}) ;
numRdd.foreach(new VoidFunction() {
@Override
public void call(Integer integer) throws Exception {
System.out.println("number : " + integer) ;
}
});
numLarger5.foreach(new VoidFunction() {
@Override
public void call(Integer integer) throws Exception {
System.out.println("number larger 5 : " + integer) ;
}
});
sc.stop() ;
}
}
package rddDemo.transformation;
import org.apache.spark.SparkConf ;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext ;
import org.apache.spark.api.java.function.FlatMapFunction;
import java.util.* ;
/**
* Created by asus on 2018/6/16.
*/
public class FlatMapDemoJava {
public static void main(String[] args) {
SparkConf conf = new SparkConf() ;
conf.setAppName("FlatMapDemoJava") ;
conf.setMaster("local[2]") ;
System.setProperty("hadoop.home.dir" , "E:\\hadoop-2.6.0") ;
JavaSparkContext sc = new JavaSparkContext(conf) ;
List sentences = new ArrayList<>() ;
sentences.add("today is a nice day") ;
sentences.add("i love you") ;
sentences.add("who am i") ;
JavaRDD sentenceRdd = sc.parallelize(sentences) ;
for(String s : sentenceRdd.collect()) {
System.out.println(s);
}
JavaRDD wordRdd = sentenceRdd.flatMap(new FlatMapFunction() {
@Override
public Iterator call(String s) throws Exception {
String[] words = s.split("\\s+") ;
List wordsList = new ArrayList<>() ;
for(int i = 0 ; i < words.length ; i ++) {
wordsList.add(words[i]) ;
}
return wordsList.iterator() ;
}
}) ;
for(String word : wordRdd.collect()) {
System.out.println(word);
}
sc.stop();
}
}
package rddDemo.transformation;
import org.apache.spark.SparkConf ;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext ;
import org.apache.spark.api.java.function.PairFunction;
import scala.Tuple2;
import java.util.* ;
/**
* Created by asus on 2018/6/17.
*/
public class GroupByKeyJava {
public static void main(String [] args) {
SparkConf conf = new SparkConf() ;
conf.setAppName("GroupByKeyJava") ;
conf.setMaster("local[3]") ;
System.setProperty("hadoop.home.dir" , "E:\\hadoop-2.6.0") ;
JavaSparkContext sc = new JavaSparkContext(conf) ;
List> userInfo = new ArrayList<>() ;
userInfo.add(new Tuple2("zhang" , 100)) ;
userInfo.add(new Tuple2("zhang" , 101)) ;
userInfo.add(new Tuple2("zhang" , 102)) ;
userInfo.add(new Tuple2("wang" , 90)) ;
userInfo.add(new Tuple2("wang" , 91)) ;
userInfo.add(new Tuple2("wang" , 92)) ;
userInfo.add(new Tuple2("li" , 80)) ;
userInfo.add(new Tuple2("li" , 81)) ;
userInfo.add(new Tuple2("li" , 82)) ;
System.out.println("############################ 构造键值对RDD ############################");
// 使用 sc.parallelizePairs 加载键值对集合,键值对由Tuple2元组构成
JavaPairRDD userRdd = sc.parallelizePairs(userInfo) ;
for(Tuple2 t : userRdd.collect()) {
System.out.println("name : " + t._1 + " score : " + t._2);
}
System.out.println("############################ 操作键值对RDD ############################");
// mapToPairs 方法操作键值对中的每个值
JavaPairRDD mappedUserRdd = userRdd.mapToPair(new PairFunction, String, Integer>() {
@Override
public Tuple2 call(Tuple2 t) throws Exception {
return new Tuple2<>(t._1 , t._2 + 10);
}
}) ;
for(Tuple2 t : mappedUserRdd.collect()) {
System.out.println("name " + t._1 + " score " + t._2);
}
System.out.println("############################ 键值对RDD groupByKey ############################");
// groupByKey算子进行分组,分组结果为>
JavaPairRDD> userRddGroupByKey = userRdd.groupByKey() ;
for(Tuple2> t : userRddGroupByKey.collect()) {
List scoreList = new ArrayList<>() ;
for(Integer score : t._2) {
scoreList.add(score) ;
}
System.out.println("name " + t._1 + " scoreList : " + scoreList.toString());
}
System.out.println("############################ ############################");
JavaPairRDD userTotalScore = userRddGroupByKey.mapToPair(new PairFunction>, String , Integer>() {
@Override
public Tuple2 call(Tuple2> t) throws Exception {
Integer sum = 0 ;
for(Integer s : t._2) {
sum += s ;
}
return new Tuple2<>(t._1 , sum) ;
}
}) ;
for(Tuple2 t : userTotalScore.collect()) {
System.out.println("name : " + t._1 + " total_score : " + t._2) ;
}
sc.stop();
}
}
package rddDemo.transformation;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.VoidFunction;
import java.util.ArrayList;
import java.util.List ;
/**
* RDD 求交集算子
* Created by asus on 2018/7/15.
*/
public class IntersectionDemoJava {
public static void main(String[] args) {
SparkConf conf = new SparkConf() ;
conf.setAppName("IntersectionDemoJava") ;
conf.setMaster("local[2]") ;
System.setProperty("hadoop.home.dir" , "E:\\hadoop-2.6.0") ;
JavaSparkContext sc = new JavaSparkContext(conf) ;
// 班级一
List classOne = new ArrayList<>() ;
// 班级二
List classTwo = new ArrayList<>() ;
classOne.add("lao wang") ;
classOne.add("lao wang") ;
classOne.add("lao zhang") ;
classOne.add("lao li") ;
classOne.add("lao zhao") ;
classTwo.add("lao wang") ;
classTwo.add("lao wang") ;
classTwo.add("lao zhao") ;
classTwo.add("lao zhao") ;
classTwo.add("xiao wang") ;
classTwo.add("xiao zhao") ;
JavaRDD classOneRDD = sc.parallelize(classOne , 2) ;
JavaRDD classTwoRDD = sc.parallelize(classTwo , 2) ;
// 求两个RDD的交集,会去重
JavaRDD classOneAndTwoRDD = classOneRDD.intersection(classTwoRDD) ;
classOneAndTwoRDD.foreach(new VoidFunction() {
@Override
public void call(String s) throws Exception {
System.out.println(s);
}
});
sc.stop();
}
}
package rddDemo.transformation;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.VoidFunction;
import scala.Tuple2;
import java.util.ArrayList;
import java.util.List;
/**
* Created by asus on 2018/7/18.
*/
public class JoinDemoJava {
public static void main(String[] args) {
SparkConf conf = new SparkConf() ;
conf.setAppName("CogroupDemoJava") ;
conf.setMaster("local[2]") ;
System.setProperty("hadoop.home.dir" , "E:\\hadoop-2.6.0") ;
JavaSparkContext sc = new JavaSparkContext(conf) ;
List> scores_1 = new ArrayList<>() ;
List> scores_2 = new ArrayList<>() ;
scores_1.add(new Tuple2("lao wang" , 10)) ;
scores_1.add(new Tuple2("lao zhang" , 20)) ;
scores_1.add(new Tuple2("lao zhao" , 30)) ;
scores_1.add(new Tuple2("lao li" , 40)) ;
scores_2.add(new Tuple2("lao wang" , 10)) ;
scores_2.add(new Tuple2("xiao zhang" , 20)) ;
scores_2.add(new Tuple2("lao zhao" , 30)) ;
scores_2.add(new Tuple2("xiao li" , 40)) ;
JavaPairRDD scoreOneRDD = sc.parallelizePairs(scores_1) ;
JavaPairRDD scoreTwoRDD = sc.parallelizePairs(scores_2) ;
JavaPairRDD> joinRDD = scoreOneRDD.join(scoreTwoRDD) ;
joinRDD.foreach(new VoidFunction>>() {
@Override
public void call(Tuple2> t) throws Exception {
System.out.println(t._1 + " -> " + t._2._1 + " , " + t._2._2);
}
}) ;
sc.stop() ;
}
}
package rddDemo.transformation;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.VoidFunction;
import java.util.Arrays;
import java.util.List;
/**
* Created by asus on 2018/6/16.
*/
public class MapDemoJava {
public static void main(String [] args) {
System.out.println("MapDemoJava");
SparkConf conf = new SparkConf() ;
conf.setAppName("MapDemoJava") ;
conf.setMaster("local[2]") ;
System.setProperty("hadoop.home.dir" , "E:\\hadoop-2.6.0") ;
JavaSparkContext sc = new JavaSparkContext(conf) ;
// 生命RDD
List numbers = Arrays.asList(1 , 2 , 3 , 4 ,5 , 5 ,6 ,8) ;
JavaRDD numRdd = sc.parallelize(numbers) ;
// RDD 应用map 算子
JavaRDD newNumRdd = numRdd.map(new Function() {
private static final long serialVersionID = 1L ;
@Override
public Integer call(Integer number) throws Exception {
return number * 10 ;
}
}) ;
// RDD 应用 foreach 算子
numRdd.foreach(new VoidFunction() {
@Override
public void call(Integer integer) throws Exception {
System.out.println("number : " + integer) ;
}
});
newNumRdd.foreach(new VoidFunction() {
@Override
public void call(Integer integer) throws Exception {
System.out.println("new number : " + integer);
}
});
sc.stop();
}
}
package rddDemo.transformation;
import org.apache.spark.SparkConf ;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.VoidFunction;
import java.util.*;
/**
* Created by asus on 2018/6/16.
*/
public class MapPartitionsDemoJava {
public static void main(String[] args) {
System.out.println("MapPartitionsDemoJava");
SparkConf conf = new SparkConf() ;
conf.setAppName("MapPartitionsDemoJava") ;
conf.setMaster("local[2]") ;
System.setProperty("hadoop.home.dir" , "E:\\hadoop-2.6.0") ;
JavaSparkContext sc = new JavaSparkContext(conf) ;
List names = new ArrayList() ;
names.add("xuruyun") ;
names.add("liangjingru") ;
names.add("wangfei") ;
final Map scoreMap = new HashMap() ;
scoreMap.put("xuruyun" , 150) ;
scoreMap.put("liangjingru" , 100) ;
scoreMap.put("wangfei" , 90) ;
// 定义 NameRDD
JavaRDD nameRdd = sc.parallelize(names) ;
// 根据NameRDD 生成ScoreRDD
JavaRDD scoreRdd = nameRdd.mapPartitions(new FlatMapFunction, Integer>() {
@Override
public Iterator call(Iterator iterator) throws Exception {
List scores = new ArrayList() ;
while(iterator.hasNext()) {
String name = iterator.next() ;
Integer score = scoreMap.get(name) ;
scores.add(score) ;
}
return scores.iterator();
}
}) ;
// foreach 输出 NameRDD的内容
nameRdd.foreach(new VoidFunction() {
@Override
public void call(String s) throws Exception {
System.out.println("name : " + s);
}
});
scoreRdd.foreach(new VoidFunction() {
@Override
public void call(Integer score) throws Exception {
System.out.println("score : " + score);
}
});
sc.stop();
}
}
package rddDemo.transformation;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.VoidFunction;
import java.util.* ;
/**
* Created by asus on 2018/6/16.
*/
public class MapPartitionsWithIndexDemoJava {
public static void main(String [] args) {
SparkConf conf = new SparkConf() ;
conf.setAppName("MapPartitionsWithIndexDemoJava") ;
conf.setMaster("local[2]") ;
System.setProperty("hadoop.home.dir" , "E:\\hadoop-2.6.0") ;
JavaSparkContext sc = new JavaSparkContext(conf) ;
List names = new ArrayList() ;
names.add("xuruyun") ;
names.add("liangjingru") ;
names.add("wangfei") ;
JavaRDD nameRdd = sc.parallelize(names , 3) ;
nameRdd.foreach(new VoidFunction() {
@Override
public void call(String s) throws Exception {
System.out.println(s) ;
}
});
JavaRDD nameRddWithPartitionIndex = nameRdd.mapPartitionsWithIndex(new Function2, Iterator>() {
private static final long serialVersionID = 1L ;
@Override
public Iterator call(Integer index, Iterator iterator) throws Exception {
List helloPeople = new ArrayList() ;
while(iterator.hasNext()) {
String name = iterator.next() ;
helloPeople.add("Hello " + name + " with index " + index) ;
}
return helloPeople.iterator();
}
} , false) ;
nameRddWithPartitionIndex.foreach(new VoidFunction() {
@Override
public void call(String s) throws Exception {
System.out.println(s);
}
});
sc.stop() ;
}
}
package rddDemo.transformation;
import org.apache.spark.SparkConf ;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function2;
import java.util.ArrayList;
import java.util.* ;
/**
* Created by asus on 2018/6/16.
*/
public class RepartitionDemoJava {
public static void main(String[] args) {
SparkConf conf = new SparkConf() ;
conf.setAppName("RepartitionDemoJava") ;
conf.setMaster("local[2]") ;
System.setProperty("hadoop.home.dir" , "E:\\hadoop-2.6.0") ;
JavaSparkContext sc = new JavaSparkContext(conf) ;
List numbers = new ArrayList<>() ;
for(int i = 0 ; i <= 100 ; i ++) {
numbers.add(i) ;
}
JavaRDD numRdd = sc.parallelize(numbers , 10) ;
JavaRDD numRddWithPartitionIndex = numRdd.mapPartitionsWithIndex(new Function2, Iterator>() {
private static final long serialVersionID = 1L ;
@Override
public Iterator call(Integer index, Iterator numIter) throws Exception {
List numString = new ArrayList() ;
while(numIter.hasNext()) {
Integer num = numIter.next() ;
numString.add("number " + num + " with partition index " + index) ;
}
return numString.iterator();
}
} , false) ;
for(String numInfo : numRddWithPartitionIndex.collect()) {
System.out.println(numInfo);
}
JavaRDD numRddRepartition = numRdd.repartition(5) ;
JavaRDD numRddRepartitionWithPartitionIndex = numRddRepartition.mapPartitionsWithIndex(new Function2, Iterator>() {
private static final long serialVersionID = 2L ;
@Override
public Iterator call(Integer index, Iterator numIter) throws Exception {
List numString = new ArrayList() ;
while(numIter.hasNext()) {
Integer num = numIter.next() ;
numString.add("number " + num + " with partition index " + index) ;
}
return numString.iterator();
}
} , false) ;
for(String numInfo : numRddRepartitionWithPartitionIndex.collect()) {
System.out.println(numInfo);
}
sc.stop();
}
}
package rddDemo.transformation;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.VoidFunction;
import java.util.ArrayList;
import java.util.List;
/**
* sample(replace:boolean , fraction:float , seed : long) 取样算子
* Created by asus on 2018/7/8.
*/
public class SampleDemoJava {
public static void main(String[] args) {
SparkConf conf = new SparkConf() ;
conf.setAppName("SampleDemoJava") ;
conf.setMaster("local[2]") ;
System.setProperty("hadoop.home.dir" , "E:\\hadoop-2.6.0") ;
JavaSparkContext sc = new JavaSparkContext(conf) ;
List names = new ArrayList<>() ;
names.add("lao wang") ;
names.add("xiao wang") ;
names.add("lao zhang") ;
names.add("xiao zhang") ;
names.add("lao li") ;
names.add("xiao li") ;
JavaRDD namesRDD = sc.parallelize(names , 3) ;
// 无放回取样50%
System.out.println(">>>>>>>>>>>>>>>>>> 无放回取样50% <<<<<<<<<<<<<<<<<");
namesRDD.sample(false , 0.5).foreach(new VoidFunction() {
@Override
public void call(String s) throws Exception {
System.out.println(s);
}
});
// 有放回取样50%
System.out.println(">>>>>>>>>>>>>>>>>> 有放回取样50% <<<<<<<<<<<<<<<<<");
namesRDD.sample(true , 0.5).foreach(new VoidFunction() {
@Override
public void call(String s) throws Exception {
System.out.println(s);
}
});
// 无放回取样50%,指定seed,取样唯一
System.out.println(">>>>>>>>>>>>>>>>>> 无放回取样50%,指定seed,取样唯一 <<<<<<<<<<<<<<<<<");
namesRDD.sample(false , 0.5 , 100).foreach(new VoidFunction() {
@Override
public void call(String s) throws Exception {
System.out.println(s);
}
});
// 有放回取样50%,指定seed,取样唯一
System.out.println(">>>>>>>>>>>>>>>>>> 有放回取样50%,指定seed,取样唯一 <<<<<<<<<<<<<<<<<");
namesRDD.sample(true , 0.5 , 100).foreach(new VoidFunction() {
@Override
public void call(String s) throws Exception {
System.out.println(s);
}
});
sc.stop();
}
}
package rddDemo.transformation;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import scala.Tuple2;
import java.util.ArrayList;
import java.util.List ;
/**
* Created by asus on 2018/7/11.
*/
public class SortByKeyDemoJava {
public static void main(String[] args) {
SparkConf conf = new SparkConf() ;
conf.setAppName("SortByKeyDemoJava") ;
conf.setMaster("local[2]") ;
System.setProperty("hadoop.home.dir" , "E:\\hadoop-2.6.0") ;
JavaSparkContext sc = new JavaSparkContext(conf) ;
List> scores = new ArrayList<>() ;
scores.add(new Tuple2("lao wang" , 10)) ;
scores.add(new Tuple2("lao zhang" , 20)) ;
scores.add(new Tuple2("lao li" , 30)) ;
scores.add(new Tuple2("lao zhao" , 40)) ;
JavaPairRDD scoreRDD = sc.parallelizePairs(scores , 2) ;
scoreRDD.foreach(new VoidFunction>() {
@Override
public void call(Tuple2 s) throws Exception {
System.out.println("name -> " + s._1 + " , score -> " + s._2);
}
});
// 升序
System.out.println(">>>>>>>>>>>>>>>>>>>>>>>>>>> asc(升序) <<<<<<<<<<<<<<<<<<<<<<<<<<<<<");
JavaPairRDD scoreSortByKeyAscRDD = scoreRDD.sortByKey(true) ;
scoreSortByKeyAscRDD.foreach(new VoidFunction>() {
@Override
public void call(Tuple2 s) throws Exception {
System.out.println("name -> " + s._1 + " , score -> " + s._2);
}
});
// 降序
System.out.println(">>>>>>>>>>>>>>>>>>>>>>>>>>> desc(降序) <<<<<<<<<<<<<<<<<<<<<<<<<<<<<");
JavaPairRDD scoreSortByKeyDescRDD = scoreRDD.sortByKey(false) ;
scoreSortByKeyDescRDD.foreach(new VoidFunction>() {
@Override
public void call(Tuple2 s) throws Exception {
System.out.println("name -> " + s._1 + " , score -> " + s._2);
}
});
// 按照分数升序排序
System.out.println(">>>>>>>>>>>>>>>>>>>>>>>>>>> asc(按照分数升序排序) <<<<<<<<<<<<<<<<<<<<<<<<<<<<<");
JavaPairRDD scoreAscRDD = scoreRDD.mapToPair(new PairFunction, Integer, String>() {
@Override
public Tuple2 call(Tuple2 s) throws Exception {
return new Tuple2<>(s._2 , s._1);
}
}).sortByKey(true).mapToPair(new PairFunction, String, Integer>() {
@Override
public Tuple2 call(Tuple2 s) throws Exception {
return new Tuple2<>(s._2 , s._1);
}
}) ;
scoreAscRDD.foreach(new VoidFunction>() {
@Override
public void call(Tuple2 s) throws Exception {
System.out.println("name -> " + s._1 + " , score -> " + s._2);
}
});
// 按照分数降序排序(每个partition内部的排序)
System.out.println(">>>>>>>>>>>>>>>>>>>>>>>>>>> desc(按照分数降序排序) <<<<<<<<<<<<<<<<<<<<<<<<<<<<<");
JavaPairRDD scoreDescRDD = scoreRDD.mapToPair(new PairFunction, Integer, String>() {
@Override
public Tuple2 call(Tuple2 s) throws Exception {
return new Tuple2<>(s._2 , s._1);
}
}).sortByKey(false).mapToPair(new PairFunction, String , Integer>() {
@Override
public Tuple2 call(Tuple2 s) throws Exception {
return new Tuple2<>(s._2 , s._1);
}
}) ;
scoreDescRDD.foreach(new VoidFunction>() {
@Override
public void call(Tuple2 s) throws Exception {
System.out.println("name -> " + s._1 + " , score -> " + s._2);
}
});
sc.stop();
}
}
package rddDemo.transformation;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.VoidFunction;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List ;
/**
* rdd_1.union(rdd_2)
* 直接合并RDD的partition
* 如果rdd_1,rdd_2分别有两个partition,则合并后的RDD有4个partition,即union过程中不会shuffle
* Created by asus on 2018/7/8.
*/
public class UnionDemoJava {
public static void main(String[] args) {
SparkConf conf = new SparkConf() ;
conf.setAppName("UnionDemoJava") ;
conf.setMaster("local[2]") ;
System.setProperty("hadoop.home.dir" , "E:\\hadoop-2.6.0") ;
JavaSparkContext sc = new JavaSparkContext(conf) ;
List oldMan = new ArrayList<>() ;
oldMan.add("lao wang") ;
oldMan.add("lao zhang") ;
oldMan.add("lao li") ;
oldMan.add("lao zhao") ;
List youngMan = new ArrayList<>() ;
youngMan.add("xiao wang") ;
youngMan.add("xiao zhang") ;
youngMan.add("xiao li") ;
youngMan.add("xiao zhao") ;
/**
* name : lao li with index 1
* name : lao zhao with index 1
* name : lao wang with index 0
* name : lao zhang with index 0
*/
// oldManRDD 有两个 partition
JavaRDD oldManRDD = sc.parallelize(oldMan , 2) ;
// 获取 oldManRDD 中各元素所属的 partition 编号
JavaRDD oldManRDDWithIndex = oldManRDD.mapPartitionsWithIndex(new Function2, Iterator>() {
private static final long serialVersionID = 1L ;
@Override
public Iterator call(Integer index, Iterator names) throws Exception {
List nameWithIndex = new ArrayList<>() ;
while(names.hasNext()) {
nameWithIndex.add("name : " + names.next() + " with index " + index) ;
}
return nameWithIndex.iterator() ;
}
} , false) ;
oldManRDDWithIndex.foreach(new VoidFunction() {
@Override
public void call(String s) throws Exception {
System.out.println(s);
}
});
/**
* name : xiao li with index 1
* name : xiao zhao with index 1
* name : xiao wang with index 0
* name : xiao zhang with index 0
*/
// youngManRDD 有两个 partition
JavaRDD youngManRDD = sc.parallelize(youngMan , 2) ;
// 获取 youngManRDD 中每个元素所属的 partition 编号
JavaRDD youngManRDDWithIndex = youngManRDD.mapPartitionsWithIndex(new Function2, Iterator>() {
private static final long serialVersionID = 2L ;
@Override
public Iterator call(Integer index, Iterator names) throws Exception {
List nameWithIndex = new ArrayList<>() ;
while(names.hasNext()) {
nameWithIndex.add("name : " + names.next() + " with index " + index) ;
}
return nameWithIndex.iterator() ;
}
} , false) ;
youngManRDDWithIndex.foreach(new VoidFunction() {
@Override
public void call(String s) throws Exception {
System.out.println(s);
}
});
/**
* name : lao li with index 1
* name : lao zhao with index 1
* name : xiao wang with index 2
* name : xiao zhang with index 2
* name : xiao li with index 3
* name : xiao zhao with index 3
* name : lao wang with index 0
* name : lao zhang with index 0
*/
// 合并 oldManRDD 和 youngManRDD , 合并后的RDD中有 4 个 partition,且每个partition 中的内容与未合并前一样
JavaRDD unionOldAndYoung = oldManRDD.union(youngManRDD) ;
// 获取 unionOldAndYoung 中每个元素所属的 partition 编号
JavaRDD unionOldAndYoungWithIndex = unionOldAndYoung.mapPartitionsWithIndex(new Function2, Iterator>() {
private static final long serialVersionID = 3L ;
@Override
public Iterator call(Integer index, Iterator names) throws Exception {
List nameWithIndex = new ArrayList<>() ;
while(names.hasNext()) {
nameWithIndex.add("name : " + names.next() + " with index " + index) ;
}
return nameWithIndex.iterator();
}
} , false) ;
unionOldAndYoungWithIndex.foreach(new VoidFunction() {
@Override
public void call(String s) throws Exception {
System.out.println(s);
}
});
sc.stop() ;
}
}
package rddDemo.transformation;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import java.util.ArrayList;
import java.util.List ;
/**
* Created by asus on 2018/7/15.
*/
public class SaveAsTextFileDemoJava {
public static void main(String[] args) {
SparkConf conf = new SparkConf() ;
conf.setAppName("SaveAsTextFileDemoJava") ;
conf.setMaster("local[2]") ;
System.setProperty("hadoop.home.dir" , "E:\\hadoop-2.6.0") ;
JavaSparkContext sc = new JavaSparkContext(conf) ;
List numbers = new ArrayList<>() ;
for(int i = 0 ; i < 10 ; i ++) {
numbers.add(i) ;
}
JavaRDD numberRDD = sc.parallelize(numbers , 2) ;
// 将结果文件保存到本地(保存目录不能事先存在)
numberRDD.saveAsTextFile("src/main/java/rddDemo/saveAsTextFilePath");
// 将结果文件保存到HDFS(保存目录不能事先存在)
// numberRDD.saveAsTextFile(""hdfs://ip:9000/saveAsTextFilePath);
sc.stop();
}
}