学习Spark第三天
一.常用transformation
package main;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.VoidFunction;
import scala.Tuple2;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
public class Transfromtion {
public static void main(String[] args) {
private static void myMap(){
SparkConf conf = new SparkConf().setMaster("local").setAppName("myMap");
JavaSparkContext sc = new JavaSparkContext(conf);
List<Integer> numbers = Arrays.asList(1, 2, 3, 4, 5, 6);
JavaRDD<Integer> numbersRDD = sc.parallelize(numbers);
JavaRDD<Integer> map = numbersRDD.map(new Function<Integer, Integer>() {
@Override
public Integer call(Integer integer) throws Exception {
return integer * 2;
}
});
map.foreach(new VoidFunction<Integer>() {
@Override
public void call(Integer integer) throws Exception {
System.out.println(integer);
}
});
sc.close();
}
private static void myFilter(){
SparkConf conf = new SparkConf().setAppName("myFilter").setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
List<Integer> numbers = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
JavaRDD<Integer> numberRDD = sc.parallelize(numbers);
JavaRDD<Integer> filter = numberRDD.filter(new Function<Integer, Boolean>() {
@Override
public Boolean call(Integer integer) throws Exception {
return integer % 2 == 0;
}
});
filter.foreach(new VoidFunction<Integer>() {
@Override
public void call(Integer integer) throws Exception {
System.out.println(integer);
}
});
sc.close();
}
private static void myFlatMap(){
SparkConf conf = new SparkConf().setMaster("local").setAppName("myFlatMap()");
JavaSparkContext sc = new JavaSparkContext(conf);
List<String> lineList = Arrays.asList("hello me", "hello you", "hello world");
JavaRDD<String> lines = sc.parallelize(lineList);
JavaRDD<String> words = lines.flatMap(new FlatMapFunction<String, String>() {
@Override
public Iterable<String> call(String s) throws Exception {
return Arrays.asList(s.split(" "));
}
});
words.foreach(new VoidFunction<String>() {
@Override
public void call(String s) throws Exception {
System.out.println(s);
}
});
sc.close();
}
private static void myGroupByKey(){
SparkConf conf = new SparkConf().setAppName("myGroupByKey").setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
List<Tuple2<String, Integer>> scoreList = Arrays.asList(
new Tuple2<String, Integer>("class1", 85),
new Tuple2<String, Integer>("class2", 56),
new Tuple2<String, Integer>("class1", 49),
new Tuple2<String, Integer>("class2", 89),
new Tuple2<String, Integer>("class1", 92),
new Tuple2<String, Integer>("class2", 68)
);
JavaPairRDD<String, Integer> scores = sc.parallelizePairs(scoreList);
JavaPairRDD<String, Iterable<Integer>> groupByKeyScore = scores.groupByKey();
groupByKeyScore.foreach(new VoidFunction<Tuple2<String, Iterable<Integer>>>() {
@Override
public void call(Tuple2<String, Iterable<Integer>> stringIterableTuple2) throws Exception {
System.out.println("班级"+":"+ stringIterableTuple2._1);
Iterator<Integer> iterator = stringIterableTuple2._2.iterator();
while(iterator.hasNext()){
System.out.println(iterator.next());
}
System.out.println("=========================");
}
});
sc.close();
}
private static void myReduceByKey(){
SparkConf conf = new SparkConf().setMaster("local").setAppName("myReduceByKey");
JavaSparkContext sc = new JavaSparkContext(conf);
List<Tuple2<String, Integer>> sumClass = Arrays.asList(
new Tuple2<String, Integer>("class1", 26),
new Tuple2<String, Integer>("class1", 45),
new Tuple2<String, Integer>("class2", 35),
new Tuple2<String, Integer>("class2", 22)
);
JavaPairRDD<String, Integer> sumClassRDD = sc.parallelizePairs(sumClass);
JavaPairRDD<String, Integer> sumReduce = sumClassRDD.reduceByKey(new Function2<Integer, Integer, Integer>() {
@Override
public Integer call(Integer integer, Integer integer2) throws Exception {
return integer + integer2;
}
});
sumReduce.foreach(new VoidFunction<Tuple2<String, Integer>>() {
@Override
public void call(Tuple2<String, Integer> tuple2) throws Exception {
System.out.println(tuple2._1+":"+tuple2._2);
}
});
sc.close();
}
private static void mySortByKey(){
SparkConf conf = new SparkConf().setMaster("local").setAppName("mySortByKey");
JavaSparkContext sc = new JavaSparkContext(conf);
List<Tuple2<Integer, String>> stuScore = Arrays.asList(
new Tuple2<Integer, String>(100, "xiaoli"),
new Tuple2<Integer, String>(92, "xiaozhang"),
new Tuple2<Integer, String>(45, "xiaobao"),
new Tuple2<Integer, String>(60, "daqiang"),
new Tuple2<Integer, String>(71, "wangwu")
);
JavaPairRDD<Integer, String> stuScoreRDD = sc.parallelizePairs(stuScore);
JavaPairRDD<Integer, String> rdd = stuScoreRDD.sortByKey();
rdd.foreach(new VoidFunction<Tuple2<Integer, String>>() {
@Override
public void call(Tuple2<Integer, String> tuple2) throws Exception {
System.out.println(tuple2._1+":"+tuple2._2);
}
});
sc.close();
}
private static void myJoinAndCogroup(){
SparkConf conf = new SparkConf().setAppName("myJoinAndCogroup").setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
List<Tuple2<Integer, String>> stuList = Arrays.asList(
new Tuple2<Integer, String>(1, "zhansan"),
new Tuple2<Integer, String>(2, "lisi"),
new Tuple2<Integer, String>(3, "wangwu")
);
List<Tuple2<Integer, Integer>> scoreOneList = Arrays.asList(
new Tuple2<Integer, Integer>(1, 99),
new Tuple2<Integer, Integer>(2, 89),
new Tuple2<Integer, Integer>(3, 56)
);
List<Tuple2<Integer, Integer>> scoreTowList = Arrays.asList(
new Tuple2<Integer, Integer>(1, 45),
new Tuple2<Integer, Integer>(2, 82),
new Tuple2<Integer, Integer>(3, 38),
new Tuple2<Integer, Integer>(1, 78),
new Tuple2<Integer, Integer>(2, 76),
new Tuple2<Integer, Integer>(3, 58)
);
JavaPairRDD<Integer, String> students = sc.parallelizePairs(stuList);
final JavaPairRDD<Integer, Integer> scoreOne = sc.parallelizePairs(scoreOneList);
JavaPairRDD<Integer, Integer> scoreTow = sc.parallelizePairs(scoreTowList);
JavaPairRDD<Integer, Tuple2<String, Integer>> studentsJoinScoreOne = students.join(scoreOne);
studentsJoinScoreOne.foreach(new VoidFunction<Tuple2<Integer, Tuple2<String, Integer>>>() {
@Override
public void call(Tuple2<Integer, Tuple2<String, Integer>> tuple2) throws Exception {
System.out.println(tuple2._1+":"+tuple2._2._1+":"+tuple2._2._2);
}
});
System.out.println("================================");
JavaPairRDD<Integer, Tuple2<Iterable<String>, Iterable<Integer>>> cogroups = students.cogroup(scoreTow);
cogroups.foreach(new VoidFunction<Tuple2<Integer, Tuple2<Iterable<String>, Iterable<Integer>>>>() {
@Override
public void call(Tuple2<Integer, Tuple2<Iterable<String>, Iterable<Integer>>> tuple2) throws Exception {
System.out.println(tuple2._1+":"+tuple2._2._1+":"+tuple2._2._2);
}
});
sc.close();
}
}
二.常用action
package main;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import scala.Tuple2;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
public class Action {
public static void main(String[] args) {
}
private static void reduce(){
SparkConf conf = new SparkConf().setMaster("local").setAppName("reduce");
JavaSparkContext sc = new JavaSparkContext(conf);
List<Integer> numList = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
JavaRDD<Integer> numRDD = sc.parallelize(numList);
Integer sum = numRDD.reduce(new Function2<Integer, Integer, Integer>() {
@Override
public Integer call(Integer integer, Integer integer2) throws Exception {
return integer + integer2;
}
});
System.out.println(sum);
sc.close();
}
private static void collect(){
SparkConf conf = new SparkConf().setAppName("collect").setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
List<Integer> numList = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
JavaRDD<Integer> numRDD = sc.parallelize(numList);
JavaRDD<Integer> numc2 = numRDD.map(new Function<Integer, Integer>() {
@Override
public Integer call(Integer integer) throws Exception {
return integer * 2;
}
});
List<Integer> collect = numc2.collect();
for (Integer a: collect){
System.out.println(a);
}
sc.close();
}
private static void count(){
SparkConf conf = new SparkConf().setMaster("local").setAppName("count");
JavaSparkContext sc = new JavaSparkContext(conf);
List<Integer> numList = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
JavaRDD<Integer> nums = sc.parallelize(numList);
long count = nums.count();
System.out.println(count);
sc.close();
}
private static void take(){
SparkConf conf = new SparkConf().setAppName("take").setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
List<Integer> numbers = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
JavaRDD<Integer> numb = sc.parallelize(numbers);
List<Integer> take = numb.take(3);
for (Integer a: take){
System.out.println(a);
}
sc.close();
}
private static void saveAsTextFile(){
SparkConf conf = new SparkConf().setMaster("local").setAppName("saveAsTextFile");
JavaSparkContext sc = new JavaSparkContext(conf);
List<Integer> numberList = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
JavaRDD<Integer> numbers = sc.parallelize(numberList);
JavaRDD<Integer> numc3 = numbers.map(new Function<Integer, Integer>() {
@Override
public Integer call(Integer integer) throws Exception {
return integer * 3;
}
});
numc3.saveAsTextFile("hdfs://hadoop1:9000/numc3.txt");
sc.close();
}
private static void countByKey(){
SparkConf conf = new SparkConf().setAppName("countByKey").setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
List<Tuple2<String, String>> studentList = Arrays.asList(
new Tuple2<String, String>("class1", "leo"),
new Tuple2<String, String>("class2", "xiaoli"),
new Tuple2<String, String>("class2", "xiaohe"),
new Tuple2<String, String>("class1", "wangwu"),
new Tuple2<String, String>("class2", "zhangli"),
new Tuple2<String, String>("class1", "choutong")
);
JavaPairRDD<String, String> student = sc.parallelizePairs(studentList);
Map<String, Object> countStudent = student.countByKey();
for(Map.Entry<String,Object> a:countStudent.entrySet()){
System.out.println(a.getKey()+":"+a.getValue());
}
sc.close();
}
}