文章地址:http://www.haha174.top/article/details/257684
项目源码:https://github.com/haha174/spark.git
spark action 主要有如下操作: reduce(); collect(); count();take();
saveAsTextFile(); countByKey(); foreach();
1.reduce
//有一个集合 里面有10个数字 对10个数字进行累加
//reduce 操作的原理 reduce 首先将第一个和第二个元素传入call 计算 返回结果 比如 1+2=3
// 将该结果传入到下一个元素进行计算 比如 3+3
//以此类推
下面给出java 示例:
public static void reduce(){
//有一个集合 里面有10个数字 对10个数字进行累加
//reduce 操作的原理 reduce 首先将第一个和第二个元素传入call 计算 返回结果 比如 1+2=3
// 将该结果传入到下一个元素进行计算 比如 3+3
//以此类推
List list= Arrays.asList(1,2,3,4,5,6,7,8,9,10);
JavaRDD listRDD=sc.parallelize(list);
int num= listRDD.reduce(new Function2() {
public Integer call(Integer integer, Integer integer2) throws Exception {
return integer+integer2;
}
});
sc.close();
System.out.println(num);
}
下面给出scala 示例:
def reduce(){
var conf=new SparkConf().setAppName("ActionOperation").setMaster("local");
val sc=new SparkContext(conf);
var numberArray=Array(1,2,3,4,5,6,7,8,9,10);
var numbers=sc.parallelize(numberArray);
var sum=numbers.reduce(_+_);
println("sum "+sum)
}
2.collect
//使用 collect 操作 将分布在远程集群上的doubleNumbers RDD 的数据拉取到本地
//这种方式不建议使用 一般RDD 数据比较大性能比较差
下面给出java 示例:
public static void collect(){
List list= Arrays.asList(1,2,3,4,5,6,7,8,9,10);
JavaRDD listRdd=sc.parallelize(list);
JavaRDD doubleNumber=listRdd.map(new Function() {
public Integer call(Integer integer) throws Exception {
return integer*2;
}
});
//不使用foreach action 操作远程集群上的rdd 中的元素
//而使用 collect 操作 将分布在远程集群上的doubleNumbers RDD 的数据拉取到本地
//这种方式不建议使用 一般RDD 数据比较大性能比较差
// 通常还是使用foreach action 来对最终的RDD 元素进行处理
List list1=doubleNumber.collect();
for(int i:list1){
System.out.println(i);
}
sc.close();
}
下面给出scala示例:
def collect(){
var conf=new SparkConf().setAppName("ActionOperation").setMaster("local");
val sc=new SparkContext(conf);
var numberArray=Array(1,2,3,4,5,6,7,8,9,10);
var numbers=sc.parallelize(numberArray);
var doubleList=numbers.map(num=>num*2);
var sumList=doubleList.collect();
for (num <- sumList){
println(num)
}
}
3.count
count 没什么好说的就是运算
下面给出 java示例:
def count(){
var conf=new SparkConf().setAppName("ActionOperation").setMaster("local");
val sc=new SparkContext(conf);
var numberArray=Array(1,2,3,4,5,6,7,8,9,10);
var numbers=sc.parallelize(numberArray);
var num=numbers.count();
println(num)
}
下面给出scala :示例
public static void count(){
List list= Arrays.asList(1,2,3,4,5,6,7,8,9,10);
JavaRDD listRdd=sc.parallelize(list);
long num=listRdd.count();
System.out.println(num);
sc.close();
}
4.take
take 于 collect 类似但是take 只是获取前n 个数据
下面给出java 示例:
public static void take(){
List list= Arrays.asList(1,2,3,4,5,6,7,8,9,10);
JavaRDD listRdd=sc.parallelize(list);
//于 collect 类似但是take 只是获取前n 个数据
List listTakes=listRdd.take(3);
for(int i:listTakes){
System.out.println(i);
}
sc.close();
}
下面给出scala 示例:
def take(){
var conf=new SparkConf().setAppName("ActionOperation").setMaster("local");
val sc=new SparkContext(conf);
var numberArray=Array(1,2,3,4,5,6,7,8,9,10);
var numbers=sc.parallelize(numberArray);
var doubleList=numbers.map(num=>num*2);
var sumList=doubleList.take(3);
for (num <- sumList){
println(num)
}
}
5.saveAsTextFile
保存文件 可以将温江保存在本地或者hdfs 中
但是注意虽然写的是text.txt 但是实际保存的是文件夹 文件在文件夹中
下面给出java 示例
public static void saveAsTextFile(){
List list= Arrays.asList(1,2,3,4,5,6,7,8,9,10);
JavaRDD listRdd=sc.parallelize(list);
//于 collect 类似但是take 只是获取前n 个数据
listRdd.saveAsTextFile("C://cwq//data//text.txt");
// listRdd.saveAsTextFile("hdfs://spark1:8020/text.txt");
sc.close();
}
下面给出scala 示例:
def saveAsTextFile(){
var conf=new SparkConf().setAppName("ActionOperation").setMaster("local");
val sc=new SparkContext(conf);
var numberArray=Array(1,2,3,4,5,6,7,8,9,10);
var numbers=sc.parallelize(numberArray);
var doubleList=numbers.map(num=>num*2);
doubleList.saveAsTextFile("C://cwq//data//text.txt")
// doubleList.saveAsTextFile("hdfs://spark1:8020/text.txt");
}
6.countByKey
顾名思义就是根据key 计数
下面给出java 示例
public static void countByKey(){
List> list= Arrays.asList(
new Tuple2("calss1",1)
, new Tuple2("calss2",2)
, new Tuple2("calss3",3)
, new Tuple2("calss3",4)
, new Tuple2("calss4",4)
);
JavaPairRDD listRdd=sc.parallelizePairs(list,1);
//于 collect 类似但是take 只是获取前n 个数据
Map map= listRdd.countByKey();
for (Map.Entry count:map.entrySet() ){
System.out.println(count.getKey()+" "+count.getValue());
}
sc.close();
}
下面给出scala 示例
def countByKey(){
var conf=new SparkConf().setAppName("ActionOperation").setMaster("local");
val sc=new SparkContext(conf);
var numberArray=Array(new Tuple2("class1",2),new Tuple2("class2",2),new Tuple2("class1",2),new Tuple2("class1",2));
var RDDList=sc.parallelize(numberArray,1);
var reduceResult=RDDList.countByKey();
println(reduceResult)
}
运行结果为
calss1 1
calss3 2
calss4 1
calss2 1
7.foreach
下面给出java 示例
public static void foreach(){
List>> list= Arrays.asList(
new Tuple2>("calss1",new Tuple2<>("calss2",1))
, new Tuple2>("calss3",new Tuple2<>("calss4",1))
, new Tuple2>("calss5",new Tuple2<>("calss6",1))
, new Tuple2>("calss7",new Tuple2<>("calss8",1))
, new Tuple2>("calss9",new Tuple2<>("calss10",1))
);
JavaPairRDD> listRdd=sc.parallelizePairs(list);
listRdd.foreach(new VoidFunction>>() {
public void call(Tuple2> stringIntegerTuple2) throws Exception {
System.out.println(stringIntegerTuple2._1+stringIntegerTuple2._2._1+stringIntegerTuple2._2._2);
}
});
sc.close();
}
下面给出scala 示例:
def fearch(): Unit ={
var conf=new SparkConf().setAppName("ActionOperation").setMaster("local");
val sc=new SparkContext(conf);
var numberArray=Array(
new Tuple2[String, Tuple2[String, Integer]]("calss1", new Tuple2[String, Integer]("calss2", 1))
, new Tuple2[String, Tuple2[String, Integer]]("calss3", new Tuple2[String, Integer]("calss4", 1))
, new Tuple2[String, Tuple2[String, Integer]]("calss5", new Tuple2[String, Integer]("calss6", 1))
, new Tuple2[String, Tuple2[String, Integer]]("calss7", new Tuple2[String, Integer]("calss8", 1))
, new Tuple2[String, Tuple2[String, Integer]]("calss9", new Tuple2[String, Integer]("calss10", 1)))
var RDDList=sc.parallelize(numberArray,1);
RDDList.foreach(num=>{
println(num._1+num._2._1+num._2._2);
})
}