API
[SparkContext]
Spark程序的入口点,封装了整个spark运行环境的信息。
代表到Spark集群的连接,可以创建RDD、累加器和广播变量.
每个JVM只能激活一个SparkContext对象,在创建sc之前需要stop掉active的sc。
[RDD]
resilient distributed dataset,弹性分布式数据集。等价于集合。
spark配置对象,设置Spark应用各种参数,kv形式
spark实现word count
//加载文本文件,以换行符方式切割文本.Array(hello world2,hello world2 ,...)
val rdd1 = sc.textFile("/home/centos/test.txt");
//单词统计1
$scala>val rdd1 = sc.textFile("/home/centos/test.txt")
$scala>val rdd2 = rdd1.flatMap(line=>line.split(" "))
$scala>val rdd3 = rdd2.map(word = > (word,1))
$scala>val rdd4 = rdd3.reduceByKey(_ + _)
$scala>rdd4.collect
//单词统计2
sc.textFile("/home/centos/test.txt").flatMap(_.split(" ")).map((_,1)).reduceByKey(_ + _).collect
//统计所有含有wor字样到单词个数。filter
//过滤单词
sc.textFile("/home/centos/test.txt").flatMap(_.split(" ")).filter(_.contains("wor")).map((_,1)).reduceByKey(_ + _).collect
pom
4.0.0
groupId
scalaDemo
1.0-SNAPSHOT
org.apache.spark
spark-core_2.11
2.4.0
com.thoughtworks.paranamer
paranamer
2.8
scala类实现统计:
package com.mao.scala.scala
import org.apache.spark.{SparkConf, SparkContext}
;
object WordCountScala {
def main(args: Array[String]): Unit = {
wordCount3(args);
}
def wordCount1(args: Array[String]): Unit ={
//创建spark配置对象
val conf = new SparkConf();
// conf.setAppName("wordCount");
// //设置master属性
// conf.setMaster("local");
//通过conf创建sc
val sc = new SparkContext(conf);
//加载文本文件
val rdd1 = sc.textFile(args(0));
//压扁
val rdd2 = rdd1.flatMap(line => line.split(" "));
//映射w => (w,1)
val rdd3 = rdd2.map((_,1));
//聚合
val rdd4 = rdd3.reduceByKey(_ + _);
//收集
val r = rdd4.collect();
r.foreach(println);
}
def wordCount2(args: Array[String]): Unit ={
val conf = new SparkConf();
// conf.setAppName("wc");
// conf.setMaster("local");
val sc = new SparkContext(conf);
val r = sc.textFile(args(0)).flatMap(line=>line.split(" ")).map((_,1)).reduceByKey(_ + _).collect();
r.foreach(println);
}
//统计hello的个数
def wordCount3(args: Array[String]): Unit ={
val conf = new SparkConf();
// conf.setAppName("wc");
// conf.setMaster("local");
val sc = new SparkContext(conf);
val r = sc.textFile(args(0)).flatMap(line=>line.split(" ")).filter(_.equals("hello")).map((_,1)).reduceByKey(_ + _).collect();
r.foreach(println);
}
}
java类实现统计:
package com.mao.scala.java;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import scala.Tuple2;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
public class wordCountJava {
public static void main(String[] args) {
//创建sparkConf对象
SparkConf conf = new SparkConf();
// conf.setAppName("wcJava");
// conf.setMaster("local");
//创建java sc
JavaSparkContext sc = new JavaSparkContext(conf);
//加载文本文件
JavaRDD rdd1 = sc.textFile(args[0]);
//压扁
// val rdd2 = rdd1.flatMap(line => line.split(" "));
JavaRDD rdd2 = rdd1.flatMap(new FlatMapFunction() {
public Iterator call(String s) throws Exception {
List list = new ArrayList();
String[] arr = s.split(" ");
for (String ss : arr){
list.add(ss);
}
return list.iterator();
}
});
//映射w => (w,1)
//val rdd3 = rdd2.map((_,1));
JavaPairRDD rdd3 = rdd2.mapToPair(new PairFunction() {
public Tuple2 call(String s) throws Exception {
return new Tuple2(s,1);
}
});
//聚合
//val rdd4 = rdd3.reduceByKey(_ + _);
JavaPairRDD rdd4 = rdd3.reduceByKey(new Function2() {
public Integer call(Integer v1, Integer v2) throws Exception {
return v1+v2;
}
});
//收集、输出
//val r = rdd4.collect();
//r.foreach(println);
for (Object o : rdd4.collect()){
System.out.println(o);
}
}
}
提交作业到spark集群运行
1.导出jar包
2.spark-submit提交命令运行job
//scala
spark-submit --master local --name wcJava --class com.mao.scala.java.wordCountJava scalaDemo-1.0-SNAPSHOT.jar /home/admin/wc.txt
//java版
spark-submit --master local --name wordCount --class com.mao.scala.scala.WordCountScala scalaDemo-1.0-SNAPSHOT.jar /home/admin/wc.txt
注:
Spark2.1.0最新版是基于Scala2.11.8版本,因此安装scala2.11.8版本,
否则如果基于2.12.0版本编译会出现找不到包的问题。
----------------------------------------------
1.卸载原来的scala.
2.重新安装scala2.11.8版本
3.配置idea的全局库
project settings -> global library -> 删除原来的scala sdk
project settings -> global library -> 添加sdk -> browser -> 定位scala安装目录 ->选中scala-compiler.jar +
scala-library.jar +
scala-reflect.jar
4.在模块中添加scala sdk 2.11.8版本
5.重新编译项目 -> 导入jar ->丢到集群运行。