命令行提交spark任务./bin/spark-submit --class org.apache.spark.examples.SparkPi ./examples/jars/spark-examples_2.11-2.1.0.jar 10000
./bin/spark-shell 进入spark shell页面
RDD:数据集合,集群里数据集合的映射
在spark-shell里,用scala语法建立RDD
scala> sc.textFile("/root/hello.txt")
res1: org.apache.spark.rdd.RDD[String] = /root/hello.txt MapPartitionsRDD[1] at textFile at :25
把每一行读出来,作为一条单独的记录
scala> val lineRDD = sc.textFile("/root/hello.text")
lineRDD: org.apache.spark.rdd.RDD[String] = /root/hello.txt MapPartitionsRDD[3] at textFile at :24
scala> lineRDD.foreach(println)
hello java
hello scala
hello c
hello python
hello shell
scala> lineRDD.collect
res4: Array[String] = Array(hello java, hello scala, hello c, hello python, hello shell, "")
对每行(每条记录做map)字符串形成数组
scala> val wordRDD = lineRDD.map(line => line.split(" "))
wordRDD: org.apache.spark.rdd.RDD[Array[String]] = MapPartitionsRDD[6] at map at :26
scala> wordRDD.collect
res5: Array[Array[String]] = Array(Array(hello, java), Array(hello, scala), Array(hello, c), Array(hello, python), Array(hello, shell), Array(""))
把数组中的每个单词都分出来,压平
scala> val wordRDD = lineRDD.flatMap(line => line.split(" "))
wordRDD: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[7] at flatMap at :26
scala> wordRDD.collect
res6: Array[String] = Array(hello, java, hello, scala, hello, c, hello, python, hello, shell, "")
把每个单词形成kv对,(key,1)的形式
scala> val wordCountRDD = wordRDD.map(word => (word,1))
wordCountRDD: org.apache.spark.rdd.RDD[(String, Int)] = MapPartitionsRDD[8] at map at :28
scala> wordCountRDD.collect
res7: Array[(String, Int)] = Array((hello,1), (java,1), (hello,1), (scala,1), (hello,1), (c,1), (hello,1), (python,1), (hello,1), (shell,1), ("",1))
执行reduce,归并wordCountRDD中的value
scala> val resultRDD = wordCountRDD.reduceByKey((x, y) => x + y)
resultRDD: org.apache.spark.rdd.RDD[(String, Int)] = ShuffledRDD[9] at reduceByKey at :30
scala> resultRDD.collect
res8: Array[(String, Int)] = Array((scala,1), (python,1), ("",1), (hello,5), (java,1), (shell,1), (c,1))
排序
scala> val orderedRDD = resultRDD.sortByKey()
orderedRDD: org.apache.spark.rdd.RDD[(String, Int)] = ShuffledRDD[10] at sortByKey at :32
scala> orderedRDD.collect
res10: Array[(String, Int)] = Array(("",1), (c,1), (hello,5), (java,1), (python,1), (scala,1), (shell,1))