org.apache.spark
spark-core_2.11
2.1.1
SparkCoreTest
net.alchim31.maven
scala-maven-plugin
3.4.6
compile
testCompile
org.apache.maven.plugins
maven-assembly-plugin
3.0.0
com.spark.day01.WordCount
jar-with-dependencies
make-assembly
package
single
package com.spark.day02
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
/*
* 通过读取内存集合中的数据,创建RDD
*
* */
object Spark01_CreateRDD_mem {
def main(args: Array[String]): Unit = {
// 创建Spark配置文件对象
val conf = new SparkConf().setAppName("Spark01_CreateRDD_mem").setMaster("local[*]")
// 创建SparkContext对象
val sc = new SparkContext()
// 创建一个集合对象
val list: List[Int] = List(1, 2, 3, 4)
// 根据集合创建RDD 方式一
// val rdd: RDD[Int] = sc.parallelize(list)
// 根据集合创建RDD 方式二、makeRDD底层调用的就是parallelize
val rdd: RDD[Int] = sc.makeRDD(list)
rdd.collect().foreach(println)
//
// 关闭连接
sc.stop()
}
}
package com.spark.day02
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object Spark02_Create_file {
def main(args: Array[String]): Unit = {
// 创建SparkConf并设置App名称
val conf: SparkConf = new SparkConf().setAppName("SparkCoreTest").setMaster("local[*]")
// 创建SparkContext,该对象是提交Spark App的入口
val sc: SparkContext = new SparkContext(conf)
// 从本地文件中读取数据创建RDD
// val rdd: RDD[String] = sc.textFile("/Users/tiger/Desktop/JinWWProject/Spark_Project/input/1.txt")
// 从HDFS服务器上面读取数据,创建RDD
val rdd: RDD[String] = sc.textFile("hdfs://172.23.4.221:9000/input")
// 关闭连接
sc.stop()
}
}
主要是通过一个RDD运算完后,再产生新的RDD。
object partition01_default {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("SparkCoreTest1")
val sc: SparkContext = new SparkContext(conf)
val rdd: RDD[Int] = sc.makeRDD(Array(1,2,3,4))
rdd.saveAsTextFile("output")
}
}
object partition02_Array {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("SparkCoreTest1")
val sc: SparkContext = new SparkContext(conf)
//1)4个数据,设置4个分区,输出:0分区->1,1分区->2,2分区->3,3分区->4
//val rdd: RDD[Int] = sc.makeRDD(Array(1, 2, 3, 4), 4)
//2)4个数据,设置3个分区,输出:0分区->1,1分区->2,2分区->3,4
//val rdd: RDD[Int] = sc.makeRDD(Array(1, 2, 3, 4), 3)
//3)5个数据,设置3个分区,输出:0分区->1,1分区->2、3,2分区->4、5
val rdd: RDD[Int] = sc.makeRDD(Array(1, 2, 3, 4, 5), 3)
rdd.saveAsTextFile("output")
sc.stop()
}
}
object partition03_file {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("SparkCoreTest1")
val sc: SparkContext = new SparkContext(conf)
//1)默认分区的数量:默认取值为当前核数和2的最小值
//val rdd: RDD[String] = sc.textFile("input")
//2)输入数据1-4,每行一个数字;输出:0=>{1、2} 1=>{3} 2=>{4} 3=>{空}
//val rdd: RDD[String] = sc.textFile("input/3.txt",3)
//3)输入数据1-4,一共一行;输出:0=>{1234} 1=>{空} 2=>{空} 3=>{空}
val rdd: RDD[String] = sc.textFile("input/4.txt",3)
rdd.saveAsTextFile("output")
sc.stop()
}
}
def map[U: ClassTag](f: T => U):RDD[U]
object value01_map {
def main(args: Array[String]): Unit = {
//1.创建SparkConf并设置App名称
val conf = new SparkConf().setAppName("SparkCoreTest").setMaster("local[*]")
//2.创建SparkContext,该对象是提交Spark App的入口
val sc = new SparkContext(conf)
//3具体业务逻辑
// 3.1 创建一个RDD
val rdd: RDD[Int] = sc.makeRDD(1 to 4,2)
// 3.2 调用map方法,每个元素乘以2
val mapRdd: RDD[Int] = rdd.map(_ * 2)
// 3.3 打印修改后的RDD中数据
mapRdd.collect().foreach(println)
//4.关闭连接
sc.stop()
}
}
def mapPartitions[U: ClassTag](
f: Iterator[T] => Iterator[U],
preservesPartitioning: Boolean = false):RDD[U]
object value02_mapPartitions {
def main(args: Array[String]): Unit = {
//1.创建SparkConf并设置App名称
val conf = new SparkConf().setAppName("SparkCoreTest").setMaster("local[*]")
//2.创建SparkContext,该对象是提交Spark App的入口
val sc = new SparkContext(conf)
//3具体业务逻辑
// 3.1 创建一个RDD
val rdd: RDD[Int] = sc.makeRDD(1 to 4, 2)
// 3.2 调用mapPartitions方法,每个元素乘以2
val rdd1 = rdd.mapPartitions(x=>x.map(_*2))
// 3.3 打印修改后的RDD中数据
rdd1.collect().foreach(println)
//4.关闭连接
sc.stop()
}
}
def mapPartitionsWithIndex[U: ClassTag](
f: (Int, Iterator[T]) => Iterator[U], // Int表示分区号
preservesPartitioning: Boolean = false): RDD[U]
object value03_mapPartitionsWithIndex {
def main(args: Array[String]): Unit = {
//1.创建SparkConf并设置App名称
val conf = new SparkConf().setAppName("SparkCoreTest").setMaster("local[*]")
//2.创建SparkContext,该对象是提交Spark App的入口
val sc = new SparkContext(conf)
//3具体业务逻辑
// 3.1 创建一个RDD
val rdd: RDD[Int] = sc.makeRDD(1 to 4, 2)
// 3.2 创建一个RDD,使每个元素跟所在分区号形成一个元组,组成一个新的RDD
val indexRdd = rdd.mapPartitionsWithIndex( (index,items)=>{items.map( (index,_) )} )
//扩展功能:第二个分区元素*2,其余分区不变
// 3.3 打印修改后的RDD中数据
indexRdd.collect().foreach(println)
//4.关闭连接
sc.stop()
}
}
def flatMap[U: ClassTag](f: T => TraversableOnce[U]):RDD[U]
object value04_flatMap {
def main(args: Array[String]): Unit = {
//1.创建SparkConf并设置App名称
val conf = new SparkConf().setAppName("SparkCoreTest").setMaster("local[*]")
//2.创建SparkContext,该对象是提交Spark App的入口
val sc = new SparkContext(conf)
//3具体业务逻辑
// 3.1 创建一个RDD
val listRDD=sc.makeRDD(List(List(1,2),List(3,4),List(5,6),List(7)), 2)
// 3.2 把所有子集合中数据取出放入到一个大的集合中
listRDD.flatMap(list=>list).collect.foreach(println)
//4.关闭连接
sc.stop()
}
}
def glom():RDD[Array[T]]
object value05_glom {
def main(args: Array[String]): Unit = {
//1.创建SparkConf并设置App名称
val conf = new SparkConf().setAppName("SparkCoreTest").setMaster("local[*]")
//2.创建SparkContext,该对象是提交Spark App的入口
val sc = new SparkContext(conf)
//3具体业务逻辑
// 3.1 创建一个RDD
val rdd = sc.makeRDD(1 to 4, 2)
// 3.2 求出每个分区的最大值 0->1,2 1->3,4
val maxRdd: RDD[Int] = rdd.glom().map(_.max)
// 3.3 求出所有分区的最大值的和 2 + 4
println(maxRdd.collect().sum)
//4.关闭连接
sc.stop()
}
}
def groupBy[K](f: T => K)(implicit kt: ClassTag[K]): RDD[(K,Iterable[T])]
object value06_groupby {
def main(args: Array[String]): Unit = {
//1.创建SparkConf并设置App名称
val conf = new SparkConf().setAppName("SparkCoreTest").setMaster("local[*]")
//2.创建SparkContext,该对象是提交Spark App的入口
val sc = new SparkContext(conf)
//3具体业务逻辑
// 3.1 创建一个RDD
val rdd = sc.makeRDD(1 to 4, 2)
// 3.2 将每个分区的数据放到一个数组并收集到Driver端打印
rdd.groupBy(_ % 2).collect().foreach(println)
// 3.3 创建一个RDD
val rdd1: RDD[String] = sc.makeRDD(List("hello","hive","hadoop","spark","scala"))
// 3.4 按照首字母第一个单词相同分组
rdd1.groupBy(str=>str.substring(0,1)).collect().foreach(println)
sc.stop()
}
}
object value06_groupby {
def main(args: Array[String]): Unit = {
//1.创建SparkConf并设置App名称
val conf = new SparkConf().setAppName("SparkCoreTest").setMaster("local[*]")
//2.创建SparkContext,该对象是提交Spark App的入口
val sc = new SparkContext(conf)
//3具体业务逻辑
// 3.1 创建一个RDD
val strList: List[String] = List("Hello Scala", "Hello Spark", "Hello World")
val rdd = sc.makeRDD(strList)
// 3.2 将字符串拆分成一个一个的单词
val wordRdd: RDD[String] = rdd.flatMap(str=>str.split(" "))
// 3.3 将单词结果进行转换:word=>(word,1)
val wordToOneRdd: RDD[(String, Int)] = wordRdd.map(word=>(word, 1))
// 3.4 将转换结构后的数据分组
val groupRdd: RDD[(String, Iterable[(String, Int)])] = wordToOneRdd.groupBy(t=>t._1)
// 3.5 将分组后的数据进行结构的转换
val wordToSum: RDD[(String, Int)] = groupRdd.map {
case (word, list) => {
(word, list.size)
}
}
wordToSum.collect().foreach(println)
sc.stop()
}
}
def filter(f: T => Boolean):RDD[T]
object value07_filter {
def main(args: Array[String]): Unit = {
//1.创建SparkConf并设置App名称
val conf: SparkConf = new SparkConf().setAppName("SparkCoreTest").setMaster("local[*]")
//2.创建SparkContext,该对象是提交Spark App的入口
val sc: SparkContext = new SparkContext(conf)
//3.创建一个RDD
val rdd: RDD[Int] = sc.makeRDD(Array(1, 2, 3, 4),2)
//3.1 过滤出符合条件的数据
val filterRdd: RDD[Int] = rdd.filter(_ % 2 == 0)
//3.2 收集并打印数据
filterRdd.collect().foreach(println)
//4 关闭连接
sc.stop()
}
}
def sample(
withReplacement: Boolean, // withReplacement表示:抽出的数据是否放回,true为有放回的抽样,false为无放回的抽样
fraction: Double, // 当withReplacement=false时:选择每个元素的概率;取值一定是[0,1] ;
seed: Long = Utils.random.nextLong): RDD[T] // seed表示:指定随机数生成器种子。
object value08_sample {
def main(args: Array[String]): Unit = {
//1.创建SparkConf并设置App名称
val conf: SparkConf = new SparkConf().setAppName("SparkCoreTest").setMaster("local[*]")
//2.创建SparkContext,该对象是提交Spark App的入口
val sc: SparkContext = new SparkContext(conf)
//3具体业务逻辑
// 3.1 创建一个RDD
val rdd: RDD[Int] = sc.makeRDD(1 to 10)
// 3.2 打印放回抽样结果
rdd.sample(true, 0.4, 2).collect().foreach(println)
// 3.3 打印不放回抽样结果
rdd.sample(false, 0.2, 3).collect().foreach(println)
//4.关闭连接
sc.stop()
}
}
def distinct(): RDD[T] // 默认情况下,distinct会生成与原RDD分区个数一致的分区数
def distinct(numPartitions: Int)(implicit ord: Ordering[T] = null): RDD[T] // 可以去重后修改分区个数
object value09_distinct {
def main(args: Array[String]): Unit = {
//1.创建SparkConf并设置App名称
val conf: SparkConf = new SparkConf().setAppName("SparkCoreTest").setMaster("local[*]")
//2.创建SparkContext,该对象是提交Spark App的入口
val sc: SparkContext = new SparkContext(conf)
//3具体业务逻辑
// 3.1 创建一个RDD
val distinctRdd: RDD[Int] = sc.makeRDD(List(1,2,1,5,2,9,6,1))
// 3.2 打印去重后生成的新RDD
distinctRdd.distinct().collect().foreach(println)
// 3.3 对RDD采用多个Task去重,提高并发度
distinctRdd.distinct(2).collect().foreach(println)
//4.关闭连接
sc.stop()
}
}
def coalesce(numPartitions: Int, shuffle: Boolean = false, // shuffle为false,则该操作会将分区数较多的原始RDD向分区数比较少的目标RDD,进行转换。
partitionCoalescer: Option[PartitionCoalescer] = Option.empty)
(implicit ord: Ordering[T] = null) : RDD[T]
object value10_coalesce {
def main(args: Array[String]): Unit = {
//1.创建SparkConf并设置App名称
val conf: SparkConf = new SparkConf().setAppName("SparkCoreTest").setMaster("local[*]")
//2.创建SparkContext,该对象是提交Spark App的入口
val sc: SparkContext = new SparkContext(conf)
//3.创建一个RDD
//val rdd: RDD[Int] = sc.makeRDD(Array(1, 2, 3, 4), 4)
//3.1 缩减分区
//val coalesceRdd: RDD[Int] = rdd.coalesce(2)
//4. 创建一个RDD
val rdd: RDD[Int] = sc.makeRDD(Array(1, 2, 3, 4, 5, 6), 3)
//4.1 缩减分区
val coalesceRdd: RDD[Int] = rdd.coalesce(2)
//5 打印查看对应分区数据
val indexRdd: RDD[Int] = coalesceRdd.mapPartitionsWithIndex(
(index, datas) => {
// 打印每个分区数据,并带分区号
datas.foreach(data => {
println(index + "=>" + data)
})
// 返回分区的数据
datas
}
)
indexRdd.collect()
//6. 关闭连接
sc.stop()
}
}
//3. 创建一个RDD
val rdd: RDD[Int] = sc.makeRDD(Array(1, 2, 3, 4, 5, 6), 3)
//3.1 执行shuffle
val coalesceRdd: RDD[Int] = rdd.coalesce(2, true)
def repartition(numPartitions: Int)(implicit ord: Ordering[T] = null):RDD[T]
object value11_repartition {
def main(args: Array[String]): Unit = {
//1.创建SparkConf并设置App名称
val conf: SparkConf = new SparkConf().setAppName("SparkCoreTest").setMaster("local[*]")
//2.创建SparkContext,该对象是提交Spark App的入口
val sc: SparkContext = new SparkContext(conf)
//3. 创建一个RDD
val rdd: RDD[Int] = sc.makeRDD(Array(1, 2, 3, 4, 5, 6), 3)
//3.1 缩减分区
//val coalesceRdd: RDD[Int] = rdd.coalesce(2,true)
//3.2 重新分区
val repartitionRdd: RDD[Int] = rdd.repartition(2)
//4 打印查看对应分区数据
val indexRdd: RDD[Int] = repartitionRdd.mapPartitionsWithIndex(
(index, datas) => {
// 打印每个分区数据,并带分区号
datas.foreach(data => {
println(index + "=>" + data)
})
// 返回分区的数据
datas
}
)
indexRdd.collect()
//6. 关闭连接
sc.stop()
}
}
def repartition(numPartitions: Int)(implicit ord: Ordering[T] = null): RDD[T] = withScope {
coalesce(numPartitions, shuffle = true)
}
def sortBy[K]( f: (T) => K,
ascending: Boolean = true, // // 默认为正序排列
numPartitions: Int = this.partitions.length)
(implicit ord: Ordering[K], ctag: ClassTag[K]): RDD[T]
object value12_sortBy {
def main(args: Array[String]): Unit = {
//1.创建SparkConf并设置App名称
val conf: SparkConf = new SparkConf().setAppName("SparkCoreTest").setMaster("local[*]")
//2.创建SparkContext,该对象是提交Spark App的入口
val sc: SparkContext = new SparkContext(conf)
//3具体业务逻辑
// 3.1 创建一个RDD
val rdd: RDD[Int] = sc.makeRDD(List(2, 1, 3, 4, 6, 5))
// 3.2 默认是升序排
val sortRdd: RDD[Int] = rdd.sortBy(num => num)
sortRdd.collect().foreach(println)
// 3.3 配置为倒序排
val sortRdd2: RDD[Int] = rdd.sortBy(num => num, false)
sortRdd2.collect().foreach(println)
// 3.4 创建一个RDD
val strRdd: RDD[String] = sc.makeRDD(List("1", "22", "12", "2", "3"))
// 3.5 按照字符的int值排序
strRdd.sortBy(num => num.toInt).collect().foreach(println)
// 3.5 创建一个RDD
val rdd3: RDD[(Int, Int)] = sc.makeRDD(List((2, 1), (1, 2), (1, 1), (2, 2)))
// 3.6 先按照tuple的第一个值排序,相等再按照第2个值排
rdd3.sortBy(t=>t).collect().foreach(println)
//4.关闭连接
sc.stop()
}
}
def pipe(command: String):RDD[T]
#!/bin/sh
echo "Start"
while read LINE; do
echo ">>>"${LINE}
done
创建一个只有一个分区的RDD
scala> val rdd = sc.makeRDD (List("hi","Hello","how","are","you"),1)
scala> rdd.pipe("/opt/module/spark/pipe.sh").collect()
res18: Array[String] = Array(Start, >>>hi, >>>Hello, >>>how, >>>are, >>>you)
scala> val rdd = sc.makeRDD(List("hi","Hello","how","are","you"),2)
scala> rdd.pipe("/opt/module/spark/pipe.sh").collect()
res19: Array[String] = Array(Start, >>>hi, >>>Hello, Start, >>>how, >>>are, >>>you)
def union(other: RDD[T]): RDD[T]
object DoubleValue01_union {
def main(args: Array[String]): Unit = {
//1.创建SparkConf并设置App名称
val conf: SparkConf = new SparkConf().setAppName("SparkCoreTest").setMaster("local[*]")
//2.创建SparkContext,该对象是提交Spark App的入口
val sc: SparkContext = new SparkContext(conf)
//3具体业务逻辑
//3.1 创建第一个RDD
val rdd1: RDD[Int] = sc.makeRDD(1 to 4)
//3.2 创建第二个RDD
val rdd2: RDD[Int] = sc.makeRDD(4 to 8)
//3.3 计算两个RDD的并集
rdd1.union(rdd2).collect().foreach(println)
//4.关闭连接
sc.stop()
}
}
def subtract(other: RDD[T]): RDD[T]
object DoubleValue02_subtract {
def main(args: Array[String]): Unit = {
//1.创建SparkConf并设置App名称
val conf: SparkConf = new SparkConf().setAppName("SparkCoreTest").setMaster("local[*]")
//2.创建SparkContext,该对象是提交Spark App的入口
val sc: SparkContext = new SparkContext(conf)
//3具体业务逻辑
//3.1 创建第一个RDD
val rdd: RDD[Int] = sc.makeRDD(1 to 4)
//3.2 创建第二个RDD
val rdd1: RDD[Int] = sc.makeRDD(4 to 8)
//3.3 计算第一个RDD与第二个RDD的差集并打印
rdd.subtract(rdd1).collect().foreach(println)
//4.关闭连接
sc.stop()
}
}
def intersection(other: RDD[T]): RDD[T]
object DoubleValue03_intersection {
def main(args: Array[String]): Unit = {
//1.创建SparkConf并设置App名称
val conf: SparkConf = new SparkConf().setAppName("SparkCoreTest").setMaster("local[*]")
//2.创建SparkContext,该对象是提交Spark App的入口
val sc: SparkContext = new SparkContext(conf)
//3具体业务逻辑
//3.1 创建第一个RDD
val rdd1: RDD[Int] = sc.makeRDD(1 to 4)
//3.2 创建第二个RDD
val rdd2: RDD[Int] = sc.makeRDD(4 to 8)
//3.3 计算第一个RDD与第二个RDD的差集并打印
rdd1.intersection(rdd2).collect().foreach(println)
//4.关闭连接
sc.stop()
}
}
def zip[U: ClassTag](other: RDD[U]): RDD[(T, U)]
object DoubleValue04_zip {
def main(args: Array[String]): Unit = {
//1.创建SparkConf并设置App名称
val conf: SparkConf = new SparkConf().setAppName("SparkCoreTest").setMaster("local[*]")
//2.创建SparkContext,该对象是提交Spark App的入口
val sc: SparkContext = new SparkContext(conf)
//3具体业务逻辑
//3.1 创建第一个RDD
val rdd1: RDD[Int] = sc.makeRDD(Array(1,2,3),3)
//3.2 创建第二个RDD
val rdd2: RDD[String] = sc.makeRDD(Array("a","b","c"),3)
//3.3 第一个RDD组合第二个RDD并打印
rdd1.zip(rdd2).collect().foreach(println)
//3.4 第二个RDD组合第一个RDD并打印
rdd2.zip(rdd1).collect().foreach(println)
//3.5 创建第三个RDD(与1,2分区数不同)
val rdd3: RDD[String] = sc.makeRDD(Array("a","b"),3)
//3.6 元素个数不同,不能拉链
// Can only zip RDDs with same number of elements in each partition
rdd1.zip(rdd3).collect().foreach(println)
//3.7 创建第四个RDD(与1,2分区数不同)
val rdd4: RDD[String] = sc.makeRDD(Array("a","b","c"),2)
//3.8 分区数不同,不能拉链
// Can't zip RDDs with unequal numbers of partitions: List(3, 2)
rdd1.zip(rdd4).collect().foreach(println)
//4.关闭连接
sc.stop()
}
}
def partitionBy(partitioner: Partitioner): RDD[(K, V)]
object KeyValue01_partitionBy {
def main(args: Array[String]): Unit = {
//1.创建SparkConf并设置App名称
val conf: SparkConf = new SparkConf().setAppName("SparkCoreTest").setMaster("local[*]")
//2.创建SparkContext,该对象是提交Spark App的入口
val sc: SparkContext = new SparkContext(conf)
//3具体业务逻辑
//3.1 创建第一个RDD
val rdd: RDD[(Int, String)] = sc.makeRDD(Array((1,"aaa"),(2,"bbb"),(3,"ccc")),3)
//3.2 对RDD重新分区
val rdd2: RDD[(Int, String)] = rdd.partitionBy(new org.apache.spark.HashPartitioner(2))
//3.3 查看新RDD的分区数
println(rdd2.partitions.size)
//4.关闭连接
sc.stop()
}
}
class HashPartitioner(partitions: Int) extends Partitioner {
require(partitions >= 0, s"Number of partitions ($partitions) cannot be negative.")
def numPartitions: Int = partitions
def getPartition(key: Any): Int = key match {
case null => 0
case _ => Utils.nonNegativeMod(key.hashCode, numPartitions)
}
override def equals(other: Any): Boolean = other match {
case h: HashPartitioner =>
h.numPartitions == numPartitions
case _ =>
false
}
override def hashCode: Int = numPartitions
}
package com.spark.day04
import org.apache.spark.rdd.RDD
import org.apache.spark.{HashPartitioner, Partitioner, SparkConf, SparkContext}
/*
* 转换算子-parittionBy
* 对KV类型的RDD按key进行重新分区
*
* */
object Spark01_Transformation_partitionBy {
def main(args: Array[String]): Unit = {
// 创建SparkConf并设置App名称
val conf: SparkConf = new SparkConf().setAppName("SparkCoreTest").setMaster("local[*]")
// 创建SparkContext对象,该对象是提交Spark App的入口
val sc = new SparkContext(conf)
// 创建RDD
// 注意:RDD本身是没有partitionBy这个算子的,通过隐式转换动态给kv类型的RDD扩展的功能
// val rdd: RDD[Int] = sc.makeRDD(List(1, 2, 3, 4))
val rdd: RDD[(Int, String)] = sc.makeRDD(List((1, "aaa"), (2, "bbb"), (3, "ccc")), 3)
rdd.mapPartitionsWithIndex(
(index, datas) => {
println(index + "--->" + datas.mkString(","))
datas
}
).collect()
// rdd.partitionBy(new HashPartitioner(2)).mapPartitionsWithIndex(
// (index, datas) => {
// println(index + "---------->" + datas.mkString(","))
// datas
// }
// ).collect()
rdd.partitionBy(new MyPartiioner(2)).mapPartitionsWithIndex(
(index, datas) => {
println(index + "---------->" + datas.mkString(","))
datas
}
).collect()
// 关闭连接
sc.stop()
}
}
// 自定义分区器
class MyPartiioner(partitions: Int) extends Partitioner {
// 获取分区的个数
override def numPartitions: Int = partitions
// 指定分区规则,返回值Int表示分区编号,从0开始
override def getPartition(key: Any): Int = {
// 8
// val key: String = key.asInstanceOf[String]
// if (key.startsWith("135")) {
// 0
// } else {
// 1
// }
1
}
}
def reduceByKey(func: (V, V) => V): RDD[(K, V)]
def reduceByKey(func: (V, V) => V, numPartitions: Int): RDD[(K, V)]
object KeyValue02_reduceByKey {
def main(args: Array[String]): Unit = {
//1.创建SparkConf并设置App名称
val conf: SparkConf = new SparkConf().setAppName("SparkCoreTest").setMaster("local[*]")
//2.创建SparkContext,该对象是提交Spark App的入口
val sc: SparkContext = new SparkContext(conf)
//3具体业务逻辑
//3.1 创建第一个RDD
val rdd = sc.makeRDD(List(("a",1),("b",5),("a",5),("b",2)))
//3.2 计算相同key对应值的相加结果
val reduce: RDD[(String, Int)] = rdd.reduceByKey((x,y) => x+y)
//3.3 打印结果
reduce.collect().foreach(println)
//4.关闭连接
sc.stop()
}
}
def groupByKey(): RDD[(K,Iterable[V])]
object KeyValue03_groupByKey {
def main(args: Array[String]): Unit = {
//1.创建SparkConf并设置App名称
val conf: SparkConf = new SparkConf().setAppName("SparkCoreTest").setMaster("local[*]")
//2.创建SparkContext,该对象是提交Spark App的入口
val sc: SparkContext = new SparkContext(conf)
//3具体业务逻辑
//3.1 创建第一个RDD
val rdd = sc.makeRDD(List(("a",1),("b",5),("a",5),("b",2)))
//3.2 将相同key对应值聚合到一个Seq中
val group: RDD[(String, Iterable[Int])] = rdd.groupByKey()
//3.3 打印结果
group.collect().foreach(println)
//3.4 计算相同key对应值的相加结果
group.map(t=>(t._1,t._2.sum)).collect().foreach(println)
//4.关闭连接
sc.stop()
}
}
def aggregateByKey[U: ClassTag](zeroValue: U)(seqOp: (U, V) => U, combOp: (U, U) => U): RDD[(K, U)]
object KeyValue04_aggregateByKey {
def main(args: Array[String]): Unit = {
//1.创建SparkConf并设置App名称
val conf: SparkConf = new SparkConf().setAppName("SparkCoreTest").setMaster("local[*]")
//2.创建SparkContext,该对象是提交Spark App的入口
val sc: SparkContext = new SparkContext(conf)
//3具体业务逻辑
//3.1 创建第一个RDD
val rdd: RDD[(String, Int)] = sc.makeRDD(List(("a", 3), ("a", 2), ("c", 4), ("b", 3), ("c", 6), ("c", 8)), 2)
//3.2 取出每个分区相同key对应值的最大值,然后相加
rdd.aggregateByKey(0)(math.max(_, _), _ + _).collect().foreach(println)
//4.关闭连接
sc.stop()
}
}
def foldByKey(zeroValue: V)(func: (V, V) => V):RDD[(K, V)]
bject KeyValue05_foldByKey {
def main(args: Array[String]): Unit = {
//1.创建SparkConf并设置App名称
val conf: SparkConf = new SparkConf().setAppName("SparkCoreTest").setMaster("local[*]")
//2.创建SparkContext,该对象是提交Spark App的入口
val sc: SparkContext = new SparkContext(conf)
//3具体业务逻辑
//3.1 创建第一个RDD
val list: List[(String, Int)] = List(("a",1),("a",1),("a",1),("b",1),("b",1),("b",1),("b",1),("a",1))
val rdd = sc.makeRDD(list,2)
//3.2 求wordcount
//rdd.aggregateByKey(0)(_+_,_+_).collect().foreach(println)
rdd.foldByKey(0)(_+_).collect().foreach(println)
//4.关闭连接
sc.stop()
}
}
def combineByKey[C](
createCombiner: V => C,
mergeValue: (C, V) => C,
mergeCombiners: (C, C) => C): RDD[(K, C)]
object KeyValue06_combineByKey {
def main(args: Array[String]): Unit = {
//1.创建SparkConf并设置App名称
val conf: SparkConf = new SparkConf().setAppName("SparkCoreTest").setMaster("local[*]")
//2.创建SparkContext,该对象是提交Spark App的入口
val sc: SparkContext = new SparkContext(conf)
//3.1 创建第一个RDD
val list: List[(String, Int)] = List(("a", 88), ("b", 95), ("a", 91), ("b", 93), ("a", 95), ("b", 98))
val input: RDD[(String, Int)] = sc.makeRDD(list, 2)
//3.2 将相同key对应的值相加,同时记录该key出现的次数,放入一个二元组
val combineRdd: RDD[(String, (Int, Int))] = input.combineByKey(
(_, 1),
(acc: (Int, Int), v) => (acc._1 + v, acc._2 + 1),
(acc1: (Int, Int), acc2: (Int, Int)) => (acc1._1 + acc2._1, acc1._2 + acc2._2)
)
//3.3 打印合并后的结果
combineRdd.collect().foreach(println)
//3.4 计算平均值
combineRdd.map {
case (key, value) => {
(key, value._1 / value._2.toDouble)
}
}.collect().foreach(println)
//4.关闭连接
sc.stop()
}
}
def sortByKey(
ascending: Boolean = true, // 默认升序
numPartitions: Int = self.partitions.length) : RDD[(K, V)]
object KeyValue07_sortByKey {
def main(args: Array[String]): Unit = {
//1.创建SparkConf并设置App名称
val conf: SparkConf = new SparkConf().setAppName("SparkCoreTest").setMaster("local[*]")
//2.创建SparkContext,该对象是提交Spark App的入口
val sc: SparkContext = new SparkContext(conf)
//3具体业务逻辑
//3.1 创建第一个RDD
val rdd: RDD[(Int, String)] = sc.makeRDD(Array((3,"aa"),(6,"cc"),(2,"bb"),(1,"dd")))
//3.2 按照key的正序(默认顺序)
rdd.sortByKey(true).collect().foreach(println)
//3.3 按照key的倒序
rdd.sortByKey(false).collect().foreach(println)
//4.关闭连接
sc.stop()
}
}
def join[W](other: RDD[(K, W)]): RDD[(K, (V, W))]
def join[W](other: RDD[(K, W)], numPartitions: Int): RDD[(K, (V, W))]
object KeyValue09_join {
def main(args: Array[String]): Unit = {
//1.创建SparkConf并设置App名称
val conf: SparkConf = new SparkConf().setAppName("SparkCoreTest").setMaster("local[*]")
//2.创建SparkContext,该对象是提交Spark App的入口
val sc: SparkContext = new SparkContext(conf)
//3具体业务逻辑
//3.1 创建第一个RDD
val rdd: RDD[(Int, String)] = sc.makeRDD(Array((1, "a"), (2, "b"), (3, "c")))
//3.2 创建第二个pairRDD
val rdd1: RDD[(Int, Int)] = sc.makeRDD(Array((1, 4), (2, 5), (4, 6)))
//3.3 join操作并打印结果
rdd.join(rdd1).collect().foreach(println)
//4.关闭连接
sc.stop()
}
}
def cogroup[W](other: RDD[(K, W)]): RDD[(K, (Iterable[V], Iterable[W]))]
object KeyValue10_cogroup {
def main(args: Array[String]): Unit = {
//1.创建SparkConf并设置App名称
val conf: SparkConf = new SparkConf().setAppName("SparkCoreTest").setMaster("local[*]")
//2.创建SparkContext,该对象是提交Spark App的入口
val sc: SparkContext = new SparkContext(conf)
//3具体业务逻辑
//3.1 创建第一个RDD
val rdd: RDD[(Int, String)] = sc.makeRDD(Array((1,"a"),(2,"b"),(3,"c")))
//3.2 创建第二个RDD
val rdd1: RDD[(Int, Int)] = sc.makeRDD(Array((1,4),(2,5),(3,6)))
//3.3 cogroup两个RDD并打印结果
rdd.cogroup(rdd1).collect().foreach(println)
//4.关闭连接
sc.stop()
}
}
object Demo_top3 {
def main(args: Array[String]): Unit = {
//1. 初始化Spark配置信息并建立与Spark的连接
val sparkConf = new SparkConf().setMaster("local[*]").setAppName("Test")
val sc = new SparkContext(sparkConf)
//2. 读取日志文件,获取原始数据
val dataRDD: RDD[String] = sc.textFile("input/agent.log")
//3. 将原始数据进行结构转换string =>(prv-adv,1)
val prvAndAdvToOneRDD: RDD[(String, Int)] = dataRDD.map {
line => {
val datas: Array[String] = line.split(" ")
(datas(1) + "-" + datas(4), 1)
}
}
//4. 将转换结构后的数据进行聚合统计(prv-adv,1)=>(prv-adv,sum)
val prvAndAdvToSumRDD: RDD[(String, Int)] = prvAndAdvToOneRDD.reduceByKey(_ + _)
//5. 将统计的结果进行结构的转换(prv-adv,sum)=>(prv,(adv,sum))
val prvToAdvAndSumRDD: RDD[(String, (String, Int))] = prvAndAdvToSumRDD.map {
case (prvAndAdv, sum) => {
val ks: Array[String] = prvAndAdv.split("-")
(ks(0), (ks(1), sum))
}
}
//6. 根据省份对数据进行分组:(prv,(adv,sum)) => (prv, Iterator[(adv,sum)])
val groupRDD: RDD[(String, Iterable[(String, Int)])] = prvToAdvAndSumRDD.groupByKey()
//7. 对相同省份中的广告进行排序(降序),取前三名
val mapValuesRDD: RDD[(String, List[(String, Int)])] = groupRDD.mapValues {
datas => {
datas.toList.sortWith(
(left, right) => {
left._2 > right._2
}
).take(3)
}
}
//8. 将结果打印
mapValuesRDD.collect().foreach(println)
//9.关闭与spark的连接
sc.stop()
}
}
def reduce(f: (T, T) => T): T
object action01_reduce {
def main(args: Array[String]): Unit = {
//1.创建SparkConf并设置App名称
val conf: SparkConf = new SparkConf().setAppName("SparkCoreTest").setMaster("local[*]")
//2.创建SparkContext,该对象是提交Spark App的入口
val sc: SparkContext = new SparkContext(conf)
//3具体业务逻辑
//3.1 创建第一个RDD
val rdd: RDD[Int] = sc.makeRDD(List(1,2,3,4))
//3.2 聚合数据
val reduceResult: Int = rdd.reduce(_+_)
println(reduceResult)
//4.关闭连接
sc.stop()
}
}
函数签名
def collect(): Array[T]
object action02_collect {
def main(args: Array[String]): Unit = {
//1.创建SparkConf并设置App名称
val conf: SparkConf = new SparkConf().setAppName("SparkCoreTest").setMaster("local[*]")
//2.创建SparkContext,该对象是提交Spark App的入口
val sc: SparkContext = new SparkContext(conf)
//3具体业务逻辑
//3.1 创建第一个RDD
val rdd: RDD[Int] = sc.makeRDD(List(1,2,3,4))
//3.2 收集数据到Driver
rdd.collect().foreach(println)
//4.关闭连接
sc.stop()
}
}
函数签名
def count(): Long
object action03_count {
def main(args: Array[String]): Unit = {
//1.创建SparkConf并设置App名称
val conf: SparkConf = new SparkConf().setAppName("SparkCoreTest").setMaster("local[*]")
//2.创建SparkContext,该对象是提交Spark App的入口
val sc: SparkContext = new SparkContext(conf)
//3具体业务逻辑
//3.1 创建第一个RDD
val rdd: RDD[Int] = sc.makeRDD(List(1,2,3,4))
//3.2 返回RDD中元素的个数
val countResult: Long = rdd.count()
println(countResult)
//4.关闭连接
sc.stop()
}
}
def first(): T
object action04_first {
def main(args: Array[String]): Unit = {
//1.创建SparkConf并设置App名称
val conf: SparkConf = new SparkConf().setAppName("SparkCoreTest").setMaster("local[*]")
//2.创建SparkContext,该对象是提交Spark App的入口
val sc: SparkContext = new SparkContext(conf)
//3具体业务逻辑
//3.1 创建第一个RDD
val rdd: RDD[Int] = sc.makeRDD(List(1,2,3,4))
//3.2 返回RDD中元素的个数
val firstResult: Int = rdd.first()
println(firstResult)
//4.关闭连接
sc.stop()
}
}
def take(num: Int): Array[T]
object action05_take {
def main(args: Array[String]): Unit = {
//1.创建SparkConf并设置App名称
val conf: SparkConf = new SparkConf().setAppName("SparkCoreTest").setMaster("local[*]")
//2.创建SparkContext,该对象是提交Spark App的入口
val sc: SparkContext = new SparkContext(conf)
//3具体业务逻辑
//3.1 创建第一个RDD
val rdd: RDD[Int] = sc.makeRDD(List(1,2,3,4))
//3.2 返回RDD中元素的个数
val takeResult: Array[Int] = rdd.take(2)
println(takeResult)
//4.关闭连接
sc.stop()
}
}
def takeOrdered(num: Int)(implicit ord: Ordering[T]): Array[T]
object action06_takeOrdered{
def main(args: Array[String]): Unit = {
//1.创建SparkConf并设置App名称
val conf: SparkConf = new SparkConf().setAppName("SparkCoreTest").setMaster("local[*]")
//2.创建SparkContext,该对象是提交Spark App的入口
val sc: SparkContext = new SparkContext(conf)
//3具体业务逻辑
//3.1 创建第一个RDD
val rdd: RDD[Int] = sc.makeRDD(List(1,3,2,4))
//3.2 返回RDD中元素的个数
val result: Array[Int] = rdd.takeOrdered(2)
println(result)
//4.关闭连接
sc.stop()
}
}
def aggregate[U: ClassTag](zeroValue: U)(seqOp: (U, T) => U, combOp: (U, U) => U): U
object action07_aggregate {
def main(args: Array[String]): Unit = {
//1.创建SparkConf并设置App名称
val conf: SparkConf = new SparkConf().setAppName("SparkCoreTest").setMaster("local[*]")
//2.创建SparkContext,该对象是提交Spark App的入口
val sc: SparkContext = new SparkContext(conf)
//3具体业务逻辑
//3.1 创建第一个RDD
val rdd: RDD[Int] = sc.makeRDD(List(1, 2, 3, 4),8)
//3.2 将该RDD所有元素相加得到结果
//val result: Int = rdd.aggregate(0)(_ + _, _ + _)
val result: Int = rdd.aggregate(10)(_ + _, _ + _)
println(result)
//4.关闭连接
sc.stop()
}
}
def fold(zeroValue: T)(op: (T, T) => T): T
object action08_fold {
def main(args: Array[String]): Unit = {
//1.创建SparkConf并设置App名称
val conf: SparkConf = new SparkConf().setAppName("SparkCoreTest").setMaster("local[*]")
//2.创建SparkContext,该对象是提交Spark App的入口
val sc: SparkContext = new SparkContext(conf)
//3具体业务逻辑
//3.1 创建第一个RDD
val rdd: RDD[Int] = sc.makeRDD(List(1, 2, 3, 4))
//3.2 将该RDD所有元素相加得到结果
val foldResult: Int = rdd.fold(0)(_+_)
println(foldResult)
//4.关闭连接
sc.stop()
}
}
def countByKey(): Map[K, Long]
示例:创建一个PairRDD,统计每种key的个数
object action09_countByKey {
def main(args: Array[String]): Unit = {
//1.创建SparkConf并设置App名称
val conf: SparkConf = new SparkConf().setAppName("SparkCoreTest").setMaster("local[*]")
//2.创建SparkContext,该对象是提交Spark App的入口
val sc: SparkContext = new SparkContext(conf)
//3具体业务逻辑
//3.1 创建第一个RDD
val rdd: RDD[(Int, String)] = sc.makeRDD(List((1, "a"), (1, "a"), (1, "a"), (2, "b"), (3, "c"), (3, "c")))
//3.2 统计每种key的个数
val result: collection.Map[Int, Long] = rdd.countByKey()
println(result)
//4.关闭连接
sc.stop()
}
}
object action10_save {
def main(args: Array[String]): Unit = {
//1.创建SparkConf并设置App名称
val conf: SparkConf = new SparkConf().setAppName("SparkCoreTest").setMaster("local[*]")
//2.创建SparkContext,该对象是提交Spark App的入口
val sc: SparkContext = new SparkContext(conf)
//3具体业务逻辑
//3.1 创建第一个RDD
val rdd: RDD[Int] = sc.makeRDD(List(1,2,3,4),2)
//3.2 保存成Text文件
rdd.saveAsTextFile("output")
//3.3 序列化成对象保存到文件
rdd.saveAsObjectFile("output1")
//3.4 保存成Sequencefile文件
rdd.map((_,1)).saveAsSequenceFile("output2")
//4.关闭连接
sc.stop()
}
}
def foreach(f: T => Unit): Unit
object action11_foreach {
def main(args: Array[String]): Unit = {
//1.创建SparkConf并设置App名称
val conf: SparkConf = new SparkConf().setAppName("SparkCoreTest").setMaster("local[*]")
//2.创建SparkContext,该对象是提交Spark App的入口
val sc: SparkContext = new SparkContext(conf)
//3具体业务逻辑
//3.1 创建第一个RDD
// val rdd: RDD[Int] = sc.makeRDD(List(1,2,3,4),2)
val rdd: RDD[Int] = sc.makeRDD(List(1,2,3,4))
//3.2 收集后打印
rdd.map(num=>num).collect().foreach(println)
println("****************")
//3.3 分布式打印
rdd.foreach(println)
//4.关闭连接
sc.stop()
}
}
object serializable01_object {
def main(args: Array[String]): Unit = {
//1.创建SparkConf并设置App名称
val conf: SparkConf = new SparkConf().setAppName("SparkCoreTest").setMaster("local[*]")
//2.创建SparkContext,该对象是提交Spark App的入口
val sc: SparkContext = new SparkContext(conf)
//3.创建两个对象
val user1 = new User()
user1.name = "zhangsan"
val user2 = new User()
user2.name = "lisi"
val userRDD1: RDD[User] = sc.makeRDD(List(user1, user2))
//3.1 打印,ERROR报java.io.NotSerializableException
//userRDD1.foreach(user => println(user.name))
//3.2 打印,RIGHT
val userRDD2: RDD[User] = sc.makeRDD(List())
//userRDD2.foreach(user => println(user.name))
//3.3 打印,ERROR Task not serializable 注意:没执行就报错了
userRDD2.foreach(user => println(user1.name))
//4.关闭连接
sc.stop()
}
}
//class User {
// var name: String = _
//}
class User extends Serializable {
var name: String = _
}
object serializable02_function {
def main(args: Array[String]): Unit = {
//1.创建SparkConf并设置App名称
val conf: SparkConf = new SparkConf().setAppName("SparkCoreTest").setMaster("local[*]")
//2.创建SparkContext,该对象是提交Spark App的入口
val sc: SparkContext = new SparkContext(conf)
//3.创建一个RDD
val rdd: RDD[String] = sc.makeRDD(Array("hello world", "hello spark", "hive", "atguigu"))
//3.1创建一个Search对象
val search = new Search("hello")
// Driver:算子以外的代码都是在Driver端执行
// Executor:算子里面的代码都是在Executor端执行
//3.2 函数传递,打印:ERROR Task not serializable
search.getMatch1(rdd).collect().foreach(println)
//3.3 属性传递,打印:ERROR Task not serializable
search.getMatche2(rdd).collect().foreach(println)
//4.关闭连接
sc.stop()
}
}
class Search(query:String) extends Serializable {
def isMatch(s: String): Boolean = {
s.contains(query)
}
// 函数序列化案例
def getMatch1 (rdd: RDD[String]): RDD[String] = {
//rdd.filter(this.isMatch)
rdd.filter(isMatch)
}
// 属性序列化案例
def getMatche2(rdd: RDD[String]): RDD[String] = {
//rdd.filter(x => x.contains(this.query))
rdd.filter(x => x.contains(query))
//val q = query
//rdd.filter(x => x.contains(q))
}
}
class Search() extends Serializable{...}
//过滤出包含字符串的RDD
def getMatche2(rdd: RDD[String]): RDD[String] = {
val q = this.query//将类变量赋值给局部变量
rdd.filter(x => x.contains(q))
}
case class Search(query:String) extends Serializable {...}
object serializable03_Kryo {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf()
.setAppName("SerDemo")
.setMaster("local[*]")
// 替换默认的序列化机制
.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
// 注册需要使用 kryo 序列化的自定义类
.registerKryoClasses(Array(classOf[Searcher]))
val sc = new SparkContext(conf)
val rdd: RDD[String] = sc.makeRDD(Array("hello world", "hello atguigu", "atguigu", "hahah"), 2)
val searcher = new Searcher("hello")
val result: RDD[String] = searcher.getMatchedRDD1(rdd)
result.collect.foreach(println)
}
}
case class Searcher(val query: String) {
def isMatch(s: String) = {
s.contains(query)
}
def getMatchedRDD1(rdd: RDD[String]) = {
rdd.filter(isMatch)
}
def getMatchedRDD2(rdd: RDD[String]) = {
val q = query
rdd.filter(_.contains(q))
}
}
object Lineage01 {
def main(args: Array[String]): Unit = {
//1.创建SparkConf并设置App名称
val conf: SparkConf = new SparkConf().setAppName("SparkCoreTest").setMaster("local[*]")
//2.创建SparkContext,该对象是提交Spark App的入口
val sc: SparkContext = new SparkContext(conf)
val fileRDD: RDD[String] = sc.textFile("input/1.txt")
println(fileRDD.toDebugString)
println("----------------------")
val wordRDD: RDD[String] = fileRDD.flatMap(_.split(" "))
println(wordRDD.toDebugString)
println("----------------------")
val mapRDD: RDD[(String, Int)] = wordRDD.map((_,1))
println(mapRDD.toDebugString)
println("----------------------")
val resultRDD: RDD[(String, Int)] = mapRDD.reduceByKey(_+_)
println(resultRDD.toDebugString)
resultRDD.collect()
//4.关闭连接
sc.stop()
}
}
(2) input/1.txt MapPartitionsRDD[1] at textFile at Lineage01.scala:15 []
| input/1.txt HadoopRDD[0] at textFile at Lineage01.scala:15 []
----------------------
(2) MapPartitionsRDD[2] at flatMap at Lineage01.scala:19 []
| input/1.txt MapPartitionsRDD[1] at textFile at Lineage01.scala:15 []
| input/1.txt HadoopRDD[0] at textFile at Lineage01.scala:15 []
----------------------
(2) MapPartitionsRDD[3] at map at Lineage01.scala:23 []
| MapPartitionsRDD[2] at flatMap at Lineage01.scala:19 []
| input/1.txt MapPartitionsRDD[1] at textFile at Lineage01.scala:15 []
| input/1.txt HadoopRDD[0] at textFile at Lineage01.scala:15 []
----------------------
(2) ShuffledRDD[4] at reduceByKey at Lineage01.scala:27 []
+-(2) MapPartitionsRDD[3] at map at Lineage01.scala:23 []
| MapPartitionsRDD[2] at flatMap at Lineage01.scala:19 []
| input/1.txt MapPartitionsRDD[1] at textFile at Lineage01.scala:15 []
| input/1.txt HadoopRDD[0] at textFile at Lineage01.scala:15 []
object Lineage01 {
def main(args: Array[String]): Unit = {
//1.创建SparkConf并设置App名称
val conf: SparkConf = new SparkConf().setAppName("SparkCoreTest").setMaster("local[*]")
//2.创建SparkContext,该对象是提交Spark App的入口
val sc: SparkContext = new SparkContext(conf)
val fileRDD: RDD[String] = sc.textFile("input/1.txt")
println(fileRDD.dependencies)
println("----------------------")
val wordRDD: RDD[String] = fileRDD.flatMap(_.split(" "))
println(wordRDD.dependencies)
println("----------------------")
val mapRDD: RDD[(String, Int)] = wordRDD.map((_,1))
println(mapRDD.dependencies)
println("----------------------")
val resultRDD: RDD[(String, Int)] = mapRDD.reduceByKey(_+_)
println(resultRDD.dependencies)
resultRDD.collect()
//4.关闭连接
sc.stop()
}
}
List(org.apache.spark.OneToOneDependency@f2ce6b)
----------------------
List(org.apache.spark.OneToOneDependency@692fd26)
----------------------
List(org.apache.spark.OneToOneDependency@627d8516)
----------------------
List(org.apache.spark.ShuffleDependency@a518813)
class OneToOneDependency[T](rdd: RDD[T]) extends NarrowDependency[T](rdd) {
override def getParents(partitionId: Int): List[Int] = List(partitionId)
}
object Stage01 {
def main(args: Array[String]): Unit = {
//1.创建SparkConf并设置App名称
val conf: SparkConf = new SparkConf().setAppName("SparkCoreTest").setMaster("local[*]")
//2. Application:初始化一个SparkContext即生成一个Application;
val sc: SparkContext = new SparkContext(conf)
//3. 创建RDD
val dataRDD: RDD[Int] = sc.makeRDD(List(1,2,3,4,1,2),2)
//3.1 聚合
val resultRDD: RDD[(Int, Int)] = dataRDD.map((_,1)).reduceByKey(_+_)
// Job:一个Action算子就会生成一个Job;
//3.2 job1打印到控制台
resultRDD.collect().foreach(println)
//3.3 job2输出到磁盘
resultRDD.saveAsTextFile("output")
Thread.sleep(1000000)
//4.关闭连接
sc.stop()
}
}
object cache01 {
def main(args: Array[String]): Unit = {
//1.创建SparkConf并设置App名称
val conf: SparkConf = new SparkConf().setAppName("SparkCoreTest").setMaster("local[*]")
//2.创建SparkContext,该对象是提交Spark App的入口
val sc: SparkContext = new SparkContext(conf)
//3. 创建一个RDD,读取指定位置文件:hello atguigu atguigu
val lineRdd: RDD[String] = sc.textFile("input1")
//3.1.业务逻辑
val wordRdd: RDD[String] = lineRdd.flatMap(line => line.split(" "))
val wordToOneRdd: RDD[(String, Int)] = wordRdd.map {
word => {
println("************")
(word, 1)
}
}
//3.5 cache操作会增加血缘关系,不改变原有的血缘关系
println(wordToOneRdd.toDebugString)
//3.4 数据缓存。
wordToOneRdd.cache()
//3.6 可以更改存储级别
// wordToOneRdd.persist(StorageLevel.MEMORY_AND_DISK_2)
//3.2 触发执行逻辑
wordToOneRdd.collect()
println("-----------------")
println(wordToOneRdd.toDebugString)
//3.3 再次触发执行逻辑
wordToOneRdd.collect()
//4.关闭连接
sc.stop()
}
}
mapRdd.cache()
def cache(): this.type = persist()
def persist(): this.type = persist(StorageLevel.MEMORY_ONLY)
object StorageLevel {
val NONE = new StorageLevel(false, false, false, false)
val DISK_ONLY = new StorageLevel(true, false, false, false)
val DISK_ONLY_2 = new StorageLevel(true, false, false, false, 2)
val MEMORY_ONLY = new StorageLevel(false, true, false, true)
val MEMORY_ONLY_2 = new StorageLevel(false, true, false, true, 2)
val MEMORY_ONLY_SER = new StorageLevel(false, true, false, false)
val MEMORY_ONLY_SER_2 = new StorageLevel(false, true, false, false, 2)
val MEMORY_AND_DISK = new StorageLevel(true, true, false, true)
val MEMORY_AND_DISK_2 = new StorageLevel(true, true, false, true, 2)
val MEMORY_AND_DISK_SER = new StorageLevel(true, true, false, false)
val MEMORY_AND_DISK_SER_2 = new StorageLevel(true, true, false, false, 2)
val OFF_HEAP = new StorageLevel(true, true, true, false, 1)
object cache02 {
def main(args: Array[String]): Unit = {
//1.创建SparkConf并设置App名称
val conf: SparkConf = new SparkConf().setAppName("SparkCoreTest").setMaster("local[*]")
//2.创建SparkContext,该对象是提交Spark App的入口
val sc: SparkContext = new SparkContext(conf)
//3. 创建一个RDD,读取指定位置文件:hello atguigu atguigu
val lineRdd: RDD[String] = sc.textFile("input1")
//3.1.业务逻辑
val wordRdd: RDD[String] = lineRdd.flatMap(line => line.split(" "))
val wordToOneRdd: RDD[(String, Int)] = wordRdd.map {
word => {
println("************")
(word, 1)
}
}
// 采用reduceByKey,自带缓存
val wordByKeyRDD: RDD[(String, Int)] = wordToOneRdd.reduceByKey(_+_)
//3.5 cache操作会增加血缘关系,不改变原有的血缘关系
println(wordByKeyRDD.toDebugString)
//3.4 数据缓存。
//wordByKeyRDD.cache()
//3.2 触发执行逻辑
wordByKeyRDD.collect()
println("-----------------")
println(wordByKeyRDD.toDebugString)
//3.3 再次触发执行逻辑
wordByKeyRDD.collect()
//4.关闭连接
sc.stop()
}
}
object checkpoint01 {
def main(args: Array[String]): Unit = {
//1.创建SparkConf并设置App名称
val conf: SparkConf = new SparkConf().setAppName("SparkCoreTest").setMaster("local[*]")
//2.创建SparkContext,该对象是提交Spark App的入口
val sc: SparkContext = new SparkContext(conf)
// 需要设置路径,否则抛异常:Checkpoint directory has not been set in the SparkContext
sc.setCheckpointDir("./checkpoint1")
//3. 创建一个RDD,读取指定位置文件:hello atguigu atguigu
val lineRdd: RDD[String] = sc.textFile("input1")
//3.1.业务逻辑
val wordRdd: RDD[String] = lineRdd.flatMap(line => line.split(" "))
val wordToOneRdd: RDD[(String, Long)] = wordRdd.map {
word => {
(word, System.currentTimeMillis())
}
}
//3.5 增加缓存,避免再重新跑一个job做checkpoint
// wordToOneRdd.cache()
//3.4 数据检查点:针对wordToOneRdd做检查点计算
wordToOneRdd.checkpoint()
//3.2 触发执行逻辑
wordToOneRdd.collect().foreach(println)
// 会立即启动一个新的job来专门的做checkpoint运算
//3.3 再次触发执行逻辑
wordToOneRdd.collect().foreach(println)
wordToOneRdd.collect().foreach(println)
Thread.sleep(10000000)
//4.关闭连接
sc.stop()
}
}
object checkpoint02 {
def main(args: Array[String]): Unit = {
// 设置访问HDFS集群的用户名
System.setProperty("HADOOP_USER_NAME","atguigu")
//1.创建SparkConf并设置App名称
val conf: SparkConf = new SparkConf().setAppName("SparkCoreTest").setMaster("local[*]")
//2.创建SparkContext,该对象是提交Spark App的入口
val sc: SparkContext = new SparkContext(conf)
// 需要设置路径.需要提前在HDFS集群上创建/checkpoint路径
sc.setCheckpointDir("hdfs://hadoop102:9000/checkpoint")
//3. 创建一个RDD,读取指定位置文件:hello atguigu atguigu
val lineRdd: RDD[String] = sc.textFile("input1")
//3.1.业务逻辑
val wordRdd: RDD[String] = lineRdd.flatMap(line => line.split(" "))
val wordToOneRdd: RDD[(String, Long)] = wordRdd.map {
word => {
(word, System.currentTimeMillis())
}
}
//3.4 增加缓存,避免再重新跑一个job做checkpoint
wordToOneRdd.cache()
//3.3 数据检查点:针对wordToOneRdd做检查点计算
wordToOneRdd.checkpoint()
//3.2 触发执行逻辑
wordToOneRdd.collect().foreach(println)
//4.关闭连接
sc.stop()
}
}
object partitioner01_get {
def main(args: Array[String]): Unit = {
//1.创建SparkConf并设置App名称
val conf: SparkConf = new SparkConf().setAppName("SparkCoreTest").setMaster("local[*]")
//2.创建SparkContext,该对象是提交Spark App的入口
val sc: SparkContext = new SparkContext(conf)
//3 创建RDD
val pairRDD: RDD[(Int, Int)] = sc.makeRDD(List((1,1),(2,2),(3,3)))
//3.1 打印分区器
println(pairRDD.partitioner)
//3.2 使用HashPartitioner对RDD进行重新分区
val partitionRDD: RDD[(Int, Int)] = pairRDD.partitionBy(new HashPartitioner(2))
//3.3 打印分区器
println(partitionRDD.partitioner)
//4.关闭连接
sc.stop()
}