文章目录
分组取topN,使用的是集合中的List排序,会有性能瓶颈,内存溢出
调用RDD的sortBy方法,对每个RDD中数据进行排序,采用内存+磁盘的方式
自定义分区器,根据学科分区
自定义分区的优化,减少shuffle
分组取topN,使用的是集合中的List排序,会有性能瓶颈,内存溢出
package rdd
import java.net.URL
import org.apache.spark.{SparkConf, SparkContext}
object FavObjTeacher {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[*]").setAppName("FavObjTeacher")
val sc = new SparkContext(conf)
val lines = sc.textFile("C:\\Users\\admin\\Desktop\\test\\teacher.log")
// http://bigdata.edu360.cn/laozhang
val subjectTeacherAndOne = lines.map(
x => {
val index = x.lastIndexOf("/")
val teacher = x.substring(index + 1)
val httpHost = x.substring(0, index)
val subject = new URL(httpHost).getHost.split("[.]")(0)
//直接使用map归类
((subject, teacher), 1)
})
//聚合,将学科和老师联合作为key
val reduced = subjectTeacherAndOne.reduceByKey(_+_)
// 分组排序 按学科进行分组
val grouped = reduced.groupBy(_._1._1)
//转换成List,使用scala中的集合特性进行转换
val sorted = grouped.mapValues(_.toList.sortBy(_._2).reverse.take(3))
val r = sorted.collect()
print(r.toBuffer)
}
}
调用RDD的sortBy方法,对每个RDD中数据进行排序,采用内存+磁盘的方式
package rdd
import java.net.URL
import org.apache.spark.{SparkConf, SparkContext}
object FavObjTeacher {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[*]").setAppName("FavObjTeacher")
val sc = new SparkContext(conf)
val lines = sc.textFile("C:\\Users\\admin\\Desktop\\test\\teacher.log")
// http://bigdata.edu360.cn/laozhang
val subjectTeacherAndOne = lines.map(
x => {
val index = x.lastIndexOf("/")
val teacher = x.substring(index + 1)
val httpHost = x.substring(0, index)
val subject = new URL(httpHost).getHost.split("[.]")(0)
//直接使用map归类
((subject, teacher), 1)
})
//聚合,将学科和老师联合作为key
val reduced = subjectTeacherAndOne.reduceByKey(_ + _)
val subjects = Array("bigdata", "javaee", "php")
for (sb
自定义分区器,根据学科分区
自定义分区器,partitionBy,mapPartition,其中mapPartition要求传入一个迭代器和传出一个迭代器
import java.net.URL
import org.apache.spark.{Partitioner, SparkConf, SparkContext}
import scala.collection.mutable
object FavTeacher2 {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[*]").setAppName("FavObjTeacher")
val sc = new SparkContext(conf)
val lines = sc.textFile("C:\\Users\\admin\\Desktop\\test\\teacher.log")
// http://bigdata.edu360.cn/laozhang
val subjectTeacherAndOne = lines.map(
x => {
val index = x.lastIndexOf("/")
val teacher = x.substring(index + 1)
val httpHost = x.substring(0, index)
val subject = new URL(httpHost).getHost.split("[.]")(0)
//直接使用map归类
((subject, teacher), 1)
})
//聚合,将学科和老师联合作为key
val reduced = subjectTeacherAndOne.reduceByKey(_ + _)
//计算有多少个学科
val subjects = reduced.map(_._1._1).distinct().collect()
val sbPartitioner = new SubjectPartitioner(subjects)
//调用partitionBy方法 按照指定的分区规则排序
val partitionedRDD = reduced.partitionBy(sbPartitioner)
//一次拿出一个分区相当于操作一个分区中的数据 使用算子mapPartition
val sorted = partitionedRDD.mapPartitions(it => {
it.toList.sortBy(_._2).reverse.take(3).iterator
})
val r = sorted.collect()
println(r.toBuffer)
}
}
//自定义分区器
class SubjectPartitioner(sbs: Array[String]) extends Partitioner {
// 此处是写的是主构造器中
//返回分区数量,即 下一个RDD中有多少个分区
override def numPartitions: Int = sbs.length
val rules = new mutable.HashMap[String, Int]()
var i = 0 //分区号
for (elem
自定义分区的优化,减少shuffle
reduceBykey ,partitionBy都是会产生shuffle
reduce的时候就按照分区进行聚合,制造一个长度固定的集合
package rdd
import java.net.URL
import org.apache.spark.{Partitioner, SparkConf, SparkContext}
import scala.collection.mutable
object FavTeacher3 {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[*]").setAppName("FavObjTeacher")
val sc = new SparkContext(conf)
val lines = sc.textFile("C:\\Users\\admin\\Desktop\\test\\teacher.log")
// http://bigdata.edu360.cn/laozhang
val subjectTeacherAndOne = lines.map(
x => {
val index = x.lastIndexOf("/")
val teacher = x.substring(index + 1)
val httpHost = x.substring(0, index)
val subject = new URL(httpHost).getHost.split("[.]")(0)
//直接使用map归类
((subject, teacher), 1)
})
val subjects = subjectTeacherAndOne.map(_._1._1).distinct().collect()
val sbPartitioner = new SubjectPartitioner(subjects)
val reduced2 = subjectTeacherAndOne.reduceByKey(sbPartitioner,_ + _)
//一次拿出一个分区相当于操作一个分区中的数据 使用算子mapPartition
val sorted = reduced2.mapPartitions(it => {
it.toList.sortBy(_._2).reverse.take(3).iterator
})
val r = sorted.collect()
// sorted.saveAsTextFile("")
println(r.toBuffer)
}
}
//自定义分区器
class SubjectPartitioner(sbs: Array[String]) extends Partitioner {
// 此处是写的是主构造器中
//返回分区数量,即 下一个RDD中有多少个分区
override def numPartitions: Int = sbs.length
val rules = new mutable.HashMap[String, Int]()
var i = 0 //分区号
for (elem