rdd 内生分组_sparkRDD中分组取TopN案例以及优化

文章目录

分组取topN,使用的是集合中的List排序,会有性能瓶颈,内存溢出

调用RDD的sortBy方法,对每个RDD中数据进行排序,采用内存+磁盘的方式

自定义分区器,根据学科分区

自定义分区的优化,减少shuffle

分组取topN,使用的是集合中的List排序,会有性能瓶颈,内存溢出

package rdd

import java.net.URL

import org.apache.spark.{SparkConf, SparkContext}

object FavObjTeacher {

def main(args: Array[String]): Unit = {

val conf = new SparkConf().setMaster("local[*]").setAppName("FavObjTeacher")

val sc = new SparkContext(conf)

val lines = sc.textFile("C:\\Users\\admin\\Desktop\\test\\teacher.log")

// http://bigdata.edu360.cn/laozhang

val subjectTeacherAndOne = lines.map(

x => {

val index = x.lastIndexOf("/")

val teacher = x.substring(index + 1)

val httpHost = x.substring(0, index)

val subject = new URL(httpHost).getHost.split("[.]")(0)

//直接使用map归类

((subject, teacher), 1)

})

//聚合,将学科和老师联合作为key

val reduced = subjectTeacherAndOne.reduceByKey(_+_)

// 分组排序 按学科进行分组

val grouped = reduced.groupBy(_._1._1)

//转换成List,使用scala中的集合特性进行转换

val sorted = grouped.mapValues(_.toList.sortBy(_._2).reverse.take(3))

val r = sorted.collect()

print(r.toBuffer)

}

}

调用RDD的sortBy方法,对每个RDD中数据进行排序,采用内存+磁盘的方式

package rdd

import java.net.URL

import org.apache.spark.{SparkConf, SparkContext}

object FavObjTeacher {

def main(args: Array[String]): Unit = {

val conf = new SparkConf().setMaster("local[*]").setAppName("FavObjTeacher")

val sc = new SparkContext(conf)

val lines = sc.textFile("C:\\Users\\admin\\Desktop\\test\\teacher.log")

// http://bigdata.edu360.cn/laozhang

val subjectTeacherAndOne = lines.map(

x => {

val index = x.lastIndexOf("/")

val teacher = x.substring(index + 1)

val httpHost = x.substring(0, index)

val subject = new URL(httpHost).getHost.split("[.]")(0)

//直接使用map归类

((subject, teacher), 1)

})

//聚合,将学科和老师联合作为key

val reduced = subjectTeacherAndOne.reduceByKey(_ + _)

val subjects = Array("bigdata", "javaee", "php")

for (sb

自定义分区器,根据学科分区

自定义分区器,partitionBy,mapPartition,其中mapPartition要求传入一个迭代器和传出一个迭代器

import java.net.URL

import org.apache.spark.{Partitioner, SparkConf, SparkContext}

import scala.collection.mutable

object FavTeacher2 {

def main(args: Array[String]): Unit = {

val conf = new SparkConf().setMaster("local[*]").setAppName("FavObjTeacher")

val sc = new SparkContext(conf)

val lines = sc.textFile("C:\\Users\\admin\\Desktop\\test\\teacher.log")

// http://bigdata.edu360.cn/laozhang

val subjectTeacherAndOne = lines.map(

x => {

val index = x.lastIndexOf("/")

val teacher = x.substring(index + 1)

val httpHost = x.substring(0, index)

val subject = new URL(httpHost).getHost.split("[.]")(0)

//直接使用map归类

((subject, teacher), 1)

})

//聚合,将学科和老师联合作为key

val reduced = subjectTeacherAndOne.reduceByKey(_ + _)

//计算有多少个学科

val subjects = reduced.map(_._1._1).distinct().collect()

val sbPartitioner = new SubjectPartitioner(subjects)

//调用partitionBy方法 按照指定的分区规则排序

val partitionedRDD = reduced.partitionBy(sbPartitioner)

//一次拿出一个分区相当于操作一个分区中的数据 使用算子mapPartition

val sorted = partitionedRDD.mapPartitions(it => {

it.toList.sortBy(_._2).reverse.take(3).iterator

})

val r = sorted.collect()

println(r.toBuffer)

}

}

//自定义分区器

class SubjectPartitioner(sbs: Array[String]) extends Partitioner {

// 此处是写的是主构造器中

//返回分区数量,即 下一个RDD中有多少个分区

override def numPartitions: Int = sbs.length

val rules = new mutable.HashMap[String, Int]()

var i = 0 //分区号

for (elem

自定义分区的优化,减少shuffle

reduceBykey ,partitionBy都是会产生shuffle

reduce的时候就按照分区进行聚合,制造一个长度固定的集合

package rdd

import java.net.URL

import org.apache.spark.{Partitioner, SparkConf, SparkContext}

import scala.collection.mutable

object FavTeacher3 {

def main(args: Array[String]): Unit = {

val conf = new SparkConf().setMaster("local[*]").setAppName("FavObjTeacher")

val sc = new SparkContext(conf)

val lines = sc.textFile("C:\\Users\\admin\\Desktop\\test\\teacher.log")

// http://bigdata.edu360.cn/laozhang

val subjectTeacherAndOne = lines.map(

x => {

val index = x.lastIndexOf("/")

val teacher = x.substring(index + 1)

val httpHost = x.substring(0, index)

val subject = new URL(httpHost).getHost.split("[.]")(0)

//直接使用map归类

((subject, teacher), 1)

})

val subjects = subjectTeacherAndOne.map(_._1._1).distinct().collect()

val sbPartitioner = new SubjectPartitioner(subjects)

val reduced2 = subjectTeacherAndOne.reduceByKey(sbPartitioner,_ + _)

//一次拿出一个分区相当于操作一个分区中的数据 使用算子mapPartition

val sorted = reduced2.mapPartitions(it => {

it.toList.sortBy(_._2).reverse.take(3).iterator

})

val r = sorted.collect()

// sorted.saveAsTextFile("")

println(r.toBuffer)

}

}

//自定义分区器

class SubjectPartitioner(sbs: Array[String]) extends Partitioner {

// 此处是写的是主构造器中

//返回分区数量,即 下一个RDD中有多少个分区

override def numPartitions: Int = sbs.length

val rules = new mutable.HashMap[String, Int]()

var i = 0 //分区号

for (elem

你可能感兴趣的:(rdd,内生分组)