这是数据:
http://bigdata.edu360.cn/laozhang
http://bigdata.edu360.cn/laozhang
http://bigdata.edu360.cn/laozhao
http://bigdata.edu360.cn/laozhao
http://bigdata.edu360.cn/laozhao
http://bigdata.edu360.cn/laozhao
http://bigdata.edu360.cn/laozhao
http://bigdata.edu360.cn/laoduan
http://bigdata.edu360.cn/laoduan
http://javaee.edu360.cn/xiaoxu
http://javaee.edu360.cn/xiaoxu
http://javaee.edu360.cn/laoyang
代码:
package day02
import java.net.URL
import day01.MySpark
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
/**
* @author WangLeiKai
* 2018/9/19 8:36
* 根据学科取得最受欢迎的老师的前三名
*
*/
object FavSubTeacher {
def main(args: Array[String]): Unit = {
//程序的入口
val conf = new SparkConf().setAppName("FavSubTeacher").setMaster("local[*]")
val sc = new SparkContext(conf)
//读取本地文件
val lines: RDD[String] = sc.textFile("F:\\上课画图\\spark 02\\课件与代码\\teacher(1).log")
//处理数据
val subjectAndTeacher: RDD[((String, String), Int)] = lines.map(line => {
val teacher: String = line.substring(line.lastIndexOf("/") + 1)
//调用Java的url类,取到域名
val host = new URL(line).getHost
/域名再进行截取
val subject = host.substring(0, host.indexOf("."))
((subject, teacher), 1)
})
//本地聚合
val reduced: RDD[((String, String), Int)] = subjectAndTeacher.reduceByKey(_+_)
//分组
val grouped: RDD[(String, Iterable[((String, String), Int)])] = reduced.groupBy(_._1._1)
//不能排序的原因 迭代器没有排序方法
val result: RDD[(String, List[((String, String), Int)])] = grouped.mapValues(_.toList.sortBy(_._2).reverse.take(2))
//将结果打印
println(result.collect().toBuffer)
//释放资源
sc.stop()
}
}
上面是最简单的实现,但是还有很多需要优化的地方:
第一步优化:
按照学科进行过滤,将同一学科放在一个RDD中,不会产生内存溢出。
实现:将所有的学科放在一个集合中,遍历集合再进行过滤,排序,取topN
代码:
package day02
import java.net.URL
import day01.MySpark
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
/**
* @author WangLeiKai
* 2018/9/19 9:47
*/
object FavSubTeacher2 {
def main(args: Array[String]): Unit = {
//程序的入口
val conf = new SparkConf().setAppName("FavSubTeacher").setMaster("local[*]")
val sc = new SparkContext(conf)
val lines: RDD[String] = sc.textFile("F:\\上课画图\\spark 02\\课件与代码\\teacher(1).log")
//这里是简单实现,也可以在下面的分析数据的时候取学科
val subjects = Array("bigdata", "javaee", "php")
val subjectAndTeacher: RDD[((String, String), Int)] = lines.map(line => {
val teacher: String = line.substring(line.lastIndexOf("/") + 1)
val host = new URL(line).getHost
val subject = host.substring(0, host.indexOf("."))
((subject, teacher), 1)
})
val reduced: RDD[((String, String), Int)] = subjectAndTeacher.reduceByKey(_+_)
for (sb <- subjects){
//这里根据课程名进行过滤
//过滤之后就会将同一个学科的数据放在一个RDD中
val filtered: RDD[((String, String), Int)] = reduced.filter(_._1._1 == sb)
val sorted: RDD[((String, String), Int)] = filtered.sortBy(_._2,false)
val array = sorted.take(2)
println(array.toBuffer)
}
sc.stop()
}
}
第二步优化:使用自定义分区,将一个学数据放在一个分区中
reduceByKey和分区都会产生shuffle,reduceByKey参数中可以设置分区器,如果不设置的话就是使用默认的分区器
代码:
package day02
import java.net.URL
import day01.MySpark
import org.apache.spark.{Partitioner, SparkContext}
import org.apache.spark.rdd.RDD
import scala.collection.mutable
/**
* 减少shuffle过程
* @author WangLeiKai
* 2018/9/19 14:42
*/
object FavSubTeacher3 {
def main(args: Array[String]): Unit = {
//程序的入口
val conf = new SparkConf().setAppName("FavSubTeacher").setMaster("local[*]")
val sc = new SparkContext(conf)
val lines: RDD[String] = sc.textFile("F:\\上课画图\\spark 02\\课件与代码\\teacher(1).log")
// val subjects = Array("bigdata", "javaee", "php")
val subjectAndTeacher: RDD[((String, String), Int)] = lines.map(line => {
val teacher: String = line.substring(line.lastIndexOf("/") + 1)
val host = new URL(line).getHost
val subject = host.substring(0, host.indexOf("."))
((subject, teacher), 1)
})
//取到所有的科目
val subjects: Array[String] = subjectAndTeacher.map(_._1._1).distinct().collect()
val sbPartitioner: SubjectPartitioner = new SubjectPartitioner(subjects)
//reduceByKey方法 参数可以是分区器,如果没有的话 使用的是默认的
val reduced: RDD[((String, String), Int)] = subjectAndTeacher.reduceByKey(sbPartitioner,_+_)
//val partitioned: RDD[((String, String), Int)] = reduced.partitionBy(sbPartitioner)
val sorted: RDD[((String, String), Int)] = reduced.mapPartitions(it => {
it.toList.sortBy(_._2).reverse.take(3).iterator
})
val tuples = sorted.collect()
tuples.foreach(println)
reduced.unpersist()
sc.stop()
}
}
class SubjectPartitioner(sbs: Array[String]) extends Partitioner{
//map里放的是科目和对应的分区号 0 1 2
private val rules: mutable.HashMap[String, Int] = new mutable.HashMap[String,Int]()
var index = 0
for(sb <- sbs){
rules.put(sb,index)
index += 1
}
//返回分区的数量 下一个RDD有多少个分区
override def numPartitions: Int = sbs.length
//这里的key是一个元组
override def getPartition(key: Any): Int = {
//获取学科名称
val subject: String = key.asInstanceOf[(String,String)]._1
//根据规则计算分区编号
rules(subject)
}
}