需求
数据格式:
http://bigdata.edu360.cn/laozhang
网站的访问日志,最后面是老师的名称。
javaee
bigdata
php
代表着各个子学科。
要求统计全局的TopN和分组(分学科的)的TopN。
注意事项:
1,要把学科和老师一起当成key。
2,注意切分使用的特殊分隔符需要进行转义。
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD
object DealDataExample6_1 {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setMaster("local")
conf.setAppName(DealDataExample4_2.getClass.getSimpleName)
val sc = new SparkContext(conf)
//获取数据
val data: RDD[String] = sc.textFile("ExampleData/DealDataExample6/input/groupTopN.txt")
//切分
val cutRes: RDD[((String, String), String)] = data.map({
t =>
val cut1 = t.split("\\//")
val str = cut1(1)
val cut2 = str.split("\\.")
val web: String = cut2(0)
val cut3 = cut2(2).split("\\/")
val teacher: String = cut3(1)
((web, teacher), t)
})
//println(cutRes.collect().toBuffer)
//分组
val geroupRes: RDD[((String, String), Iterable[String])] = cutRes.groupByKey()
val sumRes: RDD[((String, String), String,Int)] = geroupRes.map({
t =>
val head: String = t._2.head
val glanceNum = t._2.size
(t._1, head,glanceNum)
})
val sortRes: RDD[((String, String), String, Int)] = sumRes.sortBy(-_._3)
//println(topRes.toBuffer)
sortRes.saveAsTextFile("ExampleData/DealDataExample6/output6_1/")
sc.stop()
}
}
分组TOPN:
import java.net.URL
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object DealDataExample6_3 {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setMaster("local")
conf.setAppName(DealDataExample4_2.getClass.getSimpleName)
val sc = new SparkContext(conf)
//获取数据
val data: RDD[String] = sc.textFile("ExampleData/DealDataExample6/input/groupTopN.txt")
//数据预处理
val cutRes: RDD[((String, String), Int)] = data.map({
t =>
val index: Int = t.lastIndexOf("/")
val teacher: String = t.substring(index+1, t.size)
//println(s"${teacher}")
val url: URL = new URL(t.substring(0, index))
val host: String = url.getHost
//println(s"${host}")//bigdata.edu360.cn
val web = host.split("\\.")(0)
((web, teacher), 1)
})
//按key分组
val redRes: RDD[((String, String), Int)] = cutRes.reduceByKey(_+_)
//按学科分组
val webRes: RDD[(String, Iterable[((String, String), Int)])] = redRes.groupBy({
t =>
t._1._1
})
//排序
val sortRes: RDD[(String, List[((String, String), Int)])] = webRes.mapValues({
t =>
t.toList.sortBy(-_._2).take(10)
})
//数据优化
val combyRes: RDD[(String, Iterable[String])] = sortRes.mapValues({
t =>
t.map({
t =>
t._1._2.concat(",").concat(t._2.toString)
})
})
//println(combyRes.collect().toBuffer)
//写出
sortRes.saveAsTextFile("ExampleData/DealDataExample6/output6_3/")
sc.stop()
}
}
自定义分区器 分组TOPN:
import java.net.URL
import org.apache.spark.rdd.RDD
import org.apache.spark.{Partitioner, SparkConf, SparkContext}
import scala.collection.mutable
object DealDataExample6_5_SelfPartitioner_SelfPartitioner {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setMaster("local")
conf.setAppName(DealDataExample4_2.getClass.getSimpleName)
val sc = new SparkContext(conf)
//定义TOPN
val TOPN = 3
//获取数据
val data: RDD[String] = sc.textFile("ExampleData/DealDataExample6/input/groupTopN.txt")
//数据预处理
val cutRes: RDD[(String, String)] = data.map({
t =>
val index: Int = t.lastIndexOf("/")
val teacher: String = t.substring(index+1, t.size)
//println(s"${teacher}")
val url: URL = new URL(t.substring(0, index))
val host: String = url.getHost
//println(s"${host}")//bigdata.edu360.cn
val web = host.split("\\.")(0)
(web, teacher)
})
//获取key
val totalSub: Array[String] = cutRes.keys.distinct().collect()
//和1组合
val combyRes: RDD[((String, String), Int)] = cutRes.map((_,1))
//使用自定义分区器按key分组
val reduceRes: RDD[((String, String), Int)] = combyRes.reduceByKey(new SelfPartitioner(totalSub),_+_)
//分区操作
val sortRes: RDD[((String, String), Int)] = reduceRes.mapPartitions({
t =>
t.toList.sortBy(-_._2).take(TOPN).iterator
})
//写出到文件
sortRes.saveAsTextFile("ExampleData/DealDataExample6/output6_4_Self/")
//关闭资源
sc.stop()
}
}
/**
* 自定义分区器
* php ——> 0
* javaee ——> 1
* bigdata ——> 2
* @param totalSub
*/
class SelfPartitioner(val totalSub: Array[String]) extends Partitioner{
//定义一个map集合
private val map: mutable.HashMap[String, Int] = new mutable.HashMap[String,Int]()
//将学科对应到分区号
var index = 0
for(sub <- totalSub){
map(sub) = index
index += 1
}
/*for (i <- 0 until totalSub.length){
map(totalSub(i)) = i
}*/
override def numPartitions: Int = totalSub.length
override def getPartition(key: Any): Int = {
//将key转为元组
val keysRes: (String, String) = key.asInstanceOf[(String,String)]
//取学科作为分组条件
val subName = keysRes._1
//将学科名字放入map中
map(subName)
}
}
分组TOPN结果:
(javaee,List(((javaee,laoyang),9), ((javaee,xiaoxu),6)))
(php,List(((php,laoli),3), ((php,laoliu),1)))
(bigdata,List(((bigdata,laozhao),15), ((bigdata,laoduan),6), ((bigdata,laozhang),2)))
自定义分区器TOPN结果:
((javaee,laoyang),9)
((javaee,xiaoxu),6)
((php,laoli),3)
((php,laoliu),1)
((bigdata,laozhao),15)
((bigdata,laoduan),6)
((bigdata,laozhang),2)