问题一:在过滤数据时同一个RDD重复使用,造成数据的重复读取
因为join可能存在笛卡尔乘积,而join底层实现就是corgroup,所以corgroup可能存在笛卡尔乘积,源码如下
def join[W](other: RDD[(K, W)], partitioner: Partitioner): RDD[(K, (V, W))] = self.withScope {
this.cogroup(other, partitioner).flatMapValues( pair =>
for (v <- pair._1.iterator; w <- pair._2.iterator) yield (v, w)
)
}
问题二:corgroup可能存在存在shuffle【性能瓶颈】,源码如下
def cogroup[W](other: RDD[(K, W)], partitioner: Partitioner)
: RDD[(K, (Iterable[V], Iterable[W]))] = self.withScope {
if (partitioner.isInstanceOf[HashPartitioner] && keyClass.isArray) {
throw new SparkException("HashPartitioner cannot partition array keys.")
}
val cg = new CoGroupedRDD[K](Seq(self, other), partitioner)
cg.mapValues { case Array(vs, w1s) =>
(vs.asInstanceOf[Iterable[V]], w1s.asInstanceOf[Iterable[W]])
}
}
class CoGroupedRDD[K: ClassTag](
@transient var rdds: Seq[RDD[_ <: Product2[K, _]]],
part: Partitioner)
extends RDD[(K, Array[Iterable[_]])](rdds.head.context, Nil) {
override def getDependencies: Seq[Dependency[_]] = {
rdds.map { rdd: RDD[_] =>
if (rdd.partitioner == Some(part)) {
logDebug("Adding one-to-one dependency with " + rdd)
// 窄依赖不会有shuffle
new OneToOneDependency(rdd)
} else {
logDebug("Adding shuffle dependency with " + rdd)
new ShuffleDependency[K, Any, CoGroupCombiner](
rdd.asInstanceOf[RDD[_ <: Product2[K, _]]], part, serializer)
}
}
}
解决问题一:添加代码
fileDatas.persist(StorageLevel.MEMORY_AND_DISK)
解决问题二:
思路
(品类ID,点击)Data
(品类ID,下单)Data
(品类ID,支付)Data
最终合成(品类ID,(点击,下单,支付))
转换思路:
将3个合成1个,聚合操作也可以
使用reduceByKey(虽然存在shuffle,但不会出现笛卡尔乘积)
但是其输入输出如下:
(Int,Int)=>(Int),如果将数据变为以下格式,就可以使用reduceByKey
(点击,下单,支付),(点击,下单,支付)=>(点击,下单,支付)
如何转换:
(品类ID,点击)=>(品类ID,(点击,0, 0))
(品类ID,下单)=>(品类ID,(0,下单, 0))
(品类ID,支付)=>(品类ID,(0,0, 支付))
最终合成:
(品类ID,(点击,下单, 支付))
修改需求一中第五步,代码如下
// TODO 5 对统计结果进行排序,先点击,再下单,最后支付
// (品类ID,点击)=>(品类ID,(点击,0, 0))
// (品类ID,下单)=>(品类ID,(0,下单, 0))
// (品类ID,支付)=>(品类ID,(0,0, 支付))
val clickMapDatas: RDD[(String, (Int, Int, Int))] = clickCntDatas.map {
case (cid, cnt) => {
(cid, (cnt, 0, 0))
}
}
val orderMapDatas: RDD[(String, (Int, Int, Int))] = orderCntDatas.map {
case (cid, cnt) => {
(cid, (0, cnt, 0))
}
}
val payMapDatas: RDD[(String, (Int, Int, Int))] = payCntDatas.map {
case (cid, cnt) => {
(cid, (0, 0, cnt))
}
}
// 将三个独立的数据集合并在一起,用于reduceByKey
val unionRDD: RDD[(String, (Int, Int, Int))] = clickMapDatas.union(orderMapDatas).union(payMapDatas)
val reduceRDD: RDD[(String, (Int, Int, Int))] = unionRDD.reduceByKey(
(t1, t2) => {
(t1._1 + t2._1, t1._2 + t2._2, t1._3 + t2._3)
}
)
val top10: Array[(String, (Int, Int, Int))] = reduceRDD.sortBy(_._2,false).take(10)
以上程序中存在四次reduceByKey,意味着存在四次shuffle
数据的最终表示形式:
(品类ID,(点击,下单, 支付))
由以下数据转换而来:
(品类ID,点击)=>(品类ID,(点击,0, 0))
(品类ID,下单)=>(品类ID,(0,下单, 0))
(品类ID,支付)=>(品类ID,(0,0, 支付))
每组数据中,点击,下单,支付都为sum
若不进行求和,可减少三次reduceByKey,减少三次shuffle
(品类ID,点击)=>(品类ID,(1,0, 0))
(品类ID,点击)=>(品类ID,(1,0, 0))
(品类ID,点击)=>(品类ID,(1,0, 0))
(品类ID,下单)=>(品类ID,(0,1, 0))
(品类ID,下单)=>(品类ID,(0,1, 0))
(品类ID,下单)=>(品类ID,(0,1, 0))
(品类ID,支付)=>(品类ID,(0,0, 1))
(品类ID,支付)=>(品类ID,(0,0, 1))
(品类ID,支付)=>(品类ID,(0,0, 1))
将每一条数据转换成以上三类即可,同时减少了过滤操作
代码如下
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local").setAppName("HotCategoryTop10")
val sc = new SparkContext(conf)
// TODO 1 读取文件,获取原始数据
val fileDatas: RDD[String] = sc.textFile("data/user_visit_action.txt")
// flatMap要求返回的必须是一个可以迭代的集合
val flatMap: RDD[(String, (Int, Int, Int))] = fileDatas.flatMap(
data => {
val datas: mutable.ArrayOps.ofRef[String] = data.split("_")
// 点击数据的场景
if (datas(6) != "-1") {
List((datas(6), (1, 0, 0)))
// 下单数据的场景
} else if (datas(8) != "null") {
val id: String = datas(8)
val ids: Array[String] = id.split(",")
ids.map({
id => (id, (0, 1, 0))
})
// 支付数据场景
} else if (datas(10) != "null") {
val id: String = datas(10)
val ids: Array[String] = id.split(",")
ids.map({
id => (id, (0, 0, 1))
})
} else {
Nil
}
}
)
val top10: Array[(String, (Int, Int, Int))] = flatMap.reduceByKey(
(t1, t2) => {
(t1._1 + t2._1, t1._2 + t2._2, t1._3 + t2._3)
}
).sortBy(_._2, false).take(10)
top10.foreach(println)
sc.stop()
}
自定义累加器实现需求一
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local").setAppName("HotCategoryTop10")
val sc = new SparkContext(conf)
// TODO 1 读取文件,获取原始数据
val fileDatas: RDD[String] = sc.textFile("data/user_visit_action.txt")
// 创建累加器对象
val acc = new HotCategoryAccumulator()
// 注册累加器
sc.register(acc,"HotCategory")
fileDatas.foreach(
data => {
val datas: Array[String] = data.split("_")
if( datas(6) != "-1"){
// 点击的场景
acc.add( (datas(6),"click"))
}else if ( datas(8) != "null"){
// 下单的场景
val id: String = datas(8)
val ids: Array[String] = id.split(",")
ids.foreach(
id => {
acc.add(id,"order")
}
)
} else if(datas(10) != "null"){
// 支付的场景
val id: String = datas(10)
val ids: Array[String] = id.split(",")
ids.foreach(
id => {
acc.add(id,"pay")
}
)
}
}
)
// TODO 获取累加器结果
val resultMap: mutable.Map[String, HotCategoryCount] = acc.value
val top10: List[HotCategoryCount] = resultMap.map(_._2).toList.sortWith(
(left, right) => {
if (left.clickCnt > right.clickCnt) {
true
} else if (left.clickCnt == right.clickCnt) {
if (left.orderCnt > right.orderCnt) {
true
} else if (left.clickCnt == right.clickCnt) {
left.payCnt > right.payCnt
} else {
false
}
} else {
false
}
}
).take(10)
top10.foreach(println)
sc.stop()
}
case class HotCategoryCount ( cid:String, var clickCnt : Int, var orderCnt : Int, var payCnt : Int)
// TODO 自定义热门点击累加器
// 1 继承AccumulatorV2
// 2 定义泛型
// IN: (品类ID,行为类型)
// OUT: Map[品类ID,HotCategoryCount]
// 3 重写方法(3 + 3)
class HotCategoryAccumulator extends AccumulatorV2[(String,String), mutable.Map[String,HotCategoryCount] ]{
private val map = mutable.Map[String,HotCategoryCount]()
override def isZero: Boolean = {
map.isEmpty
}
override def copy(): AccumulatorV2[(String, String), mutable.Map[String, HotCategoryCount]] = {
new HotCategoryAccumulator()
}
override def reset(): Unit = {
map.clear()
}
override def add(v: (String, String)): Unit = {
val (cid,actionType) = v
val hcc: HotCategoryCount = map.getOrElse(cid,HotCategoryCount(cid,0,0,0))
actionType match {
case "click" => hcc.clickCnt += 1
case "order" => hcc.orderCnt += 1
case "pay" => hcc.payCnt += 1
}
map.update(cid,hcc)
}
override def merge(other: AccumulatorV2[(String, String), mutable.Map[String, HotCategoryCount]]): Unit = {
other.value.foreach{
case (cid, otherHcc) => {
val thisHcc: HotCategoryCount = map.getOrElse(cid,HotCategoryCount(cid,0,0,0))
thisHcc.clickCnt += otherHcc.clickCnt
thisHcc.orderCnt += otherHcc.orderCnt
thisHcc.payCnt += otherHcc.payCnt
map.update(cid,thisHcc)
}
}
}
override def value: mutable.Map[String, HotCategoryCount] = {
map
}
}
将需求一更改为框架式开发模式,以便将需求一的结果传给其他应用使用
object HotCategoryTop10Application extends App with TApplication {
execute(appName = "HotCategoryTop10") {
//将用户请求传递给Controller层
val controller = new HotCategoryTop10Controller
//获取从Controller层传回的结果
controller.dispatch()
}
}
class HotCategoryTop10Controller extends TController{
private val hotCategoryTop10Service = new HotCategoryTop10Service
override def dispatch(): Unit = {
val result: Array[(String, (Int, Int, Int))] = hotCategoryTop10Service.analysis()
result.foreach(println)
}
}
class HotCategoryTop10Service extends TService{
private val hotCategoryTop10Dao = new HotCategoryTop10Dao
override def analysis() = {
val fileDatas: RDD[String] = hotCategoryTop10Dao.readFileBySpark("data/user_visit_action.txt")
// flatMap要求返回的必须是一个可以迭代的集合
val flatMap: RDD[(String, (Int, Int, Int))] = fileDatas.flatMap(
data => {
val datas: mutable.ArrayOps.ofRef[String] = data.split("_")
// 点击数据的场景
if (datas(6) != "-1") {
List((datas(6), (1, 0, 0)))
// 下单数据的场景
} else if (datas(8) != null) {
val id: String = datas(8)
val ids: Array[String] = id.split(",")
ids.map({
id => (id, (0, 1, 0))
})
// 支付数据场景
} else if (datas(10) != null) {
val id: String = datas(10)
val ids: Array[String] = id.split(",")
ids.map({
id => (id, (0, 0, 1))
})
} else {
Nil
}
}
)
val top10: Array[(String, (Int, Int, Int))] = flatMap.reduceByKey(
(t1, t2) => {
(t1._1 + t2._1, t1._2 + t2._2, t1._3 + t2._3)
}
).sortBy(_._2, false).take(10)
top10
}
}
class HotCategoryTop10Dao extends TDao{
}
//抽取Application层的通用方法
trait TApplication {
def execute(master:String = "local[*]",appName:String)(op: => Unit): Unit = {
val conf = new SparkConf().setMaster(master).setAppName(appName)
val sc = new SparkContext(conf)
EnvCache.put(sc)
try {
op
} catch {
case e: Exception => e.printStackTrace()
}
sc.stop()
EnvCache.clear()
}
}
//Controller层的运行方法,由实现类实现
trait TController {
def dispatch(): Unit
}
trait TDao {
//读取文件,返回文件内容
def readFile(path: String) = {
//EnvCache.get获取文件根目录
val source: BufferedSource = Source.fromFile(EnvCache.get() + path)
val lines = source.getLines().toList
source.close()
lines
}
def readFileBySpark(path: String) = {
//EnvCache.get获取文件根目录
EnvCache.get().asInstanceOf[SparkContext].textFile(path)
}
}
//service层的执行方法,由实现类实现
trait TService {
def analysis(): Any = {
}
def analysis(data : Any ): Any = {
}
}
object EnvCache {
//在线程中开辟一块空间,这个空间内的数据可以供任意层取出
//ThreadLocal不能解决线程安全问题,只是共享数据
private val envCache: ThreadLocal[Object] = new ThreadLocal[Object]
def put(data: Object): Unit = {
envCache.set(data)
}
def get() = {
envCache.get()
}
def clear(): Unit = {
envCache.remove()
}
}