/**
* 需求四的数据库表格形式样例类
* top10热门品类的top10活跃session
*
* @param taskid
* @param categoryid
* @param sessionid
* @param clickCount
*/
case class Top10Session(
taskid:String,
categoryid:Long,
sessionid:String,
clickCount:Long
)
/**
* (步骤9 需求四的方法)
* top10热门品类top10活跃session统计
*
* @param sparkSession
* @param taskUUID
* @param sessionId2FilterActionRDD
* @param top10CategoryArray
*/
def top10ActiveSession(sparkSession: SparkSession,
taskUUID: String,
sessionId2FilterActionRDD: RDD[(String, UserVisitAction)],
top10CategoryArray:Array[(SortKey,String)])={
//todo 第一步:过滤出所有点击过top10品类的action
// 法一:join
/*val cid2CountInfoRDD = sparkSession.sparkContext.makeRDD(top10CategoryArray).map{
case (sortKey,countInfo) =>{
val cid = StringUtils.getFieldFromConcatString(countInfo,"\\|",Constants.FIELD_CATEGORY_ID).toLong
(cid,countInfo)
}
}
val cid2ActionRDD = sessionId2FilterActionRDD.map{
case (sessionId,action) =>{
val cid = action.click_category_id
(cid,action)
}
}
val sessionId2ActionRDD = cid2CountInfoRDD.join(cid2ActionRDD).map{
case (cid,(countInfo,action)) =>{
val sid = action.session_id
(sid,action)
}
}*/
// 法2:filter
// cidArray: Array[Long] 包含了top10热门品类id
val cidArray = top10CategoryArray.map{
case (sortKey,countInfo) =>{
val cid = StringUtils.getFieldFromConcatString(countInfo,"\\|",Constants.FIELD_CATEGORY_ID).toLong
cid
}
}
// 所有符合过滤条件的,并且点击过Top10热门品类的action
val sessionId2ActionRDD = sessionId2FilterActionRDD.filter{
case (sessionId,action) =>{
cidArray.contains(action.click_category_id)
}
}
// todo 第二步:按照我们的sessionId进行聚合操作
val sessionId2GroupRDD: RDD[(String, Iterable[UserVisitAction])] = sessionId2ActionRDD.groupByKey()
// todo 第三步:每个count前加上sessionId
// cid2SessionCountRDD:RDD[(cid,sessionCount)]
val cid2SessionCountRDD= sessionId2GroupRDD.flatMap{
case (sessionId,iterableAction) =>{
val categoryCountMap = new mutable.HashMap[Long,Long]()
for(action <- iterableAction){
val cid = action.click_category_id
if(!categoryCountMap.contains(cid)){ // 这种方式经常出现
categoryCountMap += (cid->0)
}
categoryCountMap.update(cid,categoryCountMap(cid)+1)
}
// categoryCountMap 记录了一个session对于它所有点击过的品类点击过的次数
for((cid,count) <- categoryCountMap)
yield (cid,sessionId + "=" +count) //yield 会搜集所有 为什么加了{}就会报没有返回值的错
}
}
// todo 第四步 变成 (8888号,sessionid1=66次 sess2=99 sess8=21)
// cid2GroupRDD: RDD[(cid, IterableSessionCount)]
// cid2GroupRDD每一条数据都是一个categoryId和它对应的所有点击过它的session对它的点击次数 比如(8888号,sessionid1=66次 sess2=99 sess8=21)
val cid2GroupRDD: RDD[(Long, Iterable[String])] = cid2SessionCountRDD.groupByKey()
// todo 第五步 排序 取前10 存入数据库
// sortWith 对cid后面的各个session按点击次数转换成列表后进行排序 取前10
val top10SessionRDD = cid2GroupRDD.flatMap{
case (cid,iterableSessionCount) =>{
// true: item1放前面
// false: item2放前面
// item:sessionCount String "sessionId=count"
val sortList = iterableSessionCount.toList.sortWith((item1,item2)=>{
item1.split("=")(1).toLong > item2.split("=")(1).toLong
}).take(10)
// 写入数据库表的形式
val top10Session = sortList.map{
// item: sesssionCount String "sessionId=count"
case item =>{
val sessionId = item.split("=")(0)
val count = item.split("=")(1).toLong
Top10Session(taskUUID,cid,sessionId,count)
}
}
top10Session
}
}
import sparkSession.implicits._
top10SessionRDD.toDF().write
.format("jdbc")
.option("url",ConfigurationManager.config.getString(Constants.JDBC_URL))
.option("user",ConfigurationManager.config.getString(Constants.JDBC_USER))
.option("password",ConfigurationManager.config.getString(Constants.JDBC_PASSWORD))
.option("dbtable","top10_session")
.mode(SaveMode.Append)
.save
}
def main(args: Array[String]): Unit = {
// 获取筛选条件
val jsonStr = ConfigurationManager.config.getString(Constants.TASK_PARAMS)
// 获取筛选条件的JsonObject
val taskParam = JSONObject.fromObject(jsonStr)
// 创建全局唯一的主键
val taskUUID = UUID.randomUUID().toString
// 创建SparkConf
val sparkConf = new SparkConf().setMaster("local[*]").setAppName("session")
// 创建SparkSession (包含SparkContext)
val sparkSession = SparkSession.builder().config(sparkConf).enableHiveSupport().getOrCreate()
// 获取原始动作表
// actionRDD:RDD[UserVisitAction]
val actionRDD = getOriActionRDD(sparkSession,taskParam)
// 测试1打印输出 先确认下数据获取成功
// actionRDD.foreach(println(_))
// map-----sessionID2ActionRDD:RDD[(sessionID,UserVieitAction)]
val sessionID2ActionRDD = actionRDD.map(item => (item.session_id, item)) //item就是平时练习的x
// groupByKey-----sessionID2GroupActionRDD: RDD[(sessionID, Iterable[UserVisitAction])]
val session2GroupActionRDD = sessionID2ActionRDD.groupByKey()
session2GroupActionRDD.cache()
//todo:聚合数据
//测试2打印输出
// session2GroupActionRDD.foreach(println(_))
//测试3打印输出
// val userId2AggrInfoRDD = getSessionFullInfo(sparkSession, session2GroupActionRDD)
// userId2AggrInfoRDD.foreach(println(_))
//4打印输出
val sessionId2FullInfoRDD = getSessionFullInfo(sparkSession, session2GroupActionRDD)
sessionId2FullInfoRDD .foreach(println(_))
//至此聚合完成,开始过滤操作
//5 过滤
//todo:过滤
//对自定义累加器进行注册
val sessionAccumulator =new SessionAccumulator
sparkSession.sparkContext.register(sessionAccumulator)
//在过滤过程中完成了累加器的更新操作
//sessionId2FilterRDD:RDD[(sessionId,fullInfo)]是所有符合过滤条件的数组组成的RDD
//getSessionFilteredRDD:实现根据限制条件对session数据进行过滤,并完成累加的更新
val sessionId2FilteredRDD =getSessionFilteredRDD(taskParam,sessionId2FullInfoRDD,sessionAccumulator) //sessionAccumulator作为参数传进去
//s输出
sessionId2FilteredRDD.foreach(println(_)) //需要一个action操作
//6 计算比率 存入mysql数据库
//todo: 计算比率 存入mysql数据库
getSessionRatio(sparkSession,taskUUID,sessionAccumulator.value)
// 需求二:Session随机抽取
// 7
// sessionId2FilteredRDD:RDD[(sid,fullInfo)]
//todo:Session随机抽取
sessionRandomExtract(sparkSession,taskUUID,sessionId2FilteredRDD)
// 需求三: top10热门品类
// sessionId2ActionRDD:RDD[(sessionId,action)]
// sessionId2FilteredRDD:RDD[(ssessionId,fullInfo)] 符合过滤条件的
// 获取所有符合过滤条件的action数据 join
val sessionId2FilterActionRDD = sessionID2ActionRDD.join(sessionId2FilteredRDD).map{
case (sessionId,(action,fullInfo)) =>
(sessionId,action)
}
// 8
// top10CategoryArray:Array[(sortKey,countInfo)]
//todo:按点击、下单、支付数量获取top10品类
val top10CategoryArray = top10PopularCategories(sparkSession,taskUUID,sessionId2FilterActionRDD)
// 需求四:top10热门品类top10活跃session统计
// 9
// sessionId2FilterActionRDD:RDD[(sessionId,action)]
// top10CategoryArray:Array[(sortKey,countInfo)]
// todo:每个top10品类的top10活跃session
top10ActiveSession(sparkSession,taskUUID,sessionId2FilterActionRDD,top10CategoryArray) //def top10PopularCategories 在这个方法里需要返回一下 top10CategoryArray
}
需求一到四部分
import java.util.{Date, UUID}
import commons.conf.ConfigurationManager
import commons.constant.Constants
import commons.model.{UserInfo, UserVisitAction}
import commons.utils.{DateUtils, NumberUtils, ParamUtils, StringUtils, ValidUtils}
import net.sf.json.JSONObject
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{SaveMode, SparkSession}
import scala.collection.mutable
import scala.collection.mutable.{ArrayBuffer, ListBuffer}
import scala.util.Random
/**
* SessionStat.scala
* 需求一到四部分
*
*/
/**
* 新建一个session module ;
* 然后 scala添加进去session module ;project structure -- global lib --scala 2.11.8 右键 addToModules
*/
object SessionStat {
def main(args: Array[String]): Unit = {
// 获取筛选条件
val jsonStr = ConfigurationManager.config.getString(Constants.TASK_PARAMS) //task.params.json
// 获取筛选条件的JsonObject
val taskParam = JSONObject.fromObject(jsonStr)
// 创建全局唯一的主键
val taskUUID = UUID.randomUUID().toString
// 创建SparkConf
val sparkConf = new SparkConf().setMaster("local[*]").setAppName("session")
// 创建SparkSession (包含SparkContext)
val sparkSession = SparkSession.builder().config(sparkConf).enableHiveSupport().getOrCreate()
// 获取原始动作表
// actionRDD:RDD[UserVisitAction]
val actionRDD = getOriActionRDD(sparkSession,taskParam)
// 测试1打印输出 先确认下数据获取成功
// actionRDD.foreach(println(_))
// map-----sessionID2ActionRDD:RDD[(sessionID,UserVieitAction)]
val sessionID2ActionRDD = actionRDD.map(item => (item.session_id, item)) //item就是平时练习的x
// groupByKey-----sessionID2GroupActionRDD: RDD[(sessionID, Iterable[UserVisitAction])]
val session2GroupActionRDD = sessionID2ActionRDD.groupByKey()
session2GroupActionRDD.cache()
//todo:聚合数据
//测试2打印输出
// session2GroupActionRDD.foreach(println(_))
//测试3打印输出
// val userId2AggrInfoRDD = getSessionFullInfo(sparkSession, session2GroupActionRDD)
// userId2AggrInfoRDD.foreach(println(_))
//4打印输出
val sessionId2FullInfoRDD = getSessionFullInfo(sparkSession, session2GroupActionRDD)
sessionId2FullInfoRDD .foreach(println(_))
//至此聚合完成,开始过滤操作
//5 过滤
//todo:过滤
//对自定义累加器进行注册
val sessionAccumulator =new SessionAccumulator
sparkSession.sparkContext.register(sessionAccumulator)
//在过滤过程中完成了累加器的更新操作
//sessionId2FilterRDD:RDD[(sessionId,fullInfo)]是所有符合过滤条件的数组组成的RDD
//getSessionFilteredRDD:实现根据限制条件对session数据进行过滤,并完成累加的更新
val sessionId2FilteredRDD =getSessionFilteredRDD(taskParam,sessionId2FullInfoRDD,sessionAccumulator) //sessionAccumulator作为参数传进去
//s输出
sessionId2FilteredRDD.foreach(println(_)) //需要一个action操作
//6 计算比率 存入mysql数据库
//todo: 计算比率 存入mysql数据库
getSessionRatio(sparkSession,taskUUID,sessionAccumulator.value)
// 需求二:Session随机抽取
// 7
// sessionId2FilteredRDD:RDD[(sid,fullInfo)]
//todo:Session随机抽取
sessionRandomExtract(sparkSession,taskUUID,sessionId2FilteredRDD)
// 需求三: top10热门品类
// sessionId2ActionRDD:RDD[(sessionId,action)]
// sessionId2FilteredRDD:RDD[(ssessionId,fullInfo)] 符合过滤条件的
// 获取所有符合过滤条件的action数据 join
val sessionId2FilterActionRDD = sessionID2ActionRDD.join(sessionId2FilteredRDD).map{
case (sessionId,(action,fullInfo)) =>
(sessionId,action)
}
// 8
// top10CategoryArray:Array[(sortKey,countInfo)]
//todo:按点击、下单、支付数量获取top10品类
val top10CategoryArray = top10PopularCategories(sparkSession,taskUUID,sessionId2FilterActionRDD)
// 需求四:top10热门品类top10活跃session统计
// 9
// sessionId2FilterActionRDD:RDD[(sessionId,action)]
// top10CategoryArray:Array[(sortKey,countInfo)]
// todo:每个top10品类的top10活跃session
top10ActiveSession(sparkSession,taskUUID,sessionId2FilterActionRDD,top10CategoryArray) //def top10PopularCategories 在这个方法里需要返回一下
}
/**
* (步骤9 需求四的方法)
* top10热门品类top10活跃session统计
*
* @param sparkSession
* @param taskUUID
* @param sessionId2FilterActionRDD
* @param top10CategoryArray
*/
def top10ActiveSession(sparkSession: SparkSession,
taskUUID: String,
sessionId2FilterActionRDD: RDD[(String, UserVisitAction)],
top10CategoryArray:Array[(SortKey,String)])={
//todo 第一步:过滤出所有点击过top10品类的action
// 法一:join
/*val cid2CountInfoRDD = sparkSession.sparkContext.makeRDD(top10CategoryArray).map{
case (sortKey,countInfo) =>{
val cid = StringUtils.getFieldFromConcatString(countInfo,"\\|",Constants.FIELD_CATEGORY_ID).toLong
(cid,countInfo)
}
}
val cid2ActionRDD = sessionId2FilterActionRDD.map{
case (sessionId,action) =>{
val cid = action.click_category_id
(cid,action)
}
}
val sessionId2ActionRDD = cid2CountInfoRDD.join(cid2ActionRDD).map{
case (cid,(countInfo,action)) =>{
val sid = action.session_id
(sid,action)
}
}*/
// 法2:filter
// cidArray: Array[Long] 包含了top10热门品类id
val cidArray = top10CategoryArray.map{
case (sortKey,countInfo) =>{
val cid = StringUtils.getFieldFromConcatString(countInfo,"\\|",Constants.FIELD_CATEGORY_ID).toLong
cid
}
}
// 所有符合过滤条件的,并且点击过Top10热门品类的action
val sessionId2ActionRDD = sessionId2FilterActionRDD.filter{
case (sessionId,action) =>{
cidArray.contains(action.click_category_id)
}
}
// todo 第二步:按照我们的sessionId进行聚合操作
val sessionId2GroupRDD: RDD[(String, Iterable[UserVisitAction])] = sessionId2ActionRDD.groupByKey()
// todo 第三步:每个count前加上sessionId
// cid2SessionCountRDD:RDD[(cid,sessionCount)]
val cid2SessionCountRDD= sessionId2GroupRDD.flatMap{
case (sessionId,iterableAction) =>{
val categoryCountMap = new mutable.HashMap[Long,Long]()
for(action <- iterableAction){
val cid = action.click_category_id
if(!categoryCountMap.contains(cid)){ // 这种方式经常出现
categoryCountMap += (cid->0)
}
categoryCountMap.update(cid,categoryCountMap(cid)+1)
}
// categoryCountMap 记录了一个session对于它所有点击过的品类点击过的次数
for((cid,count) <- categoryCountMap)
yield (cid,sessionId + "=" +count) //yield 会搜集所有 为什么加了{}就会报没有返回值的错
}
}
// todo 第四步 变成 (8888号,sessionid1=66次 sess2=99 sess8=21)
// cid2GroupRDD: RDD[(cid, IterableSessionCount)]
// cid2GroupRDD每一条数据都是一个categoryId和它对应的所有点击过它的session对它的点击次数 比如(8888号,sessionid1=66次 sess2=99 sess8=21)
val cid2GroupRDD: RDD[(Long, Iterable[String])] = cid2SessionCountRDD.groupByKey()
// todo 第五步 排序 取前10 存入数据库
// sortWith 对cid后面的各个session按点击次数转换成列表后进行排序 取前10
val top10SessionRDD = cid2GroupRDD.flatMap{
case (cid,iterableSessionCount) =>{
// true: item1放前面
// false: item2放前面
// item:sessionCount String "sessionId=count"
val sortList = iterableSessionCount.toList.sortWith((item1,item2)=>{
item1.split("=")(1).toLong > item2.split("=")(1).toLong
}).take(10)
// 写入数据库表的形式
val top10Session = sortList.map{
// item: sesssionCount String "sessionId=count"
case item =>{
val sessionId = item.split("=")(0)
val count = item.split("=")(1).toLong
Top10Session(taskUUID,cid,sessionId,count)
}
}
top10Session
}
}
import sparkSession.implicits._
top10SessionRDD.toDF().write
.format("jdbc")
.option("url",ConfigurationManager.config.getString(Constants.JDBC_URL))
.option("user",ConfigurationManager.config.getString(Constants.JDBC_USER))
.option("password",ConfigurationManager.config.getString(Constants.JDBC_PASSWORD))
.option("dbtable","top10_session")
.mode(SaveMode.Append)
.save
}
/**
* (步骤8 需求3方法里的方法1)
* 求各品类的点击次数
*
* @param sessionId2FilterActionRDD
*/
def getClickCount(sessionId2FilterActionRDD: RDD[(String, UserVisitAction)]) = {
// 过滤 把点击行为对应的action保留下来
val clickFilterRDD = sessionId2FilterActionRDD.filter(item => item._2.click_category_id != -1L)
//map格式转换 为reduceBykey做准备 (5555品类,1次)
val clickNumRDD = clickFilterRDD.map{
case(sessionId,action) => (action.click_category_id,1L)
}
// 聚合 返回
clickNumRDD.reduceByKey(_+_)
}
/**
* (步骤8 需求3方法里的方法2)
* 求各品类的下单次数
*
* @param sessionId2FilterActionRDD
*/
def getOrderCount(sessionId2FilterActionRDD: RDD[(String, UserVisitAction)])={
// 过滤 把下单行为对应的action保留下来
val orderFilterRDD = sessionId2FilterActionRDD.filter(item => item._2.order_category_ids != null)
// map 换成聚合的key (5555品类,1次)
val orderNumRDD = orderFilterRDD.flatMap{
case (sid,action) =>
action.order_category_ids.split(",").map(item => (item.toLong,1L))
}
// 聚合 返回
orderNumRDD.reduceByKey(_+_)
}
/**
* (步骤8 需求3方法里的方法3)
* 求各品类的付款次数
*
* @param sessionId2FilterActionRDD
*/
def getPayCount(sessionId2FilterActionRDD: RDD[(String, UserVisitAction)]) = {
// 过滤 把付款行为对应的action保留下来
val payFilterRDD = sessionId2FilterActionRDD.filter(item => item._2.pay_category_ids != null)
// map 换成聚合的key (5555品类,1次)
val payNumRDD = payFilterRDD.flatMap{
case (sid,action) =>
action.pay_category_ids.split(",").map(item => (item.toLong,1L))
}
// 聚合 返回
payNumRDD.reduceByKey(_+_)
}
/**
* (步骤8 需求三方法里的方法4)
*
* @param distinctCid2CidRDD
* @param cid2ClickCountRDD
* @param cid2OrderCountRDD
* @param cid2PayCountRDD
*/
def getFullCount(distinctCid2CidRDD: RDD[(Long, Long)],
cid2ClickCountRDD: RDD[(Long, Long)],
cid2OrderCountRDD: RDD[(Long, Long)],
cid2PayCountRDD: RDD[(Long, Long)])={
val cid2ClickInfoRDD = distinctCid2CidRDD.leftOuterJoin(cid2ClickCountRDD).map{
case (cid,(categoryId,option)) =>{ //因为是leftOuterJoin 所以后面十个option 需要先判断一下存不存在 option.isDefinded
val clickCount = if(option.isDefined) option.get else 0
val aggCount = Constants.FIELD_CATEGORY_ID + "=" +categoryId+ "|" +
Constants.FIELD_CLICK_COUNT + "=" + clickCount
(cid,aggCount)
}
}
val cid2OrderInfoRDD = cid2ClickInfoRDD.leftOuterJoin(cid2OrderCountRDD).map{
case(cid,(clickInfo,option))=>{
val orderCount = if(option.isDefined) option.get else 0
val aggCount = clickInfo + "|" + Constants.FIELD_ORDER_COUNT + "=" + orderCount
(cid, aggCount)
}
}
val cid2PayInfoRDD = cid2OrderInfoRDD.leftOuterJoin(cid2PayCountRDD).map{
case(cid,(orderInfo,option)) =>{
val payCount = if(option.isDefined) option.get else 0
val aggCount = orderInfo +"|"+Constants.FIELD_PAY_COUNT+"="+payCount
(cid,aggCount)
}
}
cid2PayInfoRDD //虽然叫payinfo 但是已经是完整信息了
}
/**
* (步骤8 需求三的方法)
* top10热门品类
*
* @param sparkSession
* @param taskUUID
* @param sessionId2FilterActionRDD 符合条件用户行为数据
*/
def top10PopularCategories(sparkSession: SparkSession, taskUUID: String, sessionId2FilterActionRDD: RDD[(String, UserVisitAction)])= {
// todo 第一步:获取所有发生过点击、下单、付款的品类 放在一个容器里 可以看打印文档的图
val cid2CidRDD = sessionId2FilterActionRDD.flatMap{
case (sid,action)=>{
val categoryBuffer = new ArrayBuffer[(Long,Long)]()
if(action.click_category_id != -1){ //点击行为
categoryBuffer += ((action.click_category_id,action.click_category_id))
}else if(action.order_category_ids != null){ //下单行为
for(orderCid <- action.order_category_ids.split(",")){
categoryBuffer += ((orderCid.toLong,orderCid.toLong))
}
}else if(action.pay_category_ids != null){ //付款行为
for(payCid <- action.pay_category_ids.split(",")){
categoryBuffer += ((payCid.toLong,payCid.toLong))
}
}
categoryBuffer
}
}
//对重复的categoryId进行去重
val distinctCid2CidRDD = cid2CidRDD.distinct()
// todo 第二步:计算各品类被点击、下单、支付的各个次数
val cid2ClickCountRDD = getClickCount(sessionId2FilterActionRDD) //注意 传的是未去重前的
val cid2OrderCountRDD = getOrderCount(sessionId2FilterActionRDD)
val cid2PayCountRDD = getPayCount(sessionId2FilterActionRDD)
// 测试打印输出一下 (注意必须把 getClickCount后面的unit删了 不然foreach不行)
// cid2ClickCountRDD.foreach(println(_))
/* 结果如下(cid,count)
(93,75)
(37,67)...
*/
// todo 第三步:整合各品类被点击、下单、支付次数 比如(888品类,点击80次,下单30次,付款2次)
val cid2FullCountRDD = getFullCount(distinctCid2CidRDD,cid2ClickCountRDD,cid2OrderCountRDD,cid2PayCountRDD)
// 测试输出一下 看看数据的拼接有没有问题
// cid2FullCountRDD.foreach(println(_))
/* 结果如下
(80,categoryid=80|clickCount=79|orderCount=80|payCount=87)
(22,categoryid=22|clickCount=76|orderCount=68|payCount=89)
(54,categoryid=54|clickCount=78|orderCount=88|payCount=67) ...
*/
// todo 第四步:自定义二次排序key
val sortkey2FullCountRDD = cid2FullCountRDD.map{
case (cid,countInfo) =>{
val clickCount = StringUtils.getFieldFromConcatString(countInfo,"\\|",Constants.FIELD_CLICK_COUNT).toLong
val orderCount = StringUtils.getFieldFromConcatString(countInfo,"\\|",Constants.FIELD_ORDER_COUNT).toLong
val payCount = StringUtils.getFieldFromConcatString(countInfo,"\\|",Constants.FIELD_PAY_COUNT).toLong
// 调用自定义的二次排序类
val sortKey = SortKey(clickCount,orderCount,payCount) //从小到大的
(sortKey,countInfo)
}
}
// 再用一下sortByKey()算子 false降序 从大到小 注意 take返回的是array
val top10CategoryArray: Array[(SortKey, String)] = sortkey2FullCountRDD.sortByKey(false).take(10)
// Array ----> RDD
val top10CategoryRDD =sparkSession.sparkContext.makeRDD(top10CategoryArray).map{
case(sortKey,countInfo) =>{
val cid = StringUtils.getFieldFromConcatString(countInfo,"\\|",Constants.FIELD_CATEGORY_ID).toLong
val clickCount = sortKey.clictCount
val orderCount = sortKey.orderCount
val payCount = sortKey.payCount
Top10Category(taskUUID,cid,clickCount,orderCount,payCount)
}
}
// todo 第五步:写入到数据库
import sparkSession.implicits._
top10CategoryRDD.toDF().write
.format("jdbc")
.option("url",ConfigurationManager.config.getString(Constants.JDBC_URL))
.option("user",ConfigurationManager.config.getString(Constants.JDBC_USER))
.option("password",ConfigurationManager.config.getString(Constants.JDBC_PASSWORD))
.option("dbtable","top10_category")
.mode(SaveMode.Append)
.save
// 这个 array 返回一下 给需求四用
top10CategoryArray
}
/**
* (步骤7 需求二里的方法里的方法)
* 根据数量随机生成下标
*
* @param extractNumberPerDay
* @param dateSessionCount
* @param hourCountMap
* @param dateHourExtractIndexListMap
* @return
*/
def generateRandomIndexList(extractNumberPerDay: Int,
dateSessionCount: Long,
hourCountMap: mutable.HashMap[String, Long],
dateHourExtractIndexListMap: mutable.HashMap[String, ListBuffer[Int]]) = {
for((hour,count) <- hourCountMap){
//该hour抽样数 =(每小时的session个数除以当天session总数)*当天抽样session数
var hourExrCount = ((count / dateSessionCount.toDouble)*extractNumberPerDay).toInt
//避免一个小时抽取数超过这个小时的总数
if(hourExrCount > count){
hourExrCount = count.toInt
}
val random = new Random()
dateHourExtractIndexListMap.get(hour) match{
case None => dateHourExtractIndexListMap(hour) = new ListBuffer[Int]
for(i <- 0 until hourExrCount){ //(0,hourExrCount)
var index = random.nextInt(count.toInt)
while (dateHourExtractIndexListMap(hour).contains(index)){
index = random.nextInt(count.toInt)
}
dateHourExtractIndexListMap(hour).append(index) //是不是也可以用“+”
}
case Some(list) =>
for(i <- 0 until hourExrCount){
var index = random.nextInt(count.toInt)
while(dateHourExtractIndexListMap(hour).contains(index)){
index = random.nextInt(count.toInt)
}
dateHourExtractIndexListMap(hour).append(index)
}
}
}
}
/**
* 随机抽取session(步骤7 需求二里的)
*
* @param sparkSession
* @param taskUUID
* @param sessionId2FilteredRDD
*/
def sessionRandomExtract(sparkSession: SparkSession,
taskUUID: String,
sessionId2FilteredRDD: RDD[(String, String)]): Unit = {
// dataHourFullInfoRDD:RDD[(dataHour,fullInfo)]
val dataHourFullInfoRDD = sessionId2FilteredRDD.map{
case(sid,fullInfo) =>
// println("========================fullInfo==="+fullInfo)
val startTime = StringUtils.getFieldFromConcatString(fullInfo,"\\|",Constants.FIELD_START_TIME)
// println("========================startTime==="+startTime)
val dataHour = DateUtils.getDateHour(startTime) //yyyy-MM-dd HH:mm:ss)=>(yyyy-MM-dd_HH)
(dataHour,fullInfo)
}
// 得到每天每小时的session数量 根据key
val countMap: collection.Map[String, Long] = dataHourFullInfoRDD.countByKey()
// 按时间比例随机抽取算法,计算出每天每小时要抽取的session的索引
// 将的map转换成>的格式
val dateHourCountMap: mutable.HashMap[String, mutable.HashMap[String, Long]] = mutable.HashMap[String,mutable.HashMap[String,Long]]()
for((dateHour,count) <- countMap){
val date = dateHour.split("_")(0)
val hour = dateHour.split("_")(1)
//模式匹配 dataHourCountMap中 日期存在和日期不存在分别进行处理
dateHourCountMap.get(date) match {
// 不存在则新增
case None => dateHourCountMap(date)=new mutable.HashMap[String,Long]();
dateHourCountMap(date) += (hour-> count) //空累加一下?
// 存在,累加
case Some(hourCountMap) => hourCountMap += (hour -> count)
}
}
// 按时间比例随机抽取算法
// 每天抽取量=1000/总天数
// 每小时要随机抽取的session数 =(每小时的session数/当天的session总数)*当天的抽取量
//todo: 解决问题一:获取每一天要抽取的session数
val extractNumberPerDay = 100 / dateHourCountMap.size
//todo: 解决问题二:一天有多少session :dataHourCountMap(date).values.sum
//todo: 解决问题三:一个小时有多少个session:dataHourCountMap(date)(hour)
val dateHourExtractIndexListMap = new mutable.HashMap[String,mutable.HashMap[String,ListBuffer[Int]]]()
//dateHourCountMap:Map[(date,Map[(hour,count)])]
for((date,hourCountMap) <- dateHourCountMap) {
val dateSessionCount = hourCountMap.values.sum
dateHourExtractIndexListMap.get(date) match {
case None => dateHourExtractIndexListMap(date) = new mutable.HashMap[String,ListBuffer[Int]]()
generateRandomIndexList(extractNumberPerDay,dateSessionCount,hourCountMap,dateHourExtractIndexListMap(date))
case Some(map) =>
generateRandomIndexList(extractNumberPerDay,dateSessionCount,hourCountMap,dateHourExtractIndexListMap(date))
}
}
/*到目前为止,获取到了每个小时要抽取的session的index*/
//todo: 将map进行广播,提升任务性能
var dateHourExtractIndexListMapBroadcast = sparkSession.sparkContext.broadcast(dateHourExtractIndexListMap)
// ----
val dateHour2GroupRDD = dataHourFullInfoRDD.groupByKey() //dataHourFullInfoRDD:RDD[(dataHour,fullInfo)]
//extractSessionRDD[SessionRandomExtract]
val extractSessionRDD = dateHour2GroupRDD.flatMap{
case(dateHour,iterableFullInfo) =>
val date = dateHour.split("_")(0)
val hour = dateHour.split("_")(1)
val extractList = dateHourExtractIndexListMapBroadcast.value.get(date).get(hour)
val extractSessionArrayBuffer = new ArrayBuffer[SessionRandomExtract]()
var index=0
for(fullInfo <- iterableFullInfo){
if(extractList.contains(index)){
val sessionId = StringUtils.getFieldFromConcatString(fullInfo,"\\|",Constants.FIELD_SESSION_ID)
val startTime = StringUtils.getFieldFromConcatString(fullInfo,"\\|",Constants.FIELD_START_TIME)
val searchKeywords = StringUtils.getFieldFromConcatString(fullInfo,"\\|",Constants.FIELD_CLICK_CATEGORY_IDS)
val clickCategories = StringUtils.getFieldFromConcatString(fullInfo,"\\|",Constants.FIELD_CLICK_CATEGORY_IDS)
val extractSession = SessionRandomExtract(taskUUID,sessionId,startTime,searchKeywords,clickCategories) //大写S和小写s是分别两个类 取名字太难了我
extractSessionArrayBuffer += extractSession
}
// index自增
index += 1
}
extractSessionArrayBuffer
}
// todo:倒入数据库
import sparkSession.implicits._
extractSessionRDD.toDF().write
.format("jdbc")
.option("url",ConfigurationManager.config.getString(Constants.JDBC_URL))
.option("user",ConfigurationManager.config.getString(Constants.JDBC_USER))
.option("password",ConfigurationManager.config.getString(Constants.JDBC_PASSWORD))
.option("dbtable","session_random_extract")
.mode(SaveMode.Append)
.save()
/*session_random_extract 就在本地mysql里啦*/
}
//计算比率 存入数据库(步骤6里的)
def getSessionRatio(sparkSession: SparkSession, taskUUID: String, value: mutable.HashMap[String, Int]): Unit = {
//拿到总个数 从累加器统计串中获取值 累加器的key传进去 如果没有的话默认它为1
val session_count: Double = value.getOrElse(Constants.SESSION_COUNT,1).toDouble
//不同范围访问[时长]的session个数
val visit_length_1s_3s = value.getOrElse(Constants.TIME_PERIOD_1s_3s,0) //Spark累加器Key名称常量
val visit_length_4s_6s = value.getOrElse(Constants.TIME_PERIOD_4s_6s,0)
val visit_length_7s_9s = value.getOrElse(Constants.TIME_PERIOD_7s_9s, 0)
val visit_length_10s_30s = value.getOrElse(Constants.TIME_PERIOD_10s_30s, 0)
val visit_length_30s_60s = value.getOrElse(Constants.TIME_PERIOD_30s_60s, 0)
val visit_length_1m_3m = value.getOrElse(Constants.TIME_PERIOD_1m_3m, 0)
val visit_length_3m_10m = value.getOrElse(Constants.TIME_PERIOD_3m_10m, 0)
val visit_length_10m_30m = value.getOrElse(Constants.TIME_PERIOD_10m_30m, 0)
val visit_length_30m = value.getOrElse(Constants.TIME_PERIOD_30m, 0)
//不同访问[步长]的session个数
val step_length_1_3 = value.getOrElse(Constants.STEP_PERIOD_1_3,0)
val step_length_4_6 = value.getOrElse(Constants.STEP_PERIOD_4_6, 0)
val step_length_7_9 = value.getOrElse(Constants.STEP_PERIOD_7_9, 0)
val step_length_10_30 = value.getOrElse(Constants.STEP_PERIOD_10_30, 0)
val step_length_30_60 = value.getOrElse(Constants.STEP_PERIOD_30_60, 0)
val step_length_60 = value.getOrElse(Constants.STEP_PERIOD_60, 0)
//计算各个访问时长和访问步长的范围占比
val visit_length_1s_3s_ratio = NumberUtils.formatDouble(visit_length_1s_3s/session_count,2) //对这个比率保存为两位小数
val visit_length_4s_6s_ratio = NumberUtils.formatDouble(visit_length_4s_6s/session_count,2)
val visit_length_7s_9s_ratio = NumberUtils.formatDouble(visit_length_7s_9s/session_count,2)
val visit_length_10s_30s_ratio = NumberUtils.formatDouble(visit_length_10s_30s/session_count,2)
val visit_length_30s_60s_ratio = NumberUtils.formatDouble(visit_length_30s_60s/session_count,2)
val visit_length_1m_3m_ratio = NumberUtils.formatDouble(visit_length_1m_3m/session_count,2)
val visit_length_3m_10m_ratio = NumberUtils.formatDouble(visit_length_3m_10m/session_count,2)
val visit_length_10m_30m_ratio = NumberUtils.formatDouble(visit_length_10m_30m/session_count,2)
val visit_length_30m_ratio = NumberUtils.formatDouble(visit_length_30m/session_count,2)
val step_length_1_3_ratio = NumberUtils.formatDouble(step_length_1_3 / session_count, 2)
val step_length_4_6_ratio = NumberUtils.formatDouble(step_length_4_6 / session_count, 2)
val step_length_7_9_ratio = NumberUtils.formatDouble(step_length_7_9 / session_count, 2)
val step_length_10_30_ratio = NumberUtils.formatDouble(step_length_10_30 / session_count, 2)
val step_length_30_60_ratio = NumberUtils.formatDouble(step_length_30_60 / session_count, 2)
val step_legth_60_ratio = NumberUtils.formatDouble(step_length_60 / session_count, 2)
//将统计结果封装为Domain对象
val stat = SessionAggrStat(
taskUUID,
session_count.toInt,
visit_length_1s_3s_ratio,
visit_length_4s_6s_ratio,
visit_length_7s_9s_ratio,
visit_length_10s_30s_ratio,
visit_length_30s_60s_ratio,
visit_length_1m_3m_ratio,
visit_length_3m_10m_ratio,
visit_length_10m_30m_ratio,
visit_length_30m_ratio,
step_length_1_3_ratio,
step_length_4_6_ratio,
step_length_7_9_ratio,
step_length_10_30_ratio,
step_length_30_60_ratio,
step_legth_60_ratio
)
val sessionRatioRDD = sparkSession.sparkContext.makeRDD(Array(stat))
import sparkSession.implicits._
sessionRatioRDD.toDF().write
.format("jdbc")
.option("url",ConfigurationManager.config.getString(Constants.JDBC_URL)) //配置类工具 "jdbc.user"
.option("user",ConfigurationManager.config.getString(Constants.JDBC_USER))
.option("password",ConfigurationManager.config.getString(Constants.JDBC_PASSWORD))
.option("dbtable","session_stat_ratio_0416")
.mode(SaveMode.Append)
.save()
}
//封装方法(步骤5里的)
def calculateVisitLength(visitLength:Long,sessionAggrStatAccumulator:SessionAccumulator)={
if(visitLength >= 1 && visitLength <= 3){
sessionAggrStatAccumulator.add(Constants.TIME_PERIOD_1s_3s)
}else if (visitLength >= 4 && visitLength <= 6) {
sessionAggrStatAccumulator.add(Constants.TIME_PERIOD_4s_6s);
} else if (visitLength >= 7 && visitLength <= 9) {
sessionAggrStatAccumulator.add(Constants.TIME_PERIOD_7s_9s);
} else if (visitLength >= 10 && visitLength <= 30) {
sessionAggrStatAccumulator.add(Constants.TIME_PERIOD_10s_30s);
} else if (visitLength > 30 && visitLength <= 60) {
sessionAggrStatAccumulator.add(Constants.TIME_PERIOD_30s_60s);
} else if (visitLength > 60 && visitLength <= 180) {
sessionAggrStatAccumulator.add(Constants.TIME_PERIOD_1m_3m);
} else if (visitLength > 180 && visitLength <= 600) {
sessionAggrStatAccumulator.add(Constants.TIME_PERIOD_3m_10m);
} else if (visitLength > 600 && visitLength <= 1800) {
sessionAggrStatAccumulator.add(Constants.TIME_PERIOD_10m_30m);
} else if (visitLength > 1800) {
sessionAggrStatAccumulator.add(Constants.TIME_PERIOD_30m);
}
}
//封装方法(步骤5里的)
def calculateStepLength(stepLength: Long, sessionAggrStatAccumulator: SessionAccumulator) = {
if (stepLength >= 1 && stepLength <= 3) {
sessionAggrStatAccumulator.add(Constants.STEP_PERIOD_1_3);
} else if (stepLength >= 4 && stepLength <= 6) {
sessionAggrStatAccumulator.add(Constants.STEP_PERIOD_4_6);
} else if (stepLength >= 7 && stepLength <= 9) {
sessionAggrStatAccumulator.add(Constants.STEP_PERIOD_7_9);
} else if (stepLength >= 10 && stepLength <= 30) {
sessionAggrStatAccumulator.add(Constants.STEP_PERIOD_10_30);
} else if (stepLength > 30 && stepLength <= 60) {
sessionAggrStatAccumulator.add(Constants.STEP_PERIOD_30_60);
} else if (stepLength > 60) {
sessionAggrStatAccumulator.add(Constants.STEP_PERIOD_60);
}
}
/**
* step5 过滤
*
* @param taskParam
* @param sessionId2FullInfoRDD
* @return
*/
def getSessionFilteredRDD(taskParam: JSONObject, sessionId2FullInfoRDD: RDD[(String, String)],sessionAccumulator:SessionAccumulator) ={
//根据task.params.json{startDate:"2018-08-01",......}
val startAge = ParamUtils.getParam(taskParam,Constants.PARAM_START_AGE)
val endAge = ParamUtils.getParam(taskParam,Constants.PARAM_END_AGE)
val professinals = ParamUtils.getParam(taskParam,Constants.PARAM_PROFESSIONALS)
val cities = ParamUtils.getParam(taskParam,Constants.PARAM_CITIES)
val sex = ParamUtils.getParam(taskParam,Constants.PARAM_SEX)
val keywords = ParamUtils.getParam(taskParam,Constants.PARAM_KEYWORDS)
val categoryIds = ParamUtils.getParam(taskParam,Constants.PARAM_CATEGORY_IDS)
//先判断 再拼接
var filterInfo =
(if(startAge != null) Constants.PARAM_START_AGE + "=" + startAge +"|" else "") + //Contants是一个常量接口 所以可以这样用
(if(endAge != null) Constants.PARAM_END_AGE + "=" + endAge + "|" else "")+
(if(professinals != null) Constants.PARAM_PROFESSIONALS+"="+professinals+"|" else "")+
(if(cities != null) Constants.PARAM_CITIES + "=" +cities else "") +
(if(sex != null) Constants.PARAM_SEX + "=" + sex +"|" else "")+
(if(keywords != null) Constants.PARAM_KEYWORDS +"="+ keywords +"|" else "")+
(if(categoryIds != null) Constants.PARAM_CATEGORY_IDS + "=" +categoryIds+"|" else "")
if(filterInfo.endsWith("\\|")){ //后面有些字段会是null 就空了
filterInfo=filterInfo.substring(0,filterInfo.length-1)
}
sessionId2FullInfoRDD.filter{
case(sessionId,fullInfo) =>
var success = true
if(!ValidUtils.between(fullInfo,Constants.FIELD_AGE,filterInfo,Constants.PARAM_START_AGE,Constants.PARAM_END_AGE)){ //数据 数据字段 参数 参数字段 数据字段和参数字段是一样的
success = false
}else if(!ValidUtils.in(fullInfo,Constants.FIELD_PROFESSIONAL,filterInfo,Constants.FIELD_PROFESSIONAL)){
success = false
}else if(!ValidUtils.equal(fullInfo,Constants.FIELD_SEX,filterInfo,Constants.FIELD_SEX)){
success = false
}else if(!ValidUtils.in(fullInfo,Constants.FIELD_SEARCH_KEYWORDS,filterInfo,Constants.PARAM_KEYWORDS)){
success = false
}else if(!ValidUtils.in(fullInfo,Constants.FIELD_CLICK_CATEGORY_IDS,filterInfo,Constants.PARAM_CATEGORY_IDS)){
success = false
}
//以下部分是指在过滤基础上增加了一步累加器的更新
if(success){ //sessssionid里面的每一个字段都符合限制条件的要求
//自动维护了我们的key 会自动在key对应的值上加一
sessionAccumulator.add(Constants.SESSION_COUNT)
val visitLength = StringUtils.getFieldFromConcatString(fullInfo,"\\|",Constants.FIELD_VISIT_LENGTH).toLong
val stepLength = StringUtils.getFieldFromConcatString(fullInfo,"\\|",Constants.FIELD_STEP_LENGTH).toLong
// if(visitLength>=1 && visitLength <= 3){ //封装成方法
// sessionAccumulator.add(Constants.TIME_PERIOD_1s_3s)
// }else if(visitLength>=4 && visitLength <= 6)
// sessionAccumulator.add(Constants.TIME_PERIOD_4s_6s)
calculateVisitLength(visitLength,sessionAccumulator)
calculateStepLength(stepLength,sessionAccumulator)
}
success //filter必须要有一个布尔类型的返回
}
}
/**
* step2+3+4
* 原数据
* sparkSession,(4bc33302668f4331aba52c8328a781c7,CompactBuffer(UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,6,0000-00-00 12:45:35,联想笔记本,-1,-1,null,null,null,null,0), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,5,0000-00-00 12:46:24,吸尘器,-1,-1,null,null,null,null,5), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,2,0000-00-00 12:18:10,保温杯,-1,-1,null,null,null,null,5), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,8,0000-00-00 12:58:49,null,-1,-1,59,20,null,null,3), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,3,0000-00-00 12:03:04,null,-1,-1,59,78,null,null,5), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,4,0000-00-00 12:02:11,null,-1,-1,22,56,null,null,1), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,2,0000-00-00 12:31:47,null,-1,-1,98,64,null,null,6), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,3,0000-00-00 12:40:32,null,-1,-1,14,89,null,null,8), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,5,0000-00-00 12:47:58,null,-1,-1,null,null,36,75,9), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,3,0000-00-00 12:28:19,null,-1,-1,96,79,null,null,7), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,8,0000-00-00 12:31:50,null,83,3,null,null,null,null,3), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,7,0000-00-00 12:11:05,机器学习,-1,-1,null,null,null,null,5), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,6,0000-00-00 12:58:18,null,-1,-1,null,null,66,25,0), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,9,0000-00-00 12:14:06,null,24,97,null,null,null,null,9), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,4,0000-00-00 12:56:48,null,-1,-1,32,46,null,null,9), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,1,0000-00-00 12:50:12,null,33,84,null,null,null,null,2), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,7,0000-00-00 12:32:32,null,-1,-1,4,15,null,null,5), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,7,0000-00-00 12:02:53,吸尘器,-1,-1,null,null,null,null,5), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,0,0000-00-00 12:04:02,null,-1,-1,44,80,null,null,4), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,1,0000-00-00 12:36:43,null,-1,-1,null,null,60,54,3), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,8,0000-00-00 12:58:52,华为手机,-1,-1,null,null,null,null,5), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,6,0000-00-00 12:54:11,洗面奶,-1,-1,null,null,null,null,5), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,6,0000-00-00 12:09:19,null,42,67,null,null,null,null,2), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,8,0000-00-00 12:44:23,null,44,17,null,null,null,null,7), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,6,0000-00-00 12:09:38,卫生纸,-1,-1,null,null,null,null,8), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,1,0000-00-00 12:05:18,null,18,74,null,null,null,null,1), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,2,0000-00-00 12:24:06,卫生纸,-1,-1,null,null,null,null,4), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,6,0000-00-00 12:32:27,null,74,85,null,null,null,null,8), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,9,0000-00-00 12:18:19,null,-1,-1,null,null,12,89,4), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,3,0000-00-00 12:26:21,保温杯,-1,-1,null,null,null,null,6), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,5,0000-00-00 12:41:48,联想笔记本,-1,-1,null,null,null,null,2), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,7,0000-00-00 12:04:17,null,12,79,null,null,null,null,5), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,0,0000-00-00 12:17:24,null,-1,-1,93,98,null,null,1), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,7,0000-00-00 12:27:52,null,-1,-1,null,null,58,44,8), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,9,0000-00-00 12:34:27,null,-1,-1,null,null,52,69,7), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,3,0000-00-00 12:16:49,null,-1,-1,null,null,13,90,1), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,4,0000-00-00 12:37:47,null,-1,-1,86,67,null,null,7), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,8,0000-00-00 12:10:56,null,-1,-1,null,null,62,31,7), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,0,0000-00-00 12:01:41,null,4,0,null,null,null,null,7), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,3,0000-00-00 12:48:00,null,-1,-1,87,86,null,null,6)))
*
* step3 我想要得到的聚合信息数据
* Session_Id|Search_Keywords|Click_Categary_Id|Visit_Length|Step_Length|Start_Time
*
* step4 我想要得到的聚合信息
* Session_Id|Search_Keywords|Click_Categary_Id|Visit_Length|Step_Length|Start_Time|Age|Professional|Sex|City
*
* @param sparkSession
* @param session2GroupActionRDD
*/
def getSessionFullInfo(sparkSession: SparkSession, session2GroupActionRDD: RDD[(String, Iterable[UserVisitAction])]) = { //这里的unit要去掉 否则返回的值会被认为unit
//step3
//userId2AggrInfoRDD:RDD[(userId,aggrInfo)] key:userId value:agrInfo
val userId2AggrInfoRDD = session2GroupActionRDD.map{
case (sessionId,iterableAction)=>
var userId = -1L
var startTime:Date = null
var endTime:Date = null
var stepLength = 0
val searchKeywords = new StringBuffer("")
val clickCategories = new StringBuffer("")
for(action <- iterableAction){
//userId
if(userId == -1L){
userId = action.user_id
}
//actionTime 类似在逐步扩大时间区间[null,null]---[14:22:22,14:26:30]
val actionTime = DateUtils.parseTime(action.action_time) //点击行为的时间点
if(startTime == null || startTime.after(actionTime)){
startTime = actionTime
}
if(endTime == null || endTime.before(actionTime)){
endTime = actionTime
}
//searchKeyword
val searchKeyword = action.search_keyword
if(StringUtils.isNotEmpty(searchKeyword) && !searchKeywords.toString.contains(searchKeyword)){
searchKeywords.append(searchKeyword+",")
}
//clickCategoryId 某一个商品品类的ID
val clickCategoryId = action.click_category_id
if(clickCategoryId != -1 && !clickCategories.toString.contains(clickCategoryId)){
clickCategories.append(clickCategoryId+",")
}
//stepLength
stepLength += 1
}
//目的:去除","
//searchKeywords.toString.substring(0,searchKeywords.toString.length)
val searchKw = StringUtils.trimComma(searchKeywords.toString) //截断字符串两侧的逗号
val clickCg = StringUtils.trimComma(clickCategories.toString)
val visitLength = (endTime.getTime - startTime.getTime) /1000
val aggrInfo = Constants.FIELD_SESSION_ID+"="+sessionId+"|"+
Constants.FIELD_SEARCH_KEYWORDS+"="+searchKw+"|"+
Constants.FIELD_CLICK_CATEGORY_IDS+"="+clickCg+"|"+
Constants.FIELD_VISIT_LENGTH+"="+visitLength+"|"+
Constants.FIELD_STEP_LENGTH+"="+stepLength+"|"+
Constants.FIELD_START_TIME+"="+DateUtils.formatTime(startTime) //格式化日期(yyyy-MM-dd)!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
//(sessionId,aggrInfo)
//返回一下 因为下一步要去脸公共的user表,user表中是没有sessionId字段的
//所以 找到公共的字段 userId
(userId,aggrInfo)
}
//返回一下(测试3输出)
// userId2AggrInfoRDD
//step4 再把已经获取到的userId2AggrInfoRDD和userinfo表做一次map,得到一个完整信息的RDD
val sql = "select * from user_info"
import sparkSession.implicits._
//userId2InfoIdRDD: RDD[(userId, UserInfo)]
val userId2InfoIdRDD: RDD[(Long, UserInfo)] = sparkSession.sql(sql).as[UserInfo].rdd.map(item => (item.user_id, item)) //UserInfo 用户信息表样例类
val sessionId2FullInfoRDD = userId2AggrInfoRDD.join(userId2InfoIdRDD).map{
case (userId,(aggrInfo,userInfo)) =>
val age = userInfo.age
val professional = userInfo.professional
val sex = userInfo.sex
val city = userInfo.city
val fullInfo = aggrInfo +"|"+
Constants.FIELD_AGE + "=" + age +"|"+
Constants.FIELD_PROFESSIONAL +"="+professional+"|"+
Constants.FIELD_SEX + "=" +sex+"|"+
Constants.FIELD_CITY+"="+city
//之前是为了聚合用的userId 先在聚合完成 就用回sessionId
val sessionId = StringUtils.getFieldFromConcatString(aggrInfo,"\\|",Constants.FIELD_SESSION_ID) //从拼接的字符串中提取字段
//返回
(sessionId,fullInfo)
}
//4 返回完整数据
sessionId2FullInfoRDD
}
def getOriActionRDD(sparkSession: SparkSession, taskParam: JSONObject) = {
// 从JSON对象中提取参数 ParamUtils.getParam
val startDate = ParamUtils.getParam(taskParam,Constants.PARAM_START_DATE)
val endDate = ParamUtils.getParam(taskParam,Constants.PARAM_END_DATE)
val sql = "select * from user_visit_action where date>='" + startDate + "' and date<='" + endDate + "'"
// !!!隐式转换
import sparkSession.implicits._
sparkSession.sql(sql).as[UserVisitAction].rdd
}
}
/*
step1 确认数据获取成功
actionRDD.foreach(println(_))
UserVisitAction(0000-00-00,21,d95c6c3cd7164e45ad483525b2132577,3,0000-00-00 16:13:29,null,-1,-1,55,62,null,null,2)
UserVisitAction(0000-00-00,21,d95c6c3cd7164e45ad483525b2132577,5,0000-00-00 16:57:26,null,3,93,null,null,null,null,8)
UserVisitAction(0000-00-00,21,d95c6c3cd7164e45ad483525b2132577,2,0000-00-00 16:34:28,null,-1,-1,null,null,48,35,1)
step2 斧子形的 以session为key 以CompactBuffer(一条条action)为value
session2GroupActionRDD.foreach(println(_))
(4bc33302668f4331aba52c8328a781c7,CompactBuffer(UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,6,0000-00-00 12:45:35,联想笔记本,-1,-1,null,null,null,null,0), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,5,0000-00-00 12:46:24,吸尘器,-1,-1,null,null,null,null,5), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,2,0000-00-00 12:18:10,保温杯,-1,-1,null,null,null,null,5), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,8,0000-00-00 12:58:49,null,-1,-1,59,20,null,null,3), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,3,0000-00-00 12:03:04,null,-1,-1,59,78,null,null,5), UserVisitAction(0000-00-
step3
* Session_Id|Search_Keywords|Click_Categary_Id|Visit_Length|Step_Length|Start_Time
(13,sessionid=a300de33934d49c4b02c1525879454bd|searchKeywords=吸尘器,保温杯,Lamer,华为手机,机器学习,洗面奶,小龙虾|clickCategoryIds=26,38,30,17,6,83,40,65,41,4,34,97,41,34,60,82,38,11,79,42,99,35,47,80,1,80,99,64,2|visitLength=3491|stepLength=94|startTime=0000-00-00)
(25,sessionid=5ddf7f6d6c9d485db9287b5ef34e077e|searchKeywords=苹果,吸尘器,保温杯|clickCategoryIds=28,59,96,29,91,21|visitLength=3322|stepLength=21|startTime=0000-00-00)
(41,sessionid=838220c6e46445d7bad52f18d3171bd8|searchKeywords=吸尘器,苹果,保温杯,洗面奶,机器学习,Lamer,华为手机|clickCategoryIds=29,80,46,79,6,98,80,34,41,30,41,1,34,64,13,98,1|visitLength=3458|stepLength=52|startTime=0000-00-00)
step4 聚合信息全部完成
* Session_Id|Search_Keywords|Click_Categary_Id|Visit_Length|Step_Length|Start_Time|Age|Professional|Sex|City
(79bfe53461f146cc836f7219351588f0,sessionid=79bfe53461f146cc836f7219351588f0|searchKeywords=卫生纸,联想笔记本,华为手机,小龙虾,洗面奶,吸尘器,苹果,Lamer|clickCategoryIds=8,49,14,13,36,41,27,82,45,47,7,41,32|visitLength=3456|stepLength=67|startTime=0000-00-00|age=22|professional=professional2|sex=male|city=city57)
(228cf3d0777749799436de675ff441a4,sessionid=228cf3d0777749799436de675ff441a4|searchKeywords=洗面奶,卫生纸,联想笔记本,苹果,Lamer,机器学习,小龙虾,吸尘器,保温杯,华为手机|clickCategoryIds=15,90,38,45,66,28,12,15,81,58,62,90,94,90,82,99,11,69|visitLength=3432|stepLength=8
step5 过滤及累加器更新之后
0000-00-00 00:00:00,740 INFO --- [ Executor task launch worker for task 25] org.apache.spark.storage.ShuffleBlockFetcherIterator (line: 54) : Started 0 remote fetches in 6 ms
(4fe6f3eaf9914578bc03308618bdd7a5,sessionid=4fe6f3eaf9914578bc03308618bdd7a5|searchKeywords=苹果,洗面奶,保温杯,机器学习,Lamer,联想笔记本,华为手机,小龙虾|clickCategoryIds=12,92,10,25,75,28,28,18,51,92,39,63,82,70,60,86|visitLength=3391|stepLength=54|startTime=0000-00-00|age=47|professional=professional40|sex=male|city=city28)
(fa00c11085ec4e8eb409ddc35e351902,sessionid=fa00c11085ec4e8eb409ddc35e351902|searchKeywords=小龙虾,洗面奶,华为手机,苹果,联想笔记本,保温杯,机器学习|clickCategoryIds=51,2,84,89,97,82,79|visitLength=3400|stepLength=37|startTime=0000-00-00|age=29|professional=professional0|sex=male|city=city92)
step6 计算比率存入数据库
然后commerce数据库中就会有一张表session_stat_ratio_0416
*/