Scala_Spark-电商平台离线分析项目-需求四top10热门品类的top10活跃session

Scala_Spark-电商平台离线分析项目-需求四top10热门品类的top10活跃session

Top10Session.scala 数据库表格形式样例类

/**
 * 需求四的数据库表格形式样例类
 * top10热门品类的top10活跃session
 *
 * @param taskid
 * @param categoryid
 * @param sessionid
 * @param clickCount
 */

case class Top10Session(
                         taskid:String,
                         categoryid:Long,
                         sessionid:String,
                         clickCount:Long
                       )

实现方法 SessionStat.scala

 /**
   * (步骤9 需求四的方法)
   * top10热门品类top10活跃session统计
   *
   * @param sparkSession
   * @param taskUUID
   * @param sessionId2FilterActionRDD
   * @param top10CategoryArray
   */
  def top10ActiveSession(sparkSession: SparkSession,
                         taskUUID: String,
                         sessionId2FilterActionRDD: RDD[(String, UserVisitAction)],
                         top10CategoryArray:Array[(SortKey,String)])={
    //todo 第一步:过滤出所有点击过top10品类的action
    // 法一:join
    /*val cid2CountInfoRDD = sparkSession.sparkContext.makeRDD(top10CategoryArray).map{
      case (sortKey,countInfo) =>{
        val cid = StringUtils.getFieldFromConcatString(countInfo,"\\|",Constants.FIELD_CATEGORY_ID).toLong
        (cid,countInfo)
      }
    }
    val cid2ActionRDD = sessionId2FilterActionRDD.map{
      case (sessionId,action) =>{
        val cid = action.click_category_id
        (cid,action)
      }
    }
    val sessionId2ActionRDD = cid2CountInfoRDD.join(cid2ActionRDD).map{
      case (cid,(countInfo,action)) =>{
        val sid = action.session_id
        (sid,action)
      }
    }*/
    // 法2:filter
    // cidArray: Array[Long] 包含了top10热门品类id
    val cidArray = top10CategoryArray.map{
      case (sortKey,countInfo) =>{
        val cid = StringUtils.getFieldFromConcatString(countInfo,"\\|",Constants.FIELD_CATEGORY_ID).toLong
        cid
      }
    }
        // 所有符合过滤条件的,并且点击过Top10热门品类的action
    val sessionId2ActionRDD = sessionId2FilterActionRDD.filter{
      case (sessionId,action) =>{
        cidArray.contains(action.click_category_id)
      }
    }

    // todo 第二步:按照我们的sessionId进行聚合操作
    val sessionId2GroupRDD: RDD[(String, Iterable[UserVisitAction])] = sessionId2ActionRDD.groupByKey()

    // todo 第三步:每个count前加上sessionId
    // cid2SessionCountRDD:RDD[(cid,sessionCount)]
    val cid2SessionCountRDD= sessionId2GroupRDD.flatMap{
      case (sessionId,iterableAction) =>{
        val categoryCountMap = new mutable.HashMap[Long,Long]()

        for(action <- iterableAction){
          val cid = action.click_category_id
          if(!categoryCountMap.contains(cid)){     // 这种方式经常出现
            categoryCountMap += (cid->0)
          }
          categoryCountMap.update(cid,categoryCountMap(cid)+1)
        }
        // categoryCountMap 记录了一个session对于它所有点击过的品类点击过的次数
        for((cid,count) <- categoryCountMap)
          yield (cid,sessionId + "=" +count)  //yield 会搜集所有 为什么加了{}就会报没有返回值的错
      }
    }

    // todo 第四步 变成 (8888号,sessionid1=66次 sess2=99 sess8=21)
    // cid2GroupRDD: RDD[(cid, IterableSessionCount)]
    // cid2GroupRDD每一条数据都是一个categoryId和它对应的所有点击过它的session对它的点击次数 比如(8888号,sessionid1=66次 sess2=99 sess8=21)
    val cid2GroupRDD: RDD[(Long, Iterable[String])] = cid2SessionCountRDD.groupByKey()

    // todo 第五步 排序 取前10 存入数据库
    // sortWith 对cid后面的各个session按点击次数转换成列表后进行排序 取前10
    val top10SessionRDD = cid2GroupRDD.flatMap{
      case (cid,iterableSessionCount) =>{
        // true: item1放前面
        // false: item2放前面
        // item:sessionCount String "sessionId=count"
        val sortList = iterableSessionCount.toList.sortWith((item1,item2)=>{
          item1.split("=")(1).toLong > item2.split("=")(1).toLong
        }).take(10)

        // 写入数据库表的形式
        val top10Session = sortList.map{
          // item: sesssionCount String "sessionId=count"
          case item =>{
            val sessionId = item.split("=")(0)
            val count = item.split("=")(1).toLong
            Top10Session(taskUUID,cid,sessionId,count)
          }
        }
        top10Session
      }
    }

    import sparkSession.implicits._
    top10SessionRDD.toDF().write
      .format("jdbc")
      .option("url",ConfigurationManager.config.getString(Constants.JDBC_URL))
      .option("user",ConfigurationManager.config.getString(Constants.JDBC_USER))
      .option("password",ConfigurationManager.config.getString(Constants.JDBC_PASSWORD))
      .option("dbtable","top10_session")
      .mode(SaveMode.Append)
      .save
  }



主线程

 def main(args: Array[String]): Unit = {

    // 获取筛选条件
    val jsonStr = ConfigurationManager.config.getString(Constants.TASK_PARAMS)

    // 获取筛选条件的JsonObject
    val taskParam = JSONObject.fromObject(jsonStr)

    // 创建全局唯一的主键
    val taskUUID = UUID.randomUUID().toString

    // 创建SparkConf
    val sparkConf = new SparkConf().setMaster("local[*]").setAppName("session")

    // 创建SparkSession (包含SparkContext)
    val sparkSession = SparkSession.builder().config(sparkConf).enableHiveSupport().getOrCreate()

    // 获取原始动作表
    // actionRDD:RDD[UserVisitAction]
    val actionRDD = getOriActionRDD(sparkSession,taskParam)

    // 测试1打印输出 先确认下数据获取成功
//    actionRDD.foreach(println(_))

    // map-----sessionID2ActionRDD:RDD[(sessionID,UserVieitAction)]
    val sessionID2ActionRDD = actionRDD.map(item => (item.session_id, item)) //item就是平时练习的x

    // groupByKey-----sessionID2GroupActionRDD: RDD[(sessionID, Iterable[UserVisitAction])]
    val session2GroupActionRDD = sessionID2ActionRDD.groupByKey()

    session2GroupActionRDD.cache()


    //todo:聚合数据
    //测试2打印输出
//    session2GroupActionRDD.foreach(println(_))

    //测试3打印输出
//    val userId2AggrInfoRDD = getSessionFullInfo(sparkSession, session2GroupActionRDD)
//    userId2AggrInfoRDD.foreach(println(_))

    //4打印输出
    val sessionId2FullInfoRDD  = getSessionFullInfo(sparkSession, session2GroupActionRDD)
    sessionId2FullInfoRDD .foreach(println(_))

    //至此聚合完成,开始过滤操作

    //5 过滤
    //todo:过滤

        //对自定义累加器进行注册
    val sessionAccumulator =new SessionAccumulator
    sparkSession.sparkContext.register(sessionAccumulator)

        //在过滤过程中完成了累加器的更新操作
        //sessionId2FilterRDD:RDD[(sessionId,fullInfo)]是所有符合过滤条件的数组组成的RDD
        //getSessionFilteredRDD:实现根据限制条件对session数据进行过滤,并完成累加的更新
    val sessionId2FilteredRDD =getSessionFilteredRDD(taskParam,sessionId2FullInfoRDD,sessionAccumulator) //sessionAccumulator作为参数传进去
        //s输出
    sessionId2FilteredRDD.foreach(println(_)) //需要一个action操作

    //6 计算比率 存入mysql数据库
    //todo: 计算比率 存入mysql数据库
    getSessionRatio(sparkSession,taskUUID,sessionAccumulator.value)


    // 需求二:Session随机抽取
    // 7
    // sessionId2FilteredRDD:RDD[(sid,fullInfo)]
    //todo:Session随机抽取
    sessionRandomExtract(sparkSession,taskUUID,sessionId2FilteredRDD)

    // 需求三: top10热门品类
    // sessionId2ActionRDD:RDD[(sessionId,action)]
    // sessionId2FilteredRDD:RDD[(ssessionId,fullInfo)] 符合过滤条件的
    // 获取所有符合过滤条件的action数据 join
    val sessionId2FilterActionRDD = sessionID2ActionRDD.join(sessionId2FilteredRDD).map{
      case (sessionId,(action,fullInfo)) =>
        (sessionId,action)
    }
    // 8
    // top10CategoryArray:Array[(sortKey,countInfo)]
    //todo:按点击、下单、支付数量获取top10品类
    val top10CategoryArray = top10PopularCategories(sparkSession,taskUUID,sessionId2FilterActionRDD)

    // 需求四:top10热门品类top10活跃session统计
    // 9
    // sessionId2FilterActionRDD:RDD[(sessionId,action)]
    // top10CategoryArray:Array[(sortKey,countInfo)]
    // todo:每个top10品类的top10活跃session
    top10ActiveSession(sparkSession,taskUUID,sessionId2FilterActionRDD,top10CategoryArray)  //def top10PopularCategories 在这个方法里需要返回一下 top10CategoryArray

  }

附:完整的SessionStat.scala

需求一到四部分

import java.util.{Date, UUID}

import commons.conf.ConfigurationManager
import commons.constant.Constants
import commons.model.{UserInfo, UserVisitAction}
import commons.utils.{DateUtils, NumberUtils, ParamUtils, StringUtils, ValidUtils}
import net.sf.json.JSONObject
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{SaveMode, SparkSession}

import scala.collection.mutable
import scala.collection.mutable.{ArrayBuffer, ListBuffer}
import scala.util.Random

/**
 * SessionStat.scala
 * 需求一到四部分
 *
 */


/**
 * 新建一个session module ;
 * 然后 scala添加进去session module ;project structure -- global lib --scala 2.11.8 右键 addToModules
 */
object SessionStat {



  def main(args: Array[String]): Unit = {

    // 获取筛选条件
    val jsonStr = ConfigurationManager.config.getString(Constants.TASK_PARAMS) //task.params.json

    // 获取筛选条件的JsonObject
    val taskParam = JSONObject.fromObject(jsonStr)

    // 创建全局唯一的主键
    val taskUUID = UUID.randomUUID().toString

    // 创建SparkConf
    val sparkConf = new SparkConf().setMaster("local[*]").setAppName("session")

    // 创建SparkSession (包含SparkContext)
    val sparkSession = SparkSession.builder().config(sparkConf).enableHiveSupport().getOrCreate()

    // 获取原始动作表
    // actionRDD:RDD[UserVisitAction]
    val actionRDD = getOriActionRDD(sparkSession,taskParam)

    // 测试1打印输出 先确认下数据获取成功
//    actionRDD.foreach(println(_))

    // map-----sessionID2ActionRDD:RDD[(sessionID,UserVieitAction)]
    val sessionID2ActionRDD = actionRDD.map(item => (item.session_id, item)) //item就是平时练习的x

    // groupByKey-----sessionID2GroupActionRDD: RDD[(sessionID, Iterable[UserVisitAction])]
    val session2GroupActionRDD = sessionID2ActionRDD.groupByKey()

    session2GroupActionRDD.cache()


    //todo:聚合数据
    //测试2打印输出
//    session2GroupActionRDD.foreach(println(_))

    //测试3打印输出
//    val userId2AggrInfoRDD = getSessionFullInfo(sparkSession, session2GroupActionRDD)
//    userId2AggrInfoRDD.foreach(println(_))

    //4打印输出
    val sessionId2FullInfoRDD  = getSessionFullInfo(sparkSession, session2GroupActionRDD)
    sessionId2FullInfoRDD .foreach(println(_))

    //至此聚合完成,开始过滤操作

    //5 过滤
    //todo:过滤

        //对自定义累加器进行注册
    val sessionAccumulator =new SessionAccumulator
    sparkSession.sparkContext.register(sessionAccumulator)

        //在过滤过程中完成了累加器的更新操作
        //sessionId2FilterRDD:RDD[(sessionId,fullInfo)]是所有符合过滤条件的数组组成的RDD
        //getSessionFilteredRDD:实现根据限制条件对session数据进行过滤,并完成累加的更新
    val sessionId2FilteredRDD =getSessionFilteredRDD(taskParam,sessionId2FullInfoRDD,sessionAccumulator) //sessionAccumulator作为参数传进去
        //s输出
    sessionId2FilteredRDD.foreach(println(_)) //需要一个action操作

    //6 计算比率 存入mysql数据库
    //todo: 计算比率 存入mysql数据库
    getSessionRatio(sparkSession,taskUUID,sessionAccumulator.value)


    // 需求二:Session随机抽取
    // 7
    // sessionId2FilteredRDD:RDD[(sid,fullInfo)]
    //todo:Session随机抽取
    sessionRandomExtract(sparkSession,taskUUID,sessionId2FilteredRDD)

    // 需求三: top10热门品类
    // sessionId2ActionRDD:RDD[(sessionId,action)]
    // sessionId2FilteredRDD:RDD[(ssessionId,fullInfo)] 符合过滤条件的
    // 获取所有符合过滤条件的action数据 join
    val sessionId2FilterActionRDD = sessionID2ActionRDD.join(sessionId2FilteredRDD).map{
      case (sessionId,(action,fullInfo)) =>
        (sessionId,action)
    }
    // 8
    // top10CategoryArray:Array[(sortKey,countInfo)]
    //todo:按点击、下单、支付数量获取top10品类
    val top10CategoryArray = top10PopularCategories(sparkSession,taskUUID,sessionId2FilterActionRDD)

    // 需求四:top10热门品类top10活跃session统计
    // 9
    // sessionId2FilterActionRDD:RDD[(sessionId,action)]
    // top10CategoryArray:Array[(sortKey,countInfo)]
    // todo:每个top10品类的top10活跃session
    top10ActiveSession(sparkSession,taskUUID,sessionId2FilterActionRDD,top10CategoryArray)  //def top10PopularCategories 在这个方法里需要返回一下

  }

  /**
   * (步骤9 需求四的方法)
   * top10热门品类top10活跃session统计
   *
   * @param sparkSession
   * @param taskUUID
   * @param sessionId2FilterActionRDD
   * @param top10CategoryArray
   */
  def top10ActiveSession(sparkSession: SparkSession,
                         taskUUID: String,
                         sessionId2FilterActionRDD: RDD[(String, UserVisitAction)],
                         top10CategoryArray:Array[(SortKey,String)])={
    //todo 第一步:过滤出所有点击过top10品类的action
    // 法一:join
    /*val cid2CountInfoRDD = sparkSession.sparkContext.makeRDD(top10CategoryArray).map{
      case (sortKey,countInfo) =>{
        val cid = StringUtils.getFieldFromConcatString(countInfo,"\\|",Constants.FIELD_CATEGORY_ID).toLong
        (cid,countInfo)
      }
    }
    val cid2ActionRDD = sessionId2FilterActionRDD.map{
      case (sessionId,action) =>{
        val cid = action.click_category_id
        (cid,action)
      }
    }
    val sessionId2ActionRDD = cid2CountInfoRDD.join(cid2ActionRDD).map{
      case (cid,(countInfo,action)) =>{
        val sid = action.session_id
        (sid,action)
      }
    }*/
    // 法2:filter
    // cidArray: Array[Long] 包含了top10热门品类id
    val cidArray = top10CategoryArray.map{
      case (sortKey,countInfo) =>{
        val cid = StringUtils.getFieldFromConcatString(countInfo,"\\|",Constants.FIELD_CATEGORY_ID).toLong
        cid
      }
    }
        // 所有符合过滤条件的,并且点击过Top10热门品类的action
    val sessionId2ActionRDD = sessionId2FilterActionRDD.filter{
      case (sessionId,action) =>{
        cidArray.contains(action.click_category_id)
      }
    }

    // todo 第二步:按照我们的sessionId进行聚合操作
    val sessionId2GroupRDD: RDD[(String, Iterable[UserVisitAction])] = sessionId2ActionRDD.groupByKey()

    // todo 第三步:每个count前加上sessionId
    // cid2SessionCountRDD:RDD[(cid,sessionCount)]
    val cid2SessionCountRDD= sessionId2GroupRDD.flatMap{
      case (sessionId,iterableAction) =>{
        val categoryCountMap = new mutable.HashMap[Long,Long]()

        for(action <- iterableAction){
          val cid = action.click_category_id
          if(!categoryCountMap.contains(cid)){     // 这种方式经常出现
            categoryCountMap += (cid->0)
          }
          categoryCountMap.update(cid,categoryCountMap(cid)+1)
        }
        // categoryCountMap 记录了一个session对于它所有点击过的品类点击过的次数
        for((cid,count) <- categoryCountMap)
          yield (cid,sessionId + "=" +count)  //yield 会搜集所有 为什么加了{}就会报没有返回值的错
      }
    }

    // todo 第四步 变成 (8888号,sessionid1=66次 sess2=99 sess8=21)
    // cid2GroupRDD: RDD[(cid, IterableSessionCount)]
    // cid2GroupRDD每一条数据都是一个categoryId和它对应的所有点击过它的session对它的点击次数 比如(8888号,sessionid1=66次 sess2=99 sess8=21)
    val cid2GroupRDD: RDD[(Long, Iterable[String])] = cid2SessionCountRDD.groupByKey()

    // todo 第五步 排序 取前10 存入数据库
    // sortWith 对cid后面的各个session按点击次数转换成列表后进行排序 取前10
    val top10SessionRDD = cid2GroupRDD.flatMap{
      case (cid,iterableSessionCount) =>{
        // true: item1放前面
        // false: item2放前面
        // item:sessionCount String "sessionId=count"
        val sortList = iterableSessionCount.toList.sortWith((item1,item2)=>{
          item1.split("=")(1).toLong > item2.split("=")(1).toLong
        }).take(10)

        // 写入数据库表的形式
        val top10Session = sortList.map{
          // item: sesssionCount String "sessionId=count"
          case item =>{
            val sessionId = item.split("=")(0)
            val count = item.split("=")(1).toLong
            Top10Session(taskUUID,cid,sessionId,count)
          }
        }
        top10Session
      }
    }

    import sparkSession.implicits._
    top10SessionRDD.toDF().write
      .format("jdbc")
      .option("url",ConfigurationManager.config.getString(Constants.JDBC_URL))
      .option("user",ConfigurationManager.config.getString(Constants.JDBC_USER))
      .option("password",ConfigurationManager.config.getString(Constants.JDBC_PASSWORD))
      .option("dbtable","top10_session")
      .mode(SaveMode.Append)
      .save
  }




  /**
   * (步骤8 需求3方法里的方法1)
   * 求各品类的点击次数
   *
   * @param sessionId2FilterActionRDD
   */
  def getClickCount(sessionId2FilterActionRDD: RDD[(String, UserVisitAction)]) = {
    // 过滤 把点击行为对应的action保留下来
    val clickFilterRDD = sessionId2FilterActionRDD.filter(item => item._2.click_category_id != -1L)

    //map格式转换 为reduceBykey做准备  (5555品类,1次)
    val clickNumRDD = clickFilterRDD.map{
      case(sessionId,action) => (action.click_category_id,1L)
    }

    // 聚合 返回
    clickNumRDD.reduceByKey(_+_)

  }

  /**
   * (步骤8 需求3方法里的方法2)
   * 求各品类的下单次数
   *
   * @param sessionId2FilterActionRDD
   */
  def getOrderCount(sessionId2FilterActionRDD: RDD[(String, UserVisitAction)])={
    // 过滤 把下单行为对应的action保留下来
    val orderFilterRDD = sessionId2FilterActionRDD.filter(item => item._2.order_category_ids != null)

    // map 换成聚合的key  (5555品类,1次)
    val orderNumRDD = orderFilterRDD.flatMap{
      case (sid,action) =>
        action.order_category_ids.split(",").map(item => (item.toLong,1L))
    }

    // 聚合 返回
    orderNumRDD.reduceByKey(_+_)
  }

  /**
   * (步骤8 需求3方法里的方法3)
   * 求各品类的付款次数
   *
   * @param sessionId2FilterActionRDD
   */
  def getPayCount(sessionId2FilterActionRDD: RDD[(String, UserVisitAction)]) = {
    // 过滤 把付款行为对应的action保留下来
    val payFilterRDD = sessionId2FilterActionRDD.filter(item => item._2.pay_category_ids != null)

    // map 换成聚合的key  (5555品类,1次)
    val payNumRDD = payFilterRDD.flatMap{
      case (sid,action) =>
        action.pay_category_ids.split(",").map(item => (item.toLong,1L))
    }

    // 聚合 返回
    payNumRDD.reduceByKey(_+_)
  }


  /**
   * (步骤8 需求三方法里的方法4)
   *
   * @param distinctCid2CidRDD
   * @param cid2ClickCountRDD
   * @param cid2OrderCountRDD
   * @param cid2PayCountRDD
   */
  def getFullCount(distinctCid2CidRDD: RDD[(Long, Long)],
                   cid2ClickCountRDD: RDD[(Long, Long)],
                   cid2OrderCountRDD: RDD[(Long, Long)],
                   cid2PayCountRDD: RDD[(Long, Long)])={
    val cid2ClickInfoRDD = distinctCid2CidRDD.leftOuterJoin(cid2ClickCountRDD).map{
      case (cid,(categoryId,option)) =>{ //因为是leftOuterJoin 所以后面十个option 需要先判断一下存不存在 option.isDefinded
        val clickCount = if(option.isDefined) option.get else 0
        val aggCount = Constants.FIELD_CATEGORY_ID + "=" +categoryId+ "|" +
        Constants.FIELD_CLICK_COUNT + "=" + clickCount

        (cid,aggCount)
      }
    }

   val cid2OrderInfoRDD = cid2ClickInfoRDD.leftOuterJoin(cid2OrderCountRDD).map{
      case(cid,(clickInfo,option))=>{
        val orderCount = if(option.isDefined) option.get else 0
        val aggCount = clickInfo + "|" + Constants.FIELD_ORDER_COUNT + "=" + orderCount
        (cid, aggCount)
      }
    }

    val cid2PayInfoRDD = cid2OrderInfoRDD.leftOuterJoin(cid2PayCountRDD).map{
      case(cid,(orderInfo,option)) =>{
        val payCount = if(option.isDefined) option.get else 0
        val aggCount = orderInfo +"|"+Constants.FIELD_PAY_COUNT+"="+payCount
        (cid,aggCount)
      }
    }

    cid2PayInfoRDD //虽然叫payinfo 但是已经是完整信息了
  }



  /**
   * (步骤8 需求三的方法)
   * top10热门品类
   *
   * @param sparkSession
   * @param taskUUID
   * @param sessionId2FilterActionRDD 符合条件用户行为数据
   */
  def top10PopularCategories(sparkSession: SparkSession, taskUUID: String, sessionId2FilterActionRDD: RDD[(String, UserVisitAction)])= {

    // todo 第一步:获取所有发生过点击、下单、付款的品类 放在一个容器里 可以看打印文档的图
    val cid2CidRDD = sessionId2FilterActionRDD.flatMap{
      case (sid,action)=>{
        val categoryBuffer = new ArrayBuffer[(Long,Long)]()

        if(action.click_category_id != -1){ //点击行为
          categoryBuffer += ((action.click_category_id,action.click_category_id))

        }else if(action.order_category_ids != null){ //下单行为
          for(orderCid <- action.order_category_ids.split(",")){
            categoryBuffer += ((orderCid.toLong,orderCid.toLong))
          }

        }else if(action.pay_category_ids != null){ //付款行为
          for(payCid <- action.pay_category_ids.split(",")){
            categoryBuffer += ((payCid.toLong,payCid.toLong))
          }
        }
        categoryBuffer
      }
    }

    //对重复的categoryId进行去重
    val distinctCid2CidRDD = cid2CidRDD.distinct()

    // todo 第二步:计算各品类被点击、下单、支付的各个次数
    val cid2ClickCountRDD = getClickCount(sessionId2FilterActionRDD) //注意 传的是未去重前的
    val cid2OrderCountRDD = getOrderCount(sessionId2FilterActionRDD)
    val cid2PayCountRDD = getPayCount(sessionId2FilterActionRDD)
      // 测试打印输出一下 (注意必须把 getClickCount后面的unit删了 不然foreach不行)
      //    cid2ClickCountRDD.foreach(println(_))
      /* 结果如下(cid,count)
      (93,75)
      (37,67)...
       */

    // todo 第三步:整合各品类被点击、下单、支付次数 比如(888品类,点击80次,下单30次,付款2次)
    val cid2FullCountRDD  = getFullCount(distinctCid2CidRDD,cid2ClickCountRDD,cid2OrderCountRDD,cid2PayCountRDD)
      // 测试输出一下 看看数据的拼接有没有问题
//    cid2FullCountRDD.foreach(println(_))
      /* 结果如下
      (80,categoryid=80|clickCount=79|orderCount=80|payCount=87)
      (22,categoryid=22|clickCount=76|orderCount=68|payCount=89)
      (54,categoryid=54|clickCount=78|orderCount=88|payCount=67) ...
      */

    // todo 第四步:自定义二次排序key
    val sortkey2FullCountRDD = cid2FullCountRDD.map{
      case (cid,countInfo) =>{
        val clickCount = StringUtils.getFieldFromConcatString(countInfo,"\\|",Constants.FIELD_CLICK_COUNT).toLong
        val orderCount = StringUtils.getFieldFromConcatString(countInfo,"\\|",Constants.FIELD_ORDER_COUNT).toLong
        val payCount = StringUtils.getFieldFromConcatString(countInfo,"\\|",Constants.FIELD_PAY_COUNT).toLong

          // 调用自定义的二次排序类
        val sortKey = SortKey(clickCount,orderCount,payCount) //从小到大的

        (sortKey,countInfo)
      }
    }
      // 再用一下sortByKey()算子 false降序 从大到小 注意 take返回的是array
    val top10CategoryArray: Array[(SortKey, String)] = sortkey2FullCountRDD.sortByKey(false).take(10)

      // Array ----> RDD
    val top10CategoryRDD =sparkSession.sparkContext.makeRDD(top10CategoryArray).map{
      case(sortKey,countInfo) =>{
        val cid = StringUtils.getFieldFromConcatString(countInfo,"\\|",Constants.FIELD_CATEGORY_ID).toLong
        val clickCount = sortKey.clictCount
        val orderCount = sortKey.orderCount
        val payCount = sortKey.payCount

        Top10Category(taskUUID,cid,clickCount,orderCount,payCount)
      }
    }

    // todo 第五步:写入到数据库
    import sparkSession.implicits._
    top10CategoryRDD.toDF().write
      .format("jdbc")
      .option("url",ConfigurationManager.config.getString(Constants.JDBC_URL))
      .option("user",ConfigurationManager.config.getString(Constants.JDBC_USER))
      .option("password",ConfigurationManager.config.getString(Constants.JDBC_PASSWORD))
      .option("dbtable","top10_category")
      .mode(SaveMode.Append)
      .save


    // 这个 array 返回一下 给需求四用
    top10CategoryArray
  }




  /**
   * (步骤7 需求二里的方法里的方法)
   * 根据数量随机生成下标
   *
   * @param extractNumberPerDay
   * @param dateSessionCount
   * @param hourCountMap
   * @param dateHourExtractIndexListMap
   * @return
   */
  def generateRandomIndexList(extractNumberPerDay: Int,
                              dateSessionCount: Long,
                              hourCountMap: mutable.HashMap[String, Long],
                              dateHourExtractIndexListMap: mutable.HashMap[String, ListBuffer[Int]]) = {
    for((hour,count) <- hourCountMap){
      //该hour抽样数 =(每小时的session个数除以当天session总数)*当天抽样session数
      var hourExrCount = ((count / dateSessionCount.toDouble)*extractNumberPerDay).toInt
      //避免一个小时抽取数超过这个小时的总数
      if(hourExrCount > count){
        hourExrCount = count.toInt
      }

      val random = new Random()

      dateHourExtractIndexListMap.get(hour) match{
        case None => dateHourExtractIndexListMap(hour) = new ListBuffer[Int]
          for(i <- 0 until hourExrCount){ //(0,hourExrCount)
            var index = random.nextInt(count.toInt)
            while (dateHourExtractIndexListMap(hour).contains(index)){
              index = random.nextInt(count.toInt)
            }

            dateHourExtractIndexListMap(hour).append(index) //是不是也可以用“+”
          }

        case Some(list) =>
          for(i <- 0 until hourExrCount){
            var index = random.nextInt(count.toInt)
            while(dateHourExtractIndexListMap(hour).contains(index)){
              index = random.nextInt(count.toInt)
            }
            dateHourExtractIndexListMap(hour).append(index)
          }
      }

    }


  }



  /**
   * 随机抽取session(步骤7 需求二里的)
 *
   * @param sparkSession
   * @param taskUUID
   * @param sessionId2FilteredRDD
   */
  def sessionRandomExtract(sparkSession: SparkSession,
                           taskUUID: String,
                           sessionId2FilteredRDD: RDD[(String, String)]): Unit = {
    // dataHourFullInfoRDD:RDD[(dataHour,fullInfo)]
    val dataHourFullInfoRDD = sessionId2FilteredRDD.map{
      case(sid,fullInfo) =>
//        println("========================fullInfo==="+fullInfo)

        val startTime = StringUtils.getFieldFromConcatString(fullInfo,"\\|",Constants.FIELD_START_TIME)
//        println("========================startTime==="+startTime)
        val dataHour = DateUtils.getDateHour(startTime) //yyyy-MM-dd HH:mm:ss)=>(yyyy-MM-dd_HH)
        (dataHour,fullInfo)
    }
    // 得到每天每小时的session数量 根据key
    val countMap: collection.Map[String, Long] = dataHourFullInfoRDD.countByKey()

    // 按时间比例随机抽取算法,计算出每天每小时要抽取的session的索引
    // 将的map转换成>的格式
    val dateHourCountMap: mutable.HashMap[String, mutable.HashMap[String, Long]] = mutable.HashMap[String,mutable.HashMap[String,Long]]()
    for((dateHour,count) <- countMap){
      val date = dateHour.split("_")(0)
      val hour = dateHour.split("_")(1)
      //模式匹配 dataHourCountMap中 日期存在和日期不存在分别进行处理
      dateHourCountMap.get(date) match {
          // 不存在则新增
        case None => dateHourCountMap(date)=new mutable.HashMap[String,Long]();
          dateHourCountMap(date) += (hour-> count) //空累加一下?
          // 存在,累加
        case Some(hourCountMap) => hourCountMap += (hour -> count)
      }
    }

    // 按时间比例随机抽取算法
    // 每天抽取量=1000/总天数
    // 每小时要随机抽取的session数 =(每小时的session数/当天的session总数)*当天的抽取量
    //todo: 解决问题一:获取每一天要抽取的session数
    val extractNumberPerDay = 100 / dateHourCountMap.size
    //todo: 解决问题二:一天有多少session :dataHourCountMap(date).values.sum
    //todo: 解决问题三:一个小时有多少个session:dataHourCountMap(date)(hour)
    val dateHourExtractIndexListMap = new mutable.HashMap[String,mutable.HashMap[String,ListBuffer[Int]]]()

    //dateHourCountMap:Map[(date,Map[(hour,count)])]
    for((date,hourCountMap) <- dateHourCountMap) {
      val dateSessionCount = hourCountMap.values.sum

      dateHourExtractIndexListMap.get(date) match {
        case None => dateHourExtractIndexListMap(date) = new mutable.HashMap[String,ListBuffer[Int]]()
          generateRandomIndexList(extractNumberPerDay,dateSessionCount,hourCountMap,dateHourExtractIndexListMap(date))
        case Some(map) =>
          generateRandomIndexList(extractNumberPerDay,dateSessionCount,hourCountMap,dateHourExtractIndexListMap(date))
      }
    }
    /*到目前为止,获取到了每个小时要抽取的session的index*/


    //todo: 将map进行广播,提升任务性能
    var dateHourExtractIndexListMapBroadcast = sparkSession.sparkContext.broadcast(dateHourExtractIndexListMap)

    // ---- 
    val dateHour2GroupRDD = dataHourFullInfoRDD.groupByKey()  //dataHourFullInfoRDD:RDD[(dataHour,fullInfo)]

    //extractSessionRDD[SessionRandomExtract]
    val extractSessionRDD = dateHour2GroupRDD.flatMap{
      case(dateHour,iterableFullInfo) =>
        val date = dateHour.split("_")(0)
        val hour = dateHour.split("_")(1)

        val extractList = dateHourExtractIndexListMapBroadcast.value.get(date).get(hour)

        val extractSessionArrayBuffer = new ArrayBuffer[SessionRandomExtract]()

        var index=0

        for(fullInfo <- iterableFullInfo){
          if(extractList.contains(index)){
            val sessionId = StringUtils.getFieldFromConcatString(fullInfo,"\\|",Constants.FIELD_SESSION_ID)
            val startTime = StringUtils.getFieldFromConcatString(fullInfo,"\\|",Constants.FIELD_START_TIME)

            val searchKeywords = StringUtils.getFieldFromConcatString(fullInfo,"\\|",Constants.FIELD_CLICK_CATEGORY_IDS)
            val clickCategories = StringUtils.getFieldFromConcatString(fullInfo,"\\|",Constants.FIELD_CLICK_CATEGORY_IDS)

            val extractSession = SessionRandomExtract(taskUUID,sessionId,startTime,searchKeywords,clickCategories) //大写S和小写s是分别两个类 取名字太难了我

            extractSessionArrayBuffer += extractSession

          }
          // index自增
          index += 1
        }
        extractSessionArrayBuffer

    }

    // todo:倒入数据库
    import sparkSession.implicits._
    extractSessionRDD.toDF().write
      .format("jdbc")
      .option("url",ConfigurationManager.config.getString(Constants.JDBC_URL))
      .option("user",ConfigurationManager.config.getString(Constants.JDBC_USER))
      .option("password",ConfigurationManager.config.getString(Constants.JDBC_PASSWORD))
      .option("dbtable","session_random_extract")
      .mode(SaveMode.Append)
      .save()

    /*session_random_extract 就在本地mysql里啦*/

  }




  //计算比率 存入数据库(步骤6里的)
  def getSessionRatio(sparkSession: SparkSession, taskUUID: String, value: mutable.HashMap[String, Int]): Unit = {

    //拿到总个数 从累加器统计串中获取值 累加器的key传进去 如果没有的话默认它为1
    val session_count: Double = value.getOrElse(Constants.SESSION_COUNT,1).toDouble

    //不同范围访问[时长]的session个数
    val visit_length_1s_3s = value.getOrElse(Constants.TIME_PERIOD_1s_3s,0)  //Spark累加器Key名称常量
    val visit_length_4s_6s = value.getOrElse(Constants.TIME_PERIOD_4s_6s,0)
    val visit_length_7s_9s = value.getOrElse(Constants.TIME_PERIOD_7s_9s, 0)
    val visit_length_10s_30s = value.getOrElse(Constants.TIME_PERIOD_10s_30s, 0)
    val visit_length_30s_60s = value.getOrElse(Constants.TIME_PERIOD_30s_60s, 0)
    val visit_length_1m_3m = value.getOrElse(Constants.TIME_PERIOD_1m_3m, 0)
    val visit_length_3m_10m = value.getOrElse(Constants.TIME_PERIOD_3m_10m, 0)
    val visit_length_10m_30m = value.getOrElse(Constants.TIME_PERIOD_10m_30m, 0)
    val visit_length_30m = value.getOrElse(Constants.TIME_PERIOD_30m, 0)

    //不同访问[步长]的session个数
    val step_length_1_3 = value.getOrElse(Constants.STEP_PERIOD_1_3,0)
    val step_length_4_6 = value.getOrElse(Constants.STEP_PERIOD_4_6, 0)
    val step_length_7_9 = value.getOrElse(Constants.STEP_PERIOD_7_9, 0)
    val step_length_10_30 = value.getOrElse(Constants.STEP_PERIOD_10_30, 0)
    val step_length_30_60 = value.getOrElse(Constants.STEP_PERIOD_30_60, 0)
    val step_length_60 = value.getOrElse(Constants.STEP_PERIOD_60, 0)

    //计算各个访问时长和访问步长的范围占比
    val visit_length_1s_3s_ratio = NumberUtils.formatDouble(visit_length_1s_3s/session_count,2) //对这个比率保存为两位小数
    val visit_length_4s_6s_ratio = NumberUtils.formatDouble(visit_length_4s_6s/session_count,2)
    val visit_length_7s_9s_ratio = NumberUtils.formatDouble(visit_length_7s_9s/session_count,2)
    val visit_length_10s_30s_ratio = NumberUtils.formatDouble(visit_length_10s_30s/session_count,2)
    val visit_length_30s_60s_ratio = NumberUtils.formatDouble(visit_length_30s_60s/session_count,2)
    val visit_length_1m_3m_ratio = NumberUtils.formatDouble(visit_length_1m_3m/session_count,2)
    val visit_length_3m_10m_ratio = NumberUtils.formatDouble(visit_length_3m_10m/session_count,2)
    val visit_length_10m_30m_ratio = NumberUtils.formatDouble(visit_length_10m_30m/session_count,2)
    val visit_length_30m_ratio = NumberUtils.formatDouble(visit_length_30m/session_count,2)

    val step_length_1_3_ratio = NumberUtils.formatDouble(step_length_1_3 / session_count, 2)
    val step_length_4_6_ratio = NumberUtils.formatDouble(step_length_4_6 / session_count, 2)
    val step_length_7_9_ratio = NumberUtils.formatDouble(step_length_7_9 / session_count, 2)
    val step_length_10_30_ratio = NumberUtils.formatDouble(step_length_10_30 / session_count, 2)
    val step_length_30_60_ratio = NumberUtils.formatDouble(step_length_30_60 / session_count, 2)
    val step_legth_60_ratio = NumberUtils.formatDouble(step_length_60 / session_count, 2)

    //将统计结果封装为Domain对象
    val stat = SessionAggrStat(
      taskUUID,
      session_count.toInt,
      visit_length_1s_3s_ratio,
      visit_length_4s_6s_ratio,
      visit_length_7s_9s_ratio,
      visit_length_10s_30s_ratio,
      visit_length_30s_60s_ratio,
      visit_length_1m_3m_ratio,
      visit_length_3m_10m_ratio,
      visit_length_10m_30m_ratio,
      visit_length_30m_ratio,
      step_length_1_3_ratio,
      step_length_4_6_ratio,
      step_length_7_9_ratio,
      step_length_10_30_ratio,
      step_length_30_60_ratio,
      step_legth_60_ratio
    )

    val sessionRatioRDD = sparkSession.sparkContext.makeRDD(Array(stat))

    import sparkSession.implicits._
    sessionRatioRDD.toDF().write
      .format("jdbc")
      .option("url",ConfigurationManager.config.getString(Constants.JDBC_URL))  //配置类工具 "jdbc.user"
      .option("user",ConfigurationManager.config.getString(Constants.JDBC_USER))
      .option("password",ConfigurationManager.config.getString(Constants.JDBC_PASSWORD))
      .option("dbtable","session_stat_ratio_0416")
      .mode(SaveMode.Append)
      .save()


  }



  //封装方法(步骤5里的)
  def calculateVisitLength(visitLength:Long,sessionAggrStatAccumulator:SessionAccumulator)={
    if(visitLength >= 1 && visitLength <= 3){
      sessionAggrStatAccumulator.add(Constants.TIME_PERIOD_1s_3s)
    }else if (visitLength >= 4 && visitLength <= 6) {
      sessionAggrStatAccumulator.add(Constants.TIME_PERIOD_4s_6s);
    } else if (visitLength >= 7 && visitLength <= 9) {
      sessionAggrStatAccumulator.add(Constants.TIME_PERIOD_7s_9s);
    } else if (visitLength >= 10 && visitLength <= 30) {
      sessionAggrStatAccumulator.add(Constants.TIME_PERIOD_10s_30s);
    } else if (visitLength > 30 && visitLength <= 60) {
      sessionAggrStatAccumulator.add(Constants.TIME_PERIOD_30s_60s);
    } else if (visitLength > 60 && visitLength <= 180) {
      sessionAggrStatAccumulator.add(Constants.TIME_PERIOD_1m_3m);
    } else if (visitLength > 180 && visitLength <= 600) {
      sessionAggrStatAccumulator.add(Constants.TIME_PERIOD_3m_10m);
    } else if (visitLength > 600 && visitLength <= 1800) {
      sessionAggrStatAccumulator.add(Constants.TIME_PERIOD_10m_30m);
    } else if (visitLength > 1800) {
      sessionAggrStatAccumulator.add(Constants.TIME_PERIOD_30m);
    }

  }

  //封装方法(步骤5里的)
  def calculateStepLength(stepLength: Long, sessionAggrStatAccumulator: SessionAccumulator) = {
    if (stepLength >= 1 && stepLength <= 3) {
      sessionAggrStatAccumulator.add(Constants.STEP_PERIOD_1_3);
    } else if (stepLength >= 4 && stepLength <= 6) {
      sessionAggrStatAccumulator.add(Constants.STEP_PERIOD_4_6);
    } else if (stepLength >= 7 && stepLength <= 9) {
      sessionAggrStatAccumulator.add(Constants.STEP_PERIOD_7_9);
    } else if (stepLength >= 10 && stepLength <= 30) {
      sessionAggrStatAccumulator.add(Constants.STEP_PERIOD_10_30);
    } else if (stepLength > 30 && stepLength <= 60) {
      sessionAggrStatAccumulator.add(Constants.STEP_PERIOD_30_60);
    } else if (stepLength > 60) {
      sessionAggrStatAccumulator.add(Constants.STEP_PERIOD_60);
    }
  }
  /**
   * step5 过滤
   *
   * @param taskParam
   * @param sessionId2FullInfoRDD
   * @return
   */
  def getSessionFilteredRDD(taskParam: JSONObject, sessionId2FullInfoRDD: RDD[(String, String)],sessionAccumulator:SessionAccumulator) ={
    //根据task.params.json{startDate:"2018-08-01",......}
    val startAge = ParamUtils.getParam(taskParam,Constants.PARAM_START_AGE)
    val endAge = ParamUtils.getParam(taskParam,Constants.PARAM_END_AGE)
    val professinals = ParamUtils.getParam(taskParam,Constants.PARAM_PROFESSIONALS)
    val cities = ParamUtils.getParam(taskParam,Constants.PARAM_CITIES)
    val sex = ParamUtils.getParam(taskParam,Constants.PARAM_SEX)
    val keywords = ParamUtils.getParam(taskParam,Constants.PARAM_KEYWORDS)
    val categoryIds = ParamUtils.getParam(taskParam,Constants.PARAM_CATEGORY_IDS)

    //先判断 再拼接
    var filterInfo =
        (if(startAge != null) Constants.PARAM_START_AGE + "=" + startAge +"|" else "") + //Contants是一个常量接口 所以可以这样用
        (if(endAge != null) Constants.PARAM_END_AGE + "=" + endAge + "|" else "")+
        (if(professinals != null) Constants.PARAM_PROFESSIONALS+"="+professinals+"|" else "")+
        (if(cities != null) Constants.PARAM_CITIES + "=" +cities else "") +
        (if(sex != null) Constants.PARAM_SEX + "=" + sex +"|" else "")+
        (if(keywords != null) Constants.PARAM_KEYWORDS +"="+ keywords +"|" else "")+
        (if(categoryIds != null) Constants.PARAM_CATEGORY_IDS + "=" +categoryIds+"|" else "")

    if(filterInfo.endsWith("\\|")){ //后面有些字段会是null 就空了
      filterInfo=filterInfo.substring(0,filterInfo.length-1)
    }

    sessionId2FullInfoRDD.filter{
      case(sessionId,fullInfo) =>
        var success = true

        if(!ValidUtils.between(fullInfo,Constants.FIELD_AGE,filterInfo,Constants.PARAM_START_AGE,Constants.PARAM_END_AGE)){ //数据 数据字段 参数  参数字段 数据字段和参数字段是一样的
          success = false
        }else if(!ValidUtils.in(fullInfo,Constants.FIELD_PROFESSIONAL,filterInfo,Constants.FIELD_PROFESSIONAL)){
          success = false
        }else if(!ValidUtils.equal(fullInfo,Constants.FIELD_SEX,filterInfo,Constants.FIELD_SEX)){
          success = false
        }else if(!ValidUtils.in(fullInfo,Constants.FIELD_SEARCH_KEYWORDS,filterInfo,Constants.PARAM_KEYWORDS)){
          success = false
        }else if(!ValidUtils.in(fullInfo,Constants.FIELD_CLICK_CATEGORY_IDS,filterInfo,Constants.PARAM_CATEGORY_IDS)){
          success = false
        }

        //以下部分是指在过滤基础上增加了一步累加器的更新
        if(success){  //sessssionid里面的每一个字段都符合限制条件的要求
         //自动维护了我们的key 会自动在key对应的值上加一
          sessionAccumulator.add(Constants.SESSION_COUNT)

          val visitLength = StringUtils.getFieldFromConcatString(fullInfo,"\\|",Constants.FIELD_VISIT_LENGTH).toLong
          val stepLength = StringUtils.getFieldFromConcatString(fullInfo,"\\|",Constants.FIELD_STEP_LENGTH).toLong

//          if(visitLength>=1 && visitLength <= 3){ //封装成方法
//            sessionAccumulator.add(Constants.TIME_PERIOD_1s_3s)
//          }else if(visitLength>=4 && visitLength <= 6)
//            sessionAccumulator.add(Constants.TIME_PERIOD_4s_6s)
          calculateVisitLength(visitLength,sessionAccumulator)
          calculateStepLength(stepLength,sessionAccumulator)

        }



        success //filter必须要有一个布尔类型的返回


    }






  }





  /**
   * step2+3+4
   * 原数据
   * sparkSession,(4bc33302668f4331aba52c8328a781c7,CompactBuffer(UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,6,0000-00-00 12:45:35,联想笔记本,-1,-1,null,null,null,null,0), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,5,0000-00-00 12:46:24,吸尘器,-1,-1,null,null,null,null,5), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,2,0000-00-00 12:18:10,保温杯,-1,-1,null,null,null,null,5), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,8,0000-00-00 12:58:49,null,-1,-1,59,20,null,null,3), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,3,0000-00-00 12:03:04,null,-1,-1,59,78,null,null,5), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,4,0000-00-00 12:02:11,null,-1,-1,22,56,null,null,1), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,2,0000-00-00 12:31:47,null,-1,-1,98,64,null,null,6), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,3,0000-00-00 12:40:32,null,-1,-1,14,89,null,null,8), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,5,0000-00-00 12:47:58,null,-1,-1,null,null,36,75,9), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,3,0000-00-00 12:28:19,null,-1,-1,96,79,null,null,7), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,8,0000-00-00 12:31:50,null,83,3,null,null,null,null,3), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,7,0000-00-00 12:11:05,机器学习,-1,-1,null,null,null,null,5), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,6,0000-00-00 12:58:18,null,-1,-1,null,null,66,25,0), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,9,0000-00-00 12:14:06,null,24,97,null,null,null,null,9), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,4,0000-00-00 12:56:48,null,-1,-1,32,46,null,null,9), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,1,0000-00-00 12:50:12,null,33,84,null,null,null,null,2), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,7,0000-00-00 12:32:32,null,-1,-1,4,15,null,null,5), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,7,0000-00-00 12:02:53,吸尘器,-1,-1,null,null,null,null,5), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,0,0000-00-00 12:04:02,null,-1,-1,44,80,null,null,4), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,1,0000-00-00 12:36:43,null,-1,-1,null,null,60,54,3), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,8,0000-00-00 12:58:52,华为手机,-1,-1,null,null,null,null,5), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,6,0000-00-00 12:54:11,洗面奶,-1,-1,null,null,null,null,5), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,6,0000-00-00 12:09:19,null,42,67,null,null,null,null,2), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,8,0000-00-00 12:44:23,null,44,17,null,null,null,null,7), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,6,0000-00-00 12:09:38,卫生纸,-1,-1,null,null,null,null,8), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,1,0000-00-00 12:05:18,null,18,74,null,null,null,null,1), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,2,0000-00-00 12:24:06,卫生纸,-1,-1,null,null,null,null,4), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,6,0000-00-00 12:32:27,null,74,85,null,null,null,null,8), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,9,0000-00-00 12:18:19,null,-1,-1,null,null,12,89,4), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,3,0000-00-00 12:26:21,保温杯,-1,-1,null,null,null,null,6), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,5,0000-00-00 12:41:48,联想笔记本,-1,-1,null,null,null,null,2), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,7,0000-00-00 12:04:17,null,12,79,null,null,null,null,5), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,0,0000-00-00 12:17:24,null,-1,-1,93,98,null,null,1), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,7,0000-00-00 12:27:52,null,-1,-1,null,null,58,44,8), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,9,0000-00-00 12:34:27,null,-1,-1,null,null,52,69,7), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,3,0000-00-00 12:16:49,null,-1,-1,null,null,13,90,1), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,4,0000-00-00 12:37:47,null,-1,-1,86,67,null,null,7), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,8,0000-00-00 12:10:56,null,-1,-1,null,null,62,31,7), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,0,0000-00-00 12:01:41,null,4,0,null,null,null,null,7), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,3,0000-00-00 12:48:00,null,-1,-1,87,86,null,null,6)))
   *
   * step3 我想要得到的聚合信息数据
   * Session_Id|Search_Keywords|Click_Categary_Id|Visit_Length|Step_Length|Start_Time
   *
   * step4 我想要得到的聚合信息
   * Session_Id|Search_Keywords|Click_Categary_Id|Visit_Length|Step_Length|Start_Time|Age|Professional|Sex|City
   *
   * @param sparkSession
   * @param session2GroupActionRDD
   */
  def getSessionFullInfo(sparkSession: SparkSession, session2GroupActionRDD: RDD[(String, Iterable[UserVisitAction])]) = {  //这里的unit要去掉 否则返回的值会被认为unit

    //step3
    //userId2AggrInfoRDD:RDD[(userId,aggrInfo)]   key:userId  value:agrInfo
    val userId2AggrInfoRDD = session2GroupActionRDD.map{
      case (sessionId,iterableAction)=>
        var userId = -1L

        var startTime:Date = null
        var endTime:Date = null

        var stepLength = 0

        val searchKeywords = new StringBuffer("")
        val clickCategories = new StringBuffer("")

        for(action <- iterableAction){

          //userId
          if(userId == -1L){
            userId = action.user_id
          }

          //actionTime  类似在逐步扩大时间区间[null,null]---[14:22:22,14:26:30]
          val actionTime = DateUtils.parseTime(action.action_time) //点击行为的时间点
          if(startTime == null || startTime.after(actionTime)){
            startTime = actionTime
          }
          if(endTime == null || endTime.before(actionTime)){
            endTime = actionTime
          }

          //searchKeyword
          val searchKeyword = action.search_keyword
          if(StringUtils.isNotEmpty(searchKeyword) && !searchKeywords.toString.contains(searchKeyword)){
            searchKeywords.append(searchKeyword+",")
          }

          //clickCategoryId  某一个商品品类的ID
          val clickCategoryId = action.click_category_id
          if(clickCategoryId != -1 && !clickCategories.toString.contains(clickCategoryId)){
            clickCategories.append(clickCategoryId+",")
          }

          //stepLength
          stepLength += 1
        }

        //目的:去除","
        //searchKeywords.toString.substring(0,searchKeywords.toString.length)
        val searchKw = StringUtils.trimComma(searchKeywords.toString) //截断字符串两侧的逗号
        val clickCg = StringUtils.trimComma(clickCategories.toString)

        val visitLength = (endTime.getTime - startTime.getTime) /1000

        val aggrInfo = Constants.FIELD_SESSION_ID+"="+sessionId+"|"+
        Constants.FIELD_SEARCH_KEYWORDS+"="+searchKw+"|"+
        Constants.FIELD_CLICK_CATEGORY_IDS+"="+clickCg+"|"+
        Constants.FIELD_VISIT_LENGTH+"="+visitLength+"|"+
        Constants.FIELD_STEP_LENGTH+"="+stepLength+"|"+
        Constants.FIELD_START_TIME+"="+DateUtils.formatTime(startTime) //格式化日期(yyyy-MM-dd)!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

        //(sessionId,aggrInfo)
        //返回一下 因为下一步要去脸公共的user表,user表中是没有sessionId字段的
        //所以 找到公共的字段 userId
        (userId,aggrInfo)
    }
    //返回一下(测试3输出)
//    userId2AggrInfoRDD

    //step4 再把已经获取到的userId2AggrInfoRDD和userinfo表做一次map,得到一个完整信息的RDD
    val sql = "select * from user_info"
    import sparkSession.implicits._
    //userId2InfoIdRDD: RDD[(userId, UserInfo)]
    val userId2InfoIdRDD: RDD[(Long, UserInfo)] = sparkSession.sql(sql).as[UserInfo].rdd.map(item => (item.user_id, item)) //UserInfo 用户信息表样例类

   val sessionId2FullInfoRDD = userId2AggrInfoRDD.join(userId2InfoIdRDD).map{
     case (userId,(aggrInfo,userInfo)) =>
       val age = userInfo.age
       val professional = userInfo.professional
       val sex = userInfo.sex
       val city = userInfo.city

       val fullInfo = aggrInfo +"|"+
       Constants.FIELD_AGE + "=" + age +"|"+
       Constants.FIELD_PROFESSIONAL +"="+professional+"|"+
       Constants.FIELD_SEX + "=" +sex+"|"+
       Constants.FIELD_CITY+"="+city

       //之前是为了聚合用的userId 先在聚合完成 就用回sessionId
     val sessionId = StringUtils.getFieldFromConcatString(aggrInfo,"\\|",Constants.FIELD_SESSION_ID) //从拼接的字符串中提取字段


     //返回
       (sessionId,fullInfo)
   }

    //4 返回完整数据
    sessionId2FullInfoRDD

  }



  def getOriActionRDD(sparkSession: SparkSession, taskParam: JSONObject) = {
    // 从JSON对象中提取参数 ParamUtils.getParam
    val startDate = ParamUtils.getParam(taskParam,Constants.PARAM_START_DATE)
    val endDate = ParamUtils.getParam(taskParam,Constants.PARAM_END_DATE)

    val sql = "select * from user_visit_action where date>='" + startDate + "' and date<='" + endDate + "'"

    // !!!隐式转换
    import sparkSession.implicits._
    sparkSession.sql(sql).as[UserVisitAction].rdd
  }


}

/*

    step1 确认数据获取成功
    actionRDD.foreach(println(_))
UserVisitAction(0000-00-00,21,d95c6c3cd7164e45ad483525b2132577,3,0000-00-00 16:13:29,null,-1,-1,55,62,null,null,2)
UserVisitAction(0000-00-00,21,d95c6c3cd7164e45ad483525b2132577,5,0000-00-00 16:57:26,null,3,93,null,null,null,null,8)
UserVisitAction(0000-00-00,21,d95c6c3cd7164e45ad483525b2132577,2,0000-00-00 16:34:28,null,-1,-1,null,null,48,35,1)


    step2  斧子形的 以session为key 以CompactBuffer(一条条action)为value
    session2GroupActionRDD.foreach(println(_))
(4bc33302668f4331aba52c8328a781c7,CompactBuffer(UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,6,0000-00-00 12:45:35,联想笔记本,-1,-1,null,null,null,null,0), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,5,0000-00-00 12:46:24,吸尘器,-1,-1,null,null,null,null,5), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,2,0000-00-00 12:18:10,保温杯,-1,-1,null,null,null,null,5), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,8,0000-00-00 12:58:49,null,-1,-1,59,20,null,null,3), UserVisitAction(0000-00-00,95,4bc33302668f4331aba52c8328a781c7,3,0000-00-00 12:03:04,null,-1,-1,59,78,null,null,5), UserVisitAction(0000-00-


    step3
   * Session_Id|Search_Keywords|Click_Categary_Id|Visit_Length|Step_Length|Start_Time
(13,sessionid=a300de33934d49c4b02c1525879454bd|searchKeywords=吸尘器,保温杯,Lamer,华为手机,机器学习,洗面奶,小龙虾|clickCategoryIds=26,38,30,17,6,83,40,65,41,4,34,97,41,34,60,82,38,11,79,42,99,35,47,80,1,80,99,64,2|visitLength=3491|stepLength=94|startTime=0000-00-00)
(25,sessionid=5ddf7f6d6c9d485db9287b5ef34e077e|searchKeywords=苹果,吸尘器,保温杯|clickCategoryIds=28,59,96,29,91,21|visitLength=3322|stepLength=21|startTime=0000-00-00)
(41,sessionid=838220c6e46445d7bad52f18d3171bd8|searchKeywords=吸尘器,苹果,保温杯,洗面奶,机器学习,Lamer,华为手机|clickCategoryIds=29,80,46,79,6,98,80,34,41,30,41,1,34,64,13,98,1|visitLength=3458|stepLength=52|startTime=0000-00-00)


    step4 聚合信息全部完成
   * Session_Id|Search_Keywords|Click_Categary_Id|Visit_Length|Step_Length|Start_Time|Age|Professional|Sex|City
(79bfe53461f146cc836f7219351588f0,sessionid=79bfe53461f146cc836f7219351588f0|searchKeywords=卫生纸,联想笔记本,华为手机,小龙虾,洗面奶,吸尘器,苹果,Lamer|clickCategoryIds=8,49,14,13,36,41,27,82,45,47,7,41,32|visitLength=3456|stepLength=67|startTime=0000-00-00|age=22|professional=professional2|sex=male|city=city57)
(228cf3d0777749799436de675ff441a4,sessionid=228cf3d0777749799436de675ff441a4|searchKeywords=洗面奶,卫生纸,联想笔记本,苹果,Lamer,机器学习,小龙虾,吸尘器,保温杯,华为手机|clickCategoryIds=15,90,38,45,66,28,12,15,81,58,62,90,94,90,82,99,11,69|visitLength=3432|stepLength=8


   step5 过滤及累加器更新之后
   0000-00-00 00:00:00,740   INFO --- [           Executor task launch worker for task 25]  org.apache.spark.storage.ShuffleBlockFetcherIterator                            (line:   54)  :  Started 0 remote fetches in 6 ms
(4fe6f3eaf9914578bc03308618bdd7a5,sessionid=4fe6f3eaf9914578bc03308618bdd7a5|searchKeywords=苹果,洗面奶,保温杯,机器学习,Lamer,联想笔记本,华为手机,小龙虾|clickCategoryIds=12,92,10,25,75,28,28,18,51,92,39,63,82,70,60,86|visitLength=3391|stepLength=54|startTime=0000-00-00|age=47|professional=professional40|sex=male|city=city28)
(fa00c11085ec4e8eb409ddc35e351902,sessionid=fa00c11085ec4e8eb409ddc35e351902|searchKeywords=小龙虾,洗面奶,华为手机,苹果,联想笔记本,保温杯,机器学习|clickCategoryIds=51,2,84,89,97,82,79|visitLength=3400|stepLength=37|startTime=0000-00-00|age=29|professional=professional0|sex=male|city=city92)



    step6 计算比率存入数据库
    然后commerce数据库中就会有一张表session_stat_ratio_0416

 */

你可能感兴趣的:(scala,spark,项目)