spark-常用代码

1. group后在内部sort


 // 删除相邻重复的元素
  val deleteRepeatAndSort = udf(f = (list: mutable.WrappedArray[Row]) => {

    val sortList = list.map { case Row(poiId: String, ts: Long) => (poiId, ts) }.
      sortWith((a, b) => a._2 < b._2).toList

    val deleteRepeat = sortList.sliding(2).collect{case Seq(a,b) if a._1 != b._1 => b}.toList

    val addHead = (sortList.head :: deleteRepeat).
      map(e => e._1).toArray

    addHead
  }
  )

// ...
 val df = spark.sql(sql).na.drop().
      filter($"tag".startsWith("\"keyword\"")).
      groupBy("userid").
      agg(collect_list(struct($"poi_query", $"ts")).alias("item_list")).
      withColumn("session", deleteRepeatAndSort($"item_list"))
  1. rdd join 后的map处理方法
val allDaodianSession: RDD[(String, Seq[OfflineItemWithTs])] = ...
val originFeatures: RDD[(String, String)] = ...

//这里注意由于是leftjoin,leftjoin的字段在map处理中加上了option字段
//同时使用isDefined判断是否匹配到了此字段
val finalRDD = originFeatures.leftOuterJoin(allDaodianSession).map {
        case (userId: String, (data: String, daodianSessionOpt: Option[Seq[OfflineItemWithTs]])) => {
        if (daodianSessionOpt.isDefined) { // 匹配到
           val daodianSession = daodianSessionOpt.get
          ...
        }
...
}

你可能感兴趣的:(spark-常用代码)