1. group后在内部sort
// 删除相邻重复的元素
val deleteRepeatAndSort = udf(f = (list: mutable.WrappedArray[Row]) => {
val sortList = list.map { case Row(poiId: String, ts: Long) => (poiId, ts) }.
sortWith((a, b) => a._2 < b._2).toList
val deleteRepeat = sortList.sliding(2).collect{case Seq(a,b) if a._1 != b._1 => b}.toList
val addHead = (sortList.head :: deleteRepeat).
map(e => e._1).toArray
addHead
}
)
// ...
val df = spark.sql(sql).na.drop().
filter($"tag".startsWith("\"keyword\"")).
groupBy("userid").
agg(collect_list(struct($"poi_query", $"ts")).alias("item_list")).
withColumn("session", deleteRepeatAndSort($"item_list"))
- rdd join 后的map处理方法
val allDaodianSession: RDD[(String, Seq[OfflineItemWithTs])] = ...
val originFeatures: RDD[(String, String)] = ...
//这里注意由于是leftjoin,leftjoin的字段在map处理中加上了option字段
//同时使用isDefined判断是否匹配到了此字段
val finalRDD = originFeatures.leftOuterJoin(allDaodianSession).map {
case (userId: String, (data: String, daodianSessionOpt: Option[Seq[OfflineItemWithTs]])) => {
if (daodianSessionOpt.isDefined) { // 匹配到
val daodianSession = daodianSessionOpt.get
...
}
...
}