使用Spark Sql实现:
def sparkSqlFunc(testDf: DataFrame) {
testDf.createOrReplaceTempView("test_view")
val sql =
"""
| SELECT uid, collect_list(concat_ws("|_^",logid, rank)) logids
| FROM (
| SELECT uid,
| row_number() over(partition by logid order by log_timestamp desc) rank
| FROM frequency_tmp
| ) s
| GROUP BY uid
|""".stripMargin
}
testDf
.repartition(context.offlineThemisConf.getNumPartitions, testDf("logid"))
.sortWithinPartitions("log_timestamp")
.mapPartitions(line => {
//记录每条log下的不同logid的顺序
val logidSeqMap = new mutable.HashMap[String, Long]()
val logWithUidSeq = new scala.collection.mutable.ListBuffer[(String, String, Long)]()
line.filter(row => {
row != null && row.length > 0
})
.foreach(row => {
val logid: String = row.getAs("logid")
val uid: String = row.getAs("uid")
val frequency = logidSeqMap.getOrElse(logid, 0L) + 1
logidSeqMap.put(logid, frequency)
logWithUidSeq.append((uid, logid, frequency))
})
logWithUidSeq.iterator
}).filter(_ != null)
.withColumnRenamed("_1", "logid")
.withColumn("logids", map(
lit("logid"), col("_2"),
lit("logid_rank"), col("_3")
))
.groupBy("uid")
.agg(collect_list("logids").alias("logid_map"))
testDf
.rdd
.map(line => {
(FeaturePartitionKey(
logid = line.getAs[String]("logid"),
logTimestamp = line.getAs[String]("log_timestamp").toLong),
line.getAs[String]("uid"))
})
.repartitionAndSortWithinPartitions(
new FeaturePartitioner(1000))
.mapPartitions(line => {
val viewSeqMap = new mutable.HashMap[String, Long]()
val logWithViewSeqBuf = new scala.collection.mutable.ArrayBuffer[(String, String)]()
line.filter(row => {
row != null
})
.foreach(row => {
val features: FeaturePartitionKey = row._1
val frequency: Long = viewSeqMap.getOrElse(features.logid, 0L) + 1
viewSeqMap.put(features.logid, frequency)
logWithViewSeqBuf.append((features.uid, features.logid + "|_^" + frequency))
})
logWithViewSeqBuf.iterator
})
.aggregateByKey(Array[String](), context.offlineThemisConf.getNumPartitions)((aggArray: Array[String], feaStr: String) => {
aggArray :+ feaStr
}, (firArray: Array[String], secArray: Array[String]) => {
firArray ++ secArray
})
.toDF("uid", "logids")
case class FeaturePartitionKey(logid: String,
logTimestamp: Long)
// 实现排序功能, 根据logTimestamp进行排序
object FeaturePartitionKey {
implicit def orderingByTimestamp[A <: FeaturePartitionKey] : Ordering[A] = {
Ordering.by((fk: A) => (fk.logTimestamp, fk.logTimestamp * -1))
}
}
import org.apache.spark.Partitioner
class FeaturePartitioner(val partitions: Int) extends Partitioner {
require(partitions >= 0, s"Number of partitions ($partitions) cannot be negative.")
override def numPartitions: Int = partitions
override def getPartition(key: Any): Int = {
val k = key.asInstanceOf[FeaturePartitionKey]
Math.abs(k.logid.hashCode % numPartitions)
}
}
通过代码我们可以看到, 这里是使用对logid进行hashCode, 然后取 % 进行分区, 当数据量过大时, 就会出现Hash冲突的问题.
我们刚讲到 repartition 源码中用的是 Murmur3Hash 方式进行Hash的, 这也无法避免次问题, 那么repartition 和 repartitionAndSortWithinPartitions都无法完全避免HashCode冲突的问题, 我们应该怎么办呢?
此时, 我们可以通过以下方法减少HashCode的重复率:
当需要分区的字段较多时, 可以将每一个字段分别hashCode, 然后将hash值相加, 再进行 % 操作. 这样可以大大减少hashCode重复的情况
val strHashcode = uid.hashCode + uname.hashCode + age.hashCode
Math.abs(strHashcode % numPartitions)
经测试发现, 使用这种方式HashCode冲突的概率要少了20倍 + , 大大减小了重复率.
当然, 还可尝试先md5再hashCode,效果与上述结果类似.