u1,item1,0.0913375062480507
u2,item1,0.4061823571029518
u3,item1,0.021727289053235843
u4,item1,0.24172510761164112
u5,item1,0.7898802150245668
u6,item1,0.2166110282064876
(u9,List((item74540,0.9999953894581668), (item76768,0.9999930103445652), (item21169,0.9999889286058848), (item4820,0.9999782306748293), (item85543,0.9999663834573093), (item8372,0.9999487766494871), (item99252,0.9999365275502845), (item23653,0.9999347307884792), (item19615,0.9999236599402785), (item30399,0.999918672799968)))
(u18,List((item48113,0.9999984432903763), (item44728,0.9999823700583934), (item65298,0.9999721951269472), (item11426,0.9999686624512639), (item72669,0.9999525503292274), (item36801,0.9999334853565013), (item49233,0.9999283335977657), (item67481,0.9999041428222344), (item47549,0.9998546064810947), (item66968,0.999842604478957)))
/**
*先合并所有用户的item,然后对每个item排序,然后取前k个
* @param input
* @param output
* @param k
*/
def fun1(sc:SparkContext,input:String ,output:String, k :Int,num:Int) ={
val start = System.currentTimeMillis;
sc.textFile(input,num).map{x => val f = x.split(",");(f(0),(f(1),f(2).toDouble))}.
combineByKey((x:(String,Double)) => List(x),
(c:List[(String,Double)], x:(String,Double)) => c :+ x ,
(c1:List[(String,Double)], c2:List[(String,Double)]) => c1 ::: c2).
map(x => (x._1 , x._2.sortWith((x,y) => x._2 > y._2).take(10))).
saveAsTextFile(output)
println("fun1,k: "+k+",input:"+input+",num:"+num+"---> time:"+
(System.currentTimeMillis - start )*1.0 /1000 +" s")
}
/**
* 先合并所有用户item,然后使用堆栈获取k个top值
* @param sc
* @param input
* @param output
* @param k
*/
def fun2(sc:SparkContext,input:String ,output:String, k :Int,num:Int) ={
val start = System.currentTimeMillis
sc.textFile(input,num).map{x => val f = x.split(",");(f(0),(f(1),f(2).toDouble))}.
combineByKey((x:(String,Double)) => List(x),(c:List[(String,Double)], x:(String,Double)) => c :+ x ,
(c1:List[(String,Double)], c2:List[(String,Double)]) => c1 ::: c2).
map(x => (x._1 ,MyUtils.top10(x._2,k))).saveAsTextFile(output)
println("fun2,k: "+k+",input:"+input+",num:"+num+"---> time:"+
(System.currentTimeMillis - start )*1.0 /1000 +" s")
}
这里使用到了MyUtils,这个类是参考Spark里面的实现的,使用堆栈求前topk个值,如下:
package org.apache.spark.util
import org.apache.spark.util
/**
* Created by fansy on 2016/12/13.
*/
object MyUtils {
implicit val sortStringDoubleByDouble = new Ordering[(String,Double)] {
override def compare(a: (String,Double), b: (String,Double)) = (b._2 - a._2 ).toInt
}
/**
* 使用堆栈获取最大值k个
* @param items
* @param k
* @return
*/
def top10(items : List[(String,Double)], k:Int):List[(String,Double)] ={
val queue = new BoundedPriorityQueue[(String,Double)](k)(sortStringDoubleByDouble)
queue ++= util.collection.Utils.takeOrdered(items.iterator, k)(sortStringDoubleByDouble)
queue.toList
}
}
定义此类需要注意:
/**
* 根据d排序,由于每个分区的s是一样的,所以只需要根据d排序即可
* @param s
* @param d
*/
case class StringDoubleKey(s:String,d:Double)
object StringDoubleKey {
implicit def orderingByDouble[A <: StringDoubleKey] : Ordering[A] = {
Ordering.by(x => - x.d)
}
}
key类型是一个组合key,包含用户和评分,这里定义比较的方法,由于是按照评分从到到小的,所以这里只比较评分;同时需要注意,这里不需要把用户也加进来,因为这里排序的是针对每个分区的数据进行排序的,而且每个分区就只有一个用户(所以这里就要特别注意partition的设计)。
/**
* partition根据 键值对的s进行分区
* 需要保证每个用户放到一个分区里面
* @param userNum
*/
case class partition(userNum:Map[String,Int]) extends Partitioner{
override def getPartition(key: Any): Int ={
userNum(key.asInstanceOf[StringDoubleKey].s)
}
override def numPartitions: Int = userNum.size
}
由于每个用户需要映射到一个partition中,所以这里传进来一个用户和id的映射(这个映射中的用户是唯一的),把key转换为StringDoubleKey类型后,只取其s也就是user字段进行映射即可;
/**
* 每个用户的数据组成一个partition,然后每个partition获取topk
* @param sc
* @param input
* @param output
* @param k
* @param num partition的个数没有用
*/
def fun3(sc:SparkContext,input:String ,output:String, k :Int,num:Int) ={
var start = System.currentTimeMillis
val data = sc.textFile(input).map{x => val f = x.split(",");(StringDoubleKey(f(0),f(2).toDouble),f(1))}
val userNum = data.map(x => x._1.s).distinct.collect
println("fun3,k: "+k+",input:"+input+",num:"+num+"---> time:"+
(System.currentTimeMillis - start )*1.0 /1000 +" s")
start = System.currentTimeMillis()
data.repartitionAndSortWithinPartitions( partition(userNum.zip(0 until userNum.size).toMap)).mapPartitions(topK(k)).saveAsTextFile(output)
println("fun3,k: "+k+",input:"+input+",num:"+num+"---> time:"+
(System.currentTimeMillis - start )*1.0 /1000 +" s")
}
/**
* 每个partition返回一条记录
* topk ,返回用户,list((item,pref),(item,pref))
* @param k
* @param iter
* @return
*/
def topK(k :Int )(iter : Iterator[(StringDoubleKey,String)]):Iterator[(String,List[(String,Double)])] ={
val pre = iter.next
val user = pre._1.s
var items_pref = List[(String, Double)]((pre._2,pre._1.d))
for (cur <- iter if items_pref.length < k)
{
items_pref .::= (cur._2, cur._1.d)
}
Array((user,items_pref)).iterator
}
sc.parallelize(for( i <- 1 to 100000; j <- 1 to 50) yield ("u"+j,"item"+i,Math.random),4).map(x => x._1+","+x._2+","+x._3).saveAsTextFile("/tmp/user1000_item3w")
sc.parallelize(for( i <- 1 to 30000; j <- 1 to 100) yield ("u"+j,"item"+i,Math.random),4).map(x => x._1+","+x._2+","+x._3).saveAsTextFile("/tmp/user300_item3w")
sc.parallelize(for( i <- 1 to 300000; j <- 1 to 10) yield ("u"+j,"item"+i,Math.random),4).map(x => x._1+","+x._2+","+x._3).saveAsTextFile("/tmp/user10_item30w")
sc.parallelize(for( i <- 1 to 500000; j <- 1 to 10) yield ("u"+j,"item"+i,Math.random),4).map(x => x._1+","+x._2+","+x._3).saveAsTextFile("/tmp/user10_item50w")
sc.parallelize(for( i <- 1 to 1000000; j <- 1 to 10) yield ("u"+j,"item"+i,Math.random),4).map(x => x._1+","+x._2+","+x._3).saveAsTextFile("/tmp/user10_item100w")
使用下面的方式,效率更快:
sc.parallelize(for( j <- 1 to 100000) yield "u"+j,32).flatMap(x => for(j <- 1 to 1000) yield (x,"item"+j,Math.random)).map(x => x._1+","+x._2+","+x._3).saveAsTextFile("/tmp/user10w_item1k")
#!/bin/bash
echo "Start..."
input=("user_100_item_3w" "user_20_item_10w" "user_50_item_10w")
method=(1 2)
partition=(4 8 12 16 20 24 28 32)
for i in ${input[@]}
do
for m in ${method[@]}
do
for p in ${partition[@]}
do
echo "input:_$i ,method: $m ,par: _$p,"
spark-submit --name "input :$i,method:fun$m,partition:$p" --class topk.TopK --master yarn --deploy-mode cluster --driver-memory 3G --executor-memory 3G --num-executors 8 top.jar "/tmp/$i" "/tmp/fun${m}_${i}_par${p}" $m 10 $p 1>/dev/null 2>/dev/null ;
done
done
done
#!/bin/bash
echo "Start..."
input=("user10_item30w" "user10_item50w" "user10_item100w")
method=(1 2)
partition=(16 20 24 28 32)
for i in ${input[@]}
do
for m in ${method[@]}
do
for p in ${partition[@]}
do
echo "input:_$i ,method: $m ,par: _$p,"
spark-submit --name "input :$i,method:fun$m,partition:$p" --class topk.TopK --master yarn --deploy-mode cluster --driver-memory 3G --executor-memory 3G --num-executors 8 top.jar "/tmp/$i" "/tmp/fun${m}_${i}_par${p}" $m 10 $p 1>/dev/null 2>/dev/null ;
done
done
done
#!/bin/bash
echo "Start..."
input=("user_100_item_3w" "user_20_item_10w" "user_50_item_10w" "user10_item30w" "user10_item50w" "user10_item100w")
method=(3)
partition=(4) # partition个数没有影响
for i in ${input[@]}
do
for m in ${method[@]}
do
for p in ${partition[@]}
do
echo "input:_$i ,method: $m ,par: _$p,"
spark-submit --name "input :$i,method:fun$m,partition:$p" --class topk.TopK --master yarn --deploy-mode cluster --driver-memory 3G --executor-memory 3G --num-executors 8 top.jar "/tmp/$i" "/tmp/fun${m}_${i}_par${p}" $m 10 $p 1>/dev/null 2>/dev/null ;
done
done
done
如果您觉得lz的文章还行,并且您愿意动动手指,可以为我投上您的宝贵一票!谢谢!
http://blog.csdn.net/vote/candidate.html?username=fansy1990