import breeze.linalg.max import breeze.numerics.{pow, sqrt} import org.apache.spark.sql.expressions.Window import org.apache.spark.{SparkContext, SparkConf} import org.apache.spark.sql.functions.{rank, desc} /** * Created by gavinzjchao on 2016/1/8. */ object SparkTest008 { def main(args: Array[String]) { val conf = new SparkConf().setMaster("local").setAppName("UserSimilarity") val sc = new SparkContext(conf) val date: String = "20160112" val data = sc.parallelize(List("u1,v1,2", "u1,v2,1", "u1,v3,2", "u2,v1,3", "u2,v3,4", "u2,v4,1", "u2,v2,9", "u3,v2,9")) // get user -> (video, score) val rddUserRating = data.map { line => val fields = line.trim().split(",") fields match { case Array(user, video, score) => (user, video, score) } } // get user's score square sum: sqrt(s1^2 + s2^2 + ... + sn^2) val rddUserScoreSum = rddUserRating .map(fields => (fields._1, pow(fields._3.toDouble, 2))) .reduceByKey(_ + _) .map(fields => (fields._1, sqrt(fields._2))) // get <video, (user, score)> val rddVideoInfo = rddUserRating.map(tokens => tokens._2 -> (tokens._1, tokens._3)) // get <video, ((user1, score1), (user2, score2))> val rddUserPairs = rddVideoInfo.join(rddVideoInfo) .filter { tokens => tokens match { case (video, ((user1, score1), (user2, score2))) => user1 != user2 } } // get score1 * score2 and reduce by key (user1, user2) and get sum val rddUserPairScore = rddUserPairs.map { tokens => tokens match { case (video, ((user1, score1), (user2, score2))) => (user1, user2) -> score1.toDouble * score2.toDouble } } .reduceByKey(_ + _) // get cos similarity val rddSimilarityTmp = rddUserPairScore.map { tokens => tokens match { case ((user1, user2), productScore) => user1 -> (user2, productScore) } } .join(rddUserScoreSum) val rddSimilarity = rddSimilarityTmp.map { tokens => tokens match { case (user1, ((user2, productScore), squareSumScore1)) => user2 -> ((user1, squareSumScore1), productScore) } } .join(rddUserScoreSum) val userSimilarity = rddSimilarity.map { tokens => tokens match { case (user2, (((user1, squareSumScore1), productScore), squareSumScore2)) => (user1, (user2, productScore / (squareSumScore1 * squareSumScore2))) } } // Top N whith in each group algorithm val topSimilarUsers = userSimilarity.groupByKey() .map(x => (x._1, x._2.toList.sortBy(_._2).reverse.take(2))) .flatMap(x => x._2.map(y => (x._1, y._1, y._2))) for (i <- topSimilarUsers) { println(i) } sc.stop() } }