图2.1 用户订阅矩阵
图2.2 用户-特征矩阵
图2.3 特征-电影矩阵
首先将样例数据上传到HDFS,如果想要在本地测试这些功能的话,需要内存数量至少 6g, 当然可以通过减少数据量来达到通用的测试。
object RunRecommender { def main(args: Array[String]): Unit = { val conf = new SparkConf(); conf.setMaster("local[*]") val spark = SparkSession.builder().config(conf).getOrCreate() // Optional, but may help avoid errors due to long lineage // spark.sparkContext.setCheckpointDir("hdfs:///tmp/") spark.sparkContext.setCheckpointDir("d:///tmp/") //val base = "hdfs:///user/ds/" val base = "E:/newcode/spark/aas/data/"; val rawUserArtistData = spark.read.textFile(base + "user_artist_data.txt") val rawArtistData = spark.read.textFile(base + "artist_data.txt") val rawArtistAlias = spark.read.textFile(base + "artist_alias.txt") val runRecommender = new RunRecommender(spark) runRecommender.preparation(rawUserArtistData, rawArtistData, rawArtistAlias) runRecommender.model(rawUserArtistData, rawArtistData, rawArtistAlias) runRecommender.evaluate(rawUserArtistData, rawArtistAlias) runRecommender.recommend(rawUserArtistData, rawArtistData, rawArtistAlias) } }
def preparation( rawUserArtistData: Dataset[String], rawArtistData: Dataset[String], rawArtistAlias: Dataset[String]): Unit = { rawUserArtistData.take(5).foreach(println) val userArtistDF = rawUserArtistData.map { line => val Array(user, artist, _*) = line.split(' ') (user.toInt, artist.toInt) }.toDF("user", "artist") userArtistDF.agg(min("user"), max("user"), min("artist"), max("artist")).show() val artistByID = buildArtistByID(rawArtistData) val artistAlias = buildArtistAlias(rawArtistAlias) val (badID, goodID) = artistAlias.head artistByID.filter($"id" isin (badID, goodID)).show() }
/** * 过滤无效的用户艺术家ID和名字行,将格式不正确的数据行剔除掉。 * @param rawArtistData * @return */ def buildArtistByID(rawArtistData: Dataset[String]): DataFrame = { rawArtistData.flatMap { line => val (id, name) = line.span(_ != '\t') if (name.isEmpty) { None } else { try { Some((id.toInt, name.trim)) } catch { case _: NumberFormatException => None } } }.toDF("id", "name") } /** * 过滤艺术家id和对应的别名id,将格式拼写错误的行剔除掉。 * @param rawArtistAlias * @return */ def buildArtistAlias(rawArtistAlias: Dataset[String]): Map[Int,Int] = { rawArtistAlias.flatMap { line => val Array(artist, alias) = line.split('\t') if (artist.isEmpty) { None } else { Some((artist.toInt, alias.toInt)) } }.collect().toMap }
代码中模型训练好之后,预测了用户 2093760 的推荐结果,我测试结果如下,由于里面代码使用了随机生成初始矩阵,每个人的结果都有可能不一样。
Some((2814,50 Cent))
Some((1300642,The Game))
Some((1180,David Gray))
Some((813,Jurassic 5))
Some((1255340,The Saw Doctors))
def areaUnderCurve( positiveData: DataFrame, bAllArtistIDs: Broadcast[Array[Int]], predictFunction: (DataFrame => DataFrame)): Double = { // What this actually computes is AUC, per user. The result is actually something // that might be called "mean AUC". // Take held-out data as the "positive". // Make predictions for each of them, including a numeric score val positivePredictions = predictFunction(positiveData.select("user", "artist")). withColumnRenamed("prediction", "positivePrediction") // BinaryClassificationMetrics.areaUnderROC is not used here since there are really lots of // small AUC problems, and it would be inefficient, when a direct computation is available. // Create a set of "negative" products for each user. These are randomly chosen // from among all of the other artists, excluding those that are "positive" for the user. val negativeData = positiveData.select("user", "artist").as[(Int,Int)]. groupByKey { case (user, _) => user }. flatMapGroups { case (userID, userIDAndPosArtistIDs) => val random = new Random() val posItemIDSet = userIDAndPosArtistIDs.map { case (_, artist) => artist }.toSet val negative = new ArrayBuffer[Int]() val allArtistIDs = bAllArtistIDs.value var i = 0 // Make at most one pass over all artists to avoid an infinite loop. // Also stop when number of negative equals positive set size while (i < allArtistIDs.length && negative.size < posItemIDSet.size) { val artistID = allArtistIDs(random.nextInt(allArtistIDs.length)) // Only add new distinct IDs if (!posItemIDSet.contains(artistID)) { negative += artistID } i += 1 } // Return the set with user ID added back negative.map(artistID => (userID, artistID)) }.toDF("user", "artist") // Make predictions on the rest: val negativePredictions = predictFunction(negativeData). withColumnRenamed("prediction", "negativePrediction") // Join positive predictions to negative predictions by user, only. // This will result in a row for every possible pairing of positive and negative // predictions within each user. val joinedPredictions = positivePredictions.join(negativePredictions, "user"). select("user", "positivePrediction", "negativePrediction").cache() // Count the number of pairs per user val allCounts = joinedPredictions. groupBy("user").agg(count(lit("1")).as("total")). select("user", "total") // Count the number of correctly ordered pairs per user val correctCounts = joinedPredictions. filter($"positivePrediction" > $"negativePrediction"). groupBy("user").agg(count("user").as("correct")). select("user", "correct") // Combine these, compute their ratio, and average over all users val meanAUC = allCounts.join(correctCounts, Seq("user"), "left_outer"). select($"user", (coalesce($"correct", lit(0)) / $"total").as("auc")). agg(mean("auc")). as[Double].first() joinedPredictions.unpersist() meanAUC } 完整代码下载:RunRecommender.scala