Spark2.x中DataFrame的某列保存的数据,是包含一个结构的数组时,如何取出数据?

Spark2.x中,当某列保存的数据是一个结构数组时,没有一种漂亮的取出方式,只能通过rdd去取。

以下是用Spark构建协同过滤实现推荐引擎的一个例子(https://spark.apache.org/docs/2.4.5/ml-collaborative-filtering.html),例子中的数据只能通过如下方式取出。

object RecommendationProducts {

  def main(args: Array[String]) {
    val spark = SparkSession.builder.master("local[2]").appName("UserRecommendation").getOrCreate()
    val sqlContext = spark.sqlContext
    import sqlContext.implicits._

    val ratings = FeatureExtraction.getFeatures();
    val Array(training, test) = ratings.randomSplit(Array(0.8, 0.2))

    // Build the recommendation model using ALS on the training data
    val als = new ALS()
      .setMaxIter(5)
      .setRegParam(0.01)
      .setUserCol("userId")
      .setItemCol("movieId")
      .setRatingCol("rating")

    val model = als.fit(training)

    // Evaluate the model by computing the RMSE on the test data
    // Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
    model.setColdStartStrategy("drop")
    val predictions = model.transform(test)

    val evaluator = new RegressionEvaluator()
      .setMetricName("rmse")
      .setLabelCol("rating")
      .setPredictionCol("prediction")
    val rmse = evaluator.evaluate(predictions)

    // Generate top 10 movie recommendations for each user
    val userRecs = model.recommendForAllUsers(10)
    // Generate top 10 user recommendations for each movie
    val movieRecs = model.recommendForAllItems(10)

    val users = ratings.select(als.getUserCol).filter("userId=" + 789).distinct()
    val userSubsetRecs = model.recommendForUserSubset(users, 10)
    userSubsetRecs.printSchema()

    val content = userSubsetRecs.select($"recommendations").rdd.map(_.getSeq[Row](0))
    val recs = content.map(_.map( row => (
      row.getAs[Int]("movieId"),
      row.getAs[Float]("rating"),
    )))
    //recs.foreach(println)
    recs.foreach(r => r.foreach(rr => println("movieId: " + rr._1 + " rating: " + rr._2)))
    println("done.")
  }
}

 

 

你可能感兴趣的:(Spark)