基于隐语义模型(LFM)的协同过滤推荐算法(ALS)

       在Python里,直接用LFM就可以,在scala里边,借助spark-mllib的ALS模型,可以理解ALS是在spark里LFM思想的解决方案。之前试过Python的,Python确实写的少是门优秀的机器学习语言,但是spark是大环境所趋,scala开发后放进MongoDB里也是很合时宜的选择,贴一下scala的代码,后面发现更好用的函数也会优化代码
import org.apache.spark.SparkConf
import org.apache.spark.mllib.recommendation.{ALS, Rating}
import org.apache.spark.sql.SparkSession
import org.jblas.DoubleMatrix


/**
  * Movie数据集,数据集字段通过分割
  *
  * 151^                          电影的ID
  * Rob Roy (1995)^               电影的名称
  * In the highlands ....^        电影的描述
  * 139 minutes^                  电影的时长
  * August 26, 1997^              电影的发行日期
  * 1995^                         电影的拍摄日期
  * English ^                     电影的语言
  * Action|Drama|Romance|War ^    电影的类型
  * Liam Neeson|Jessica Lange...  电影的演员
  * Michael Caton-Jones           电影的导演
  *
  * tag1|tag2|tag3|....           电影的Tag
  **/

case class Movie(val mid: Int, val name: String, val descri: String, val timelong: String, val issue: String,
                 val shoot: String, val language: String, val genres: String, val actors: String, val directors: String)

/**
  * Rating数据集,用户对于电影的评分数据集,用,分割
  *
  * 1,           用户的ID
  * 31,          电影的ID
  * 2.5,         用户对于电影的评分
  * 1260759144   用户对于电影评分的时间
  */
case class MovieRating(val uid: Int, val mid: Int, val score: Double, val timestamp: Int)

/**
  * MongoDB的连接配置
  *
  * @param uri MongoDB的连接
  * @param db  MongoDB要操作数据库
  */
case class MongoConfig(val uri: String, val db: String)

//推荐
case class Recommendation(rid: Int, r: Double)

//用户的推荐
case class UserRecs(uid: Int, recs: Seq[Recommendation])

//电影的相似度
case class MovieRecs(uid: Int, recs: Seq[Recommendation])


object OfflineRecommender {

  val MONGODB_RATING_COLLECTION = "Rating"
  val MONGODB_MOVIE_COLLECTION = "Movie"

  val USER_MAX_RECOMMENDATION = 10
  val USER_RECS = "UserRecs"
  val MOVIE_RECS="MovieRecs"


  //入口方法
  def main(args: Array[String]): Unit = {

    val config = Map(
      "spark.cores" -> "local[*]",
      "mongo.uri" -> "mongodb://172.16.104.13:27017/recommender",
      "mongo.db" -> "reommender"
    )

    //创建一个spark
    val sparkConf = new SparkConf().setAppName("OfflineRecommender").setMaster(config("spark.cores")).set("spark.executor.memory", "2G").set("spark.driver.memory", "2G")

    //创建sparkSession
    val spark = SparkSession.builder().config(sparkConf).getOrCreate()

    //读取MongoDB中的数据
    val mongoConfig = MongoConfig(config("mongo.uri"), config("mongo.db"))
    import spark.implicits._

    //评分数据
    val ratingRDD = spark
      .read
      .option("uri", mongoConfig.uri)
      .option("collection", MONGODB_RATING_COLLECTION)
      .format("com.mongodb.spark.sql")
      .load()
      .as[MovieRating]
      .rdd
      .map(rating => (rating.uid, rating.mid, rating.score)).cache()

    //用户数据
    val userRDD = ratingRDD.map(_._1).distinct()

    //电影数据
    val movieRDD = spark
      .read
      .option("uri", mongoConfig.uri)
      .option("collection", MONGODB_MOVIE_COLLECTION)
      .format("com.mongodb.spark.sql")
      .load()
      .as[Movie]
      .rdd
      .map(rating => (rating.mid)).cache()

    //创建训练数据集
    val trainData = ratingRDD.map(x => Rating(x._1, x._2, x._3))

    val (rank, iterations, lambda) = (50, 5, 0.01)

    //训练ALS模型
    val model = ALS.train(trainData, rank, iterations, lambda)

    //计算用户推荐矩阵
    val userMovies = userRDD.cartesian(movieRDD)
    val preRatings = model.predict(userMovies)
    val userRecs = preRatings
      .filter(_.rating > 0)
      .map(rating => (rating.user, (rating.product, rating.rating)))
      .groupByKey()
      .map {
        case (uid, recs) => UserRecs(uid, recs.toList.sortWith(_._2 > _._2).take(USER_MAX_RECOMMENDATION).map(x => Recommendation(x._1, x._2)))
      }.toDF()

//    userRecs
//      .write
//      .option("uri", mongoConfig.uri)
//      .option("collection", USER_RECS)
//      .mode("overwrite")
//      .format("com.mongodb.spark.sql")
//      .save()

    //计算电影相似度矩阵
      //获取电影的特征矩阵,隐语义
    val movieFeatures = model.productFeatures.map {
      case (mid, features) => (mid, new DoubleMatrix(features))
    }
    val movieRecs = movieFeatures.cartesian(movieFeatures)
      .filter {
        case (a, b) => a._1 != b._1
      }
      .map {
        case (a, b) => val simScore = this.consinSim(a._2,b._2)
          (a._1,(b._1,simScore))
      }.filter(_._2._2>0.6)
      .groupByKey()
      .map{
        case (mid,items)=>MovieRecs(mid,items.toList.map(x=>Recommendation(x._1,x._2)))
      }.toDF()

    movieRecs
        .write
        .option("uri",mongoConfig.uri)
        .option("collection",MOVIE_RECS)
        .mode("overwrite")
        .format("com.mongodb.spark.sql")
        .save()



    //关闭spark
    spark.close()


  }

  def consinSim(movie1: DoubleMatrix, movie2: DoubleMatrix): Double = {
    movie1.dot(movie2) / (movie1.norm2() * movie2.norm2())
    //l1范数:向量元素绝对值之和;l2范数:即向量的模长(向量的长度),向量元素的平方和再开方
    //

  }


}

你可能感兴趣的:(spark-mllib)