修改后的电影推荐系统(基于MLlib、SparkSQL、MYSQL、HDFS)

之前写的太简单,这次加点存储和修改点代码来做次比较完善的推荐流程,但是只是基于离线,没有写实时模块,大家可以关注,接下来有时间会写到

代码如下:

package mllib

import java.text.SimpleDateFormat
import java.util.Date

import org.apache.log4j.{Level, Logger}
import org.apache.spark.mllib.recommendation.{ALS, MatrixFactorizationModel, Rating}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkContext, SparkConf}
import util.HbaseUtils
import scala.io.Source
import java.util.ArrayList
import scopt.OptionParser


/**
  * Created by 汪本成 on 2016/6/23.
  */
object movieALS {

  val numRecommender = 10

  def main(args: Array[String]) {

    //屏蔽不必要的日志显示在终端上
    Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
    Logger.getLogger("org.apache.eclipse.jetty.server").setLevel(Level.OFF)

    //创建入口对象
    val conf = new SparkConf().setAppName("moviesALS").setMaster("local[5]")
    val sc = new SparkContext(conf)


    //评分训练总的数据集,元祖格式
    val ratingList_Tuple = sc.textFile("hdfs://master:9000/user/spark/test/ratings.dat").map{
      lines =>
        val fields = lines.split("::")
        //timespan这列对10取余,是评分一列出现的数字都是一个0~9的数字
        (fields(0).toInt, fields(1).toInt, fields(2).toDouble, fields(3).toLong % 10)
    }

    //评分训练的总数据集,模拟键值对的形式,键是0-9的一个数字,值是Rating类型
    val ratingsTrain_KV = ratingList_Tuple.map(x => (x._4, Rating(x._1, x._2, x._3)))
    //打印从rating.dat中,我们从多少个用户和电影之中得到多少评分记录
    println("统计出" + ratingsTrain_KV.count() + "条评价来自" + ratingsTrain_KV.map(_._2.user).distinct().count() + "部电影里")

    //我的评分数据,RDD[Rating]格式
    val myRatedData_Rating = sc.textFile("hdfs://master:9000/user/spark/test/test.dat").map{
      lines =>
        val fields = lines.split("::")
        Rating(fields(0).toInt, fields(1).toInt, fields(2).toDouble)
    }

    //从训练总数据分出80%作为训练集,20%作为验证集,20%作为测试集,前面的将timespan10做取余操作就是为了从总数据集分出三部分
    //设置分区数
    val numPartitions = 3
    //将键的数值小于8的作为训练数据
    val traingData_Rating = ratingsTrain_KV.filter(_._1 < 8).values //注意,由于原本的数据集是伪键值对形式的,而当做训练数据只需要RDD[Rating]类型的数据,即values集合
      .union(myRatedData_Rating)   //使用union将我的评分加入训练集
      .repartition(numPartitions)
      .persist()

    //开始检验数据,此时不需要我的评分,所以不需要union操作
    val validateData_Rating = ratingsTrain_KV.filter(x => x._1 > 6 && x._1 < 8).values.repartition(numPartitions).persist()
    val testData_Rating = ratingsTrain_KV.filter(x => x._1 >= 8).values.repartition(numPartitions).persist()

    //打印出训练集,验证和测试的数据集分别是多少条记录
    println("\n")
    println("训练的数据集数量是: " + traingData_Rating.count() + "")
    println("验证的数据集数量是: " + validateData_Rating.count() + "")
    println("测试的数据集数量是: " + testData_Rating.count() + "")

    //开始训练模型,根据方差选择最佳模型
    val ranks = List(8,22)
    val lambdas = List(0.1,10.0)
    val iters = List(3,5)
    var bestModel: MatrixFactorizationModel = null
    var bestValidateRnse = Double.MaxValue
    var bestRank = 0
    var bestLambds = -1.0
    var bestIter = -1

    //一个三层循环可以产生8rankslambdsiters的组合,每个组合都会产生一个模型,计算8个模型的方差,最小的那个即为最佳模型
    for(rank <- ranks; lam <- lambdas; iter <- iters) {
      val model = ALS.train(traingData_Rating, rank, iter, lam: Double)
      val validateRnse = rnse(model, validateData_Rating, validateData_Rating.count())

      println("训练数据集的方差= " + validateRnse
        + " (训练的数据集中rank = " + rank
        + " lambda = " + lam
        + " numIter = " + iter
        + ")")

      if (validateRnse < bestValidateRnse) {
        bestModel = model
        bestRank = rank
        bestIter = iter
        bestValidateRnse = validateRnse
        bestLambds = lam
      }
    }

//    //将训练结果保存到hbase//    predictMoive(sc, bestModel)

    //奖最佳模型运用在测试的数据集上
    val testDataRnse = rnse(bestModel, testData_Rating,testData_Rating.count())
    println("最好的测试模型是在rank=" + bestRank + ",lambda=" + bestLambds
      + ",numIter = " + bestIter + ",得到测试集数据的方差=" + testDataRnse)

    //计算和原先基础相比提升了多少
    val meanRating = traingData_Rating.union(validateData_Rating).map(x => x.rating).mean()
    val baseLineRnse = math.sqrt(testData_Rating.map(x => (meanRating - x.rating) * (meanRating - x.rating)).mean())
    val improvent = (baseLineRnse - testDataRnse) / baseLineRnse * 100
    println("最好的模型通过" + "%1.2f".format(improvent) + "%" + "提高了基线")

    //电影列表总数据,元组格式
    val movieList_Tuple = sc.textFile("hdfs://master:9000/user/spark/test/movies.dat").map{
      lines =>
        val fields = lines.split("::")
        (fields(0).toInt, fields(1), fields(2))
    }

    //电影名称总数据,Map类型,键为id,值为name
    val movies_Map = movieList_Tuple.map(x => (x._1, x._2)).collect().toMap

    //电影名称总数据,Map类型,键为id,值为type
    val moviesType_Map = movieList_Tuple.map(x => (x._1, x._3)).collect().toMap

    var i = 1
    println("向你推荐的电影如下 :")
    //得到我已经看过的电影的id
    val myRatedMovieIds = myRatedData_Rating.map(_.product).collect().toSet
    //从电影列表中将这些电影过滤掉,剩下的电影列表将被送到模型中预测每部电影我可能做出的评分
    val recommondList = sc.parallelize(movies_Map.keys.filter(myRatedMovieIds.contains(_)).toSeq)
    //将结果数据按评分从大到小排序,选出评分最高的十条记录输出
    val result_movie = bestModel.predict(recommondList.map((0, _))).collect().sortBy(-_.rating).take(10)

    val result = result_movie.foreach{
      r =>
        println("%2d".format(i) + "---------->: \n电影名字 --> " + movies_Map(r.product) + " \n电影类型 --> " + moviesType_Map(r.product))
        i += 1
    }
    sc.parallelize(result_movie).saveAsTextFile("hdfs://master:9000/user/spark/movie")





    //计算可能感兴趣的人
    println("你可能刚兴趣的是以下这些编号的人物 : ")
    val sqlContext = new SQLContext(sc)
    import sqlContext.implicits._
    //将电影,用户,评分数据转换成DataFrame,进行SparkSQL操作
    val movies = movieList_Tuple.map(m => Movies(m._1.toInt, m._2, m._3)).toDF()
    val ratings = ratingList_Tuple.map(r => Ratings(r._1.toInt, r._2.toInt, r._3.toInt)).toDF()
    val users = sc.textFile("hdfs://master:9000/user/spark/test/users.dat").map{
      lines =>
        val fields = lines.split("::")
        Users(fields(0).toInt, fields(2).toInt, fields(3).toInt)
    }.toDF()

    val user = ratings.filter('rating >= 5)   //过滤出评分列表中评分为5的记录
      .join(movies, ratings("movieId") === movies("id"))  //和电影DataFrame进行join操作
      .filter(movies("mType") === "Drama") //筛选出评分为5,且电影类型为Drama的记录
      .join(users, ratings("userId") === users("id"))  //和用户DataFrame进行join操作
      .filter(users("age") === 18)   //筛选出年龄为18的记录
      .filter(users("occupation") === 15)  //筛选出工作类型为15的记录
      .select(users("id"))    //只保存用户id
      .take(10)
    user.foreach(println)

//    val show = sc.parallelize(List((1, 10), (2, 20), (3, 30)))
//    val show = sc.parallelize(s)
//    show.foreachPartition(sparkToMysql.connectToMysql)


    //60秒批处理一次
//    val ssc = new StreamingContext(conf, Seconds(60))
    //kafka中读取数据
//    val kafkaStream = KafkaUtils.createStream(ssc,
//      "master:2181",    //kafka集群使用的zookeeper
//      "group_spark_streaming",  //该消费者使用的group.id
//      Map[String, Int]("topic" -> 1),    //数据在kafkatopic及分区
//      StorageLevel.MEMORY_AND_DISK_SER
//    ).map(x => x._2.split("::", -1))
//
//    kafkaStream.foreachRDD((rdd: RDD[Array[String]], time: Time) =>{
//      val sqlContext = SQLContextSingleton.getInstance(rdd.sparkContext)
//      import sqlContext.implicits._
//
//
//    })


    // 保存和加载model
    val iString = new SimpleDateFormat("yyyyMMddHHmmssSSS").format(new Date())
    bestModel.save(sc, "hdfs://master:9000/target/tmp/myCollaborativeFilter" + iString)
    val sameModel = MatrixFactorizationModel.load(sc, "hdfs://master:9000/target/tmp/myCollaborativeFilter" + iString)


    //预测
//    println("user 2 ,top 1")
//    var rs = sameModel.recommendProducts(2, 1)
//
//    rs.foreach(println)
//    println("user 2 ,top 2")
//    rs = sameModel.recommendProducts(2, 2)
//
//    rs.foreach(println)
//    println("user 2 ,top 3")
//    rs = sameModel.recommendProducts(2, 3)
//
//    rs.foreach(println)
//    println("user 2 ,top 4")
//    rs = sameModel.recommendProducts(2, 4)
//
//    println(sameModel.predict(2, 1))
//    println(sameModel.predict(2, 2))
//    println(sameModel.predict(2, 3))
//    println(sameModel.predict(2, 4))


    //将数据保存到mysql    val url = "jdbc:mysql://192.168.43.130:3306/hdfs?user=hive&password=mysql"

//    movies.createJDBCTable(url, "Movies", true)
//    ratings.createJDBCTable(url, "Ratings", true)
//    users.createJDBCTable(url, "Users", true)


//    movies.insertIntoJDBC(url, "Movies", true)
//    ratings.insertIntoJDBC(url, "Ratings", true)
//    users.insertIntoJDBC(url, "Users", true)
    println("数据保存到mysql完成,请查看")

  }


  /** 装载用户评分文件 personalRatings.txt **/
  def loadRatings(path:String):Seq[Rating] = {
    val lines = Source.fromFile(path).getLines()
    val ratings = lines.map {
      line =>
        val fields = line.split("::")
        Rating(fields(0).toInt, fields(1).toInt, fields(2).toDouble)
    }.filter(_.rating > 0.0)
    if (ratings.isEmpty) {
      sys.error("No ratings provided.")
    } else {
      ratings.toSeq
    }
  }


  /**
    * function feature:计算方差函数
    * @param model
    * @param predictionData
    * @param n
    * @return  方差结果
    */
  def rnse(model: MatrixFactorizationModel, predictionData: RDD[Rating], n: Long): Double = {
    //根据参数model,来验证数据集进行预测
    val prediction = model.predict(predictionData.map(x => (x.user, x.product)))
    //将预测结果和验证数据集join计算评分的方差并返回
    val predictionAndOldRatings = prediction.map(x => ((x.user,x.product), x.rating))
      .join(predictionData.map(x => ((x.user, x.product), x.rating))).values
    math.sqrt(predictionAndOldRatings.map(x => (x._1 - x._2) * (x._1 - x._2)).reduce(_ - _) / n)
  }

  /**
    * 样例类,用作SparkSQL隐式转换
    */
  case class Ratings(userId: Int, movieId: Int, rating: Int)
  case class Movies(id: Int, name: String, mType: String)
  case class Users(id: Int, age: Int, occupation: Int)

  /**
    * 保存用户数据到hbase    *
    * @param context
    * @param model
    */
  private def predictMoive(context: SparkContext, model: MatrixFactorizationModel) {

    var recommenders = new ArrayList[java.util.Map[String, String]]();

    //读取需要进行电影推荐的用户数据
    val userData = context.textFile("hdfs://master:9000/user/spark/test/users.dat")

    userData.map(_.split("\\|") match {
      case Array(id, age, sex, job, x) => (id)
    }).collect().foreach(id => {
      //为用户推荐电影
      var rs = model.recommendProducts(id.toInt, numRecommender)
      var value = ""
      var key = 0

      //保存推荐数据到hbase      rs.foreach(r => {
        key = r.user
        value = value + r.product + ":" + r.rating + ","
      })

      //成功,则封装put对象,等待插入到Hbase      if (!value.equals("")) {
        var put = new java.util.HashMap[String, String]()
        put.put("rowKey", key.toString)
        put.put("t:info", value)
        recommenders.add(put)
      }
    })

    //保存到到HBase[recommender]表中
    //recommenders是返回的javaArrayList,可以自己用Java或者ScalaHBase的操作工具类,这里我就不给出具体的代码了,应该可以很快的写出
//    HbaseUtils.saveListMap("recommender", recommenders)
  }


}

你可能感兴趣的:(修改后的电影推荐系统(基于MLlib、SparkSQL、MYSQL、HDFS))