之前写的太简单,这次加点存储和修改点代码来做次比较完善的推荐流程,但是只是基于离线,没有写实时模块,大家可以关注,接下来有时间会写到
代码如下:
package mllib import java.text.SimpleDateFormat import java.util.Date import org.apache.log4j.{Level, Logger} import org.apache.spark.mllib.recommendation.{ALS, MatrixFactorizationModel, Rating} import org.apache.spark.rdd.RDD import org.apache.spark.sql.SQLContext import org.apache.spark.{SparkContext, SparkConf} import util.HbaseUtils import scala.io.Source import java.util.ArrayList import scopt.OptionParser /** * Created by 汪本成 on 2016/6/23. */ object movieALS { val numRecommender = 10 def main(args: Array[String]) { //屏蔽不必要的日志显示在终端上 Logger.getLogger("org.apache.spark").setLevel(Level.WARN) Logger.getLogger("org.apache.eclipse.jetty.server").setLevel(Level.OFF) //创建入口对象 val conf = new SparkConf().setAppName("moviesALS").setMaster("local[5]") val sc = new SparkContext(conf) //评分训练总的数据集,元祖格式 val ratingList_Tuple = sc.textFile("hdfs://master:9000/user/spark/test/ratings.dat").map{ lines => val fields = lines.split("::") //将timespan这列对10取余,是评分一列出现的数字都是一个0~9的数字 (fields(0).toInt, fields(1).toInt, fields(2).toDouble, fields(3).toLong % 10) } //评分训练的总数据集,模拟键值对的形式,键是0-9的一个数字,值是Rating类型 val ratingsTrain_KV = ratingList_Tuple.map(x => (x._4, Rating(x._1, x._2, x._3))) //打印从rating.dat中,我们从多少个用户和电影之中得到多少评分记录 println("统计出" + ratingsTrain_KV.count() + "条评价来自" + ratingsTrain_KV.map(_._2.user).distinct().count() + "部电影里") //我的评分数据,RDD[Rating]格式 val myRatedData_Rating = sc.textFile("hdfs://master:9000/user/spark/test/test.dat").map{ lines => val fields = lines.split("::") Rating(fields(0).toInt, fields(1).toInt, fields(2).toDouble) } //从训练总数据分出80%作为训练集,20%作为验证集,20%作为测试集,前面的将timespan对10做取余操作就是为了从总数据集分出三部分 //设置分区数 val numPartitions = 3 //将键的数值小于8的作为训练数据 val traingData_Rating = ratingsTrain_KV.filter(_._1 < 8).values //注意,由于原本的数据集是伪键值对形式的,而当做训练数据只需要RDD[Rating]类型的数据,即values集合 .union(myRatedData_Rating) //使用union将我的评分加入训练集 .repartition(numPartitions) .persist() //开始检验数据,此时不需要我的评分,所以不需要union操作 val validateData_Rating = ratingsTrain_KV.filter(x => x._1 > 6 && x._1 < 8).values.repartition(numPartitions).persist() val testData_Rating = ratingsTrain_KV.filter(x => x._1 >= 8).values.repartition(numPartitions).persist() //打印出训练集,验证和测试的数据集分别是多少条记录 println("\n") println("训练的数据集数量是: " + traingData_Rating.count() + "条") println("验证的数据集数量是: " + validateData_Rating.count() + "条") println("测试的数据集数量是: " + testData_Rating.count() + "条") //开始训练模型,根据方差选择最佳模型 val ranks = List(8,22) val lambdas = List(0.1,10.0) val iters = List(3,5) var bestModel: MatrixFactorizationModel = null var bestValidateRnse = Double.MaxValue var bestRank = 0 var bestLambds = -1.0 var bestIter = -1 //一个三层循环可以产生8个ranks,lambds,iters的组合,每个组合都会产生一个模型,计算8个模型的方差,最小的那个即为最佳模型 for(rank <- ranks; lam <- lambdas; iter <- iters) { val model = ALS.train(traingData_Rating, rank, iter, lam: Double) val validateRnse = rnse(model, validateData_Rating, validateData_Rating.count()) println("训练数据集的方差= " + validateRnse + " (训练的数据集中rank = " + rank + " lambda = " + lam + " numIter = " + iter + ")") if (validateRnse < bestValidateRnse) { bestModel = model bestRank = rank bestIter = iter bestValidateRnse = validateRnse bestLambds = lam } } // //将训练结果保存到hbase中 // predictMoive(sc, bestModel) //奖最佳模型运用在测试的数据集上 val testDataRnse = rnse(bestModel, testData_Rating,testData_Rating.count()) println("最好的测试模型是在rank=" + bestRank + ",lambda=" + bestLambds + ",numIter = " + bestIter + ",得到测试集数据的方差=" + testDataRnse) //计算和原先基础相比提升了多少 val meanRating = traingData_Rating.union(validateData_Rating).map(x => x.rating).mean() val baseLineRnse = math.sqrt(testData_Rating.map(x => (meanRating - x.rating) * (meanRating - x.rating)).mean()) val improvent = (baseLineRnse - testDataRnse) / baseLineRnse * 100 println("最好的模型通过" + "%1.2f".format(improvent) + "%" + "提高了基线") //电影列表总数据,元组格式 val movieList_Tuple = sc.textFile("hdfs://master:9000/user/spark/test/movies.dat").map{ lines => val fields = lines.split("::") (fields(0).toInt, fields(1), fields(2)) } //电影名称总数据,Map类型,键为id,值为name val movies_Map = movieList_Tuple.map(x => (x._1, x._2)).collect().toMap //电影名称总数据,Map类型,键为id,值为type val moviesType_Map = movieList_Tuple.map(x => (x._1, x._3)).collect().toMap var i = 1 println("向你推荐的电影如下 :") //得到我已经看过的电影的id val myRatedMovieIds = myRatedData_Rating.map(_.product).collect().toSet //从电影列表中将这些电影过滤掉,剩下的电影列表将被送到模型中预测每部电影我可能做出的评分 val recommondList = sc.parallelize(movies_Map.keys.filter(myRatedMovieIds.contains(_)).toSeq) //将结果数据按评分从大到小排序,选出评分最高的十条记录输出 val result_movie = bestModel.predict(recommondList.map((0, _))).collect().sortBy(-_.rating).take(10) val result = result_movie.foreach{ r => println("第%2d".format(i) + "名---------->: \n电影名字 --> " + movies_Map(r.product) + " \n电影类型 --> " + moviesType_Map(r.product)) i += 1 } sc.parallelize(result_movie).saveAsTextFile("hdfs://master:9000/user/spark/movie") //计算可能感兴趣的人 println("你可能刚兴趣的是以下这些编号的人物 : ") val sqlContext = new SQLContext(sc) import sqlContext.implicits._ //将电影,用户,评分数据转换成DataFrame,进行SparkSQL操作 val movies = movieList_Tuple.map(m => Movies(m._1.toInt, m._2, m._3)).toDF() val ratings = ratingList_Tuple.map(r => Ratings(r._1.toInt, r._2.toInt, r._3.toInt)).toDF() val users = sc.textFile("hdfs://master:9000/user/spark/test/users.dat").map{ lines => val fields = lines.split("::") Users(fields(0).toInt, fields(2).toInt, fields(3).toInt) }.toDF() val user = ratings.filter('rating >= 5) //过滤出评分列表中评分为5的记录 .join(movies, ratings("movieId") === movies("id")) //和电影DataFrame进行join操作 .filter(movies("mType") === "Drama") //筛选出评分为5,且电影类型为Drama的记录 .join(users, ratings("userId") === users("id")) //和用户DataFrame进行join操作 .filter(users("age") === 18) //筛选出年龄为18的记录 .filter(users("occupation") === 15) //筛选出工作类型为15的记录 .select(users("id")) //只保存用户id .take(10) user.foreach(println) // val show = sc.parallelize(List((1, 10), (2, 20), (3, 30))) // val show = sc.parallelize(s) // show.foreachPartition(sparkToMysql.connectToMysql) //每60秒批处理一次 // val ssc = new StreamingContext(conf, Seconds(60)) //从kafka中读取数据 // val kafkaStream = KafkaUtils.createStream(ssc, // "master:2181", //kafka集群使用的zookeeper // "group_spark_streaming", //该消费者使用的group.id // Map[String, Int]("topic" -> 1), //数据在kafka中topic及分区 // StorageLevel.MEMORY_AND_DISK_SER // ).map(x => x._2.split("::", -1)) // // kafkaStream.foreachRDD((rdd: RDD[Array[String]], time: Time) =>{ // val sqlContext = SQLContextSingleton.getInstance(rdd.sparkContext) // import sqlContext.implicits._ // // // }) // 保存和加载model val iString = new SimpleDateFormat("yyyyMMddHHmmssSSS").format(new Date()) bestModel.save(sc, "hdfs://master:9000/target/tmp/myCollaborativeFilter" + iString) val sameModel = MatrixFactorizationModel.load(sc, "hdfs://master:9000/target/tmp/myCollaborativeFilter" + iString) //预测 // println("user 2 ,top 1") // var rs = sameModel.recommendProducts(2, 1) // // rs.foreach(println) // println("user 2 ,top 2") // rs = sameModel.recommendProducts(2, 2) // // rs.foreach(println) // println("user 2 ,top 3") // rs = sameModel.recommendProducts(2, 3) // // rs.foreach(println) // println("user 2 ,top 4") // rs = sameModel.recommendProducts(2, 4) // // println(sameModel.predict(2, 1)) // println(sameModel.predict(2, 2)) // println(sameModel.predict(2, 3)) // println(sameModel.predict(2, 4)) //将数据保存到mysql中 val url = "jdbc:mysql://192.168.43.130:3306/hdfs?user=hive&password=mysql" // movies.createJDBCTable(url, "Movies", true) // ratings.createJDBCTable(url, "Ratings", true) // users.createJDBCTable(url, "Users", true) // movies.insertIntoJDBC(url, "Movies", true) // ratings.insertIntoJDBC(url, "Ratings", true) // users.insertIntoJDBC(url, "Users", true) println("数据保存到mysql完成,请查看") } /** 装载用户评分文件 personalRatings.txt **/ def loadRatings(path:String):Seq[Rating] = { val lines = Source.fromFile(path).getLines() val ratings = lines.map { line => val fields = line.split("::") Rating(fields(0).toInt, fields(1).toInt, fields(2).toDouble) }.filter(_.rating > 0.0) if (ratings.isEmpty) { sys.error("No ratings provided.") } else { ratings.toSeq } } /** * function feature:计算方差函数 * @param model * @param predictionData * @param n * @return 方差结果 */ def rnse(model: MatrixFactorizationModel, predictionData: RDD[Rating], n: Long): Double = { //根据参数model,来验证数据集进行预测 val prediction = model.predict(predictionData.map(x => (x.user, x.product))) //将预测结果和验证数据集join计算评分的方差并返回 val predictionAndOldRatings = prediction.map(x => ((x.user,x.product), x.rating)) .join(predictionData.map(x => ((x.user, x.product), x.rating))).values math.sqrt(predictionAndOldRatings.map(x => (x._1 - x._2) * (x._1 - x._2)).reduce(_ - _) / n) } /** * 样例类,用作SparkSQL隐式转换 */ case class Ratings(userId: Int, movieId: Int, rating: Int) case class Movies(id: Int, name: String, mType: String) case class Users(id: Int, age: Int, occupation: Int) /** * 保存用户数据到hbase中 * * @param context * @param model */ private def predictMoive(context: SparkContext, model: MatrixFactorizationModel) { var recommenders = new ArrayList[java.util.Map[String, String]](); //读取需要进行电影推荐的用户数据 val userData = context.textFile("hdfs://master:9000/user/spark/test/users.dat") userData.map(_.split("\\|") match { case Array(id, age, sex, job, x) => (id) }).collect().foreach(id => { //为用户推荐电影 var rs = model.recommendProducts(id.toInt, numRecommender) var value = "" var key = 0 //保存推荐数据到hbase中 rs.foreach(r => { key = r.user value = value + r.product + ":" + r.rating + "," }) //成功,则封装put对象,等待插入到Hbase中 if (!value.equals("")) { var put = new java.util.HashMap[String, String]() put.put("rowKey", key.toString) put.put("t:info", value) recommenders.add(put) } }) //保存到到HBase的[recommender]表中 //recommenders是返回的java的ArrayList,可以自己用Java或者Scala写HBase的操作工具类,这里我就不给出具体的代码了,应该可以很快的写出 // HbaseUtils.saveListMap("recommender", recommenders) } } |