构建分布式的协同过滤推荐系统

个性化推荐是根据用户的兴趣特点和购买行为,向用户推荐用户感兴趣的信息和商品。基本的3种不同协同过滤算法(基于用户的协同过滤User-based CF、基于项目的协同过滤Item-based CF 以及基于模型的协同过滤Model-based)。
传统的SVD只能对稠密矩阵进行分解,即不允许所分解的矩阵出现空值。为了解决稀疏矩阵分解学习问题,可以采用正则化矩阵分解(本节学习所用)和带偏置的矩阵分解。

 package ccut.spark.ml
/*
 * 基于Spark的MLlib实现协同过滤算法
 */
import java.util.regex.PatternSyntaxException

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.mllib.recommendation.ALS
import org.apache.spark.mllib.recommendation.MatrixFactorizationModel
import org.apache.spark.mllib.recommendation._
import org.apache.spark.rdd.RDD.doubleRDDToDoubleRDDFunctions
import org.apache.spark.rdd.RDD.rddToPairRDDFunctions

object CollaborativeFiltering {
    def main(args : Array[String]){
        val conf = new SparkConf().setAppName("ALS").setMaster("local[2]")
        val sc = new SparkContext(conf)

        // 读取评分数据,保存为RDD[Rating]对象
        val data = sc.textFile("C://Users//Administrator//Desktop//test.data")
        val ratings = data.map(_.split(',') match { case Array(user, item, rate) =>
                      Rating(user.toInt, item.toInt, rate.toDouble)   //三元组,如果用户ID是String类型呢
        })

        // 交替最小二乘法,特指使用交替最小二乘求解的一个协同推荐算法
        //使用ALS算法求解矩阵分解
        val rank = 10    //  设置按秩为10进行矩阵分解
        val numIterations = 10  
        val alpha = 0.01 //矩阵分解的正则系数为0.01
        val model = ALS.train(ratings, rank, numIterations, alpha)

        //利用训练结果训练一些用户评分(如预测用户1对项目1的评分)
        println("Rating of user 1 towards item 1 is: "+model.predict(1, 1))
        //为用户1推荐2个商品(预测用户1最感兴趣的2个商品)
        model.recommendProducts(1, 2).foreach { rating  =>  
           println("Product " + rating.product + "rating="+ rating.rating)  
        }
        //为商品1推荐可能对其感兴趣的2个用户
        model.recommendUsers(1, 2).foreach { rating =>
           println("User " + rating.user + "rating="+ rating.rating) 
        }
        // 获得预测结果和原始评分
        val usersProducts = ratings.map { case Rating(user, product, rate) =>
        println(user, product,rate) //输出(1,1,5.0)
        (user, product)
        }

        val predictions =
                model.predict(usersProducts).map { case Rating(user, product, rate) =>
                ((user, product), rate)
        }

        val ratesAndPreds = ratings.map { case Rating(user, product, rate) =>
        ((user, product), rate)
        }.join(predictions)  //把原始打分和预测打分组合在一起
        //打印出原始打分和预测打分
        ratesAndPreds.map{x =>{
          val user = x._1._1
          val product = x._1._2
          val rate = x._2
          println(s"user,product,rate is :$user,$product,$rate")
        }}.count

        //计算MSE误差
        val MSE = ratesAndPreds.map { case ((user, product), (r1, r2)) =>
        val err = (r1 - r2)    //math.pow((r1-r2),2)
        err * err
        }.mean()  //reduce{_+_}/valuesAndPreds.count
        println("Mean Squared Error = " + MSE)

//      // Save and load model
//      model.save(sc, "target/tmp/myCollaborativeFilter")
//      val sameModel = MatrixFactorizationModel.load(sc, "target/tmp/myCollaborativeFilter")
    }
}
/*
本地运行结果:
Rating of user 1 towards item 1 is: 4.99639776859312
Product 1rating=4.99639776859312
Product 3rating=4.99639776859312
User 1rating=4.99639776859312
User 2rating=4.99639776859312
(1,1,5.0)
(3,2,5.0)
(3,3,1.0)
(1,2,1.0)
(3,4,5.0)
(1,3,5.0)
(4,1,1.0)
(1,4,1.0)
(4,2,5.0)
(4,3,1.0)
(2,1,5.0)
(4,4,5.0)
(2,2,1.0)
(2,3,5.0)
(2,4,1.0)
(3,1,1.0)

(1,1,5.0)
(1,2,1.0)
(1,3,5.0)
(1,4,1.0)
(2,1,5.0)
(2,2,1.0)
(2,3,5.0)
(2,4,1.0)
(3,1,1.0)
(3,2,5.0)
(3,3,1.0)
(3,4,5.0)
(4,1,1.0)
(4,2,5.0)
(4,3,1.0)
(4,4,5.0)
user,product,rate is :1,4,(1.0,0.9997252436773894)
user,product,rate is :3,1,(1.0,0.9996066404676376)
user,product,rate is :2,3,(5.0,4.99639776859312)
user,product,rate is :1,2,(1.0,0.9997252436773894)
user,product,rate is :2,1,(5.0,4.99639776859312)
user,product,rate is :4,4,(5.0,4.996990768798497)
user,product,rate is :1,1,(5.0,4.99639776859312)
user,product,rate is :4,2,(5.0,4.996990768798497)
user,product,rate is :2,2,(1.0,0.9997252436773894)
user,product,rate is :4,1,(1.0,0.9996066404676376)
user,product,rate is :2,4,(1.0,0.9997252436773894)
user,product,rate is :3,2,(5.0,4.996990768798497)
user,product,rate is :3,4,(5.0,4.996990768798497)
user,product,rate is :3,3,(1.0,0.9996066404676376)
user,product,rate is :4,3,(1.0,0.9996066404676376)
user,product,rate is :1,3,(5.0,4.99639776859312)

Mean Squared Error = 5.565441572831868E-6
 * 
 */

你可能感兴趣的:(机器学习,Spark)