import java.sql.Date
import java.text.SimpleDateFormat
import org.apache.spark.SparkConf
import org.apache.spark.sql.{Dataset, SparkSession}
object StatoisticsRecommender {
val MONGO_URI:String ="mongodb://hadoop100:27017/recom3"
val MONGODB_DATABASE:String ="recom3"
val MONGODB_RATING_COLLECTION ="Rating"
val MONGODB_MOVIE_COLLECTION ="Movie"
//驱动
val MONGO_DRVIVE_CLASS:String ="com.mongodb.spark.sql"
//优质电影
val MONGODB_RATE_MORE_MOVIES_COLLECTION ="RateMoreMovies"
//热门电影
val MONGODB_RATE_MORE_MOVIES_RECENTLY_COLLECTION ="RateMoreMoviesRecently"
//平均评分
val MONGODB_AVERAGE_MOVIES_SCORE_COLLECTION ="AverageMoviesScore"
val MONGODB_GENRES_TOP_MOVIES_COLLECTION ="GenresTopMovies"
def main(args: Array[String]): Unit = {
//使用map封装参数
val conf =Map("spark.cores" ->"local[2]",
"mongo.uri" ->MONGO_URI,
"mongo.db" ->MONGODB_DATABASE)
//sparkconf
val sparkConf =new SparkConf().setAppName("statisticsRecommender").setMaster(conf("spark.cores"))
//sparkSeesion
val spark = SparkSession.builder().config(sparkConf).getOrCreate()
implicit val mongoConf =new MongoConfig(conf("mongo.uri"), conf("mongo.db"))
//从mongo 中读取数据
//导入sparkSession的隐式转换
import spark.implicits._
val ratings = spark.read
.option("uri", mongoConf.uri)
.option("collection",MONGODB_RATING_COLLECTION)
.format(MONGO_DRVIVE_CLASS)
.load()
.as[MoviesRating]
.cache
val movies = spark.read
.option("uri", mongoConf.uri)
.option("collection",MONGODB_MOVIE_COLLECTION)
.format(MONGO_DRVIVE_CLASS)
.load()
.as[Movie]
.cache
//把数据注册成view[下面用到]
ratings.createOrReplaceTempView("ratings")
//分析
//1.优质电影=>总的评分个数最多的电影==>RateMoreMoveies
rateMore(spark)
//2.热门电影=>一个月内评分最多的电影==>RateMoreRecentlyMovies
rateMoreRecently(spark)
//3.电影的平均评分==>AverageMovies
averageMovieScore(spark, movies)
//4.每类电影topN ==>GenresTopMovies
//关闭资源
}
/**
* 优质电影的计算
*
* @param spark
*/
def rateMore(spark: SparkSession)(implicit mongoConf: MongoConfig): Unit = {
//select mid, count(mid) as count from ratings group by mid order by count desc
//根据业务执行sqlma
val rateMoreDF = spark.sql("select mid, count(mid) as count from ratings group by mid order by count desc")
//把结果数据写入到mongodb对应的表中
rateMoreDF
.write
.option("uri", mongoConf.uri)
.option("collection",MONGODB_RATE_MORE_MOVIES_COLLECTION)
.mode("overwrite")
.format(MONGO_DRVIVE_CLASS)
.save()
}
/**
* 热门电影:月评分最多的电影
*
* @param spark
*/
def rateMoreRecently(spark: SparkSession)(implicit mongoConf: MongoConfig): Unit = {
val simpleDateFormat =new SimpleDateFormat("yyyyMM")
//sparkSql自定义函数,用于将时间戳转化成年月的形式(乘上1000 是将秒为单位的转化成毫秒)
spark.udf.register("changDate", (x: Long) => simpleDateFormat.format(new Date(x *1000L)).toLong)
//根据业务执行sql
val yeahMonthOfRatings = spark.sql("select mid, uid, score, changDate(timestamp) as yeahmonth from ratings")
//将上一步得到的df注册成表ymRatings
yeahMonthOfRatings.createOrReplaceTempView("ymRatings")
//根据业务执行sql
val rateMoreRecentlyDF = spark.sql("select mid, count(mid) as count,yeahmonth from ymRatings group by yeahmonth,mid order by yeahmonth desc,count desc")
//将我们的结果数据写入到mongo的RateMoreMoviesRecently表中
rateMoreRecentlyDF
.write
.option("uri", mongoConf.uri)
.option("collection",MONGODB_RATE_MORE_MOVIES_RECENTLY_COLLECTION)
.mode("overwrite")
.format(MONGO_DRVIVE_CLASS)
.save()
}
/**
* 计算电影的平均评分
*
* @param spark
* @param movies
* @param mongoConf
*/
def averageMovieScore(spark: SparkSession, movies: Dataset[Movie])(implicit mongoConf: MongoConfig): Unit = {
//求出每个电影的平均评分
val averageMovieScoreDF = spark.sql("select mid, avg(score) as avg from ratings group by mid").cache()
//把结果数据写入到mongo的AverageMoviesScore表中
averageMovieScoreDF
.write
.option("uri", mongoConf.uri)
.option("collection",MONGODB_AVERAGE_MOVIES_SCORE_COLLECTION)
.mode("overwrite")
.format(MONGO_DRVIVE_CLASS)
.save()
import spark.implicits._
//电影里面所有的类别,使用list进行封装
val genres =List("Action","Adventure","Animation","Comedy","Ccrime","Documentary","Drama","Family","Fantasy","Foreign","History","Horror","Music","Mystery"
,"Romance","Science","Tv","Thriller","War","Western")
//把电影里面的类别由list的类型转化成rdd的类型
val genresRdd = spark.sparkContext.makeRDD(genres)
// 统计每种类别最热电影【每种类别中平均评分最高的10部电影】
val moviesWithSocreDF = movies.join(averageMovieScoreDF,Seq("mid","mid")).select("mid","avg","genres").cache()
//类别.cartesian(电影数据集(含平均评分))
val genresTopMovies = genresRdd.cartesian(moviesWithSocreDF.rdd).filter(x => {
xmatch {
//包含的就留下,不包含的就去掉
case (genres, row) => {
row.getAs[String]("genres").toLowerCase().contains(genres.toLowerCase)
}
}
})
// mid avg genres
.map {
//对数据的格式进行一个调整
case (genres, row) => {
(genres, (row.getAs[Int]("mid"), row.getAs[Double]("avg")))
}
}//按照电影的类别惊醒分组
// (key,((),()))
.groupByKey()
.map {
case (genres, items) => {
GenresRecommendation(genres, items.toList.sortWith(_._2 > _._2).slice(0,10).map(x =>Recommendation(x._1, x._2)))
}
}.toDF
//把结果数据写入到mongo的GenresTopMovies表中
genresTopMovies
.write
.option("uri", mongoConf.uri)
.option("collection",MONGODB_GENRES_TOP_MOVIES_COLLECTION)
.mode("overwrite")
.format(MONGO_DRVIVE_CLASS)
.save()
}
/**
* 推荐项目
*
* @param rid 项目ID
* @param r 推荐分数
*/
case class Recommendation(rid: Int, r: Double)
/**
* 电影种类推荐样例类
*
* @param genres
* @param recs
*/
case class GenresRecommendation(genres:String, recs:Seq[Recommendation])
/**
* MongoDB 配置对象
*
* @param uri MongoDB连接地址
* @param db 操作的MongoDB数据库
*/
case class MongoConfig(val uri:String,val db:String)
/**
* Rating Class 电影的评分类
*
* @param uid 用户的ID
* @param mid 电影的ID
* @param score 用户为该电影的评分
* @param timestamp 用户为该电影评分的时间
*/
case class MoviesRating(val uid: Int,val mid: Int,val score: Double,val timestamp: Int)
/**
* Movie Class 电影类
*
* @param mid 电影的ID
* @param name 电影的名称
* @param descri 电影的描述
* @param timelong 电影的时长
* @param issue 电影的发行时间
* @param shoot 电影的拍摄时间
* @param language 电影的语言
* @param genres 电影的类别
* @param actors 电影的演员
* @param directors 电影的导演
*/
case class Movie(val mid: Int,val name:String,val descri:String,val timelong:String,val issue:String,val shoot:String,val language:String,val genres:String,val actors:String,val directors:String)
}