《Spark商业案例与性能调优实战100课》第6课:商业案例之通过Spark SQL实现大数据电影用户行为分析

《Spark商业案例与性能调优实战100课》第6课:商业案例之通过Spark SQL实现大数据电影用户行为分析


package com.dt.spark.sparksql


import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.{SparkConf, SparkContext}

import scala.collection.mutable

object Movie_Users_Analyzer_DateFrame {


  def main(args: Array[String]): Unit = {
    var masterUrl = "local[4]"
    var dataPath = "data/movielens/medium/"
    if (args.length > 0) {
      masterUrl = args(0)
    } else if (args.length > 1) {
      dataPath = args(1)
    }
    val sparkConf = new SparkConf().setMaster(masterUrl).setAppName("Movie_Users_Analyzer_DateFrame")

    val spark = SparkSession
      .builder()
      .config(sparkConf)
      .getOrCreate()
    import spark.implicits._
    // For implicit conversions like converting RDDs to DataFrames

    val sc = spark.sparkContext

    // val sc = new SparkContext(new SparkConf().setMaster(masterUrl).setAppName("Movie_Users_Analyzer"))
    val usersRDD = sc.textFile(dataPath + "users.dat")
    val moviessRDD = sc.textFile(dataPath + "movies.dat")
    val occupationsRDD = sc.textFile(dataPath + "occupation.dat")
    val ratingsRDD = sc.textFile(dataPath + "ratings.dat")
    //统计不同性别,不同年龄的电影观看人数

    val schemaforusers = StructType("userID::Gender::Age::Occupation::Zip-code".split("::")
      .map(column => StructField(column, StringType, true)))
    val userRDDRows = usersRDD.map(_.split("::")).map(line => Row(line(0).trim, line(1).trim,
      line(2).trim, line(3).trim, line(4).trim))

    val usersDataFrame = spark.createDataFrame(userRDDRows, schemaforusers)


    //ratings.dat  UserID::MovieID::Rating::Timestamp
    val schemaforrating = StructType("UserID::MovieID::Rating::Timestamp".split("::")
      .map(column => StructField(column, StringType, true)))
    val ratinsRDDRows = ratingsRDD.map(_.split("::")).map(line => Row(line(0).trim, line(1).trim,
      line(2).trim, line(3).trim))

    val ratinsDataFrame = spark.createDataFrame(ratinsRDDRows, schemaforrating)
    ratinsDataFrame.filter(s" MovieID = 1193 ")
      .join(usersDataFrame, "UserID")
      .select("Gender", "Age")
      .groupBy("Gender", "Age")
      .count()
      .show(10)


    while (true) {}
    sc.stop()

  }
}


17/01/11 23:28:14 INFO CodeGenerator: Code generated in 15.558235 ms
+------+---+-----+
|Gender|Age|count|
+------+---+-----+
|     F| 45|   55|
|     M| 50|  102|
|     M|  1|   26|
|     F| 56|   39|
|     F| 50|   43|
|     F| 18|   57|
|     F|  1|   10|
|     M| 18|  192|
|     F| 25|  140|
|     M| 45|  136|
+------+---+-----+
only showing top 10 rows



你可能感兴趣的:(《Spark商业案例与性能调优实战100课》第6课:商业案例之通过Spark SQL实现大数据电影用户行为分析)