Spark SQL API——使用Case Class创建Dataset

package com.njbdqn.mydataset

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._

/**
 * 使用dataSet完成零售商店指标统计
 * 1、使用RDD装在零售商店业务数据
 * customers.csv、orders.csv、order_items.csv、products.csv
 * 2、定义样例类
 * 将RDD转成DataSet
 */
object MyThirdSpark1 {

  //参照csv表格写样例类
  case class Customers(userid:String,fname:String,lname:String,tel:String,email:String,addr:String,city:String,state:String,zip:String)
  case class Orders(ordid:String,orddate:String,userid:String,ordstatus:String)
  case class OrderItems(id:String,ordid:String,proid:String,buynum:String,countPrice:String,price:String)
  case class Products(proid:String,protype:String,title:String,price:String,img:String)


  def main(args: Array[String]): Unit = {

    //创建一个spark
    val spark = SparkSession.builder().master("local[2]").appName("myshops").getOrCreate()

    //读取数据,生成对应的RDD
    val customers = spark.sparkContext.textFile("file:///d:/datas/dataset/customers.csv").cache()
    val orders = spark.sparkContext.textFile("file:///d:/datas/dataset/orders.csv")
    val orderItems = spark.sparkContext.textFile("file:///d:/datas/dataset/order_items.csv")
    val products = spark.sparkContext.textFile("file:///d:/datas/dataset/products.csv")

    //导包
    import spark.implicits._

    //将RDD转成DataSet
    //因为我的字段类型都是String,都带“”,所以要对“”进行处理
    val cus = customers.map(line => {
        val e = line.replaceAll("\"", "").split(",");
        Customers(e(0), e(1), e(2), e(3), e(4), e(5), e(6), e(7), e(8))
      })
      .toDS()

    val ords = orders.map(line => {
      val e = line.replace("\"", "").split(",");
      Orders(e(0), e(1), e(2), e(3))
    }).toDS()

    val orditms = orderItems.map(line => {
      val e = line.replace("\"", "").split(",");
      OrderItems(e(0), e(1), e(2), e(3), e(4), e(5))
    }).toDS()

    val pro = products.map(line => {
      val e = line.replace("\"", "").replaceAll(",,",",").split(",");
      Products(e(0), e(1), e(2), e(3), e(4))
    }).toDS()

    //查看数据
    pro.show()

    //下面就可以搞一波查询统计啦,但素要先导sql.functions包呦

    //1、哪个产品销量最高?
    orditms
      .groupBy("proid")
      .agg(sum("buynum").alias("num"))
      .orderBy(desc("num"))
      .limit(1)
      .join(pro,"proid")
      .show()

  }

}

查询结果:
Spark SQL API——使用Case Class创建Dataset_第1张图片

你可能感兴趣的:(spark)