DataFrame API&SQL练习


测试数据

  • customers.csv
  • orders.csv
  • order_items.csv
  • products.csv
  • 哪位用户买的商品最多?
    • SQL
    spark.sql("select c.customer_id, c.customer_fname, c.customer_lname, tc.order_nums 
    from customers c 
    left join 
      (select t.order_customer_id as customer_id, count(t.order_customer_id) as   order_nums 
        from (select * 
              from orders 
              where order_status = 'COMPLETE') t 
        group by t.order_customer_id) tc 
        on tc.customer_id = c.customer_id 
        order by tc.order_nums desc").show(3)
    
    +-----------+--------------+--------------+----------+                          
    |customer_id|customer_fname|customer_lname|order_nums|
    +-----------+--------------+--------------+----------+
    |       9337|          Mary|         Smith|        10|
    |       7802|          Mary|       Acevedo|         9|
    |        749|         Jesse|      Matthews|         9|
    +-----------+--------------+--------------+----------+
    
    
    spark.sql("select c.customer_id, c.customer_fname, c.customer_lname,count(c.customer_id) as order_nums, sum(oi.order_item_subtotal) as total 
    from customers c 
    inner join orders o on c.customer_id = o.order_customer_id 
    inner join order_items oi on o.order_id = oi.order_item_order_id 
    group by c.customer_id, c.customer_fname, c.customer_lname 
    order by total desc").show
     
    +-----------+--------------+--------------+----------+------------------+       
    |customer_id|customer_fname|customer_lname|order_nums|             total|
    +-----------+--------------+--------------+----------+------------------+
    |        791|          Mary|         Smith|        43|10524.169999999995|
    |       9371|          Mary|     Patterson|        44| 9299.029999999993|
    |       8766|          Mary|        Duncan|        38| 9296.139999999998|
    +-----------+--------------+--------------+----------+------------------+
    
    
    • DataFrame API
    dfCustomer.alias("c")
    .join(dfOrder.alias("o"), $"c.customer_id" === $"o.order_customer_id", "right_outer")
    .filter($"o.order_status" === "COMPLETE")
    .select("c.customer_id", "c.customer_fname", "c.customer_lname", "o.order_id")
    .groupBy($"customer_id",$"customer_fname", $"customer_lname")
    .agg(count($"order_id").as("order_nums"))
    .sort($"order_nums".desc).show(3)
    
    +-----------+--------------+--------------+----------+                          
    |customer_id|customer_fname|customer_lname|order_nums|
    +-----------+--------------+--------------+----------+
    |       9337|          Mary|         Smith|        10|
    |       7802|          Mary|       Acevedo|         9|
    |       3710|        Ashley|         Smith|         9|
    +-----------+--------------+--------------+----------+
    
  • 那个商品卖的最多?
    • SQL
    // 不考虑是否交易完成
    spark.sql("select p.product_id, p.product_name, p.product_category_id, p.product_price, tp.nums 
    from products p 
    left join 
      (select order_item_product_id as product_id, count(*) as nums  
        from order_items 
        group by order_item_product_id) tp 
     on p.product_id = tp.product_id order by tp.nums desc").show(3)
    
      +----------+--------------------+-------------------+-------------+-----+       
      |product_id|        product_name|product_category_id|product_price| nums|
      +----------+--------------------+-------------------+-------------+-----+
      |       365|Perfect Fitness P...|                 17|        59.99|24515|
      |       403|Nike Men's CJ Eli...|                 18|       129.99|22246|
      |       502|Nike Men's Dri-FI...|                 24|         50.0|21035|
      +----------+--------------------+-------------------+-------------+-----+
    
    
    // 只考虑已完成订单中的商品
    spark.sql("select p.product_id, p.product_name, p.product_category_id, p.product_price, tp.nums 
      from products p 
      right join (select order_item_product_id as product_id, count(*) as nums  
                  from (select * 
                        from order_items ot 
                        left join orders o 
                        on o.order_id = ot.order_item_order_id ) s 
                   where s.order_status = 'COMPLETE' 
                   group by order_item_product_id) tp 
                   on p.product_id = tp.product_id 
                   order by tp.nums desc").show(3)
    
      +----------+--------------------+-------------+------------+                    
      |product_id|        product_name|product_price|product_nums|
      +----------+--------------------+-------------+------------+
      |       365|Perfect Fitness P...|        59.99|        8071|
      |       403|Nike Men's CJ Eli...|       129.99|        7369|
      |       502|Nike Men's Dri-FI...|         50.0|        7036|
      +----------+--------------------+-------------+------------+
    
    
    • DataFrame API
     dfProduct.alias("p")
      .join(dfOrderItem.alias("ot"), $"p.product_id" === $"ot.order_item_product_id", "right_outer")
      .select($"product_id",$"product_name", $"product_price", $"order_item_order_id", $"product_category_id")   // 可省略
      .join(dfOrder.alias("o"), $"o.order_id" === $"order_item_order_id", "left_outer")
      .filter($"order_status" === "COMPLETE").groupBy($"product_id", $"product_name", $"product_price")
      .agg(count($"product_id").as("product_nums"))
      .sort($"product_nums".desc).show(3)
    
      +----------+--------------------+-------------+------------+                    
      |product_id|        product_name|product_price|product_nums|
      +----------+--------------------+-------------+------------+
      |       365|Perfect Fitness P...|        59.99|        8071|
      |       403|Nike Men's CJ Eli...|       129.99|        7369|
      |       502|Nike Men's Dri-FI...|         50.0|        7036|
      +----------+--------------------+-------------+------------+
    

    注:此结果只统计成功交易的订单中的商品


  • 周一到周末的购物分布情况?
    • SQL
    spark.sql("select dayOfWeek(order_date) as dayofweek, count(order_id) as order_nums 
               from orders 
                  where order_status = 'COMPLETE' 
               group by dayofweek 
               order by order_nums desc").show
    
     +---------+----------+                                                          
     |dayofweek|order_nums|
     +---------+----------+
     |        6|      3387|
     |        5|      3371|
     |        2|      3356|
     |        4|      3261|
     |        3|      3232|
     |        7|      3203|
     |        1|      3089|
     +---------+----------+
    
    • DataFrame API
    dfOrder.filter($"order_status" === "COMPLETE").withColumn("dayofweek", day_Of_Week($"order_date"))
    .select("dayofweek", "order_id").groupBy($"dayofweek").agg(count($"order_id").as("order_nums"))
    .sort($"order_nums".desc).show  
    
    +---------+----------+                                                          
    |dayofweek|order_nums|
    +---------+----------+
    |        6|      3387|
    |        5|      3371|
    |        2|      3356|
    |        4|      3261|
    |        3|      3232|
    |        7|      3203|
    |        1|      3089|
    +---------+----------+
    

    注:dayOfWeek和day_Of_Week是自定义函数


  • 一月到十二月的购物分布情况?
    • SQL
    // 使用date_format函数
    spark.sql("select t.month as month, count(t.month) as order_nums 
    from (select order_id, date_format(order_date, 'MM') as month, order_customer_id 
        from orders 
        where order_status = 'COMPLETE') t 
    group by t.month 
    order by order_nums desc").show(12)
    
      +-----+----------+                                                              
      |month|order_nums|
      +-----+----------+
      |   11|      2141|
      |   03|      1967|
      |   07|      1934|
      |   09|      1933|
      |   04|      1932|
      |   01|      1911|
      |   12|      1898|
      |   08|      1880|
      |   02|      1869|
      |   05|      1854|
      |   06|      1797|
      |   10|      1783|
      +-----+----------+
    
    
    • DataFrame API
    dfOrder.filter($"order_status" === "COMPLETE").withColumn("month", month_Of_Year($"order_date"))
    .select("month", "order_id").groupBy($"month").agg(count($"order_id").as("order_nums"))
    .sort($"order_nums".desc).show
    
      +-----+----------+                                                              
      |month|order_nums|
      +-----+----------+
      |   11|      2141|
      |    3|      1967|
      |    7|      1934|
      |    9|      1933|
      |    4|      1932|
      |    1|      1911|
      |   12|      1898|
      |    8|      1880|
      |    2|      1869|
      |    5|      1854|
      |    6|      1797|
      |   10|      1783|
      +-----+----------+
    
    

注:month_Of_Year是自定义函数

你可能感兴趣的:(DataFrame API&SQL练习)