Hadoop(二十七)HIVE的高级应用之业绩报表

一. 涉及到中重点知识

  • Hive的窗口和分析函数进阶

      • NTILE、CUME_DIST、PERCENT_RANK、LEAD、LAG、
      FIRST_VALUE、LAST_VALUE等
      • ROWS BETWEEN ... AND ...
      – CURRENT ROW,rows PRECEDING,rows FOLLOWING
    

二,项目中流程

Hadoop(二十七)HIVE的高级应用之业绩报表_第1张图片
图片.png

三.项目设计

  --========== rates ==========--

  app0  1
  app1  2
  app2  2
  app3  3
  app4  3
  app5  3
  app6  5
  app7  5
  app8  5
  app9  5

  CREATE EXTERNAL TABLE rates (
      app_name    STRING 
    , star_rates  STRING
  )
  ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
  LOCATION '/tmp/db/rates';

  create table app_ranks as
    select app_name as app
         , star_rates as stars
         , NTILE(3) OVER (ORDER BY star_rates DESC) as nt
         , row_number() OVER (ORDER BY star_rates DESC) as rn
         , rank() OVER (ORDER BY star_rates DESC) as rk
         , dense_rank() OVER (ORDER BY star_rates DESC) as drk
         , CUME_DIST() OVER (ORDER BY star_rates) as cd
         , PERCENT_RANK() OVER (ORDER BY star_rates) as pr
    from rates
    order by stars desc
  ;

  select app, stars, cd, sum(cd) OVER (PARTITION BY stars ORDER BY rn ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) from app_ranks;

  select app, stars, cd, sum(cd) OVER (PARTITION BY stars ORDER BY rn ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) from app_ranks;

  select app, stars, cd, sum(cd) OVER (PARTITION BY stars ORDER BY rn ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING) from app_ranks;

  select app, stars, rn, lead(rn) OVER (PARTITION BY stars ORDER BY rn), lag(rn) OVER (PARTITION BY stars ORDER BY rn) from app_ranks;


  --========== visitors ==========--

  d001  201301  101
  d002  201301  102
  d003  201301  103
  d001  201302  111
  d002  201302  112
  d003  201302  113
  d001  201303  121
  d002  201303  122
  d003  201303  123
  d001  201304  131
  d002  201304  132
  d003  201304  133
  d001  201305  141
  d002  201305  142
  d003  201305  143
  d001  201306  151
  d002  201306  152
  d003  201306  153
  d001  201307  201
  d002  201307  202
  d003  201307  203
  d001  201308  211
  d002  201308  212
  d003  201308  213
  d001  201309  221
  d002  201309  222
  d003  201309  223
  d001  201310  231
  d002  201310  232
  d003  201310  233
  d001  201311  241
  d002  201311  242
  d003  201311  243
  d001  201312  301
  d002  201312  302
  d003  201312  303
  d001  201401  301
  d002  201401  302
  d003  201401  303
  d001  201402  211
  d002  201402  212
  d003  201402  213
  d001  201403  271
  d002  201403  272
  d003  201403  273
  d001  201404  331
  d002  201404  332
  d003  201404  333
  d001  201405  351
  d002  201405  352
  d003  201405  353

  CREATE EXTERNAL TABLE visitors (
      domain  STRING 
    , month   STRING
    , visitor STRING
  )
  ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
  LOCATION '/tmp/db/visitors';

  select * from visitors where domain = 'd001';

  select domain
       , month
       , visitor
       , first_value(visitor) OVER (PARTITION BY domain ORDER BY month DESC)
       , last_value(visitor) OVER (PARTITION BY domain ORDER BY month DESC)
       , lead(visitor) OVER (PARTITION BY domain ORDER BY month DESC)
       , lag(visitor) OVER (PARTITION BY domain ORDER BY month DESC)
  from visitors
  where domain = 'd001';

  select domain
       , month
       , visitor
       , first_value(visitor) OVER (PARTITION BY domain ORDER BY month DESC)
       , last_value(visitor) OVER (PARTITION BY domain ORDER BY month DESC)
       , lead(visitor, 1, 0) OVER (PARTITION BY domain ORDER BY month DESC)
       , lag(visitor, 1, 0) OVER (PARTITION BY domain ORDER BY month DESC)
       , lead(visitor, 12, 0) OVER (PARTITION BY domain ORDER BY month DESC)
       , lag(visitor, 12, 0) OVER (PARTITION BY domain ORDER BY month DESC)
  from visitors
  where domain = 'd001';

  create table visitors_report as
    select domain
         , month
         , visitor
         , lead(visitor, 1, 0) OVER (PARTITION BY domain ORDER BY month DESC)            as last_mon
         , visitor - lead(visitor, 1, 0) OVER (PARTITION BY domain ORDER BY month DESC)  as delta_mon
         , lead(visitor, 12, 0) OVER (PARTITION BY domain ORDER BY month DESC)           as last_year
         , visitor - lead(visitor, 12, 0) OVER (PARTITION BY domain ORDER BY month DESC) as delta_year
    from visitors
  ;

  select * from visitors_report where domain = 'd001' and month > '2014';

  select month
       , domain
       , visitor
       , last_mon
       , last_year
  from visitors_report
  where (domain = 'd001' or domain = 'd002') and month > '2014'
  order by month desc, domain asc
  limit 100;

  select month
       , domain
       , visitor
       , max(visitor) OVER (PARTITION BY month) as max_visitors
       , min(visitor) OVER (PARTITION BY month) as min_visitors
  from visitors
  where month > '2014'
  order by month desc, domain asc;

  select *
  from (
  select month
       , domain
       , visitor
       , max(visitor) OVER (PARTITION BY domain ORDER BY month DESC ROWS BETWEEN CURRENT ROW AND 12 FOLLOWING) as max_visitors_last_12_mon
       , min(visitor) OVER (PARTITION BY domain ORDER BY month DESC ROWS BETWEEN CURRENT ROW AND 12 FOLLOWING) as min_visitors_last_12_mon
  from visitors
  ) v
  where month > '20131'
  order by month desc, domain asc;

你可能感兴趣的:(Hadoop(二十七)HIVE的高级应用之业绩报表)