15. 数据仓库分层之DWS层、ADS层--用户主题活跃

    在经过了ODS层的加载数据、DWD层的解析数据后。这一节将具体分析特定指标,DWS层将对DWD层数据轻度汇总,ADS层将统计数据。


用户活跃主题


  1. DWD层目标:统计当日、当周、当月活动的每个设备明细。

    1. 每日活跃设备明细

      • 建表语句
      hive (gmall)>
      drop table if exists dws_uv_detail_day;
      create external table dws_uv_detail_day(
        `mid_id` string COMMENT '设备唯一标识',
        `user_id` string COMMENT '用户标识', 
        `version_code` string COMMENT '程序版本号', 
        `version_name` string COMMENT '程序版本名', 
        `lang` string COMMENT '系统语言', 
        `source` string COMMENT '渠道号', 
        `os` string COMMENT '安卓系统版本', 
        `area` string COMMENT '区域', 
        `model` string COMMENT '手机型号', 
        `brand` string COMMENT '手机品牌', 
        `sdk_version` string COMMENT 'sdkVersion', 
        `gmail` string COMMENT 'gmail', 
        `height_width` string COMMENT '屏幕宽高',
        `app_time` string COMMENT '客户端日志产生时的时间',
        `network` string COMMENT '网络模式',
        `lng` string COMMENT '经度',
        `lat` string COMMENT '纬度'
      )
      partitioned by(dt string)
      stored as parquet
      location '/warehouse/gmall/dws/dws_uv_detail_day'
      ;
      • 插入数据
      hive (gmall)>
      set hive.exec.dynamic.partition.mode=nonstrict;
      
      insert overwrite table dws_uv_detail_day 
      partition(dt='2020-02-03')
      select  
        mid_id,
        concat_ws('|', collect_set(user_id)) user_id,
        concat_ws('|', collect_set(version_code)) version_code,
        concat_ws('|', collect_set(version_name)) version_name,
        concat_ws('|', collect_set(lang))lang,
        concat_ws('|', collect_set(source)) source,
        concat_ws('|', collect_set(os)) os,
        concat_ws('|', collect_set(area)) area, 
        concat_ws('|', collect_set(model)) model,
        concat_ws('|', collect_set(brand)) brand,
        concat_ws('|', collect_set(sdk_version)) sdk_version,
        concat_ws('|', collect_set(gmail)) gmail,
        concat_ws('|', collect_set(height_width)) height_width,
        concat_ws('|', collect_set(app_time)) app_time,
        concat_ws('|', collect_set(network)) network,
        concat_ws('|', collect_set(lng)) lng,
        concat_ws('|', collect_set(lat)) lat
      from dwd_start_log
      where dt='2020-02-03'
      group by mid_id;
      • 查看结果
      hive (gmall)> select * from dws_uv_detail_day limit 1;
    2. 每周活跃设备明细

      • 建表语句
      hive (gmall)>
      drop table if exists dws_uv_detail_wk;
      create external table dws_uv_detail_wk( 
        `mid_id` string COMMENT '设备唯一标识',
        `user_id` string COMMENT '用户标识', 
        `version_code` string COMMENT '程序版本号', 
        `version_name` string COMMENT '程序版本名', 
        `lang` string COMMENT '系统语言', 
        `source` string COMMENT '渠道号', 
        `os` string COMMENT '安卓系统版本', 
        `area` string COMMENT '区域', 
        `model` string COMMENT '手机型号', 
        `brand` string COMMENT '手机品牌', 
        `sdk_version` string COMMENT 'sdkVersion', 
        `gmail` string COMMENT 'gmail', 
        `height_width` string COMMENT '屏幕宽高',
        `app_time` string COMMENT '客户端日志产生时的时间',
        `network` string COMMENT '网络模式',
        `lng` string COMMENT '经度',
        `lat` string COMMENT '纬度',
        `monday_date` string COMMENT '周一日期',
        `sunday_date` string COMMENT  '周日日期' 
      ) COMMENT '活跃用户按周明细'
      PARTITIONED BY (`wk_dt` string)
      stored as parquet
      location '/warehouse/gmall/dws/dws_uv_detail_wk/';
      • 插入数据
      set hive.exec.dynamic.partition.mode=nonstrict;
      insert overwrite table dws_uv_detail_wk 
      partition(wk_dt) 
      select
        mid_id,
        concat_ws('|', collect_set(user_id)) user_id,
        concat_ws('|', collect_set(version_code)) version_code,
        concat_ws('|', collect_set(version_name)) version_name,
        concat_ws('|', collect_set(lang))lang,
        concat_ws('|', collect_set(source)) source,
        concat_ws('|', collect_set(os)) os,
        concat_ws('|', collect_set(area)) area, 
        concat_ws('|', collect_set(model)) model,
        concat_ws('|', collect_set(brand)) brand,
        concat_ws('|', collect_set(sdk_version)) sdk_version,
        concat_ws('|', collect_set(gmail)) gmail,
        concat_ws('|', collect_set(height_width)) height_width,
        concat_ws('|', collect_set(app_time)) app_time,
        concat_ws('|', collect_set(network)) network,
        concat_ws('|', collect_set(lng)) lng,
        concat_ws('|', collect_set(lat)) lat,
        date_add(next_day('2020-02-03','mo'),-7),
        date_add(next_day('2020-02-03','mo'),-1),
        concat(date_add(next_day('2020-02-03','mo'),-7),'_',date_add(next_day('2020-02-03','mo'),-1))
      from dws_uv_detail_day
      where dt>=date_add(next_day('2020-02-03','mo'),-7) and dt<=date_add(next_day('2020-02-03','mo'),-1)
      group by mid_id;
      • 查询导入结果
      select * from dws_uv_detail_wk limit 1;
    3. 每月活跃设备明细

      • 建表语句
      hive (gmall)>
      drop table if exists dws_uv_detail_mn;
      
      create external table dws_uv_detail_mn( 
        `mid_id` string COMMENT '设备唯一标识',
        `user_id` string COMMENT '用户标识', 
        `version_code` string COMMENT '程序版本号', 
        `version_name` string COMMENT '程序版本名', 
        `lang` string COMMENT '系统语言', 
        `source` string COMMENT '渠道号', 
        `os` string COMMENT '安卓系统版本', 
        `area` string COMMENT '区域', 
        `model` string COMMENT '手机型号', 
        `brand` string COMMENT '手机品牌', 
        `sdk_version` string COMMENT 'sdkVersion', 
        `gmail` string COMMENT 'gmail', 
        `height_width` string COMMENT '屏幕宽高',
        `app_time` string COMMENT '客户端日志产生时的时间',
        `network` string COMMENT '网络模式',
        `lng` string COMMENT '经度',
        `lat` string COMMENT '纬度'
      ) COMMENT '活跃用户按月明细'
      PARTITIONED BY (`mn` string)
      stored as parquet
      location '/warehouse/gmall/dws/dws_uv_detail_mn/';
      • 插入数据
      insert overwrite table dws_uv_detail_mn
      partition(mn)
      select
        mid_id,
        concat_ws('|', collect_set(user_id)) user_id,
        concat_ws('|', collect_set(version_code)) version_code,
        concat_ws('|', collect_set(version_name)) version_name,
        concat_ws('|', collect_set(lang)) lang,
        concat_ws('|', collect_set(source)) source,
        concat_ws('|', collect_set(os)) os,
        concat_ws('|', collect_set(area)) area, 
        concat_ws('|', collect_set(model)) model,
        concat_ws('|', collect_set(brand)) brand,
        concat_ws('|', collect_set(sdk_version)) sdk_version,
        concat_ws('|', collect_set(gmail)) gmail,
        concat_ws('|', collect_set(height_width)) height_width,
        concat_ws('|', collect_set(app_time)) app_time,
        concat_ws('|', collect_set(network)) network,
        concat_ws('|', collect_set(lng)) lng,
        concat_ws('|', collect_set(lat)) lat,
        date_format('2020-02-03', 'yyyy-MM')
      from dws_uv_detail_day
      where date_format(dt, 'yyyy-MM')=date_format('2020-02-03', 'yyyy-MM')
      group by mid_id;
      • 查询结果
      hive (gmall)> select * from dws_uv_detail_mn limit 2;
      • 将剩余数据导入
  2. ADS层目标:计算当日、当周、当月活跃设备数

    • 建表语句
    hive (gmall)>
    drop table if exists ads_uv_count;
    create external table ads_uv_count( 
       `dt` string COMMENT '统计日期',
       `day_count` bigint COMMENT '当日用户数量',
       `wk_count`  bigint COMMENT '当周用户数量',
       `mn_count`  bigint COMMENT '当月用户数量',
       `is_weekend` string COMMENT 'Y,N是否是周末,用于得到本周最终结果',
       `is_monthend` string COMMENT 'Y,N是否是月末,用于得到本月最终结果' 
    ) COMMENT '活跃设备数'
    row format delimited fields terminated by '\t'
    location '/warehouse/gmall/ads/ads_uv_count/';
    • 导入数据
    hive (gmall)> 
    insert into table ads_uv_count
    select
       '2020-02-03',
       daycount.ct,
       weekcount.ct,
       mncount.ct,
       if(date_add(next_day('2020-02-03','mo'),-1)='2020-02-03', 'Y', 'N'),
       if(last_day('2020-02-03')='2020-02-03','Y','N')
    from
    (
       select
           '2020-02-03' dt,
           count(*) ct
       from dws_uv_detail_day
       where dt='2020-02-03'
    )daycount join
    (
       select
           '2020-02-03' dt,
           count(*) ct
       from dws_uv_detail_wk
       where wk_dt=concat(date_add(next_day('2020-02-03','mo'),-7),'_',date_add(next_day('2020-02-03','mo'),-1))
    )weekcount on daycount.dt=weekcount.dt join
    (
       select
           '2020-02-03' dt,
           count(*) ct
       from dws_uv_detail_mn
       where mn=date_format('2020-02-10','yyyy-MM')
    )mncount on daycount.dt=mncount.dt;
    • 加载剩余数据
    [hadoop@hadoop151 bin]$ ads_uv_log.sh 2020-01-01 2020-01-31
  3. 查询ADS层中的数据。

    hive (gmall)> select * from ads_uv_count;
    OK
    ads_uv_count.dt    ads_uv_count.day_count    ads_uv_count.wk_count    ads_uv_count.mn_count    ads_uv_count.is_weekend    ads_uv_count.is_monthend
    2020-02-03    741    741     741     N    N
    2020-01-01    521    990     1000    N    N
    2020-01-10    728    999     1000    N    N
    2020-01-11    763    999     1000    N    N
    2020-01-12    742    999     1000    Y    N
    2020-01-13    444    1000    1000    N    N
    2020-01-14    757    1000    1000    N    N
    2020-01-15    757    1000    1000    N    N
    2020-01-16    756    1000    1000    N    N
    2020-01-17    744    1000    1000    N    N
    2020-01-18    746    1000    1000    N    N
    2020-01-19    722    1000    1000    Y    N
    2020-01-02    506    990     1000    N    N
    2020-01-20    751    1000    1000    N    N
    2020-01-21    742    1000    1000    N    N
    2020-01-22    760    1000    1000    N    N
    2020-01-23    750    1000    1000    N    N
    2020-01-24    784    1000    1000    N    N
    2020-01-25    578    1000    1000    N    N
    2020-01-26    866    1000    1000    Y    N
    2020-01-27    750    999     1000    N    N
    2020-01-28    757    999     1000    N    N
    2020-01-29    745    999     1000    N    N
    2020-01-03    736    990     1000    N    N
    2020-01-30    766    999     1000    N    N
    2020-01-31    657    999     1000    N    Y
    2020-01-04    502    990     1000    N    N
    2020-01-05    759    990     1000    Y    N
    2020-01-06    762    999     1000    N    N
    2020-01-07    772    999     1000    N    N
    2020-01-08    735    999     1000    N    N
    2020-01-09    29    999     1000     N    N
    Time taken: 0.042 seconds, Fetched: 32 row(s)

    造成周活跃设备接近月活跃设备的原因可能是生成的埋点数据导致的,因为没有生产数据,所以这样的结果应该在意料之中。

你可能感兴趣的:(数据仓库,hive)