14. 数据仓库分层之DWD层(2)

DWD层:对ODS层数据进行清洗(去除空值,脏数据,超过极限范围的数据,行式存储改为列存储,改压缩格式)

接上一节内容。在事件表解析成功后,我们需要将事件表中的数据详细解析。以字段“en”来判断事件名称,最后具体到确定的表。


  1. 商品点击表。14. 数据仓库分层之DWD层(2)_第1张图片

    • 建表语句
    hive (gmall)> 
    drop table if exists dwd_display_log;
    CREATE EXTERNAL TABLE dwd_display_log(
       `mid_id` string,
       `user_id` string,
       `version_code` string,
       `version_name` string,
       `lang` string,
       `source` string,
       `os` string,
       `area` string,
       `model` string,
       `brand` string,
       `sdk_version` string,
       `gmail` string,
       `height_width` string,
       `app_time` string,
       `network` string,
       `lng` string,
       `lat` string,
       `action` string,
       `goodsid` string,
       `place` string,
       `extend1` string,
       `category` string,
       `server_time` string
    )
    PARTITIONED BY (dt string)
    location '/warehouse/gmall/dwd/dwd_display_log/';
    • 导入数据
    hive (gmall)> 
    set hive.exec.dynamic.partition.mode=nonstrict;
    
    insert overwrite table dwd_display_log
    PARTITION (dt='2020-02-03')
    select 
       mid_id,
       user_id,
       version_code,
       version_name,
       lang,
       source,
       os,
       area,
       model,
       brand,
       sdk_version,
       gmail,
       height_width,
       app_time,
       network,
       lng,
       lat,
       get_json_object(event_json,'$.kv.action') action,
       get_json_object(event_json,'$.kv.goodsid') goodsid,
       get_json_object(event_json,'$.kv.place') place,
       get_json_object(event_json,'$.kv.extend1') extend1,
       get_json_object(event_json,'$.kv.category') category,
       server_time
    from dwd_base_event_log 
    where dt='2020-02-03' and event_name='display';
    • 测试
    hive (gmall)> select * from dwd_display_log limit 2;
  2. 商品详情页表

    • 建表语句
    hive (gmall)> drop table if exists dwd_newsdetail_log;
    CREATE EXTERNAL TABLE dwd_newsdetail_log(
       `mid_id` string,
       `user_id` string, 
       `version_code` string, 
       `version_name` string, 
       `lang` string, 
       `source` string, 
       `os` string, 
       `area` string, 
       `model` string,
       `brand` string, 
       `sdk_version` string, 
       `gmail` string, 
       `height_width` string, 
       `app_time` string,  
       `network` string, 
       `lng` string, 
       `lat` string, 
       `entry` string,
       `action` string,
       `goodsid` string,
       `showtype` string,
       `news_staytime` string,
       `loading_time` string,
       `type1` string,
       `category` string,
       `server_time` string)
    PARTITIONED BY (dt string)
    location '/warehouse/gmall/dwd/dwd_newsdetail_log/';
    • 导入数据
    hive (gmall)> set hive.exec.dynamic.partition.mode=nonstrict;
    
    insert overwrite table dwd_newsdetail_log
    PARTITION (dt='2020-02-03')
    select 
       mid_id,
       user_id,
       version_code,
       version_name,
       lang,
       source,
       os,
       area,
       model,
       brand,
       sdk_version,
       gmail,
       height_width,
       app_time,
       network,
       lng,
       lat,
       get_json_object(event_json,'$.kv.entry') entry,
       get_json_object(event_json,'$.kv.action') action,
       get_json_object(event_json,'$.kv.goodsid') goodsid,
       get_json_object(event_json,'$.kv.showtype') showtype,
       get_json_object(event_json,'$.kv.news_staytime') news_staytime,
       get_json_object(event_json,'$.kv.loading_time') loading_time,
       get_json_object(event_json,'$.kv.type1') type1,
       get_json_object(event_json,'$.kv.category') category,
       server_time
    from dwd_base_event_log
    where dt='2020-02-03' and event_name='newsdetail';
    • 测试
    hive (gmall)> select * from dwd_newsdetail_log limit 2;
  3. 商品列表页表

    • 建表语句
    hive (gmall)> 
    drop table if exists dwd_loading_log;
    CREATE EXTERNAL TABLE dwd_loading_log(
       `mid_id` string,
       `user_id` string, 
       `version_code` string, 
       `version_name` string, 
       `lang` string, 
       `source` string, 
       `os` string, 
       `area` string, 
       `model` string,
       `brand` string, 
       `sdk_version` string, 
       `gmail` string,
       `height_width` string,  
       `app_time` string,
       `network` string, 
       `lng` string, 
       `lat` string, 
       `action` string,
       `loading_time` string,
       `loading_way` string,
       `extend1` string,
       `extend2` string,
       `type` string,
       `type1` string,
       `server_time` string)
    PARTITIONED BY (dt string)
    location '/warehouse/gmall/dwd/dwd_loading_log/';
    • 导入数据
    hive (gmall)> set hive.exec.dynamic.partition.mode=nonstrict;
    
    insert overwrite table dwd_loading_log
    PARTITION (dt='2020-02-03')
    select 
       mid_id,
       user_id,
       version_code,
       version_name,
       lang,
       source,
       os,
       area,
       model,
       brand,
       sdk_version,
       gmail,
       height_width,
       app_time,
       network,
       lng,
       lat,
       get_json_object(event_json,'$.kv.action') action,
       get_json_object(event_json,'$.kv.loading_time') loading_time,
       get_json_object(event_json,'$.kv.loading_way') loading_way,
       get_json_object(event_json,'$.kv.extend1') extend1,
       get_json_object(event_json,'$.kv.extend2') extend2,
       get_json_object(event_json,'$.kv.type') type,
       get_json_object(event_json,'$.kv.type1') type1,
       server_time
    from dwd_base_event_log
    where dt='2020-02-03' and event_name='loading';
    • 测试
    hive (gmall)> select * from dwd_loading_log limit 2;
  4. 广告表

    • 建表语句
    hive (gmall)> 
    drop table if exists dwd_ad_log;
    CREATE EXTERNAL TABLE dwd_ad_log(
       `mid_id` string,
       `user_id` string, 
       `version_code` string, 
       `version_name` string, 
       `lang` string, 
       `source` string, 
       `os` string, 
       `area` string, 
       `model` string,
       `brand` string, 
       `sdk_version` string, 
       `gmail` string, 
       `height_width` string,  
       `app_time` string,
       `network` string, 
       `lng` string, 
       `lat` string, 
       `entry` string,
       `action` string,
       `content` string,
       `detail` string,
       `ad_source` string,
       `behavior` string,
       `newstype` string,
       `show_style` string,
       `server_time` string)
    PARTITIONED BY (dt string)
    location '/warehouse/gmall/dwd/dwd_ad_log/';
    • 插入数据
    hive (gmall)> 
    set hive.exec.dynamic.partition.mode=nonstrict;
    
    insert overwrite table dwd_ad_log
    PARTITION (dt='2020-02-03')
    select 
       mid_id,
       user_id,
       version_code,
       version_name,
       lang,
       source,
       os,
       area,
       model,
       brand,
       sdk_version,
       gmail,
       height_width,
       app_time,
       network,
       lng,
       lat,
       get_json_object(event_json,'$.kv.entry') entry,
       get_json_object(event_json,'$.kv.action') action,
       get_json_object(event_json,'$.kv.content') content,
       get_json_object(event_json,'$.kv.detail') detail,
       get_json_object(event_json,'$.kv.source') ad_source,
       get_json_object(event_json,'$.kv.behavior') behavior,
       get_json_object(event_json,'$.kv.newstype') newstype,
       get_json_object(event_json,'$.kv.show_style') show_style,
       server_time
    from dwd_base_event_log 
    where dt='2020-02-03' and event_name='ad';
    • 测试
    hive (gmall)> select * from dwd_ad_log limit 2;
  5. 消息通知表

    • 建表语句
    hive (gmall)> 
    drop table if exists dwd_notification_log;
    CREATE EXTERNAL TABLE dwd_notification_log(
       `mid_id` string,
       `user_id` string, 
       `version_code` string, 
       `version_name` string, 
       `lang` string,
       `source` string, 
       `os` string, 
       `area` string, 
       `model` string,
       `brand` string, 
       `sdk_version` string, 
       `gmail` string, 
       `height_width` string,  
       `app_time` string,
       `network` string, 
       `lng` string, 
       `lat` string, 
       `action` string,
       `noti_type` string,
       `ap_time` string,
       `content` string,
       `server_time` string
    )
    PARTITIONED BY (dt string)
    location '/warehouse/gmall/dwd/dwd_notification_log/';
    • 插入数据
    hive (gmall)> set hive.exec.dynamic.partition.mode=nonstrict;
    
    insert overwrite table dwd_notification_log
    PARTITION (dt='2020-02-03')
    select 
       mid_id,
       user_id,
       version_code,
       version_name,
       lang,
       source,
       os,
       area,
       model,
       brand,
       sdk_version,
       gmail,
       height_width,
       app_time,
       network,
       lng,
       lat,
       get_json_object(event_json,'$.kv.action') action,
       get_json_object(event_json,'$.kv.noti_type') noti_type,
       get_json_object(event_json,'$.kv.ap_time') ap_time,
       get_json_object(event_json,'$.kv.content') content,
       server_time
    from dwd_base_event_log
    where dt='2020-02-03' and event_name='notification';
    • 测试
    hive (gmall)> select * from dwd_notification_log limit 2;
  6. 用户前台活跃表

    • 建表语句
    hive (gmall)> 
    drop table if exists dwd_active_foreground_log;
    CREATE EXTERNAL TABLE dwd_active_foreground_log(
       `mid_id` string,
       `user_id` string,
       `version_code` string,
       `version_name` string,
       `lang` string,
       `source` string,
       `os` string,
       `area` string,
       `model` string,
       `brand` string,
       `sdk_version` string,
       `gmail` string,
       `height_width` string,
       `app_time` string,
       `network` string,
       `lng` string,
       `lat` string,
       `push_id` string,
       `access` string,
       `server_time` string)
    PARTITIONED BY (dt string)
    location '/warehouse/gmall/dwd/dwd_foreground_log/';
    • 插入数据
    hive (gmall)> 
    set hive.exec.dynamic.partition.mode=nonstrict;
    
    insert overwrite table dwd_active_foreground_log
    PARTITION (dt='2020-02-03')
    select 
       mid_id,
       user_id,
       version_code,
       version_name,
       lang,
       source,
       os,
       area,
       model,
       brand,
       sdk_version,
       gmail,
       height_width,
       app_time,
       network,
       lng,
       lat,
       get_json_object(event_json,'$.kv.push_id') push_id,
       get_json_object(event_json,'$.kv.access') access,
       server_time
    from dwd_base_event_log
    where dt='2020-02-03' and event_name='active_foreground';
    • 测试
    hive (gmall)> select * from dwd_active_foreground_log limit 2;
  7. 用户后台活跃表

    • 建表语句
    hive (gmall)> 
    drop table if exists dwd_active_background_log;
    CREATE EXTERNAL TABLE dwd_active_background_log(
       `mid_id` string,
       `user_id` string,
       `version_code` string,
       `version_name` string,
       `lang` string,
       `source` string,
       `os` string,
       `area` string,
       `model` string,
       `brand` string,
       `sdk_version` string,
       `gmail` string,
       `height_width` string,
       `app_time` string,
       `network` string,
       `lng` string,
       `lat` string,
       `active_source` string,
       `server_time` string
    )
    PARTITIONED BY (dt string)
    location '/warehouse/gmall/dwd/dwd_background_log/';
    • 插入数据
    hive (gmall)> 
    set hive.exec.dynamic.partition.mode=nonstrict;
    
    insert overwrite table dwd_active_background_log
    PARTITION (dt='2020-02-03')
    select 
       mid_id,
       user_id,
       version_code,
       version_name,
       lang,
       source,
       os,
       area,
       model,
       brand,
       sdk_version,
       gmail,
       height_width,
       app_time,
       network,
       lng,
       lat,
       get_json_object(event_json,'$.kv.active_source') active_source,
       server_time
    from dwd_base_event_log
    where dt='2020-02-03' and event_name='active_background';
    • 测试
    hive (gmall)> select * from dwd_active_background_log limit 2;
  8. 评论表

    • 建表语句
    hive (gmall)> 
    drop table if exists dwd_comment_log;
    CREATE EXTERNAL TABLE dwd_comment_log(
       `mid_id` string,
       `user_id` string,
       `version_code` string,
       `version_name` string,
       `lang` string,
       `source` string,
       `os` string,
       `area` string,
       `model` string,
       `brand` string,
       `sdk_version` string,
       `gmail` string,
       `height_width` string,
       `app_time` string,
       `network` string,
       `lng` string,
       `lat` string,
       `comment_id` int,
       `userid` int,
       `p_comment_id` int, 
       `content` string,
       `addtime` string,
       `other_id` int,
       `praise_count` int,
       `reply_count` int,
       `server_time` string
    )
    PARTITIONED BY (dt string)
    location '/warehouse/gmall/dwd/dwd_comment_log/';
    • 插入数据
    hive (gmall)> 
    set hive.exec.dynamic.partition.mode=nonstrict;
    
    insert overwrite table dwd_comment_log
    PARTITION (dt='2020-02-03')
    select 
       mid_id,
       user_id,
       version_code,
       version_name,
       lang,
       source,
       os,
       area,
       model,
       brand,
       sdk_version,
       gmail,
       height_width,
       app_time,
       network,
       lng,
       lat,
       get_json_object(event_json,'$.kv.comment_id') comment_id,
       get_json_object(event_json,'$.kv.userid') userid,
       get_json_object(event_json,'$.kv.p_comment_id') p_comment_id,
       get_json_object(event_json,'$.kv.content') content,
       get_json_object(event_json,'$.kv.addtime') addtime,
       get_json_object(event_json,'$.kv.other_id') other_id,
       get_json_object(event_json,'$.kv.praise_count') praise_count,
       get_json_object(event_json,'$.kv.reply_count') reply_count,
       server_time
    from dwd_base_event_log
    where dt='2020-02-03' and event_name='comment';
    • 测试
    hive (gmall)> select * from dwd_comment_log limit 2;
  9. 收藏表

    • 建表语句
    hive (gmall)> 
    drop table if exists dwd_favorites_log;
    CREATE EXTERNAL TABLE dwd_favorites_log(
       `mid_id` string,
       `user_id` string, 
       `version_code` string, 
       `version_name` string, 
       `lang` string, 
       `source` string, 
       `os` string, 
       `area` string, 
       `model` string,
       `brand` string, 
       `sdk_version` string, 
       `gmail` string, 
       `height_width` string,  
       `app_time` string,
       `network` string, 
       `lng` string, 
       `lat` string, 
       `id` int, 
       `course_id` int, 
       `userid` int,
       `add_time` string,
       `server_time` string
    )
    PARTITIONED BY (dt string)
    location '/warehouse/gmall/dwd/dwd_favorites_log/';
    • 插入数据
    hive (gmall)> 
    set hive.exec.dynamic.partition.mode=nonstrict;
    
    insert overwrite table dwd_favorites_log
    PARTITION (dt='2020-02-03')
       select 
       mid_id,
       user_id,
       version_code,
       version_name,
       lang,
       source,
       os,
       area,
       model,
       brand,
       sdk_version,
       gmail,
       height_width,
       app_time,
       network,
       lng,
       lat,
       get_json_object(event_json,'$.kv.id') id,
       get_json_object(event_json,'$.kv.course_id') course_id,
       get_json_object(event_json,'$.kv.userid') userid,
       get_json_object(event_json,'$.kv.add_time') add_time,
       server_time
    from dwd_base_event_log 
    where dt='2020-02-03' and event_name='favorites';
    • 测试
    hive (gmall)> select * from dwd_favorites_log limit 2;
  10. 点赞表

    • 建表语句
    hive (gmall)> 
    drop table if exists dwd_praise_log;
    CREATE EXTERNAL TABLE dwd_praise_log(
       `mid_id` string,
       `user_id` string, 
       `version_code` string, 
       `version_name` string, 
       `lang` string, 
       `source` string, 
       `os` string, 
       `area` string, 
       `model` string,
       `brand` string, 
       `sdk_version` string, 
       `gmail` string, 
       `height_width` string,  
       `app_time` string,
       `network` string, 
       `lng` string, 
       `lat` string, 
       `id` string, 
       `userid` string, 
       `target_id` string,
       `type` string,
       `add_time` string,
       `server_time` string
    )
    PARTITIONED BY (dt string)
    location '/warehouse/gmall/dwd/dwd_praise_log/';
    • 插入数据
    hive (gmall)> 
    set hive.exec.dynamic.partition.mode=nonstrict;
    
    insert overwrite table dwd_praise_log
    PARTITION (dt='2020-02-03')
    select 
       mid_id,
       user_id,
       version_code,
       version_name,
       lang,
       source,
       os,
       area,
       model,
       brand,
       sdk_version,
       gmail,
       height_width,
       app_time,
       network,
       lng,
       lat,
       get_json_object(event_json,'$.kv.id') id,
       get_json_object(event_json,'$.kv.userid') userid,
       get_json_object(event_json,'$.kv.target_id') target_id,
       get_json_object(event_json,'$.kv.type') type,
       get_json_object(event_json,'$.kv.add_time') add_time,
       server_time
    from dwd_base_event_log
    where dt='2020-02-03' and event_name='praise';
    • 测试
    hive (gmall)> select * from dwd_praise_log limit 2;
  11. 错误日志表

    • 建表语句
    hive (gmall)> 
    drop table if exists dwd_error_log;
    CREATE EXTERNAL TABLE dwd_error_log(
       `mid_id` string,
       `user_id` string, 
       `version_code` string, 
       `version_name` string, 
       `lang` string, 
       `source` string, 
       `os` string, 
       `area` string, 
       `model` string,
       `brand` string, 
       `sdk_version` string, 
       `gmail` string, 
       `height_width` string,  
       `app_time` string,
       `network` string, 
       `lng` string, 
       `lat` string, 
       `errorBrief` string, 
       `errorDetail` string, 
       `server_time` string)
    PARTITIONED BY (dt string)
    location '/warehouse/gmall/dwd/dwd_error_log/';
    • 插入数据
    hive (gmall)> 
    set hive.exec.dynamic.partition.mode=nonstrict;
    
    insert overwrite table dwd_error_log
    PARTITION (dt='2020-02-03')
    select 
       mid_id,
       user_id,
       version_code,
       version_name,
       lang,
       source,
       os,
       area,
       model,
       brand,
       sdk_version,
       gmail,
       height_width,
       app_time,
       network,
       lng,
       lat,
       get_json_object(event_json,'$.kv.errorBrief') errorBrief,
       get_json_object(event_json,'$.kv.errorDetail') errorDetail,
       server_time
    from dwd_base_event_log 
    where dt='2020-02-03' and event_name='error';
    • 测试
    hive (gmall)> select * from dwd_error_log limit 2;
  12. 将自己所建数据全部导入到数仓中

    [hadoop@hadoop151 bin]$ dwd_event_log.sh 2020-01-01 2020-01-31

    测试

    hive (gmall)> select * from dwd_comment_log where dt='2020-01-11' limit 2;

你可能感兴趣的:(数据仓库,hive)