14. 数据仓库分层之DWD层(1)

DWD层:对ODS层数据进行清洗(去除空值,脏数据,超过极限范围的数据,行式存储改为列存储,改压缩格式)。

DWD层启动表数据解析


  1. 创建启动表dwd_start_log

    hive (gmall)> drop table if exists dwd_start_log;
    CREATE EXTERNAL TABLE dwd_start_log(
    `mid_id` string,
    `user_id` string, 
    `version_code` string, 
    `version_name` string, 
    `lang` string, 
    `source` string, 
    `os` string, 
    `area` string, 
    `model` string,
    `brand` string, 
    `sdk_version` string, 
    `gmail` string, 
    `height_width` string,  
    `app_time` string,
    `network` string, 
    `lng` string, 
    `lat` string, 
    `entry` string, 
    `open_ad_type` string, 
    `action` string, 
    `loading_time` string, 
    `detail` string, 
    `extend1` string
    )
    PARTITIONED BY (dt string)
    location '/warehouse/gmall/dwd/dwd_start_log/';
  2. 向启动表中插入数据。

    insert overwrite table dwd_start_log
    PARTITION (dt='2020-02-03')
    select 
       get_json_object(line,'$.mid') mid_id,
       get_json_object(line,'$.uid') user_id,
       get_json_object(line,'$.vc') version_code,
       get_json_object(line,'$.vn') version_name,
       get_json_object(line,'$.l') lang,
       get_json_object(line,'$.sr') source,
       get_json_object(line,'$.os') os,
       get_json_object(line,'$.ar') area,
       get_json_object(line,'$.md') model,
       get_json_object(line,'$.ba') brand,
       get_json_object(line,'$.sv') sdk_version,
       get_json_object(line,'$.g') gmail,
       get_json_object(line,'$.hw') height_width,
       get_json_object(line,'$.t') app_time,
       get_json_object(line,'$.nw') network,
       get_json_object(line,'$.ln') lng,
       get_json_object(line,'$.la') lat,
       get_json_object(line,'$.entry') entry,
       get_json_object(line,'$.open_ad_type') open_ad_type,
       get_json_object(line,'$.action') action,
       get_json_object(line,'$.loading_time') loading_time,
       get_json_object(line,'$.detail') detail,
       get_json_object(line,'$.extend1') extend1
    from ods_start_log 
    where dt='2020-02-03';
  3. 测试

    hive (gmall)> select * from dwd_start_log limit 2;
  4. 使用脚本将所有数据导入数仓。(脚本在笔记)

DWD层事件表数据解析


14. 数据仓库分层之DWD层(1)_第1张图片

  1. 创建事件日志基础明细表dwd_base_event_log

    hive (gmall)> drop table if exists dwd_base_event_log;
    CREATE EXTERNAL TABLE dwd_base_event_log(
    `mid_id` string,
    `user_id` string, 
    `version_code` string, 
    `version_name` string, 
    `lang` string, 
    `source` string, 
    `os` string, 
    `area` string, 
    `model` string,
    `brand` string, 
    `sdk_version` string, 
    `gmail` string, 
    `height_width` string, 
    `app_time` string, 
    `network` string, 
    `lng` string, 
    `lat` string, 
    `event_name` string, 
    `event_json` string, 
    `server_time` string)
    PARTITIONED BY (`dt` string)
    stored as parquet
    location '/warehouse/gmall/dwd/dwd_base_event_log/';
  2. 创建UDF函数解析公共字段。
    14. 数据仓库分层之DWD层(1)_第2张图片

    1. 创建maven工程:hivefunction
    2. 在pom.xml文件中插入以下内容:

      
        UTF8
        1.2.1
      
      
      
        
        
            org.apache.hive
            hive-exec
            ${hive.version}
        
      
      
      
        
            
                maven-compiler-plugin
                2.3.2
                
                    1.8
                    1.8
                
            
            
                maven-assembly-plugin
                
                    
                        jar-with-dependencies
                    
                
                
                    
                        make-assembly
                        package
                        
                            single
                        
                    
                
            
        
      
    3. 具体UDF函数代码如下:

      package com.bbxy.udf;
      
      import org.apache.commons.lang.StringUtils;
      import org.apache.hadoop.hive.ql.exec.UDF;
      import org.json.JSONException;
      import org.json.JSONObject;
      
      public class BaseFieldUDF extends UDF {
      
          public String evaluate(String line, String jsonkeysString) {
            
              // 0 准备一个sb
              StringBuilder sb = new StringBuilder();
      
              // 1 切割jsonkeys  mid uid vc vn l sr os ar md
              String[] jsonkeys = jsonkeysString.split(",");
      
              // 2 处理line   服务器时间 | json
              String[] logContents = line.split("\\|");
      
              // 3 合法性校验
              if (logContents.length != 2 || StringUtils.isBlank(logContents[1])) {
                  return "";
              }
      
              // 4 开始处理json
              try {
                  JSONObject jsonObject = new JSONObject(logContents[1]);
      
                   // 获取cm里面的对象
                  JSONObject base = jsonObject.getJSONObject("cm");
      
                  // 循环遍历取值
                  for (int i = 0; i < jsonkeys.length; i++) {
                      String filedName = jsonkeys[i].trim();
      
                      if (base.has(filedName)) {
                          sb.append(base.getString(filedName)).append("\t");
                      } else {
                        sb.append("\t");
                      }
                  }
      
                  sb.append(jsonObject.getString("et")).append("\t");
                  sb.append(logContents[0]).append("\t");
              } catch (JSONException e) {
                  e.printStackTrace();
              }
      
              return sb.toString();
          }
        }
  3. 定义UDTF函数解析具体事件字段。
    14. 数据仓库分层之DWD层(1)_第3张图片
    具体UDTF函数代码如下:

    package com.bbxy.udtf;
    
    import org.apache.commons.lang.StringUtils;
    import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
    import org.apache.hadoop.hive.ql.metadata.HiveException;
    import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
    import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
    import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
    import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
    import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
    import org.json.JSONArray;
    import org.json.JSONException;
    
    import java.util.ArrayList;
    
    public class EventJsonUDTF extends GenericUDTF {
        //该方法中,我们将指定输出参数的名称和参数类型:
        @Override
        public StructObjectInspector initialize(ObjectInspector[] argOIs) throws UDFArgumentException {
    
            ArrayList fieldNames = new ArrayList();
            ArrayList fieldOIs = new ArrayList();
    
            fieldNames.add("event_name");
            fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
            fieldNames.add("event_json");
            fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
    
            return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs);
        }
    
        //输入1条记录,输出若干条结果
        @Override
        public void process(Object[] objects) throws HiveException {
            // 获取传入的et
            String input = objects[0].toString();
    
            // 如果传进来的数据为空,直接返回过滤掉该数据
            if (StringUtils.isBlank(input)) {
                return;
            } else {
                try {
                    // 获取一共有几个事件(ad/facoriters)
                    JSONArray ja = new JSONArray(input);
    
                    if (ja == null)
                        return;
    
                    // 循环遍历每一个事件
                    for (int i = 0; i < ja.length(); i++) {
                        String[] result = new String[2];
    
                        try {
                            // 取出每个的事件名称(ad/facoriters)
                            result[0] = ja.getJSONObject(i).getString("en");
    
                            // 取出每一个事件整体
                            result[1] = ja.getString(i);
                        } catch (JSONException e) {
                            continue;
                        }
    
                        // 将结果返回
                        forward(result);
                    }
                } catch (JSONException e) {
                    e.printStackTrace();
                }
            }
        }
    
        //当没有记录处理的时候该方法会被调用,用来清理代码或者产生额外的输出
        @Override
        public void close() throws HiveException {
    
        }
    }
  4. 打包。将打包好的jar包放入虚拟机“hadoop151”的“/opt/module/hive”目录下。
  5. 将jar包添加到Hive的classpath

    hive (gmall)> add jar /opt/module/hive/hivefunction-1.0-SNAPSHOT.jar;
  6. 创建临时函数与开发好的java class关联。

    hive (gmall)> 
    create temporary function base_analizer as 'com.bbxy.udf.BaseFieldUDF';
    create temporary function flat_analizer as 'com.bbxy.udtf.EventJsonUDTF';
  7. 向事件日志基础明细表dwd_base_event_log中插入数据。

    hive (gmall)> set hive.exec.dynamic.partition.mode=nonstrict;
    
    insert overwrite table dwd_base_event_log 
    PARTITION (dt='2020-02-03')
    select
        mid_id,
        user_id,
        version_code,
        version_name,
        lang,
        source,
        os,
        area,
        model,
        brand,
        sdk_version,
        gmail,
        height_width,
        app_time,
        network,
        lng,
        lat,
        event_name,
        event_json,
        server_time
    from
    (
        select
           split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[0]   as mid_id,
           split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[1]   as user_id,
           split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[2]   as version_code,
           split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[3]   as version_name,
           split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[4]   as lang,
           split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[5]   as source,
           split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[6]   as os,
           split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[7]   as area,
           split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[8]   as model,
           split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[9]   as brand,
           split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[10]   as sdk_version,
           split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[11]  as gmail,
           split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[12]  as height_width,
           split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[13]  as app_time,
           split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[14]  as network,
           split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[15]  as lng,
           split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[16]  as lat,
           split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[17]  as ops,
           split(base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[18]  as server_time
        from ods_event_log where dt='2020-02-03'  and  base_analizer(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la')<>'' 
    ) sdk_log lateral view flat_analizer(ops) tmp_k as event_name, event_json;
  8. 测试

    hive (gmall)> select * from dwd_base_event_log limit 2;
  9. 将剩余数据导入数仓。

    [hadoop@hadoop151 bin]$ dwd_base_log.sh 2020-01-01 2020-01-31

你可能感兴趣的:(数据仓库,hive)