1、缘由
本文主要涉及的是建表,不是计算指标。
2、mysql表
假设mysql中存在下面的八张表:
3、ods
ods层和源数据最好保持一致:
//创建用户表 drop table if exists ods_user_info; create table ods_user_info( `id` string COMMENT '用户id', `name` string COMMENT '姓名', `birthday` string COMMENT '生日' , `gender` string COMMENT '性别', `email` string COMMENT '邮箱', `user_level` string COMMENT '用户等级', `create_time` string COMMENT '创建时间' ) COMMENT '用户信息' PARTITIONED BY ( `dt` string) row format delimited fields terminated by '\t' location '/warehouse/gmall/ods/ods_user_info/' tblproperties ("parquet.compression"="snappy") //创建订单表 drop table if exists ods_order_info; create table ods_order_info ( `id` string COMMENT '订单编号', `total_amount` decimal(10,2) COMMENT '订单金额', `order_status` string COMMENT '订单状态', `user_id` string COMMENT '用户id' , `payment_way` string COMMENT '支付方式', `out_trade_no` string COMMENT '支付流水号', `create_time` string COMMENT '创建时间', `operate_time` string COMMENT '操作时间' ) COMMENT '订单表' PARTITIONED BY ( `dt` string) row format delimited fields terminated by '\t' location '/warehouse/gmall/ods/ods_order_info/' tblproperties ("parquet.compression"="snappy") ; //创建订单详情表 drop table if exists ods_order_detail; create table ods_order_detail( `id` string COMMENT '订单编号', `order_id` string COMMENT '订单号', `user_id` string COMMENT '用户id' , `sku_id` string COMMENT '商品id', `sku_name` string COMMENT '商品名称', `order_price` string COMMENT '下单价格', `sku_num` string COMMENT '商品数量', `create_time` string COMMENT '创建时间' ) COMMENT '订单明细表' PARTITIONED BY ( `dt` string) row format delimited fields terminated by '\t' location '/warehouse/gmall/ods/ods_order_detail/' tblproperties ("parquet.compression"="snappy") //创建支付流水表 drop table if exists `ods_payment_info`; create table `ods_payment_info`( `id` bigint COMMENT '编号', `out_trade_no` string COMMENT '对外业务编号', `order_id` string COMMENT '订单编号', `user_id` string COMMENT '用户编号', `alipay_trade_no` string COMMENT '支付宝交易流水编号', `total_amount` decimal(16,2) COMMENT '支付金额', `subject` string COMMENT '交易内容', `payment_type` string COMMENT '支付类型', `payment_time` string COMMENT '支付时间' ) COMMENT '支付流水表' PARTITIONED BY ( `dt` string) row format delimited fields terminated by '\t' location '/warehouse/gmall/ods/ods_payment_info/' tblproperties ("parquet.compression"="snappy") ; //创建商品一级、二级、三级分类表 drop table if exists ods_base_category1; create table ods_base_category1( `id` string COMMENT 'id', `name` string COMMENT '名称' ) COMMENT '商品一级分类' PARTITIONED BY ( `dt` string) row format delimited fields terminated by '\t' location '/warehouse/gmall/ods/ods_base_category1/' tblproperties ("parquet.compression"="snappy") ; drop table if exists ods_base_category2; create external table ods_base_category2( `id` string COMMENT ' id', `name` string COMMENT '名称', category1_id string COMMENT '一级品类id' ) COMMENT '商品二级分类' PARTITIONED BY ( `dt` string) row format delimited fields terminated by '\t' location '/warehouse/gmall/ods/ods_base_category2/' tblproperties ("parquet.compression"="snappy") ; drop table if exists ods_base_category3; create table ods_base_category3( `id` string COMMENT ' id', `name` string COMMENT '名称', category2_id string COMMENT '二级品类id' ) COMMENT '商品三级分类' PARTITIONED BY ( `dt` string) row format delimited fields terminated by '\t' location '/warehouse/gmall/ods/ods_base_category3/' tblproperties ("parquet.compression"="snappy") ;
4、dwd
①将文件的存储格式改为parquet
②对三级商品进行降维
注:从下面的建表sql中可以看出,只有商品表的创建有变化
③当然在导数据的过程中,也可以进行ETL(去空,去脏数据等)
//创建用户表 drop table if exists dwd_user_info; create external table dwd_user_info( ..... ) COMMENT '' PARTITIONED BY ( `dt` string) stored as parquet location '/warehouse/gmall/dwd/dwd_user_info/' tblproperties ("parquet.compression"="snappy") //创建订单表 drop table if exists dwd_order_info; create external table dwd_order_info ( ..... ) COMMENT '' PARTITIONED BY ( `dt` string) stored as parquet location '/warehouse/gmall/dwd/dwd_order_info/' tblproperties ("parquet.compression"="snappy") ; //创建订单详情表 drop table if exists dwd_order_detail; create external table dwd_order_detail( ..... ) COMMENT '' PARTITIONED BY ( `dt` string) stored as parquet location '/warehouse/gmall/dwd/dwd_order_detail/' tblproperties ("parquet.compression"="snappy") ;
//创建支付流水表 drop table if exists `dwd_payment_info`; create external table `dwd_payment_info`( ..... ) COMMENT '' PARTITIONED BY ( `dt` string) stored as parquet location '/warehouse/gmall/dwd/dwd_payment_info/' tblproperties ("parquet.compression"="snappy") //创建商品表 drop table if exists dwd_sku_info; create external table dwd_sku_info( `id` string COMMENT 'skuId', `spu_id` string COMMENT 'spuid', `price` decimal(10,2) COMMENT '' , `sku_name` string COMMENT '', `sku_desc` string COMMENT '', `weight` string COMMENT '', `tm_id` string COMMENT 'id', `category3_id` string COMMENT '1id', `category2_id` string COMMENT '2id', `category1_id` string COMMENT '3id', `category3_name` string COMMENT '3', `category2_name` string COMMENT '2', `category1_name` string COMMENT '1', `create_time` string COMMENT '' ) COMMENT '' PARTITIONED BY ( `dt` string) stored as parquet location '/warehouse/gmall/dwd/dwd_sku_info/' tblproperties ("parquet.compression"="snappy")
5、DWS
以每日为粒度,对用户这个实体进行轻度汇总,建立每日用户行为宽表
drop table if exists dws_user_action;
create external table dws_user_action (
user_id string comment '用户 id',
order_count bigint comment '下单次数 ',
order_amount decimal(16,2) comment '下单金额 ',
payment_count bigint comment '支付次数',
payment_amount decimal(16,2) comment '支付金额 ',
comment_count bigint comment '评论次数'
) COMMENT '每日用户行为宽表'
PARTITIONED BY ( `dt` string)
stored as parquet
location '/warehouse/gmall/dws/dws_user_action/' tblproperties ("parquet.compression"="snappy");
6、ads层
①求GVM
定义:一定时间段内的成交总额。即用户下单金额:包括未付款、已付款。因为只要下单了,就说明已经有足够的吸引力了。
需求:单日GMV,单日订单数量、单日收入等
思路:从需求可以看出:因为每天都要统计,所以在dm层(ads层)还需要一张表:
create table ads_gmv_sum_day( 'dt' string comment '统计日期', 'gmv_count' long comment '当日gmv个数', 'gmv_amount' decimal(16,2) comment '当日gmv订单总金额', 'gmv_payment' decimal(16,2) comment '当日支付金额' )comment '用户行为宽表' stored as parquet //存储格式 location '/warehouse/online_trade/ads/ads_gmv_sum_day' tblproperties("parquet.compression"="snappy") //压缩算法
②求转化率
新访问用户转化率 = 单日新访问设备数/日活数
新注册用户转化率 = 单日新注册用户数/日活数
新付费用户转化率 = 单日新付费用户数/日活数
付费用户转化率 = 单日消费用户数/单日日活数量
思路:需要在ads中建立一张表:
drop table if exists ads_user_convert_day; create table ads_user_convert_day( 'dt' string comment '统计日期', 'uv_count' bigint comment '当日日活', 'new_visitor_count' bigint comment '当日新增访问设备数量', 'new_regi_u_count' bigint comment '当日新增用户注册数量', //从user_info中的create_date ='2019-02-21' 'new_consume_u_count' bigint comment '当日新增用户消费数量' 'new_consume_count' bigint comment '当日消费数量' )comment '每日活跃用户转化情况' partitioned by('dt' string) stored as parquet //存储格式 location '/warehouse/online_trade/ads/ads_user_convert_day' tblproperties("parquet.compression"="snappy") //压缩算法
③用户行为漏斗分析
create table ads_event_convert_day( 'dt' string comment '统计日期', 'total_visitor_mcount' bigint comment '点击商品人数', 'visit2Item_convert_ratio' decimal(10,2) comment '首页到点击商品转化率', 'addcart_u_count' bigint comment '添加购物车人数', ........ comment '点击商品到添加购物车转化率', ........ comment '下单人数', ........ comment '购物车到下单转化率', ........ comment '支付人数', ........ comment '下单到支付的转化率' )comment '每日活跃用户转化情况' partitioned by('dt' string) stored as parquet //存储格式 location '/warehouse/online_trade/ads/ads_event_convert_day' tblproperties("parquet.compression"="snappy") //压缩算法