use source; -- 修改销售订单事务表 alter table sales_order change order_date status_date datetime, add order_status varchar(1) after status_date, change order_quantity quantity int; -- 删除sales_order表的主键 alter table sales_order change order_number order_number int not null; alter table sales_order drop primary key; -- 建立新的主键 alter table sales_order add id int unsigned not null auto_increment primary key comment '主键' first;说明:
set search_path=ext; drop external table sales_order; create external table sales_order ( id bigint, order_number int, customer_number int, product_code int, verification_ind char(1), credit_check_flag char(1), new_customer_ind char(1), web_order_flag char(1), status_date timestamp, order_status char(1), request_delivery_date timestamp, entry_date timestamp, order_amount decimal(10 , 2 ), quantity int ) location ('pxf://mycluster/data/ext/sales_order?profile=hdfstextsimple') format 'text' (delimiter=e',', null='null'); comment on table sales_order is '销售订单外部表'; comment on column sales_order.id is '业务主键'; comment on column sales_order.order_number is '订单号'; comment on column sales_order.customer_number is '客户编号'; comment on column sales_order.product_code is '产品编码'; comment on column sales_order.verification_ind is '审核标志'; comment on column sales_order.credit_check_flag is '信用检查标志'; comment on column sales_order.new_customer_ind is '客户首个订单标志'; comment on column sales_order.web_order_flag is '线上订单标志'; comment on column sales_order.status_date is '状态日期'; comment on column sales_order.order_status is '订单状态'; comment on column sales_order.request_delivery_date is '请求交付日期'; comment on column sales_order.entry_date is '登记日期'; comment on column sales_order.order_amount is '销售金额'; comment on column sales_order.quantity is '数量';
set search_path=rds; alter table sales_order rename order_date to status_date; alter table sales_order rename order_quantity to quantity; alter table sales_order add column order_status char(1) default null; comment on column sales_order.status_date is '状态日期'; comment on column sales_order.quantity is '数量'; comment on column sales_order.order_status is '订单状态';说明:
set search_path=tds; alter table sales_order_fact rename order_date_sk to status_date_sk; alter table sales_order_fact rename order_quantity to quantity; alter table sales_order_fact add column order_status char(1) default null; comment on column sales_order_fact.status_date_sk is '状态日期外键'; comment on column sales_order_fact.quantity is '数量'; comment on column sales_order_fact.order_status is '订单状态'; create view v_sales_order_fact as select order_number, customer_sk, product_sk, year_month, order_amount, request_delivery_date_sk, sales_order_attribute_sk, customer_zip_code_sk, shipping_zip_code_sk, max(case order_status when 'N' then status_date_sk else null end) nd, max(case order_status when 'N' then quantity else null end) nq, max(case order_status when 'A' then status_date_sk else null end) ad, max(case order_status when 'A' then quantity else null end) aq, max(case order_status when 'P' then status_date_sk else null end) pd, max(case order_status when 'P' then quantity else null end) pq, max(case order_status when 'S' then status_date_sk else null end) sd, max(case order_status when 'S' then quantity else null end) sq, max(case order_status when 'R' then status_date_sk else null end) rd, max(case order_status when 'R' then quantity else null end) rq from sales_order_fact group by order_number, customer_sk, product_sk, year_month, order_amount, request_delivery_date_sk, sales_order_attribute_sk, customer_zip_code_sk, shipping_zip_code_sk; -- 建立四个日期维度视图 create view v_allocate_date_dim (allocate_date_sk, allocate_date, month, month_name, quarter, year) as select * from date_dim ; create view v_packing_date_dim (packing_date_sk, packing_date, month, month_name, quarter, year) as select * from date_dim ; create view v_ship_date_dim (ship_date_sk, ship_date, month, month_name, quarter, year) as select * from date_dim ; create view v_receive_date_dim (receive_date_sk, receive_date, month, month_name, quarter, year) as select * from date_dim ;说明:
last_value=`sqoop job --show myjob_incremental_import | grep incremental.last.value | awk '{print $3}'` sqoop job --delete myjob_incremental_import sqoop job --create myjob_incremental_import -- import --connect "jdbc:mysql://172.16.1.127:3306/source?usessl=false&user=dwtest&password=123456" --table sales_order --target-dir /data/ext/sales_order --compress --where "entry_date < current_date()" --incremental append --check-column id --last-value $last_value
create or replace function fn_regular_load () returns void as $$ declare -- 设置scd的生效时间 v_cur_date date := current_date; v_pre_date date := current_date - 1; v_last_load date; begin -- 分析外部表 analyze ext.customer; analyze ext.product; analyze ext.sales_order; -- 将外部表数据装载到原始数据表 truncate table rds.customer; truncate table rds.product; insert into rds.customer select * from ext.customer; insert into rds.product select * from ext.product; insert into rds.sales_order select order_number, customer_number, product_code, status_date, entry_date, order_amount, quantity, request_delivery_date, verification_ind, credit_check_flag, new_customer_ind, web_order_flag, order_status from ext.sales_order; -- 分析rds模式的表 analyze rds.customer; analyze rds.product; analyze rds.sales_order; -- 设置cdc的上限时间 select last_load into v_last_load from rds.cdc_time; truncate table rds.cdc_time; insert into rds.cdc_time select v_last_load, v_cur_date; -- 装载客户维度 insert into tds.customer_dim (customer_number, customer_name, customer_street_address, shipping_address, isdelete, version, effective_date) select case flag when 'D' then a_customer_number else b_customer_number end customer_number, case flag when 'D' then a_customer_name else b_customer_name end customer_name, case flag when 'D' then a_customer_street_address else b_customer_street_address end customer_street_address, case flag when 'D' then a_shipping_address else b_shipping_address end shipping_address, case flag when 'D' then true else false end isdelete, case flag when 'D' then a_version when 'I' then 1 else a_version + 1 end v, v_pre_date from (select a.customer_number a_customer_number, a.customer_name a_customer_name, a.customer_street_address a_customer_street_address, a.shipping_address a_shipping_address, a.version a_version, b.customer_number b_customer_number, b.customer_name b_customer_name, b.customer_street_address b_customer_street_address, b.shipping_address b_shipping_address, case when a.customer_number is null then 'I' when b.customer_number is null then 'D' else 'U' end flag from v_customer_dim_latest a full join rds.customer b on a.customer_number = b.customer_number where a.customer_number is null -- 新增 or b.customer_number is null -- 删除 or (a.customer_number = b.customer_number and not (coalesce(a.customer_name,'') = coalesce(b.customer_name,'') and coalesce(a.customer_street_address,'') = coalesce(b.customer_street_address,'') and coalesce(a.shipping_address,'') = coalesce(b.shipping_address,'') ))) t order by coalesce(a_customer_number, 999999999999), b_customer_number limit 999999999999; -- 装载产品维度 insert into tds.product_dim (product_code, product_name, product_category, isdelete, version, effective_date) select case flag when 'D' then a_product_code else b_product_code end product_code, case flag when 'D' then a_product_name else b_product_name end product_name, case flag when 'D' then a_product_category else b_product_category end product_category, case flag when 'D' then true else false end isdelete, case flag when 'D' then a_version when 'I' then 1 else a_version + 1 end v, v_pre_date from (select a.product_code a_product_code, a.product_name a_product_name, a.product_category a_product_category, a.version a_version, b.product_code b_product_code, b.product_name b_product_name, b.product_category b_product_category, case when a.product_code is null then 'I' when b.product_code is null then 'D' else 'U' end flag from v_product_dim_latest a full join rds.product b on a.product_code = b.product_code where a.product_code is null -- 新增 or b.product_code is null -- 删除 or (a.product_code = b.product_code and not (a.product_name = b.product_name and a.product_category = b.product_category))) t order by coalesce(a_product_code, 999999999999), b_product_code limit 999999999999; -- 装载销售订单事实表 insert into sales_order_fact select a.order_number, customer_sk, product_sk, e.date_sk, e.year * 100 + e.month, order_amount, quantity, f.date_sk, g.sales_order_attribute_sk, h.customer_zip_code_sk, i.shipping_zip_code_sk, a.order_status from rds.sales_order a, v_customer_dim_his c, v_product_dim_his d, date_dim e, date_dim f, sales_order_attribute_dim g, v_customer_zip_code_dim h, v_shipping_zip_code_dim i, rds.customer j, rds.cdc_time k where a.customer_number = c.customer_number and a.status_date >= c.effective_date and a.status_date < c.expiry_date and a.product_code = d.product_code and a.status_date >= d.effective_date and a.status_date < d.expiry_date and date(a.status_date) = e.date and date(a.request_delivery_date) = f.date and a.verification_ind = g.verification_ind and a.credit_check_flag = g.credit_check_flag and a.new_customer_ind = g.new_customer_ind and a.web_order_flag = g.web_order_flag and a.customer_number = j.customer_number and j.customer_zip_code = h.customer_zip_code and j.shipping_zip_code = i.shipping_zip_code and a.entry_date >= k.last_load and a.entry_date < k.current_load; -- 重载PA客户维度 truncate table pa_customer_dim; insert into pa_customer_dim select distinct a.* from customer_dim a, sales_order_fact b, v_customer_zip_code_dim c where c.customer_state = 'pa' and b.customer_zip_code_sk = c.customer_zip_code_sk and a.customer_sk = b.customer_sk; -- 分析tds模式的表 analyze customer_dim; analyze product_dim; analyze sales_order_fact; analyze pa_customer_dim; -- 更新时间戳表的last_load字段 truncate table rds.cdc_time; insert into rds.cdc_time select v_cur_date, v_cur_date; end; $$ language plpgsql;需要修改定期数据装载中的相应列名。在装载事务事实表时,只用entry_date >= last_load and entry_date < current_load条件就可以过滤出所有新录入的、包括五种状态的订单,因为每种状态的订单都有自己对应的录入时间。
use source; set @order_date := from_unixtime(unix_timestamp('2017-06-02 00:00:01') + rand() * (unix_timestamp('2017-06-02 12:00:00') - unix_timestamp('2017-06-02 00:00:01'))); set @request_delivery_date := from_unixtime(unix_timestamp(date_add(current_date, interval 5 day)) + rand() * 86400); set @amount := floor(1000 + rand() * 9000); set @quantity := floor(10 + rand() * 90); insert into source.sales_order values (null, 141, 1, 1, 'y', 'y', 'y', 'y', @order_date, 'N', @request_delivery_date, @order_date, @amount, @quantity); set @order_date := from_unixtime(unix_timestamp('2017-06-02 12:00:00') + rand() * (unix_timestamp('2017-06-03 00:00:00') - unix_timestamp('2017-06-02 12:00:00'))); set @request_delivery_date := from_unixtime(unix_timestamp(date_add(current_date, interval 5 day)) + rand() * 86400); set @amount := floor(1000 + rand() * 9000); set @quantity := floor(10 + rand() * 90); insert into source.sales_order values (null, 142, 2, 2, 'y', 'y', 'y', 'y', @order_date, 'N', @request_delivery_date, @order_date, @amount, @quantity); commit;
truncate table rds.cdc_time; insert into rds.cdc_time select date '2017-06-02', date '2017-06-02';
~/regular_etl.sh
select a.order_number, c.order_date, d.allocate_date, e.packing_date, f.ship_date, g.receive_date from v_sales_order_fact a left join v_order_date_dim c on a.nd = c.order_date_sk left join v_allocate_date_dim d on a.ad = d.allocate_date_sk left join v_packing_date_dim e on a.pd = e.packing_date_sk left join v_ship_date_dim f on a.sd = f.ship_date_sk left join v_receive_date_dim g on a.rd = g.receive_date_sk where a.order_number > 140 order by order_number;查询结果如图1所示,只有order_date列有值,其它日期都是空,因为这两个订单是新增的,并且还没有分配库房、打包、配送或收货。
use source; set @order_date := from_unixtime(unix_timestamp('2017-06-03 00:00:00') + rand() * (unix_timestamp('2017-06-03 12:00:00') - unix_timestamp('2017-06-03 00:00:00'))); insert into sales_order select null, order_number, customer_number, product_code, verification_ind, credit_check_flag, new_customer_ind, web_order_flag, @order_date, 'A', request_delivery_date, @order_date, order_amount, quantity from sales_order where order_number = 141; set @order_date := from_unixtime(unix_timestamp('2017-06-03 12:00:00') + rand() * (unix_timestamp('2017-06-04 00:00:00') - unix_timestamp('2017-06-03 12:00:00'))); insert into sales_order select null, order_number, customer_number, product_code, verification_ind, credit_check_flag, new_customer_ind, web_order_flag, @order_date, 'P', request_delivery_date, @order_date, order_amount, quantity from sales_order where id = 143; set @order_date := from_unixtime(unix_timestamp('2017-06-03 12:00:00') + rand() * (unix_timestamp('2017-06-04 00:00:00') - unix_timestamp('2017-06-03 12:00:00'))); insert into sales_order select null, order_number, customer_number, product_code, verification_ind, credit_check_flag, new_customer_ind, web_order_flag, @order_date, 'A', request_delivery_date, @order_date, order_amount, quantity from sales_order where order_number = 142; commit;设置时间窗口。
truncate table rds.cdc_time; insert into rds.cdc_time select date '2017-06-03', date '2017-06-03';执行定期装载脚本。
~/regular_etl.sh查询v_sales_order_fact表里的两个销售订单,确认定期装载成功。 查询结果如图2所示。第一个订单具有了allocate_date和packing_date,第二个只具有allocate_date。
use source; set @order_date := from_unixtime(unix_timestamp('2017-06-04 00:00:00') + rand() * (unix_timestamp('2017-06-04 12:00:00') - unix_timestamp('2017-06-04 00:00:00'))); insert into sales_order select null, order_number, customer_number, product_code, verification_ind, credit_check_flag, new_customer_ind, web_order_flag, @order_date, 'S', request_delivery_date, @order_date, order_amount, quantity from sales_order where order_number = 141 order by id desc limit 1; set @order_date := from_unixtime(unix_timestamp('2017-06-04 12:00:00') + rand() * (unix_timestamp('2017-06-05 00:00:00') - unix_timestamp('2017-06-04 12:00:00'))); insert into sales_order select null, order_number, customer_number, product_code, verification_ind, credit_check_flag, new_customer_ind, web_order_flag, @order_date, 'R', request_delivery_date, @order_date, order_amount, quantity from sales_order where order_number = 141 order by id desc limit 1; set @order_date := from_unixtime(unix_timestamp('2017-06-04 12:00:00') + rand() * (unix_timestamp('2017-06-05 00:00:00') - unix_timestamp('2017-06-04 12:00:00'))); insert into sales_order select null, order_number, customer_number, product_code, verification_ind, credit_check_flag, new_customer_ind, web_order_flag, @order_date, 'P', request_delivery_date, @order_date, order_amount, quantity from sales_order where order_number = 142 order by id desc limit 1; commit;设置时间窗口。
truncate table rds.cdc_time; insert into rds.cdc_time select date '2017-06-04', date '2017-06-04';执行定期装载脚本。
~/regular_etl.sh
查询v_sales_order_fact表里的两个销售订单,确认定期装载成功。查询结果如图3所示。第一个订单号为141的订单,具有了全部日期,这意味着订单已完成(客户已经收货)。第二个订单已经打包,但是还没有配送。
图3
create or replace function tds.fn_month_sum(p_year_month int) returns void as $$ declare sqlstring varchar(1000); begin -- 幂等操作,先删除上月数据 sqlstring := 'truncate table month_end_sales_order_fact_1_prt_p' || cast(p_year_month as varchar); execute sqlstring; -- 插入上月销售汇总数据 insert into month_end_sales_order_fact select t1.year_month, t2.product_sk, coalesce(t2.month_order_amount,0), coalesce(t2.month_order_quantity,0) from (select p_year_month year_month) t1 left join (select year_month, product_sk, sum(order_amount) month_order_amount, sum(quantity) month_order_quantity from sales_order_fact where year_month = p_year_month and coalesce(order_status,'N') = 'N' group by year_month,product_sk) t2 on t1.year_month = t2.year_month; end; $$ language plpgsql;