USE dw; CREATE VIEW entry_date_dim (entry_date_sk , entry_date , month_name , month , quarter , year , promo_ind) AS SELECT date_sk, date, month_name, month, quarter, year, promo_ind FROM date_dim; alter table sales_order_fact rename to sales_order_fact_old; create table sales_order_fact( order_number int COMMENT 'order number', customer_sk int COMMENT 'customer surrogate key', product_sk int COMMENT 'product surrogate key', sales_order_attribute_sk int COMMENT 'sales order attribute surrogate key', order_date_sk int COMMENT 'order date surrogate key', entry_date_sk int COMMENT 'entry date surrogate key', allocate_date_sk int COMMENT 'allocate date surrogate key', allocate_quantity int COMMENT 'allocate quantity', packing_date_sk int COMMENT 'packing date surrogate key', packing_quantity int COMMENT 'packing quantity', ship_date_sk int COMMENT 'ship date surrogate key', ship_quantity int COMMENT 'ship quantity', receive_date_sk int COMMENT 'receive date surrogate key', receive_quantity int COMMENT 'receive quantity', request_delivery_date_sk int COMMENT 'request delivery date surrogate key', order_amount decimal(10,2) COMMENT 'order amount', order_quantity int COMMENT 'order quantity') clustered by (order_number) into 8 buckets stored as orc tblproperties ('transactional'='true'); insert into sales_order_fact select order_number, customer_sk, product_sk, sales_order_attribute_sk, order_date_sk, null, allocate_date_sk, allocate_quantity, packing_date_sk, packing_quantity, ship_date_sk, ship_quantity, receive_date_sk, receive_quantity, request_delivery_date_sk, order_amount, order_quantity from sales_order_fact_old; drop table sales_order_fact_old;2. 修改销售订单定期装载脚本
-- 设置环境与时间窗口 !run /root/set_time.sql -- 装载customer维度 -- 设置已删除记录和地址相关列上SCD2的过期,用<=>运算符处理NULL值。 UPDATE customer_dim SET expiry_date = ${hivevar:pre_date} WHERE customer_dim.customer_sk IN (SELECT a.customer_sk FROM (SELECT customer_sk, customer_number, customer_street_address, customer_zip_code, customer_city, customer_state, shipping_address, shipping_zip_code, shipping_city, shipping_state FROM customer_dim WHERE expiry_date = ${hivevar:max_date}) a LEFT JOIN rds.customer b ON a.customer_number = b.customer_number WHERE b.customer_number IS NULL OR ( !(a.customer_street_address <=> b.customer_street_address) OR !(a.customer_zip_code <=> b.customer_zip_code) OR !(a.customer_city <=> b.customer_city) OR !(a.customer_state <=> b.customer_state) OR !(a.shipping_address <=> b.shipping_address) OR !(a.shipping_zip_code <=> b.shipping_zip_code) OR !(a.shipping_city <=> b.shipping_city) OR !(a.shipping_state <=> b.shipping_state) )); -- 处理customer_street_addresses列上SCD2的新增行 INSERT INTO customer_dim SELECT ROW_NUMBER() OVER (ORDER BY t1.customer_number) + t2.sk_max, t1.customer_number, t1.customer_name, t1.customer_street_address, t1.customer_zip_code, t1.customer_city, t1.customer_state, t1.shipping_address, t1.shipping_zip_code, t1.shipping_city, t1.shipping_state, t1.version, t1.effective_date, t1.expiry_date FROM ( SELECT t2.customer_number customer_number, t2.customer_name customer_name, t2.customer_street_address customer_street_address, t2.customer_zip_code customer_zip_code, t2.customer_city customer_city, t2.customer_state customer_state, t2.shipping_address shipping_address, t2.shipping_zip_code shipping_zip_code, t2.shipping_city shipping_city, t2.shipping_state shipping_state, t1.version + 1 version, ${hivevar:pre_date} effective_date, ${hivevar:max_date} expiry_date FROM customer_dim t1 INNER JOIN rds.customer t2 ON t1.customer_number = t2.customer_number AND t1.expiry_date = ${hivevar:pre_date} LEFT JOIN customer_dim t3 ON t1.customer_number = t3.customer_number AND t3.expiry_date = ${hivevar:max_date} WHERE (!(t1.customer_street_address <=> t2.customer_street_address) OR !(t1.customer_zip_code <=> t2.customer_zip_code) OR !(t1.customer_city <=> t2.customer_city) OR !(t1.customer_state <=> t2.customer_state) OR !(t1.shipping_address <=> t2.shipping_address) OR !(t1.shipping_zip_code <=> t2.shipping_zip_code) OR !(t1.shipping_city <=> t2.shipping_city) OR !(t1.shipping_state <=> t2.shipping_state) ) AND t3.customer_sk IS NULL) t1 CROSS JOIN (SELECT COALESCE(MAX(customer_sk),0) sk_max FROM customer_dim) t2; -- 处理customer_name列上的SCD1 -- 因为hive的update的set子句还不支持子查询,所以这里使用了一个临时表存储需要更新的记录,用先delete再insert代替update -- 因为SCD1本身就不保存历史数据,所以这里更新维度表里的所有customer_name改变的记录,而不是仅仅更新当前版本的记录 DROP TABLE IF EXISTS tmp; CREATE TABLE tmp AS SELECT a.customer_sk, a.customer_number, b.customer_name, a.customer_street_address, a.customer_zip_code, a.customer_city, a.customer_state, a.shipping_address, a.shipping_zip_code, a.shipping_city, a.shipping_state, a.version, a.effective_date, a.expiry_date FROM customer_dim a, rds.customer b WHERE a.customer_number = b.customer_number AND !(a.customer_name <=> b.customer_name); DELETE FROM customer_dim WHERE customer_dim.customer_sk IN (SELECT customer_sk FROM tmp); INSERT INTO customer_dim SELECT * FROM tmp; -- 处理新增的customer记录 INSERT INTO customer_dim SELECT ROW_NUMBER() OVER (ORDER BY t1.customer_number) + t2.sk_max, t1.customer_number, t1.customer_name, t1.customer_street_address, t1.customer_zip_code, t1.customer_city, t1.customer_state, t1.shipping_address, t1.shipping_zip_code, t1.shipping_city, t1.shipping_state, 1, ${hivevar:pre_date}, ${hivevar:max_date} FROM ( SELECT t1.* FROM rds.customer t1 LEFT JOIN customer_dim t2 ON t1.customer_number = t2.customer_number WHERE t2.customer_sk IS NULL) t1 CROSS JOIN (SELECT COALESCE(MAX(customer_sk),0) sk_max FROM customer_dim) t2; -- 重载PA客户维度 TRUNCATE TABLE pa_customer_dim; INSERT INTO pa_customer_dim SELECT customer_sk , customer_number , customer_name , customer_street_address , customer_zip_code , customer_city , customer_state , shipping_address , shipping_zip_code , shipping_city , shipping_state , version , effective_date , expiry_date FROM customer_dim WHERE customer_state = 'PA' ; -- 装载product维度 -- 设置已删除记录和product_name、product_category列上SCD2的过期 UPDATE product_dim SET expiry_date = ${hivevar:pre_date} WHERE product_dim.product_sk IN (SELECT a.product_sk FROM (SELECT product_sk,product_code,product_name,product_category FROM product_dim WHERE expiry_date = ${hivevar:max_date}) a LEFT JOIN rds.product b ON a.product_code = b.product_code WHERE b.product_code IS NULL OR (a.product_name <> b.product_name OR a.product_category <> b.product_category)); -- 处理product_name、product_category列上SCD2的新增行 INSERT INTO product_dim SELECT ROW_NUMBER() OVER (ORDER BY t1.product_code) + t2.sk_max, t1.product_code, t1.product_name, t1.product_category, t1.version, t1.effective_date, t1.expiry_date FROM ( SELECT t2.product_code product_code, t2.product_name product_name, t2.product_category product_category, t1.version + 1 version, ${hivevar:pre_date} effective_date, ${hivevar:max_date} expiry_date FROM product_dim t1 INNER JOIN rds.product t2 ON t1.product_code = t2.product_code AND t1.expiry_date = ${hivevar:pre_date} LEFT JOIN product_dim t3 ON t1.product_code = t3.product_code AND t3.expiry_date = ${hivevar:max_date} WHERE (t1.product_name <> t2.product_name OR t1.product_category <> t2.product_category) AND t3.product_sk IS NULL) t1 CROSS JOIN (SELECT COALESCE(MAX(product_sk),0) sk_max FROM product_dim) t2; -- 处理新增的product记录 INSERT INTO product_dim SELECT ROW_NUMBER() OVER (ORDER BY t1.product_code) + t2.sk_max, t1.product_code, t1.product_name, t1.product_category, 1, ${hivevar:pre_date}, ${hivevar:max_date} FROM ( SELECT t1.* FROM rds.product t1 LEFT JOIN product_dim t2 ON t1.product_code = t2.product_code WHERE t2.product_sk IS NULL) t1 CROSS JOIN (SELECT COALESCE(MAX(product_sk),0) sk_max FROM product_dim) t2; -- 装载product_count_fact表 insert overwrite table product_count_fact select product_sk,date_sk from (select a.product_sk product_sk, a.product_code product_code, b.date_sk date_sk, row_number() over (partition by a.product_code order by b.date_sk) rn from product_dim a,date_dim b where a.effective_date = b.date) t where rn = 1; -- 装载销售订单事实表 -- 前一天新增的销售订单 INSERT INTO sales_order_fact SELECT a.order_number, customer_sk, product_sk, g.sales_order_attribute_sk, e.order_date_sk, h.entry_date_sk, null, null, null, null, null, null, null, null, f.request_delivery_date_sk, order_amount, quantity FROM rds.sales_order a, customer_dim c, product_dim d, order_date_dim e, request_delivery_date_dim f, sales_order_attribute_dim g, entry_date_dim h, rds.cdc_time i WHERE a.order_status = 'N' AND a.customer_number = c.customer_number AND a.status_date >= c.effective_date AND a.status_date < c.expiry_date AND a.product_code = d.product_code AND a.status_date >= d.effective_date AND a.status_date < d.expiry_date AND to_date(a.status_date) = e.order_date AND to_date(a.entry_date) = h.entry_date AND to_date(a.request_delivery_date) = f.request_delivery_date AND a.verification_ind = g.verification_ind AND a.credit_check_flag = g.credit_check_flag AND a.new_customer_ind = g.new_customer_ind AND a.web_order_flag = g.web_order_flag AND a.entry_date >= i.last_load AND a.entry_date < i.current_load ; -- 处理分配库房、打包、配送和收货四个状态 DROP TABLE IF EXISTS tmp; CREATE TABLE tmp AS select t0.order_number order_number, t0.customer_sk customer_sk, t0.product_sk product_sk, t0.sales_order_attribute_sk, t0.order_date_sk order_date_sk, t0.entry_date_sk entry_date_sk, t2.allocate_date_sk allocate_date_sk, t1.quantity allocate_quantity, t0.packing_date_sk packing_date_sk, t0.packing_quantity packing_quantity, t0.ship_date_sk ship_date_sk, t0.ship_quantity ship_quantity, t0.receive_date_sk receive_date_sk, t0.receive_quantity receive_quantity, t0.request_delivery_date_sk request_delivery_date_sk, t0.order_amount order_amount, t0.order_quantity order_quantity from sales_order_fact t0, rds.sales_order t1, allocate_date_dim t2, rds.cdc_time t4 where t0.order_number = t1.order_number and t1.order_status = 'A' and to_date(t1.status_date) = t2.allocate_date and t1.entry_date >= t4.last_load and t1.entry_date < t4.current_load; DELETE FROM sales_order_fact WHERE sales_order_fact.order_number IN (SELECT order_number FROM tmp); INSERT INTO sales_order_fact SELECT * FROM tmp; DROP TABLE IF EXISTS tmp; CREATE TABLE tmp AS select t0.order_number order_number, t0.customer_sk customer_sk, t0.product_sk product_sk, t0.sales_order_attribute_sk, t0.order_date_sk order_date_sk, t0.entry_date_sk entry_date_sk, t0.allocate_date_sk allocate_date_sk, t0.allocate_quantity allocate_quantity, t2.packing_date_sk packing_date_sk, t1.quantity packing_quantity, t0.ship_date_sk ship_date_sk, t0.ship_quantity ship_quantity, t0.receive_date_sk receive_date_sk, t0.receive_quantity receive_quantity, t0.request_delivery_date_sk request_delivery_date_sk, t0.order_amount order_amount, t0.order_quantity order_quantity from sales_order_fact t0, rds.sales_order t1, packing_date_dim t2, rds.cdc_time t4 where t0.order_number = t1.order_number and t1.order_status = 'P' and to_date(t1.status_date) = t2.packing_date and t1.entry_date >= t4.last_load and t1.entry_date < t4.current_load; DELETE FROM sales_order_fact WHERE sales_order_fact.order_number IN (SELECT order_number FROM tmp); INSERT INTO sales_order_fact SELECT * FROM tmp; DROP TABLE IF EXISTS tmp; CREATE TABLE tmp AS select t0.order_number order_number, t0.customer_sk customer_sk, t0.product_sk product_sk, t0.sales_order_attribute_sk, t0.order_date_sk order_date_sk, t0.entry_date_sk entry_date_sk, t0.allocate_date_sk allocate_date_sk, t0.allocate_quantity allocate_quantity, t0.packing_date_sk packing_date_sk, t0.packing_quantity packing_quantity, t2.ship_date_sk ship_date_sk, t1.quantity ship_quantity, t0.receive_date_sk receive_date_sk, t0.receive_quantity receive_quantity, t0.request_delivery_date_sk request_delivery_date_sk, t0.order_amount order_amount, t0.order_quantity order_quantity from sales_order_fact t0, rds.sales_order t1, ship_date_dim t2, rds.cdc_time t4 where t0.order_number = t1.order_number and t1.order_status = 'S' and to_date(t1.status_date) = t2.ship_date and t1.entry_date >= t4.last_load and t1.entry_date < t4.current_load; DELETE FROM sales_order_fact WHERE sales_order_fact.order_number IN (SELECT order_number FROM tmp); INSERT INTO sales_order_fact SELECT * FROM tmp; DROP TABLE IF EXISTS tmp; CREATE TABLE tmp AS select t0.order_number order_number, t0.customer_sk customer_sk, t0.product_sk product_sk, t0.sales_order_attribute_sk, t0.order_date_sk order_date_sk, t0.entry_date_sk entry_date_sk, t0.allocate_date_sk allocate_date_sk, t0.allocate_quantity allocate_quantity, t0.packing_date_sk packing_date_sk, t0.packing_quantity packing_quantity, t0.ship_date_sk ship_date_sk, t0.ship_quantity ship_quantity, t2.receive_date_sk receive_date_sk, t1.quantity receive_quantity, t0.request_delivery_date_sk request_delivery_date_sk, t0.order_amount order_amount, t0.order_quantity order_quantity from sales_order_fact t0, rds.sales_order t1, receive_date_dim t2, rds.cdc_time t4 where t0.order_number = t1.order_number and t1.order_status = 'R' and to_date(t1.status_date) = t2.receive_date and t1.entry_date >= t4.last_load and t1.entry_date < t4.current_load; DELETE FROM sales_order_fact WHERE sales_order_fact.order_number IN (SELECT order_number FROM tmp); INSERT INTO sales_order_fact SELECT * FROM tmp; -- 更新时间戳表的last_load字段 INSERT OVERWRITE TABLE rds.cdc_time SELECT current_load, current_load FROM rds.cdc_time;3. 修改装载月底销售订单事实表脚本
-- 设置变量以支持事务 set hive.support.concurrency=true; set hive.exec.dynamic.partition.mode=nonstrict; set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager; set hive.compactor.initiator.on=true; set hive.compactor.worker.threads=1; USE dw; SET hivevar:pre_month_date = add_months(current_date,-1); drop table if exists tmp; create table tmp as select a.order_month_sk order_month_sk, a.product_sk product_sk, a.month_order_amount + b.order_amount month_order_amount, a.month_order_quantity + b.order_quantity month_order_quantity from month_end_sales_order_fact a, (select d.month_sk month_sk, a.product_sk product_sk, sum(order_amount) order_amount, sum(order_quantity) order_quantity from sales_order_fact a, order_date_dim b, entry_date_dim c, month_dim d where a.order_date_sk = b.order_date_sk and a.entry_date_sk = c.entry_date_sk and c.month = month(${hivevar:pre_month_date}) and c.year = year(${hivevar:pre_month_date}) and b.month = d.month and b.order_date <> c.entry_date group by d.month_sk , a.product_sk) b where a.product_sk = b.product_sk and a.order_month_sk = b.month_sk; delete from month_end_sales_order_fact where exists (select 1 from tmp t2 where month_end_sales_order_fact.order_month_sk = t2.order_month_sk and month_end_sales_order_fact.product_sk = t2.product_sk); insert into month_end_sales_order_fact select * from tmp; insert into month_end_sales_order_fact -- 正常的订单汇总(order_date = entry_date) select b.month_sk, a.product_sk, sum(order_amount), sum(order_quantity) from sales_order_fact a, month_dim b, order_date_dim d, entry_date_dim c where a.order_date_sk = d.order_date_sk and a.entry_date_sk = c.entry_date_sk and b.month = d.month and b.year = d.year and c.month = month(${hivevar:pre_month_date}) and c.year = year(${hivevar:pre_month_date}) and d.order_date = c.entry_date group by b.month_sk, a.product_sk union all -- 迟到的事实,但尚不存在汇总数据 select d.month_sk, a.product_sk, sum(order_amount), sum(order_quantity) from sales_order_fact a, order_date_dim b, entry_date_dim c, month_dim d where a.order_date_sk = b.order_date_sk and a.entry_date_sk = c.entry_date_sk and b.order_date <> c.entry_date and c.month = month(${hivevar:pre_month_date}) and c.year = year(${hivevar:pre_month_date}) and b.month = d.month and b.year = d.year and not exists (select 1 from month_end_sales_order_fact p, month_dim s where p.order_month_sk = s.month_sk and s.month = d.month and s.year = d.year and p.product_sk = a.product_sk) group by d.month_sk , a.product_sk;4. 测试
UPDATE dw.sales_order_fact SET entry_date_sk = order_date_sk;在执行定期装载脚本前使用下面的语句查询month_end_sales_order_fact表。之后可以对比‘前’(不包含迟到事实)‘后’(包含了迟到事实)的数据,以确认装载的正确性。
use dw; select year, month, product_name, month_order_amount amt, month_order_quantity qty from month_end_sales_order_fact a, month_dim b, product_dim c where a.order_month_sk = b.month_sk and a.product_sk = c.product_sk cluster by year , month , product_name;查询结果如下图所示。
use dw; select a.product_sk,product_name,sum(order_amount) from sales_order_fact a, product_dim b where a.product_sk = b.product_sk group by a.product_sk,product_name cluster by product_name;查询结果如下图所示。
USE source; -- 迟到已存在 SET @order_date := from_unixtime(unix_timestamp('2016-06-25') + rand() * (unix_timestamp('2016-06-26') - unix_timestamp('2016-06-25'))); SET @entry_date := from_unixtime(unix_timestamp('2016-07-26') + rand() * (unix_timestamp('2016-07-27') - unix_timestamp('2016-07-26'))); SET @amount := floor(1000 + rand() * 9000); SET @quantity := floor(10 + rand() * 90); INSERT INTO sales_order VALUES (null, 141, 6, 2, 'Y', 'Y', 'Y', 'N', @order_date, 'N', '2016-06-30', @entry_date, @amount, @quantity); -- 迟到不存在 SET @order_date := from_unixtime(unix_timestamp('2016-06-26') + rand() * (unix_timestamp('2016-06-27') - unix_timestamp('2016-06-26'))); SET @entry_date := from_unixtime(unix_timestamp('2016-07-26') + rand() * (unix_timestamp('2016-07-27') - unix_timestamp('2016-07-26'))); SET @amount := floor(1000 + rand() * 9000); SET @quantity := floor(10 + rand() * 90); INSERT INTO sales_order VALUES (null, 142, 6, 3, 'Y', 'Y', 'Y', 'N', @order_date, 'N', '2016-06-30', @entry_date, @amount, @quantity); -- 非迟到 SET @order_date := from_unixtime(unix_timestamp('2016-07-26') + rand() * (unix_timestamp('2016-07-27') - unix_timestamp('2016-07-26'))); SET @amount := floor(1000 + rand() * 9000); SET @quantity := floor(10 + rand() * 90); INSERT INTO sales_order VALUES (null, 143, 12, 4, 'Y', 'N', 'Y', 'N', @order_date, 'N', '2016-07-30', @order_date, @amount, @quantity); COMMIT;新增订单数据如下图所示。
INSERT OVERWRITE TABLE rds.cdc_time SELECT '2016-07-26', '2016-07-26' FROM rds.cdc_time;将set_time.sql文件中的SET hivevar:cur_date = CURRENT_DATE();行改为SET hivevar:cur_date = '2016-07-27';后执行定期装载。
./regular_etl.sh现在已经准备好运行修改后的月底快照装载。执行下面的命令执行月底销售订单事实表装载脚本导入2016年7月的快照。
beeline -u jdbc:hive2://cdh2:10000/dw -f month_sum.sql最后,执行相同的查询获取包含了迟到事实月底销售订单数据
use dw; select year, month, product_name, month_order_amount amt, month_order_quantity qty from month_end_sales_order_fact a, month_dim b, product_dim c where a.order_month_sk = b.month_sk and a.product_sk = c.product_sk cluster by year , month , product_name;
查询结果如下图所示。
对比‘前’‘后’查询的结果可以看到: