use dw; -- 建立地址维度表 create table zip_code_dim ( zip_code_sk int, zip_code int, city varchar(30), state varchar(2), version int, effective_date date, expiry_date date ) clustered by (zip_code_sk) into 8 buckets stored as orc tblproperties ('transactional'='true'); -- 初始装载邮编相关数据 insert into zip_code_dim values (1,17050,'pittsburgh','PA',1,'1900-01-01','2200-01-01'); insert into zip_code_dim values (2,17051,'mc veytown','PA',1,'1900-01-01','2200-01-01'); insert into zip_code_dim values (3,17052,'mapleton depot','PA',1,'1900-01-01','2200-01-01'); insert into zip_code_dim values (4,17053,'marysville','PA',1,'1900-01-01','2200-01-01'); insert into zip_code_dim values (5,17054,'mattawana','PA',1,'1900-01-01','2200-01-01'); insert into zip_code_dim values (6,17055,'mechanicsburg','PA',1,'1900-01-01','2200-01-01'); insert into zip_code_dim values (7,44102,'cleveland','OH',1,'1900-01-01','2200-01-01'); -- 创建视图 create view customer_zip_code_dim (customer_zip_code_sk , customer_zip_code , customer_city , customer_state , version , effective_date , expiry_date) as select zip_code_sk, zip_code, city, state, version, effective_date, expiry_date from zip_code_dim; create view shipping_zip_code_dim (shipping_zip_code_sk , shipping_zip_code , shipping_city , shipping_state , version , effective_date , expiry_date) as select zip_code_sk, zip_code, city, state, version, effective_date, expiry_date from zip_code_dim; -- 添加邮编代理键 alter table sales_order_fact rename to sales_order_fact_old; create table sales_order_fact( order_number int COMMENT 'order number', customer_sk int COMMENT 'customer surrogate key', customer_zip_code_sk int COMMENT 'customer zip code sk', shipping_zip_code_sk int COMMENT 'shipping zip code sk', product_sk int COMMENT 'product surrogate key', sales_order_attribute_sk int COMMENT 'sales order attribute surrogate key', order_date_sk int COMMENT 'order date surrogate key', entry_date_sk int COMMENT 'entry date surrogate key', allocate_date_sk int COMMENT 'allocate date surrogate key', allocate_quantity int COMMENT 'allocate quantity', packing_date_sk int COMMENT 'packing date surrogate key', packing_quantity int COMMENT 'packing quantity', ship_date_sk int COMMENT 'ship date surrogate key', ship_quantity int COMMENT 'ship quantity', receive_date_sk int COMMENT 'receive date surrogate key', receive_quantity int COMMENT 'receive quantity', request_delivery_date_sk int COMMENT 'request delivery date surrogate key', order_amount decimal(10,2) COMMENT 'order amount', order_quantity int COMMENT 'order quantity') clustered by (order_number) into 8 buckets stored as orc tblproperties ('transactional'='true'); insert into sales_order_fact select order_number, customer_sk, null, null, product_sk, sales_order_attribute_sk, order_date_sk, entry_date_sk, allocate_date_sk, allocate_quantity, packing_date_sk, packing_quantity, ship_date_sk, ship_quantity, receive_date_sk, receive_quantity, request_delivery_date_sk, order_amount, order_quantity from sales_order_fact_old; drop table sales_order_fact_old; -- 初始装载两个邮编代理键 drop table if exists tmp; create table tmp as select t1.order_number, t1.customer_sk, t2.customer_zip_code_sk, t3.shipping_zip_code_sk, t1.product_sk, t1.sales_order_attribute_sk, t1.order_date_sk, t1.entry_date_sk, t1.allocate_date_sk, t1.allocate_quantity, t1.packing_date_sk, t1.packing_quantity, t1.ship_date_sk, t1.ship_quantity, t1.receive_date_sk, t1.receive_quantity, t1.request_delivery_date_sk, t1.order_amount, t1.order_quantity from sales_order_fact t1 left join (select a.order_number order_number,c.customer_zip_code_sk customer_zip_code_sk from sales_order_fact a, customer_dim b, customer_zip_code_dim c where a.customer_sk = b.customer_sk and b.customer_zip_code = c.customer_zip_code) t2 on t1.order_number = t2.order_number left join (select a.order_number order_number,c.shipping_zip_code_sk shipping_zip_code_sk from sales_order_fact a, customer_dim b, shipping_zip_code_dim c where a.customer_sk = b.customer_sk and b.shipping_zip_code = c.shipping_zip_code) t3 on t1.order_number = t3.order_number; delete from sales_order_fact where sales_order_fact.order_number in (select order_number from tmp); insert into sales_order_fact select * from tmp; alter table customer_dim rename to customer_dim_old; create table customer_dim (customer_sk int COMMENT 'surrogate key', customer_number int COMMENT 'number', customer_name varchar(50) COMMENT 'name', customer_street_address varchar(50) COMMENT 'address', shipping_address varchar(50) COMMENT 'shipping_address', version int COMMENT 'version', effective_date date COMMENT 'effective date', expiry_date date COMMENT 'expiry date') clustered by (customer_sk) into 8 buckets stored as orc tblproperties ('transactional'='true'); insert into customer_dim select customer_sk, customer_number, customer_name, customer_street_address, shipping_address, version, effective_date, expiry_date from customer_dim_old; drop table customer_dim_old; alter table pa_customer_dim rename to pa_customer_dim_old; create table pa_customer_dim (customer_sk int, customer_number int, customer_name varchar(50), customer_street_address varchar(50), shipping_address varchar(50), version int, effective_date date, expiry_date date) clustered by (customer_sk) into 8 buckets stored as orc tblproperties ('transactional'='true'); insert into pa_customer_dim select customer_sk, customer_number, customer_name, customer_street_address, shipping_address, version, effective_date, expiry_date from pa_customer_dim_old; drop table pa_customer_dim_old; -- 创建视图 create view factory_zip_code_dim (factory_zip_code_sk , factory_zip_code , factory_city , factory_state , version,effective_date , expiry_date) as select zip_code_sk, zip_code, city, state, version, effective_date, expiry_date from zip_code_dim; alter table production_fact rename to production_fact_old; create table production_fact (product_sk int, production_date_sk int, factory_sk int, factory_zip_code_sk int, production_quantity int); -- 初始装载邮编代理键 insert into production_fact select a.product_sk, a.production_date_sk, a.factory_sk, c.factory_zip_code_sk, a.production_quantity from production_fact_old a, factory_dim b, factory_zip_code_dim c where a.factory_sk = b.factory_sk and b.factory_zip_code = c.factory_zip_code; drop table production_fact_old; -- 在factory_dim表上删除工厂编码及其它们的城市和州列 alter table factory_dim rename to factory_dim_old; create table factory_dim (factory_sk int, factory_code int, factory_name varchar(30), factory_street_address varchar(50), version int, effective_date date, expiry_date date) clustered by (factory_sk) into 8 buckets stored as orc tblproperties ('transactional'='true'); insert into factory_dim select factory_sk, factory_code, factory_name, factory_street_address, version, effective_date, expiry_date from factory_dim_old; drop table factory_dim_old;执行完修改数据仓库模式的脚本后,可以查询customer_zip_code_dim、shipping_code_dim、factory_zip_code_dim维度表和sales_order_fact、production_fact事实表,确认邮编已经被成功分离。
-- 设置环境与时间窗口 !run /root/set_time.sql -- 装载customer维度 -- 设置已删除记录和地址相关列上SCD2的过期,用<=>运算符处理NULL值。 UPDATE customer_dim SET expiry_date = ${hivevar:pre_date} WHERE customer_dim.customer_sk IN (SELECT a.customer_sk FROM (SELECT customer_sk, customer_number, customer_street_address, shipping_address FROM customer_dim WHERE expiry_date = ${hivevar:max_date}) a LEFT JOIN rds.customer b ON a.customer_number = b.customer_number WHERE b.customer_number IS NULL OR ( !(a.customer_street_address <=> b.customer_street_address) OR !(a.shipping_address <=> b.shipping_address) )); -- 处理customer_street_addresses列上SCD2的新增行 INSERT INTO customer_dim SELECT ROW_NUMBER() OVER (ORDER BY t1.customer_number) + t2.sk_max, t1.customer_number, t1.customer_name, t1.customer_street_address, t1.shipping_address, t1.version, t1.effective_date, t1.expiry_date FROM ( SELECT t2.customer_number customer_number, t2.customer_name customer_name, t2.customer_street_address customer_street_address, t2.shipping_address shipping_address, t1.version + 1 version, ${hivevar:pre_date} effective_date, ${hivevar:max_date} expiry_date FROM customer_dim t1 INNER JOIN rds.customer t2 ON t1.customer_number = t2.customer_number AND t1.expiry_date = ${hivevar:pre_date} LEFT JOIN customer_dim t3 ON t1.customer_number = t3.customer_number AND t3.expiry_date = ${hivevar:max_date} WHERE (!(t1.customer_street_address <=> t2.customer_street_address) OR !(t1.shipping_address <=> t2.shipping_address) ) AND t3.customer_sk IS NULL) t1 CROSS JOIN (SELECT COALESCE(MAX(customer_sk),0) sk_max FROM customer_dim) t2; -- 处理customer_name列上的SCD1 -- 因为hive的update的set子句还不支持子查询,所以这里使用了一个临时表存储需要更新的记录,用先delete再insert代替update -- 因为SCD1本身就不保存历史数据,所以这里更新维度表里的所有customer_name改变的记录,而不是仅仅更新当前版本的记录 DROP TABLE IF EXISTS tmp; CREATE TABLE tmp AS SELECT a.customer_sk, a.customer_number, b.customer_name, a.customer_street_address, a.shipping_address, a.version, a.effective_date, a.expiry_date FROM customer_dim a, rds.customer b WHERE a.customer_number = b.customer_number AND !(a.customer_name <=> b.customer_name); DELETE FROM customer_dim WHERE customer_dim.customer_sk IN (SELECT customer_sk FROM tmp); INSERT INTO customer_dim SELECT * FROM tmp; -- 处理新增的customer记录 INSERT INTO customer_dim SELECT ROW_NUMBER() OVER (ORDER BY t1.customer_number) + t2.sk_max, t1.customer_number, t1.customer_name, t1.customer_street_address, t1.shipping_address, 1, ${hivevar:pre_date}, ${hivevar:max_date} FROM ( SELECT t1.* FROM rds.customer t1 LEFT JOIN customer_dim t2 ON t1.customer_number = t2.customer_number WHERE t2.customer_sk IS NULL) t1 CROSS JOIN (SELECT COALESCE(MAX(customer_sk),0) sk_max FROM customer_dim) t2; -- 装载product维度 -- 设置已删除记录和product_name、product_category列上SCD2的过期 UPDATE product_dim SET expiry_date = ${hivevar:pre_date} WHERE product_dim.product_sk IN (SELECT a.product_sk FROM (SELECT product_sk,product_code,product_name,product_category FROM product_dim WHERE expiry_date = ${hivevar:max_date}) a LEFT JOIN rds.product b ON a.product_code = b.product_code WHERE b.product_code IS NULL OR (a.product_name <> b.product_name OR a.product_category <> b.product_category)); -- 处理product_name、product_category列上SCD2的新增行 INSERT INTO product_dim SELECT ROW_NUMBER() OVER (ORDER BY t1.product_code) + t2.sk_max, t1.product_code, t1.product_name, t1.product_category, t1.version, t1.effective_date, t1.expiry_date FROM ( SELECT t2.product_code product_code, t2.product_name product_name, t2.product_category product_category, t1.version + 1 version, ${hivevar:pre_date} effective_date, ${hivevar:max_date} expiry_date FROM product_dim t1 INNER JOIN rds.product t2 ON t1.product_code = t2.product_code AND t1.expiry_date = ${hivevar:pre_date} LEFT JOIN product_dim t3 ON t1.product_code = t3.product_code AND t3.expiry_date = ${hivevar:max_date} WHERE (t1.product_name <> t2.product_name OR t1.product_category <> t2.product_category) AND t3.product_sk IS NULL) t1 CROSS JOIN (SELECT COALESCE(MAX(product_sk),0) sk_max FROM product_dim) t2; -- 处理新增的product记录 INSERT INTO product_dim SELECT ROW_NUMBER() OVER (ORDER BY t1.product_code) + t2.sk_max, t1.product_code, t1.product_name, t1.product_category, 1, ${hivevar:pre_date}, ${hivevar:max_date} FROM ( SELECT t1.* FROM rds.product t1 LEFT JOIN product_dim t2 ON t1.product_code = t2.product_code WHERE t2.product_sk IS NULL) t1 CROSS JOIN (SELECT COALESCE(MAX(product_sk),0) sk_max FROM product_dim) t2; -- 装载product_count_fact表 insert overwrite table product_count_fact select product_sk,date_sk from (select a.product_sk product_sk, a.product_code product_code, b.date_sk date_sk, row_number() over (partition by a.product_code order by b.date_sk) rn from product_dim a,date_dim b where a.effective_date = b.date) t where rn = 1; -- 装载销售订单事实表 -- 前一天新增的销售订单 INSERT INTO sales_order_fact SELECT a.order_number, customer_sk, i.customer_zip_code_sk, j.shipping_zip_code_sk, product_sk, g.sales_order_attribute_sk, e.order_date_sk, h.entry_date_sk, null, null, null, null, null, null, null, null, f.request_delivery_date_sk, order_amount, quantity FROM rds.sales_order a, customer_dim c, product_dim d, order_date_dim e, request_delivery_date_dim f, sales_order_attribute_dim g, entry_date_dim h, customer_zip_code_dim i, shipping_zip_code_dim j, rds.customer k, rds.cdc_time l WHERE a.order_status = 'N' AND a.customer_number = c.customer_number AND a.status_date >= c.effective_date AND a.status_date < c.expiry_date AND a.customer_number = k.customer_number AND k.customer_zip_code = i.customer_zip_code AND a.status_date >= i.effective_date AND a.status_date <= i.expiry_date AND k.shipping_zip_code = j.shipping_zip_code AND a.status_date >= j.effective_date AND a.status_date <= j.expiry_date AND a.product_code = d.product_code AND a.status_date >= d.effective_date AND a.status_date < d.expiry_date AND to_date(a.status_date) = e.order_date AND to_date(a.entry_date) = h.entry_date AND to_date(a.request_delivery_date) = f.request_delivery_date AND a.verification_ind = g.verification_ind AND a.credit_check_flag = g.credit_check_flag AND a.new_customer_ind = g.new_customer_ind AND a.web_order_flag = g.web_order_flag AND a.entry_date >= l.last_load AND a.entry_date < l.current_load ; -- 重载PA客户维度 TRUNCATE TABLE pa_customer_dim; INSERT INTO pa_customer_dim SELECT DISTINCT a.* FROM customer_dim a, sales_order_fact b, customer_zip_code_dim c WHERE c.customer_state = 'PA' AND b.customer_zip_code_sk = c.customer_zip_code_sk AND a.customer_sk = b.customer_sk; -- 处理分配库房、打包、配送和收货四个状态 DROP TABLE IF EXISTS tmp; CREATE TABLE tmp AS select t0.order_number order_number, t0.customer_sk customer_sk, t0.customer_zip_code_sk, t0.shipping_zip_code_sk, t0.product_sk product_sk, t0.sales_order_attribute_sk, t0.order_date_sk order_date_sk, t0.entry_date_sk entry_date_sk, t2.allocate_date_sk allocate_date_sk, t1.quantity allocate_quantity, t0.packing_date_sk packing_date_sk, t0.packing_quantity packing_quantity, t0.ship_date_sk ship_date_sk, t0.ship_quantity ship_quantity, t0.receive_date_sk receive_date_sk, t0.receive_quantity receive_quantity, t0.request_delivery_date_sk request_delivery_date_sk, t0.order_amount order_amount, t0.order_quantity order_quantity from sales_order_fact t0, rds.sales_order t1, allocate_date_dim t2, rds.cdc_time t4 where t0.order_number = t1.order_number and t1.order_status = 'A' and to_date(t1.status_date) = t2.allocate_date and t1.entry_date >= t4.last_load and t1.entry_date < t4.current_load; DELETE FROM sales_order_fact WHERE sales_order_fact.order_number IN (SELECT order_number FROM tmp); INSERT INTO sales_order_fact SELECT * FROM tmp; DROP TABLE IF EXISTS tmp; CREATE TABLE tmp AS select t0.order_number order_number, t0.customer_sk customer_sk, t0.customer_zip_code_sk, t0.shipping_zip_code_sk, t0.product_sk product_sk, t0.sales_order_attribute_sk, t0.order_date_sk order_date_sk, t0.entry_date_sk entry_date_sk, t0.allocate_date_sk allocate_date_sk, t0.allocate_quantity allocate_quantity, t2.packing_date_sk packing_date_sk, t1.quantity packing_quantity, t0.ship_date_sk ship_date_sk, t0.ship_quantity ship_quantity, t0.receive_date_sk receive_date_sk, t0.receive_quantity receive_quantity, t0.request_delivery_date_sk request_delivery_date_sk, t0.order_amount order_amount, t0.order_quantity order_quantity from sales_order_fact t0, rds.sales_order t1, packing_date_dim t2, rds.cdc_time t4 where t0.order_number = t1.order_number and t1.order_status = 'P' and to_date(t1.status_date) = t2.packing_date and t1.entry_date >= t4.last_load and t1.entry_date < t4.current_load; DELETE FROM sales_order_fact WHERE sales_order_fact.order_number IN (SELECT order_number FROM tmp); INSERT INTO sales_order_fact SELECT * FROM tmp; DROP TABLE IF EXISTS tmp; CREATE TABLE tmp AS select t0.order_number order_number, t0.customer_sk customer_sk, t0.customer_zip_code_sk, t0.shipping_zip_code_sk, t0.product_sk product_sk, t0.sales_order_attribute_sk, t0.order_date_sk order_date_sk, t0.entry_date_sk entry_date_sk, t0.allocate_date_sk allocate_date_sk, t0.allocate_quantity allocate_quantity, t0.packing_date_sk packing_date_sk, t0.packing_quantity packing_quantity, t2.ship_date_sk ship_date_sk, t1.quantity ship_quantity, t0.receive_date_sk receive_date_sk, t0.receive_quantity receive_quantity, t0.request_delivery_date_sk request_delivery_date_sk, t0.order_amount order_amount, t0.order_quantity order_quantity from sales_order_fact t0, rds.sales_order t1, ship_date_dim t2, rds.cdc_time t4 where t0.order_number = t1.order_number and t1.order_status = 'S' and to_date(t1.status_date) = t2.ship_date and t1.entry_date >= t4.last_load and t1.entry_date < t4.current_load; DELETE FROM sales_order_fact WHERE sales_order_fact.order_number IN (SELECT order_number FROM tmp); INSERT INTO sales_order_fact SELECT * FROM tmp; DROP TABLE IF EXISTS tmp; CREATE TABLE tmp AS select t0.order_number order_number, t0.customer_sk customer_sk, t0.customer_zip_code_sk, t0.shipping_zip_code_sk, t0.product_sk product_sk, t0.sales_order_attribute_sk, t0.order_date_sk order_date_sk, t0.entry_date_sk entry_date_sk, t0.allocate_date_sk allocate_date_sk, t0.allocate_quantity allocate_quantity, t0.packing_date_sk packing_date_sk, t0.packing_quantity packing_quantity, t0.ship_date_sk ship_date_sk, t0.ship_quantity ship_quantity, t2.receive_date_sk receive_date_sk, t1.quantity receive_quantity, t0.request_delivery_date_sk request_delivery_date_sk, t0.order_amount order_amount, t0.order_quantity order_quantity from sales_order_fact t0, rds.sales_order t1, receive_date_dim t2, rds.cdc_time t4 where t0.order_number = t1.order_number and t1.order_status = 'R' and to_date(t1.status_date) = t2.receive_date and t1.entry_date >= t4.last_load and t1.entry_date < t4.current_load; DELETE FROM sales_order_fact WHERE sales_order_fact.order_number IN (SELECT order_number FROM tmp); INSERT INTO sales_order_fact SELECT * FROM tmp; -- 更新时间戳表的last_load字段 INSERT OVERWRITE TABLE rds.cdc_time SELECT current_load, current_load FROM rds.cdc_time;
update source.customer set customer_street_address = '9999 Louise Dr.', customer_zip_code = 17055, customer_city = 'Pittsburgh', shipping_address = '9999 Louise Dr.', shipping_zip_code = 17055, shipping_city = 'Pittsburgh' where customer_number = 4; insert into source.customer values(15, 'Super Stores', '1000 Woodland St.', 17055, 'Pittsburgh', 'PA', '1000 Woodland St.', 17055, 'Pittsburgh', 'PA'); COMMIT;现在在装载新的客户数据前查询最后的客户和送货邮编。后面可以用改变后的信息和此查询的输出作对比。查询语句如下。
use dw; SELECT order_date_sk odsk, customer_number cn, customer_zip_code czc, shipping_zip_code szc FROM customer_zip_code_dim a, shipping_zip_code_dim b, sales_order_fact c, customer_dim d WHERE a.customer_zip_code_sk = c.customer_zip_code_sk AND b.shipping_zip_code_sk = c.shipping_zip_code_sk AND d.customer_sk = c.customer_sk;然后使用下面的语句新增两条销售订单。
SET @order_date := from_unixtime(unix_timestamp('2016-08-08 00:00:01') + rand() * (unix_timestamp('2016-08-08 12:00:00') - unix_timestamp('2016-08-08 00:00:01'))); SET @amount := floor(1000 + rand() * 9000); SET @quantity := floor(10 + rand() * 90); INSERT INTO source.sales_order VALUES (null, 144, 4, 3, 'Y', 'Y', 'Y', 'N', @order_date, 'N', '2016-08-10', @order_date, @amount, @quantity); SET @order_date := from_unixtime(unix_timestamp('2016-08-08 12:00:00') + rand() * (unix_timestamp('2016-08-09 00:00:00') - unix_timestamp('2016-08-08 12:00:00'))); SET @amount := floor(1000 + rand() * 9000); SET @quantity := floor(10 + rand() * 90); INSERT INTO source.sales_order VALUES (null, 145, 15, 4, 'Y', 'N', 'Y', 'N', @order_date, 'N', '2016-08-10', @order_date, @amount, @quantity); commit;使用下面的SQL命令修改时间窗口。
INSERT OVERWRITE TABLE rds.cdc_time SELECT '2016-08-08', '2016-08-08' FROM rds.cdc_time;执行下面的命令定期装载。
./regular_etl.sh查询customer_dim表,确认两个改变的客户,即编号4和15的客户,已经正确装载。
select customer_sk csk, customer_number cnum, customer_name cnam, customer_street_address csd, shipping_address sd, version, effective_date, expiry_date from dw.customer_dim where customer_number in (4, 15);查询结果如下图所示。
use dw; select a.order_number onum, f.customer_number cnum, b.customer_zip_code czc, c.shipping_zip_code szc, g.product_code pc, d.order_date od, e.entry_date ed, a.order_amount, a.order_quantity from sales_order_fact a, customer_zip_code_dim b, shipping_zip_code_dim c, order_date_dim d, entry_date_dim e, customer_dim f, product_dim g where a.order_number IN (144, 145) and a.customer_sk = f.customer_sk and a.product_sk = g.product_sk and a.customer_zip_code_sk = b.customer_zip_code_sk and a.shipping_zip_code_sk = c.shipping_zip_code_sk and a.order_date_sk = d.order_date_sk and a.entry_date_sk = e.entry_date_sk;查询结果如下图所示。
select customer_sk csk, customer_number cnum, customer_name cnam, customer_street_address csa, shipping_address sad, version, effective_date, expiry_date from dw.pa_customer_dim;查询结果如下图所示。
-- 设置环境与时间窗口 !run /root/set_time.sql -- 工厂信息很少修改,一般不需要保留历史,所以使用SCD1 drop table if exists tmp; create table tmp as select a.factory_sk, a.factory_code, b.factory_name, b.factory_street_address, a.version, a.effective_date, a.expiry_date from factory_dim a,rds.factory_master b where a.factory_code = b.factory_code and !(a.factory_name <=> b.factory_name and a.factory_street_address <=> b.factory_street_address ); delete from factory_dim where factory_dim.factory_sk in (select factory_sk from tmp); insert into factory_dim select * from tmp; -- 添加新的工厂信息 INSERT INTO factory_dim SELECT ROW_NUMBER() OVER (ORDER BY t1.factory_code) + t2.sk_max, t1.factory_code, t1.factory_name, t1.factory_street_address, 1, ${hivevar:pre_date}, ${hivevar:max_date} FROM ( SELECT t1.* FROM rds.factory_master t1 LEFT JOIN factory_dim t2 ON t1.factory_code = t2.factory_code WHERE t2.factory_sk IS NULL) t1 CROSS JOIN (SELECT COALESCE(MAX(factory_sk),0) sk_max FROM factory_dim) t2; -- 装载每日产品事实表 INSERT INTO production_fact SELECT b.product_sk , c.date_sk , d.factory_sk , e.factory_zip_code_sk , production_quantity FROM rds.daily_production a , product_dim b , date_dim c , factory_dim d , factory_zip_code_dim e , rds.factory_master f WHERE production_date = ${hivevar:pre_date} AND a.product_code = b.product_code AND a.production_date >= b.effective_date AND a.production_date <= b.expiry_date AND a.factory_code = f.factory_code AND f.factory_zip_code = e.factory_zip_code AND a.production_date >= e.effective_date AND a.production_date < e.expiry_date AND a.production_date = c.date AND a.factory_code = d.factory_code ;5. 测试修改后的产品定期装载
insert into source.factory_master values (5,'Fifth Factory','90909 McNicholds Blvd.',17055,'Pittsburgh','PA'); commit;向daily_production表里添加三个日常产品记录。
INSERT INTO source.daily_production VALUES (1, '2016-08-08', 3, 400 ) , (3, '2016-08-08', 4, 200 ) , (5, '2016-08-08', 5, 100 ); commit;修改时间窗口。
INSERT OVERWRITE TABLE rds.cdc_time SELECT '2016-08-08', '2016-08-08' FROM rds.cdc_time;执行产品定期装载。
./regular_etl_daily_production.sh查询factory_dim,确认导入是正确的。
select factory_sk, factory_code, factory_name, factory_street_address, version, effective_date, expiry_date from dw.factory_dim;查询结果如下图所示。
use dw; select e.product_code pc, b.date, c.factory_code fc, d.factory_zip_code fzc, a.production_quantity qty from production_fact a, date_dim b, factory_dim c, factory_zip_code_dim d, product_dim e where a.product_sk = e.product_sk and a.production_date_sk = b.date_sk and a.factory_sk = c.factory_sk and a.factory_zip_code_sk = d.factory_zip_code_sk;
查询结果如下图所示。