USE dw; CREATE TABLE month_dim ( month_sk INT comment 'surrogate key', month tinyint comment 'month', month_name varchar(9) comment 'month name', quarter tinyint comment 'quarter', year smallint comment 'year' ) comment 'month dimension table' clustered by (month_sk) into 8 buckets stored as orc tblproperties ('transactional'='true') ;为了从日期维度同步导入月份维度,要把月份装载嵌入到日期维度的预装载脚本中。需要修改“建立数据仓库示例模型”里生成日期维度数据的脚本。下图显示了修改后的date_dim_generate.sh文件内容。
修改后的create_table_date_dim.sql文件内容如下:
-- 设置变量以支持事务 set hive.support.concurrency=true; set hive.exec.dynamic.partition.mode=nonstrict; set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager; set hive.compactor.initiator.on=true; set hive.compactor.worker.threads=1; use dw; -- 首次执行会建立日期维度临时表 create table if not exists date_dim_tmp ( date date comment 'date,yyyy-mm-dd', month tinyint comment 'month', month_name varchar(9) comment 'month name', quarter tinyint comment 'quarter', year smallint comment 'year', promo_ind char(1) comment 'promotion index' ) comment 'date dimension table' row format delimited fields terminated by ',' stored as textfile; -- 首次执行会建立日期维度表 create table if not exists date_dim ( date_sk int comment 'surrogate key', date date comment 'date,yyyy-mm-dd', month tinyint comment 'month', month_name varchar(9) comment 'month name', quarter tinyint comment 'quarter', year smallint comment 'year', promo_ind char(1) comment 'promotion index' ) comment 'date dimension table' clustered by (date_sk) into 8 buckets stored as orc tblproperties ('transactional'='true'); -- 首次执行会建立月份维度表 create table if not exists month_dim ( month_sk INT comment 'surrogate key', month tinyint comment 'month', month_name varchar(9) comment 'month name', quarter tinyint comment 'quarter', year smallint comment 'year' ) comment 'month dimension table' clustered by (month_sk) into 8 buckets stored as orc tblproperties ('transactional'='true') ;
-- 设置变量以支持事务 set hive.support.concurrency=true; set hive.exec.dynamic.partition.mode=nonstrict; set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager; set hive.compactor.initiator.on=true; set hive.compactor.worker.threads=1; use dw; -- 向日期维度表追加数据 insert into date_dim select row_number() over (order by date) + t2.sk_max, t1.date, t1.month, t1.month_name, t1.quarter, t1.year, t1.promo_ind from (select * from date_dim_tmp) t1 cross join (select coalesce(max(date_sk),0) sk_max from date_dim) t2; -- 向月份维度表追加数据 insert into month_dim select row_number() over (order by t1.year,t1.month) + t2.sk_max, t1.month, t1.month_name, t1.quarter, t1.year from (select distinct month, month_name, quarter, year FROM date_dim_tmp) t1 cross join (select coalesce(max(month_sk),0) sk_max from month_dim) t2;对预装载主要做了以下三点修改:
use dw; drop table date_dim_tmp; drop table date_dim; drop table month_dim;(2)执行预装载,生成从2000年1月1日到2010年12月31日的日期数据
./date_dim_generate.sh 2000-01-01 2010-12-31这次执行维度表都是新建的,生成的日期和月份维度数据如下图所示。
./date_dim_generate.sh 2011-01-01 2020-12-31这次执行是向已有的维度表中追加日期,生成的日期和月份维度数据如下图所示。
./on_demand.sh2. 建立PA客户维度
USE dw; CREATE TABLE pa_customer_dim ( customer_sk int comment 'surrogate key', customer_number int comment 'number', customer_name varchar(50) comment 'name', customer_street_address varchar(50) comment 'address', customer_zip_code int comment 'zipcode', customer_city varchar(30) comment 'city', customer_state varchar(2) comment 'state', shipping_address varchar(50) comment 'shipping_address', shipping_zip_code int comment 'shipping_zip_code', shipping_city varchar(30) comment 'shipping_city', shipping_state varchar(2) comment 'shipping_state', version int comment 'version', effective_date date comment 'effective date', expiry_date date comment 'expiry date' ) CLUSTERED BY (customer_sk) INTO 8 BUCKETS STORED AS ORC TBLPROPERTIES ('transactional'='true');注意,PA客户维度子集与月份维度子集有两点区别:
-- 设置变量以支持事务 set hive.support.concurrency=true; set hive.exec.dynamic.partition.mode=nonstrict; set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager; set hive.compactor.initiator.on=true; set hive.compactor.worker.threads=1; USE dw; -- 设置SCD的生效时间和过期时间 SET hivevar:cur_date = CURRENT_DATE(); SET hivevar:pre_date = DATE_ADD(${hivevar:cur_date},-1); SET hivevar:max_date = CAST('2200-01-01' AS DATE); -- 设置CDC的上限时间 INSERT OVERWRITE TABLE rds.cdc_time SELECT last_load, ${hivevar:cur_date} FROM rds.cdc_time; -- 装载customer维度 -- 设置已删除记录和地址相关列上SCD2的过期,用<=>运算符处理NULL值。 UPDATE customer_dim SET expiry_date = ${hivevar:pre_date} WHERE customer_dim.customer_sk IN (SELECT a.customer_sk FROM (SELECT customer_sk, customer_number, customer_street_address, customer_zip_code, customer_city, customer_state, shipping_address, shipping_zip_code, shipping_city, shipping_state FROM customer_dim WHERE expiry_date = ${hivevar:max_date}) a LEFT JOIN rds.customer b ON a.customer_number = b.customer_number WHERE b.customer_number IS NULL OR ( !(a.customer_street_address <=> b.customer_street_address) OR !(a.customer_zip_code <=> b.customer_zip_code) OR !(a.customer_city <=> b.customer_city) OR !(a.customer_state <=> b.customer_state) OR !(a.shipping_address <=> b.shipping_address) OR !(a.shipping_zip_code <=> b.shipping_zip_code) OR !(a.shipping_city <=> b.shipping_city) OR !(a.shipping_state <=> b.shipping_state) )); -- 处理customer_street_addresses列上SCD2的新增行 INSERT INTO customer_dim SELECT ROW_NUMBER() OVER (ORDER BY t1.customer_number) + t2.sk_max, t1.customer_number, t1.customer_name, t1.customer_street_address, t1.customer_zip_code, t1.customer_city, t1.customer_state, t1.shipping_address, t1.shipping_zip_code, t1.shipping_city, t1.shipping_state, t1.version, t1.effective_date, t1.expiry_date FROM ( SELECT t2.customer_number customer_number, t2.customer_name customer_name, t2.customer_street_address customer_street_address, t2.customer_zip_code customer_zip_code, t2.customer_city customer_city, t2.customer_state customer_state, t2.shipping_address shipping_address, t2.shipping_zip_code shipping_zip_code, t2.shipping_city shipping_city, t2.shipping_state shipping_state, t1.version + 1 version, ${hivevar:pre_date} effective_date, ${hivevar:max_date} expiry_date FROM customer_dim t1 INNER JOIN rds.customer t2 ON t1.customer_number = t2.customer_number AND t1.expiry_date = ${hivevar:pre_date} LEFT JOIN customer_dim t3 ON t1.customer_number = t3.customer_number AND t3.expiry_date = ${hivevar:max_date} WHERE (!(t1.customer_street_address <=> t2.customer_street_address) OR !(t1.customer_zip_code <=> t2.customer_zip_code) OR !(t1.customer_city <=> t2.customer_city) OR !(t1.customer_state <=> t2.customer_state) OR !(t1.shipping_address <=> t2.shipping_address) OR !(t1.shipping_zip_code <=> t2.shipping_zip_code) OR !(t1.shipping_city <=> t2.shipping_city) OR !(t1.shipping_state <=> t2.shipping_state) ) AND t3.customer_sk IS NULL) t1 CROSS JOIN (SELECT COALESCE(MAX(customer_sk),0) sk_max FROM customer_dim) t2; -- 处理customer_name列上的SCD1 -- 因为hive的update的set子句还不支持子查询,所以这里使用了一个临时表存储需要更新的记录,用先delete再insert代替update -- 因为SCD1本身就不保存历史数据,所以这里更新维度表里的所有customer_name改变的记录,而不是仅仅更新当前版本的记录 DROP TABLE IF EXISTS tmp; CREATE TABLE tmp AS SELECT a.customer_sk, a.customer_number, b.customer_name, a.customer_street_address, a.customer_zip_code, a.customer_city, a.customer_state, a.shipping_address, a.shipping_zip_code, a.shipping_city, a.shipping_state, a.version, a.effective_date, a.expiry_date FROM customer_dim a, rds.customer b WHERE a.customer_number = b.customer_number AND !(a.customer_name <=> b.customer_name); DELETE FROM customer_dim WHERE customer_dim.customer_sk IN (SELECT customer_sk FROM tmp); INSERT INTO customer_dim SELECT * FROM tmp; -- 处理新增的customer记录 INSERT INTO customer_dim SELECT ROW_NUMBER() OVER (ORDER BY t1.customer_number) + t2.sk_max, t1.customer_number, t1.customer_name, t1.customer_street_address, t1.customer_zip_code, t1.customer_city, t1.customer_state, t1.shipping_address, t1.shipping_zip_code, t1.shipping_city, t1.shipping_state, 1, ${hivevar:pre_date}, ${hivevar:max_date} FROM ( SELECT t1.* FROM rds.customer t1 LEFT JOIN customer_dim t2 ON t1.customer_number = t2.customer_number WHERE t2.customer_sk IS NULL) t1 CROSS JOIN (SELECT COALESCE(MAX(customer_sk),0) sk_max FROM customer_dim) t2; -- 重载PA客户维度 TRUNCATE TABLE pa_customer_dim; INSERT INTO pa_customer_dim SELECT customer_sk , customer_number , customer_name , customer_street_address , customer_zip_code , customer_city , customer_state , shipping_address , shipping_zip_code , shipping_city , shipping_state , version , effective_date , expiry_date FROM customer_dim WHERE customer_state = 'PA' ; -- 装载product维度 -- 设置已删除记录和product_name、product_category列上SCD2的过期 UPDATE product_dim SET expiry_date = ${hivevar:pre_date} WHERE product_dim.product_sk IN (SELECT a.product_sk FROM (SELECT product_sk,product_code,product_name,product_category FROM product_dim WHERE expiry_date = ${hivevar:max_date}) a LEFT JOIN rds.product b ON a.product_code = b.product_code WHERE b.product_code IS NULL OR (a.product_name <> b.product_name OR a.product_category <> b.product_category)); -- 处理product_name、product_category列上SCD2的新增行 INSERT INTO product_dim SELECT ROW_NUMBER() OVER (ORDER BY t1.product_code) + t2.sk_max, t1.product_code, t1.product_name, t1.product_category, t1.version, t1.effective_date, t1.expiry_date FROM ( SELECT t2.product_code product_code, t2.product_name product_name, t2.product_category product_category, t1.version + 1 version, ${hivevar:pre_date} effective_date, ${hivevar:max_date} expiry_date FROM product_dim t1 INNER JOIN rds.product t2 ON t1.product_code = t2.product_code AND t1.expiry_date = ${hivevar:pre_date} LEFT JOIN product_dim t3 ON t1.product_code = t3.product_code AND t3.expiry_date = ${hivevar:max_date} WHERE (t1.product_name <> t2.product_name OR t1.product_category <> t2.product_category) AND t3.product_sk IS NULL) t1 CROSS JOIN (SELECT COALESCE(MAX(product_sk),0) sk_max FROM product_dim) t2; -- 处理新增的product记录 INSERT INTO product_dim SELECT ROW_NUMBER() OVER (ORDER BY t1.product_code) + t2.sk_max, t1.product_code, t1.product_name, t1.product_category, 1, ${hivevar:pre_date}, ${hivevar:max_date} FROM ( SELECT t1.* FROM rds.product t1 LEFT JOIN product_dim t2 ON t1.product_code = t2.product_code WHERE t2.product_sk IS NULL) t1 CROSS JOIN (SELECT COALESCE(MAX(product_sk),0) sk_max FROM product_dim) t2; -- 装载order维度 INSERT INTO order_dim SELECT ROW_NUMBER() OVER (ORDER BY t1.order_number) + t2.sk_max, t1.order_number, t1.version, t1.effective_date, t1.expiry_date FROM ( SELECT order_number order_number, 1 version, order_date effective_date, '2200-01-01' expiry_date FROM rds.sales_order, rds.cdc_time WHERE entry_date >= last_load AND entry_date < current_load ) t1 CROSS JOIN (SELECT COALESCE(MAX(order_sk),0) sk_max FROM order_dim) t2; -- 装载销售订单事实表 INSERT INTO sales_order_fact SELECT order_sk, customer_sk, product_sk, date_sk, order_amount, order_quantity FROM rds.sales_order a, order_dim b, customer_dim c, product_dim d, date_dim e, rds.cdc_time f WHERE a.order_number = b.order_number AND a.customer_number = c.customer_number AND a.order_date >= c.effective_date AND a.order_date < c.expiry_date AND a.product_code = d.product_code AND a.order_date >= d.effective_date AND a.order_date < d.expiry_date AND to_date(a.order_date) = e.date AND a.entry_date >= f.last_load AND a.entry_date < f.current_load ; -- 更新时间戳表的last_load字段 INSERT OVERWRITE TABLE rds.cdc_time SELECT current_load, current_load FROM rds.cdc_time;4. 测试
use source; insert into customer ( customer_name, customer_street_address, customer_zip_code, customer_city, customer_state, shipping_address, shipping_zip_code, shipping_city, shipping_state) values ('PA Customer', '1111 Louise Dr.', '17050', 'Mechanicsburg', 'PA', '1111 Louise Dr.', '17050', 'Mechanicsburg', 'PA'), ('Bigger Customers', '7777 Ridge Rd.', '44102', 'Cleveland', 'OH', '7777 Ridge Rd.', '44102', 'Cleveland', 'OH'), ('Smaller Stores', '8888 Jennings Fwy.', '44102', 'Cleveland', 'OH', '8888 Jennings Fwy.', '44102', 'Cleveland', 'OH'), ('Small-Medium Retailers', '9999 Memphis Ave.', '44102', 'Cleveland', 'OH', '9999 Memphis Ave.', '44102', 'Cleveland', 'OH'), ('OH Customer', '6666 Ridge Rd.', '44102', 'Cleveland', 'OH', '6666 Ridge Rd.', '44102','Cleveland', 'OH') ; commit;(2)执行定期装载并查看结果。
./regular_etl.sh使用下面的查询验证结果。
use dw; select customer_name, customer_state, effective_date, expiry_date from pa_customer_dim;结果是pa_customer_dim表增加了19条记录,如下图所示。