USE dw;
-- 建立地址维度表
CREATE TABLE zip_code_dim (
zip_code_sk INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
zip_code INT(5),
city VARCHAR(30),
state VARCHAR(2),
version INT DEFAULT 1,
effective_date DATE DEFAULT '1900-01-01',
expiry_date DATE DEFAULT '2200-01-01'
);
-- 初始装载邮编相关数据
insert into zip_code_dim (zip_code,city,state) values (17050,'PITTSBURGH','PA');
insert into zip_code_dim (zip_code,city,state) values (17051,'MC VEYTOWN','PA');
insert into zip_code_dim (zip_code,city,state) values (17052,'MAPLETON DEPOT','PA');
insert into zip_code_dim (zip_code,city,state) values (17053,'MARYSVILLE','PA');
insert into zip_code_dim (zip_code,city,state) values (17054,'MATTAWANA','PA');
insert into zip_code_dim (zip_code,city,state) values (17055,'MECHANICSBURG','PA');
insert into zip_code_dim (zip_code,city,state) values (44102,'CLEVELAND','OH');
COMMIT;
-- 创建视图
CREATE VIEW customer_zip_code_dim (customer_zip_code_sk , customer_zip_code , customer_city , customer_state , version , effective_date , expiry_date) AS
SELECT
zip_code_sk,
zip_code,
city,
state,
version,
effective_date,
expiry_date
FROM
zip_code_dim;
CREATE VIEW shipping_zip_code_dim (shipping_zip_code_sk , shipping_zip_code , shipping_city , shipping_state , version , effective_date , expiry_date) AS
SELECT
zip_code_sk,
zip_code,
city,
state,
version,
effective_date,
expiry_date
FROM
zip_code_dim;
-- 添加邮编代理键
ALTER TABLE sales_order_fact
ADD customer_zip_code_sk INT AFTER customer_sk
, ADD shipping_zip_code_sk INT AFTER customer_zip_code_sk;
-- 添加外键约束
ALTER TABLE sales_order_fact
ADD FOREIGN KEY (customer_zip_code_sk) REFERENCES zip_code_dim(zip_code_sk),
ADD FOREIGN KEY (shipping_zip_code_sk) REFERENCES zip_code_dim(zip_code_sk);
-- 初始装载两个邮编代理键
UPDATE sales_order_fact a,
customer_dim b,
customer_zip_code_dim c
SET
a.customer_zip_code_sk = c.customer_zip_code_sk
WHERE
a.customer_sk = b.customer_sk
AND b.customer_zip_code = c.customer_zip_code;
UPDATE sales_order_fact a,
customer_dim b,
shipping_zip_code_dim c
SET
a.shipping_zip_code_sk = c.shipping_zip_code_sk
WHERE
a.customer_sk = b.customer_sk
AND b.shipping_zip_code = c.shipping_zip_code;
COMMIT;
ALTER TABLE customer_dim
DROP customer_zip_code
, DROP customer_city
, DROP customer_state
, DROP shipping_zip_code
, DROP shipping_city
, DROP shipping_state;
ALTER TABLE pa_customer_dim
DROP customer_zip_code
, DROP customer_city
, DROP customer_state
, DROP shipping_zip_code
, DROP shipping_city
, DROP shipping_state;
-- 创建视图
CREATE VIEW factory_zip_code_dim (factory_zip_code_sk , factory_zip_code , factory_city , factory_state , version,effective_date , expiry_date) AS
SELECT
zip_code_sk,
zip_code,
city,
state,
version,
effective_date,
expiry_date
FROM
zip_code_dim;
-- 给production_fact表增加factory_zip_code_sk列
ALTER TABLE production_fact
ADD factory_zip_code_sk INT AFTER factory_sk;
-- 添加外键约束
ALTER TABLE production_fact ADD FOREIGN KEY (factory_zip_code_sk) REFERENCES zip_code_dim(zip_code_sk);
-- 初始装载邮编代理键
UPDATE production_fact a,
factory_dim b,
factory_zip_code_dim c
SET
a.factory_zip_code_sk = c.factory_zip_code_sk
WHERE
a.factory_sk = b.factory_sk
AND b.factory_zip_code = c.factory_zip_code;
COMMIT;
-- 定义factory_code作为factory_stg表的主键,并把factory_dim表里的工厂信息导入factory_stg。为产品的定期导入,过渡表里需要有所有工厂的完整数据(包括邮编、城市和州)。需要主键来维护factory_stg表里的工厂数据。
TRUNCATE factory_stg;
ALTER TABLE factory_stg
ADD PRIMARY KEY (factory_code);
INSERT INTO factory_stg
SELECT
factory_code
, factory_name
, factory_street_address
, factory_zip_code
, factory_city
, factory_state
FROM factory_dim;
COMMIT ;
-- 在factory_dim表上删除工厂编码及其它们的城市和州列
ALTER TABLE factory_dim
DROP factory_zip_code
, DROP factory_city
, DROP factory_state;
USE dw;
-- 设置SCD的截止时间和生效时间
SET @pre_date = SUBDATE(CURRENT_DATE,1) ;
-- 设置CDC的上限时间
UPDATE cdc_time SET current_load = CURRENT_DATE ;
-- 装载客户维度
TRUNCATE TABLE customer_stg;
INSERT INTO customer_stg
SELECT
customer_number
, customer_name
, customer_street_address
, customer_zip_code
, customer_city
, customer_state
, shipping_address
, shipping_zip_code
, shipping_city
, shipping_state
FROM source.customer ;
/* 在所有地址列上 SCD2 */
/* 置过期 */
UPDATE customer_dim a,
customer_stg b
SET
expiry_date = @pre_date
WHERE
a.customer_number = b.customer_number
AND (a.customer_street_address <> b.customer_street_address
OR a.shipping_address <> b.shipping_address
OR a.shipping_address IS NULL)
AND expiry_date = '2200-01-01';
/* 加新行 */
INSERT INTO customer_dim
SELECT
NULL
, b.customer_number
, b.customer_name
, b.customer_street_address
, b.shipping_address
, a.version + 1
, @pre_date
, '2200-01-01'
FROM
customer_dim a
, customer_stg b
WHERE
a.customer_number = b.customer_number
AND ( a.customer_street_address <> b.customer_street_address
OR a.shipping_address <> b.shipping_address
OR a.shipping_address IS NULL)
AND EXISTS(
SELECT *
FROM customer_dim x
WHERE
b.customer_number=x.customer_number
AND a.expiry_date = @pre_date )
AND NOT EXISTS (
SELECT *
FROM customer_dim y
WHERE
b.customer_number = y.customer_number
AND y.expiry_date = '2200-01-01') ;
/* 在 customer_name 列上 SCD1 */
UPDATE customer_dim a, customer_stg b
SET a.customer_name = b.customer_name
WHERE a.customer_number = b.customer_number
AND a.customer_name <> b.customer_name ;
/* 新增的客户 */
INSERT INTO customer_dim
SELECT
NULL
, customer_number
, customer_name
, customer_street_address
, shipping_address
, 1
, @pre_date
,'2200-01-01'
FROM customer_stg
WHERE customer_number NOT IN(
SELECT y.customer_number
FROM customer_dim x, customer_stg y
WHERE x.customer_number = y.customer_number) ;
/* 装载产品维度 */
TRUNCATE TABLE product_stg ;
INSERT INTO product_stg
SELECT
product_code
, product_name
, product_category
FROM source.product ;
/* 在 product_name 和 product_category 列上 SCD2 */
/* 置过期 */
UPDATE
product_dim a
, product_stg b
SET
expiry_date = @pre_date
WHERE
a.product_code = b.product_code
AND ( a.product_name <> b.product_name
OR a.product_category <> b.product_category)
AND expiry_date = '2200-01-01';
/* 加新行 */
INSERT INTO product_dim
SELECT
NULL
, b.product_code
, b.product_name
, b.product_category
, a.version + 1
, @pre_date
,'2200-01-01'
FROM
product_dim a
, product_stg b
WHERE
a.product_code = b.product_code
AND ( a.product_name <> b.product_name
OR a.product_category <> b.product_category)
AND EXISTS(
SELECT *
FROM product_dim x
WHERE b.product_code = x.product_code
AND a.expiry_date = @pre_date)
AND NOT EXISTS (
SELECT *
FROM product_dim y
WHERE b.product_code = y.product_code
AND y.expiry_date = '2200-01-01') ;
/* 新增的产品 */
INSERT INTO product_dim
SELECT
NULL
, product_code
, product_name
, product_category
, 1
, @pre_date
, '2200-01-01'
FROM product_stg
WHERE product_code NOT IN(
SELECT y.product_code
FROM product_dim x, product_stg y
WHERE x.product_code = y.product_code) ;
/* PRODUCT_COUNT_FACT POPULATION */
TRUNCATE product_count_fact;
INSERT INTO product_count_fact(product_sk, product_launch_date_sk)
SELECT
a.product_sk
, b.date_sk
FROM
product_dim a
, date_dim b
WHERE
a.effective_date = b.date
GROUP BY product_code;
/* END OF PRODUCT_COUNT_FACT POPULATION */
-- 装载事实表,新增前一天的订单
INSERT INTO sales_order_fact
SELECT
customer_sk
, i.customer_zip_code_sk
, j.shipping_zip_code_sk
, product_sk
, g.sales_order_attribute_sk
, e.order_date_sk
, NULL
, NULL
, NULL
, NULL
, h.entry_date_sk
, a.order_number
, f.request_delivery_date_sk
, order_amount
, quantity
, NULL
, NULL
, NULL
, NULL
FROM
source.sales_order a
, customer_dim c
, product_dim d
, order_date_dim e
, request_delivery_date_dim f
, sales_order_attribute_dim g
, entry_date_dim h
, customer_zip_code_dim i
, shipping_zip_code_dim j
, customer_stg k
, cdc_time l
WHERE
a.order_status = 'N'
AND a.customer_number = c.customer_number
AND a.status_date >= c.effective_date
AND a.status_date < c.expiry_date
AND a.customer_number = k.customer_number
AND k.customer_zip_code = i.customer_zip_code
AND a.status_date >= i.effective_date
AND a.status_date <= i.expiry_date
AND k.shipping_zip_code = j.shipping_zip_code
AND a.status_date >= j.effective_date
AND a.status_date <= j.expiry_date
AND a.product_code = d.product_code
AND a.status_date >= d.effective_date
AND a.status_date < d.expiry_date
AND a.status_date = e.order_date
AND a.entry_date = h.entry_date
AND a.request_delivery_date = f.request_delivery_date
AND a.verification_ind = g.verification_ind
AND a.credit_check_flag = g.credit_check_flag
AND a.new_customer_ind = g.new_customer_ind
AND a.web_order_flag = g.web_order_flag
AND a.status_date >= g.effective_date
AND a.status_date <= g.expiry_date
AND a.entry_date >= l.last_load AND a.entry_date < l.current_load ;
/* RE-BUILD PA CUSTOMER DIMENSION*/
TRUNCATE pa_customer_dim;
INSERT INTO pa_customer_dim
SELECT DISTINCT a.*
FROM
customer_dim a
, sales_order_fact b
, customer_zip_code_dim c
WHERE
c.customer_state = 'PA'
AND b.customer_zip_code_sk = c.customer_zip_code_sk
AND a.customer_sk = b.customer_sk;
/* UPDATING the new sales order to Allocated status */
UPDATE sales_order_fact a,
source.sales_order b,
allocate_date_dim c,
cdc_time h
SET
a.allocate_date_sk = c.allocate_date_sk,
a.allocate_quantity = b.quantity
WHERE
order_status = 'A'
AND b.entry_date >= h.last_load AND b.entry_date < h.current_load
AND b.order_number = a.order_number
AND c.allocate_date = b.status_date ;
/* UPDATING the allocated order to Packed status */
UPDATE sales_order_fact a,
source.sales_order b,
packing_date_dim d,
cdc_time h
SET
a.packing_date_sk = d.packing_date_sk,
a.packing_quantity = b.quantity
WHERE
order_status = 'P'
AND b.entry_date >= h.last_load AND b.entry_date < h.current_load
AND b.order_number = a.order_number
AND d.packing_date = b.status_date ;
/* UPDATING the packed order to Shipped status */
UPDATE sales_order_fact a,
source.sales_order b,
ship_date_dim e,
cdc_time h
SET
a.ship_date_sk = e.ship_date_sk,
a.ship_quantity = b.quantity
WHERE
order_status = 'S'
AND b.entry_date >= h.last_load AND b.entry_date < h.current_load
AND b.order_number = a.order_number
AND e.ship_date = b.status_date ;
/* UPDATING the shipped order to Received status */
UPDATE sales_order_fact a,
source.sales_order b,
receive_date_dim f,
cdc_time h
SET
a.receive_date_sk = f.receive_date_sk,
a.receive_quantity = b.quantity
WHERE
order_status = 'R'
AND b.entry_date >= h.last_load AND b.entry_date < h.current_load
AND b.order_number = a.order_number
AND f.receive_date = b.status_date ;
-- 更新时间戳表的last_load字段
UPDATE cdc_time SET last_load = current_load ;
COMMIT ;
测试修改后的定期装载
执行修改后的定期装载脚本或相应的Kettle作业前,需要做一些准备工作。首先对源数据的客户信息做以下两处修改:查询pa_customer_dim表,确认PA客户正确装载。查询语句和结果如下所示。
mysql> select * from pa_customer_dim;
+-------------+-----------------+------------------------+-------------------------+---------------------+---------+----------------+-------------+
| customer_sk | customer_number | customer_name | customer_street_address | shipping_address | version | effective_date | expiry_date |
+-------------+-----------------+------------------------+-------------------------+---------------------+---------+----------------+-------------+
| 1 | 1 | Really Large Customers | 7500 Louise Dr. | NULL | 1 | 2013-03-01 | 2015-03-02 |
| 2 | 2 | Small Stores | 2500 Woodland St. | NULL | 1 | 2013-03-01 | 2015-03-02 |
| 3 | 3 | Medium Retailers | 1111 Ritter Rd. | NULL | 1 | 2013-03-01 | 2015-03-02 |
| 4 | 4 | Good Companies | 9500 Scott St. | NULL | 1 | 2013-03-01 | 2015-03-02 |
| 5 | 5 | Wonderful Shops | 3333 Rossmoyne Rd. | NULL | 1 | 2013-03-01 | 2015-03-02 |
| 6 | 6 | Loyal Clients | 7070 Ritter Rd. | NULL | 1 | 2013-03-01 | 2015-03-01 |
| 7 | 7 | Distinguished Agencies | 9999 Scott St. | NULL | 1 | 2013-03-01 | 2015-03-02 |
| 8 | 6 | Loyal Clients | 7777 Ritter Rd. | NULL | 2 | 2015-03-01 | 2015-03-02 |
| 9 | 8 | Subsidiaries | 10000 Wetline Blvd. | NULL | 1 | 2015-03-01 | 2015-03-02 |
| 10 | 1 | Really Large Customers | 7500 Louise Dr. | 7500 Louise Dr. | 2 | 2015-03-02 | 2200-01-01 |
| 11 | 2 | Small Stores | 2500 Woodland St. | 2500 Woodland St. | 2 | 2015-03-02 | 2200-01-01 |
| 12 | 3 | Medium Retailers | 1111 Ritter Rd. | 1111 Ritter Rd. | 2 | 2015-03-02 | 2200-01-01 |
| 13 | 4 | Good Companies | 9500 Scott St. | 9500 Scott St. | 2 | 2015-03-02 | 2015-03-27 |
| 14 | 5 | Wonderful Shops | 3333 Rossmoyne Rd. | 3333 Rossmoyne Rd. | 2 | 2015-03-02 | 2200-01-01 |
| 15 | 6 | Loyal Clients | 7777 Ritter Rd. | 7777 Ritter Rd. | 3 | 2015-03-02 | 2200-01-01 |
| 16 | 7 | Distinguished Agencies | 9999 Scott St. | 9999 Scott St. | 2 | 2015-03-02 | 2200-01-01 |
| 17 | 8 | Subsidiaries | 10000 Wetline Blvd. | 10000 Wetline Blvd. | 2 | 2015-03-02 | 2200-01-01 |
| 18 | 9 | Online Distributors | 2323 Louise Dr. | 2323 Louise Dr. | 1 | 2015-03-02 | 2200-01-01 |
| 22 | 13 | PA Customer | 1111 Louise Dr. | 1111 Louise Dr. | 1 | 2015-03-03 | 2200-01-01 |
| 24 | 4 | Good Companies | 9999 Louise Dr. | 9999 Louise Dr. | 3 | 2015-03-27 | 2200-01-01 |
| 25 | 15 | Super Stores | 1000 Woodland St. | 1000 Woodland St. | 1 | 2015-03-27 | 2200-01-01 |
+-------------+-----------------+------------------------+-------------------------+---------------------+---------+----------------+-------------+
21 rows in set (0.00 sec)
USE dw;
-- 设置SCD的截止时间和生效时间
SET @pre_date = SUBDATE(CURRENT_DATE,1) ;
LOAD DATA INFILE '/root/data-integration/factory.csv'
REPLACE INTO TABLE factory_stg
FIELDS TERMINATED BY ','
OPTIONALLY ENCLOSED BY ""
LINES TERMINATED BY '\n'
IGNORE 1 LINES
( factory_code
, factory_name
, factory_street_address
, factory_zip_code
, factory_city
, factory_state );
/* SCD1 */
UPDATE
factory_dim a
, factory_stg b
SET
a.factory_name = b.factory_name
, a.factory_street_address = b.factory_street_address
WHERE a.factory_code = b.factory_code;
/* add new factory */
INSERT INTO factory_dim
SELECT
NULL
, factory_code
, factory_name
, factory_street_address
, 1
, @pre_date
, '2200-01-01'
FROM factory_stg
WHERE factory_code NOT IN (
SELECT y.factory_code
FROM factory_dim x, factory_stg y
WHERE x.factory_code = y.factory_code );
INSERT INTO production_fact
SELECT
b.product_sk
, c.date_sk
, d.factory_sk
, e.factory_zip_code_sk
, production_quantity
FROM
source.daily_production a
, product_dim b
, date_dim c
, factory_dim d
, factory_zip_code_dim e
, factory_stg f
WHERE
production_date = @pre_date
AND a.product_code = b.product_code
AND a.production_date >= b.effective_date
AND a.production_date <= b.expiry_date
AND a.factory_code = f.factory_code
AND f.factory_zip_code = e.factory_zip_code
AND a.production_date >= e.effective_date
AND a.production_date <= e.expiry_date
AND a.production_date = c.date
AND a.factory_code = d.factory_code ;
COMMIT ;