TPC-DS是与真实场景非常接近的一个测试集,它包含7张事实表,17张纬度表,平均每张表含有18列。用这个数据集能够很好的模拟企业数据查询、报表生成以及数据挖掘等复杂应用。
下列是7个事实表与17个维度表之间的ER图。
-- 1.符合某条件的每各客户单次消费额总额大于900的客户表
-- 条件:
-- 在1999-2001年期间,
-- 每月的1-3或25-28日的,
-- 来自williamson county的客户。
with sales_record(ss_ticket_number,ss_customer_sk,sales)as
(
select
ss_ticket_number,
ss_customer_sk,
sum(ss_sales_price)as sales
from
store_sales ss,
date_dim dd,
store s,
household_demographics hd
where
ss.ss_sold_date_sk=dd.d_date_sk
and ss.ss_store_sk=s.s_store_sk
and ss.ss_hdemo_sk=hd.hd_demo_sk
and(dd.d_dom between 1 and 3 or dd.d_dom between 25 and 28)
and dd.d_year in (1999,2000,2001)
and s.s_county in('williamson county')
group by
ss_ticket_number,
ss_cutomer_sk
)
select
c_last_name,
c_first_name,
c_salutation,
c_preferred_cust_flag,
sales,
count(*) as cnt
from
sales_record,
customer
where
ss_customer_sk=c_customer_sk
and sales>900
group by
c_last_name,
c_first_name,
c_salutation,
c_preferred_cust_flag,
sales
order by
c_last_name,
c_first_name,
c_salutation,
c_preferred_cust_flag desc;
-- 2.4号店利润率最好最差的top10名产品名称
with ascending as
(
select
item_sk,
rank()over(order by avg_net_profit asc)as rn
from
(
select
ss_item_sk as item_sk,
avg(ss_net_paid-ss_ext_wholesale_cost)as avg_net_profit
from
store_sales
where
ss_store_sk=4
group by
ss_item_sk
having
avg(ss_net_paid-ss_ext_discount_amt)>0.8*
(
select
avg(ss_net_paid-ss_ext_wholesale_cost)
from
store_sales
where
ss_store_sk=4
group by
ss_store_sk
)
)s
),
descending as -- 小于平均水平的0.8
(
select
item_sk,
rank()over(order by avg_net_profit desc)as rn
from
(select
ss_item_sk as item_sk,
avg(ss_net_paid-ss_ext_wholesale_cost)as avg_net_profit
from
store_sales
where
ss_store_sk=4
group by
ss_item_sk
having
avg(ss_net_paid-ss_ext_wholesale_cost)<0.2*
(select
avg(ss_net_paid-ss_ext_wholesale_cost)
from
store_sales
where
ss_store_sk=4
group by
ss_store_sk)
)s
)
select
ascending.rn,
i1.i_product_name as best_performing,
i2.i_product_name as worst_performing
from
item i1,
item i2,
ascending,
descending
where
ascending.rn=descending.rn
and ascending.rn<=10
and descending.rn<=10
and i1.i_item_sk=ascending.item_sk
and i2.i_item_sk=descending.item_sk
order by
ascending.rn
-- 3.符合某条件的客户的消费总额,按。。排序
select
c_last_name,
c_first_name,
c_salutation,
c_preferred_cust_flag,
sales,
count(*)as cnt
from
(
select
ss_ticket_number,
ss_customer_sk,
sum(ss_sales_price)as sales
from
store_sales ss,
date_dim dd,
store s,
household_demographics hd
where
ss_sold_date_sk=d_date_sk
and ss_store_sk=s_store_sk
and ss_hdemo_sk=hd_demo_sk
and(d_dom between 1 and 3 or d_dom between 25 and 28)
and hd_buy_potential='>10000'
and d_year in (1999,2000,2001)
and s_county in ('williamson county')
group by
ss_ticket_number,
ss_customer_sk)as sales_record(ss_ticket_number,ss_customer_sk,sales),
customer
where
ss_customer_sk=c_customer_sk
and sales>900
group by
c_last_name,
c_first_name,
c_salutation,
c_preferred_cust_flag,
sales
order by
c_last_name,
c_first_name,
c_salutation,
c_preferred_cust_flag desc;
-- 4.人群画像统计
-- 在2001年4-6月内,在store,没在web、catalog上买的用户a
SELECT
cd_gender,
cd_marital_status,
cd_education_status,
cd_purchase_estimate,
cd_credit_rating,
COUNT(DISTINCT c_customer_sk) AS cust_num
FROM
customer c,
customer_address ca,
customer_demographics cd
WHERE
c_current_addr_sk = ca_address_sk
AND ca_state IN ('nm' , 'ky', 'ga')
AND cd_demo_sk = c_current_cdemo_sk
-- 在xx时间内 没在web catalog 在store
AND EXISTS( SELECT
*
FROM
store_sales,
date_dim
WHERE
c_customer_sk = ss_customer_sk
AND ss_sold_date_sk = d_date_sk
AND d_year = 2001
AND d_moy BETWEEN 4 AND 4 + 2)
AND NOT EXISTS( SELECT
*
FROM
web_sales,
date_dim
WHERE
c_customer_sk = ws_bill_customer_sk
AND ws_sold_date_sk = d_date_sk
AND d_year = 2001
AND d_moy BETWEEN 4 AND 4 + 2)
AND NOT EXISTS( SELECT
*
FROM
catalog_sales,
date_dim
WHERE
c_customer_sk = cs_ship_customer_sk
AND cs_sold_date_sk = d_date_sk
AND d_year = 2001
AND d_moy BETWEEN 4 AND 4 + 2)
GROUP BY cd_gender , cd_marital_status , cd_education_status, cd_purchase_estimate , cd_credit_rating
ORDER BY cd_gender , cd_marital_status , cd_education_status , cd_purchase_estimate , cd_credit_rating;
-- 5月销售额与年平均销售额差10%以上
with v1 as
( select
i_category,
i_brand,
cc_name,
d_year,
d_moy,
sum(cs_sales_price)as current_sum_sales,
avg(sum(cs_sales_price))over(partition by i_category,i_brand,cc_name,d_year)as avg_year_sales,
sum(sum(cs_sales_price))over(partition by i_category,i_brand,cc_name,d_year)as sum_year_sales,
sum(sum(cs_sales_price))over(partition by i_category,i_brand,cc_name,d_year order by d_moy rows
between unbounded preceding and current row)as accumulated_sum_year_sales,
rank()over(partition by i_category,i_brand,cc_name order by d_year,d_moy)as rn
from
item,
catalog_sales,
date_dim,
call_center
where
cs_item_sk=i_item_sk
and cs_sold_date_sk=d_date_sk
and cc_call_center_sk=cs_call_center_sk
and(d_year=2001 or(d_year=2001-1 and d_moy=12)or (d_year=2001+1 and d_moy=1))
group by
i_category,
i_brand,
cc_name,
d_year,
d_moy
),
v2 as
(
select
v1.i_category,
v1.i_brand,
v1.cc_name,
v1.d_year,
v1.d_moy,
v1.avg_year_sales,
v1_lag.current_sum_sales as previous_sum_sales,
v1.current_sum_sales,
v1_lead.current_sum_sales as next_sum_sales,
v1.sum_year_sales,
v1.accumulated_sum_year_sales
from
v1,
v1 v1_lag,
v1 v1_lead
where
v1.i_category=v1_lag.i_category
and v1.i_category=v1_lead.i_category
and v1.i_brand=v1_lag.i_brand
and v1.i_brand=v1_lead.i_brand
and v1.cc_name=v1_lag.cc_name
and v1.cc_name=v1_lead.cc_name
and v1.rn=v1_lag.rn+1
and v1.rn=v1_lead.rn-1
)
select
*
from
v2
where
d_year=2001
and avg_year_sales>0
and(case when avg_year_sales>0 then abs(current_sum_sales-avg_year_sales)/avg_year_sales else null end)>0.1
order by
current_sum_sales-avg_year_sales
-- 2000年store网中,tn州退货比较大的用户,按金额降序排列
-- 退款金额是store网用户平均退货金额的1.5倍
with
customer_total_return as
(
select
sr_customer_sk,
sr_store_sk,
sum(sr_fee)as total_return
from
store_returns,
date_dim
where
sr_returned_date_sk=d_date_sk
and d_year=2000
group by
sr_customer_sk,
sr_store_sk
)
select
c_customer_id,
total_return
from
customer_total_return ctr,
store s,
customer c
where
ctr.total_return>
(select
avg(total_return)*1.5
from
customer_total_return)
and s.s_store_sk=ctr.sr_store_sk
and ctr.sr_customer_sk=c.c_customer_sk
and s.s_state='tn'
order by
total_return desc;
--2002年四个季度web和catalog销售同比情况。
with s1 as
(
select
ws_sold_date_sk as sold_date_sk,
ws_ext_sales_price as sales_price
from
web_sales
union all
select
cs_sold_date_sk as sold_date_sk,
cs_ext_sales_price as sales_price
from
catalog_sales
),
s2 as
(
select
sum(case when (d_year=2001 and d_qoy=1) then sales_price else 0 end) as sales_2001_q1,
sum(case when (d_year=2001 and d_qoy=2) then sales_price else 0 end) as sales_2001_q2,
sum(case when (d_year=2001 and d_qoy=3) then sales_price else 0 end) as sales_2001_q3,
sum(case when (d_year=2001 and d_qoy=4) then sales_price else 0 end) as sales_2001_q4,
sum(case when (d_year=2002 and d_qoy=1) then sales_price else 0 end) as sales_2002_q1,
sum(case when (d_year=2002 and d_qoy=2) then sales_price else 0 end) as sales_2002_q2,
sum(case when (d_year=2002 and d_qoy=3) then sales_price else 0 end) as sales_2002_q3,
sum(case when (d_year=2002 and d_qoy=4) then sales_price else 0 end) as sales_2002_q4
from
s1,
date_dim
where
d_date_sk=sold_date_sk
)
select
round((sales_2002_q1-sales_2001_q1)/sales_2001_q1,2)as q1_sales_ratio,
round((sales_2002_q2-sales_2001_q2)/sales_2001_q1,2)as q2_sales_ratio,
round((sales_2002_q3-sales_2001_q3)/sales_2001_q1,2)as q3_sales_ratio,
round((sales_2002_q4-sales_2001_q4)/sales_2001_q1,2)as q4_sales_ratio
from
s2