tcp-ds数据集sql查询

TPC-DS是与真实场景非常接近的一个测试集,它包含7张事实表,17张纬度表,平均每张表含有18列。用这个数据集能够很好的模拟企业数据查询、报表生成以及数据挖掘等复杂应用。

下列是7个事实表与17个维度表之间的ER图。

1.jpg
2.png
3.png
4.png
5.png
6.png
7.png
-- 1.符合某条件的每各客户单次消费额总额大于900的客户表
-- 条件:
-- 在1999-2001年期间,
-- 每月的1-3或25-28日的,
-- 来自williamson county的客户。

with sales_record(ss_ticket_number,ss_customer_sk,sales)as
(
    select
        ss_ticket_number,
        ss_customer_sk,
        sum(ss_sales_price)as sales
    from
        store_sales ss,
        date_dim dd,
        store s,
        household_demographics hd
    where
        ss.ss_sold_date_sk=dd.d_date_sk
        and ss.ss_store_sk=s.s_store_sk
        and ss.ss_hdemo_sk=hd.hd_demo_sk
        and(dd.d_dom between 1 and 3 or dd.d_dom between 25 and 28)
        and dd.d_year in (1999,2000,2001)
        and s.s_county in('williamson county')
    group by
        ss_ticket_number,
        ss_cutomer_sk
)
select
    c_last_name,
    c_first_name,
    c_salutation,
    c_preferred_cust_flag,
    sales,
    count(*) as cnt
from
    sales_record,
    customer
where
    ss_customer_sk=c_customer_sk
    and sales>900
group by
    c_last_name,
    c_first_name,
    c_salutation,
    c_preferred_cust_flag,
    sales 
order by
    c_last_name,
    c_first_name,
    c_salutation,
    c_preferred_cust_flag desc;


-- 2.4号店利润率最好最差的top10名产品名称


with ascending as
(
    select
        item_sk,
        rank()over(order by avg_net_profit asc)as rn
    from
    (
    select
            ss_item_sk as item_sk,
            avg(ss_net_paid-ss_ext_wholesale_cost)as avg_net_profit
        from
            store_sales
        where
            ss_store_sk=4
        group by
            ss_item_sk
        having
            avg(ss_net_paid-ss_ext_discount_amt)>0.8*
            (
                select
                    avg(ss_net_paid-ss_ext_wholesale_cost)
                from
                    store_sales
                where
                    ss_store_sk=4
                group by
                ss_store_sk
                )
        
        
        )s
),
descending as -- 小于平均水平的0.8
(
    select
        item_sk,
        rank()over(order by avg_net_profit desc)as rn
    from
    (select
        ss_item_sk as item_sk,
        avg(ss_net_paid-ss_ext_wholesale_cost)as avg_net_profit
    from
        store_sales
    where
        ss_store_sk=4
    group by
        ss_item_sk
    having
        avg(ss_net_paid-ss_ext_wholesale_cost)<0.2*
        (select
            avg(ss_net_paid-ss_ext_wholesale_cost)
        from
            store_sales
        where
            ss_store_sk=4
        group by
            ss_store_sk)
    )s
)

select
    ascending.rn,
    i1.i_product_name as best_performing,
    i2.i_product_name as worst_performing
from
    item i1,
    item i2,
    ascending,
    descending
where
    ascending.rn=descending.rn
    and ascending.rn<=10
    and descending.rn<=10
    and i1.i_item_sk=ascending.item_sk
    and i2.i_item_sk=descending.item_sk
order by
    ascending.rn


-- 3.符合某条件的客户的消费总额,按。。排序

select
    c_last_name,
    c_first_name,
    c_salutation,
    c_preferred_cust_flag,
    sales,
    count(*)as cnt
from
    (
    select
        ss_ticket_number,
        ss_customer_sk,
        sum(ss_sales_price)as sales
    from
        store_sales ss,
        date_dim dd,
        store s,
        household_demographics hd
    where
        ss_sold_date_sk=d_date_sk
        and ss_store_sk=s_store_sk
        and ss_hdemo_sk=hd_demo_sk
        and(d_dom between 1 and 3 or d_dom between 25 and 28)
        and hd_buy_potential='>10000'
        and d_year in (1999,2000,2001)
        and s_county in ('williamson county')
    group by
        ss_ticket_number,
        ss_customer_sk)as sales_record(ss_ticket_number,ss_customer_sk,sales),
    customer
where
    ss_customer_sk=c_customer_sk
    and sales>900
group by
    c_last_name,
    c_first_name,
    c_salutation,
    c_preferred_cust_flag,
    sales
order by
    c_last_name,
    c_first_name,
    c_salutation,
    c_preferred_cust_flag desc;
        
        

-- 4.人群画像统计
-- 在2001年4-6月内,在store,没在web、catalog上买的用户a

SELECT 
    cd_gender,
    cd_marital_status,
    cd_education_status,
    cd_purchase_estimate,
    cd_credit_rating,
    COUNT(DISTINCT c_customer_sk) AS cust_num
FROM
    customer c,
    customer_address ca,
    customer_demographics cd
WHERE
    c_current_addr_sk = ca_address_sk
        AND ca_state IN ('nm' , 'ky', 'ga')
        AND cd_demo_sk = c_current_cdemo_sk
        -- 在xx时间内 没在web catalog 在store
        AND EXISTS( SELECT 
            *
        FROM
            store_sales,
            date_dim
        WHERE
            c_customer_sk = ss_customer_sk
                AND ss_sold_date_sk = d_date_sk
                AND d_year = 2001
                AND d_moy BETWEEN 4 AND 4 + 2)
        AND NOT EXISTS( SELECT 
            *
        FROM
            web_sales,
            date_dim
        WHERE
            c_customer_sk = ws_bill_customer_sk
                AND ws_sold_date_sk = d_date_sk
                AND d_year = 2001
                AND d_moy BETWEEN 4 AND 4 + 2)
        AND NOT EXISTS( SELECT 
            *
        FROM
            catalog_sales,
            date_dim
        WHERE
            c_customer_sk = cs_ship_customer_sk
                AND cs_sold_date_sk = d_date_sk
                AND d_year = 2001
                AND d_moy BETWEEN 4 AND 4 + 2)
GROUP BY cd_gender , cd_marital_status  , cd_education_status, cd_purchase_estimate , cd_credit_rating
ORDER BY cd_gender , cd_marital_status , cd_education_status , cd_purchase_estimate , cd_credit_rating;

    

-- 5月销售额与年平均销售额差10%以上

with v1 as
(   select
        i_category,
        i_brand,
        cc_name,
        d_year,
        d_moy,
        sum(cs_sales_price)as current_sum_sales,
        avg(sum(cs_sales_price))over(partition by i_category,i_brand,cc_name,d_year)as avg_year_sales,
        sum(sum(cs_sales_price))over(partition by i_category,i_brand,cc_name,d_year)as sum_year_sales,
        sum(sum(cs_sales_price))over(partition by i_category,i_brand,cc_name,d_year order by d_moy rows 
            between unbounded preceding and current row)as accumulated_sum_year_sales,
        rank()over(partition by i_category,i_brand,cc_name order by d_year,d_moy)as rn
    from
        item,
        catalog_sales,
        date_dim,
        call_center
    where
        cs_item_sk=i_item_sk
        and cs_sold_date_sk=d_date_sk
        and cc_call_center_sk=cs_call_center_sk
        and(d_year=2001 or(d_year=2001-1 and d_moy=12)or (d_year=2001+1 and d_moy=1))
    group by
        i_category,
        i_brand,
        cc_name,
        d_year,
        d_moy
),
v2 as
(
    select 
        v1.i_category,
        v1.i_brand,
        v1.cc_name,
        v1.d_year,
        v1.d_moy,
        v1.avg_year_sales,
        v1_lag.current_sum_sales as previous_sum_sales,
        v1.current_sum_sales,
        v1_lead.current_sum_sales as next_sum_sales,
        v1.sum_year_sales,
        v1.accumulated_sum_year_sales
    from
        v1,
        v1 v1_lag,
        v1 v1_lead
    where
        v1.i_category=v1_lag.i_category
        and v1.i_category=v1_lead.i_category
        and v1.i_brand=v1_lag.i_brand
        and v1.i_brand=v1_lead.i_brand
        and v1.cc_name=v1_lag.cc_name
        and v1.cc_name=v1_lead.cc_name
        and v1.rn=v1_lag.rn+1
        and v1.rn=v1_lead.rn-1
)
select
    *
from
    v2
where
    d_year=2001
    and avg_year_sales>0
    and(case when avg_year_sales>0 then abs(current_sum_sales-avg_year_sales)/avg_year_sales else null end)>0.1
    order by
        current_sum_sales-avg_year_sales

-- 2000年store网中,tn州退货比较大的用户,按金额降序排列
-- 退款金额是store网用户平均退货金额的1.5倍
with
customer_total_return as
(
    select
        sr_customer_sk,
        sr_store_sk,
        sum(sr_fee)as total_return
    from 
        store_returns,
        date_dim
    where
        sr_returned_date_sk=d_date_sk
        and d_year=2000
    group by
        sr_customer_sk,
        sr_store_sk
)

select
    c_customer_id,
    total_return
from
    customer_total_return ctr,
    store s,
    customer c
where
    ctr.total_return>
    (select
        avg(total_return)*1.5
    from
        customer_total_return)
    and s.s_store_sk=ctr.sr_store_sk
    and ctr.sr_customer_sk=c.c_customer_sk
    and s.s_state='tn'
order by
    total_return desc;

--2002年四个季度web和catalog销售同比情况。

with s1 as 
(
    select 
        ws_sold_date_sk as sold_date_sk,
        ws_ext_sales_price as sales_price
    from 
        web_sales
    union all
    select
        cs_sold_date_sk as sold_date_sk,
        cs_ext_sales_price as sales_price
    from
        catalog_sales
),
s2 as
(
    select
        sum(case when (d_year=2001 and d_qoy=1) then sales_price else 0 end) as sales_2001_q1,
        sum(case when (d_year=2001 and d_qoy=2) then sales_price else 0 end) as sales_2001_q2,
        sum(case when (d_year=2001 and d_qoy=3) then sales_price else 0 end) as sales_2001_q3,
        sum(case when (d_year=2001 and d_qoy=4) then sales_price else 0 end) as sales_2001_q4,
        sum(case when (d_year=2002 and d_qoy=1) then sales_price else 0 end) as sales_2002_q1,
        sum(case when (d_year=2002 and d_qoy=2) then sales_price else 0 end) as sales_2002_q2,
        sum(case when (d_year=2002 and d_qoy=3) then sales_price else 0 end) as sales_2002_q3,
        sum(case when (d_year=2002 and d_qoy=4) then sales_price else 0 end) as sales_2002_q4
    from
        s1,
        date_dim
    where
        d_date_sk=sold_date_sk
)
select 
    round((sales_2002_q1-sales_2001_q1)/sales_2001_q1,2)as q1_sales_ratio,
    round((sales_2002_q2-sales_2001_q2)/sales_2001_q1,2)as q2_sales_ratio,
    round((sales_2002_q3-sales_2001_q3)/sales_2001_q1,2)as q3_sales_ratio,
    round((sales_2002_q4-sales_2001_q4)/sales_2001_q1,2)as q4_sales_ratio
from
    s2

你可能感兴趣的:(tcp-ds数据集sql查询)