1.假定你当前有两张淘宝交易订单表order和sub_oder,存储于hive环境,其表结构信息如下,一个订单ID下可能多个子订单,一个子订单代表一个买家在一个卖家购买的一种商品,可能购买多件,整个支付金额是在主订单上。
create table order(
order_id bigint --订单ID
,sub_order_id bigint --子订单ID
,seller_id bigint --卖家ID
,buyer_id bigint --买家ID
,pay_time string --支付时间
,pay_amt double --实际支付金额(元)
,adjust_amt double --主订单优惠券金额(元)
)
create table sub_order (
order_id bigint --订单ID
,sub_order_id bigint --子订单ID
,product_id bigint --商品ID
,price double --商品价格(元)
,quantity bigint --购买商品数量
)
现在需要你设计和开发一段数据处理逻辑SQL实现,将实际支付金额基于每个子订单的(商品价格*购买数量)占总的订单的(商品价格*购买数量)比例进行拆分,获得每个子订单分摊实际支付金额,并输出表结构如下:
create table order(
order_id bigint --订单ID
,sub_order_id bigint --子订单ID
,seller_id bigint --卖家ID
,buyer_id bigint --买家ID
,product_id bigint --商品ID
,pay_time string --支付时间
,price double --商品价格(元)
,quantity bigint --购买商品数量
,sub_pay_amt double --子订单分摊实际支付金额(元)
)
请注意几个要求:
1)拆分后金额精确到小数点两位;
2)拆分后的汇总金额要与拆分前完全一致;
3)拆分的金额保持,每次程序重新运行计算的结果是一致的;
4)如有业务理解有异议的,你可以进行一定假设,在代码注释中标明;
第一题:
方法:实际支付金额+优惠金额=所有订单的实际金额,计算出每个订单总金额在父订单所有订单的实际金额的比例*实际支付金额,就是每个子订单分摊的金额。
,考虑到数据量大的情况,pay_amt 乘以分摊的比例price*quantity*1.0/(pay_amt+adjust_amt) 结果保留两位小数相加后的金额可能与原来的数据不一样。
需要做数据校验,是否存在差额,如果存在差距,用差额再乘以订单金额占比然后分摊到每一个子订单上,减去差额的分摊。
示例:
步骤一:临时表 计算初步订单金额占比、分摊金额
create TABLE as `order_test`
select
order.order_id
,sub_order.sub_order_id
,order.seller_id
,order.buyer_id
,sub_order.product_id
,order.pay_time
,sub_order.price
,sub_order.quantity
,order.pay_amt
-- 订单金额占比
,price*quantity*1.0/(pay_amt+adjust_amt) amt_ratio
-- 支付金额*订单占比 = 分摊金额
,cast(pay_amt* (price*quantity*1.0/(pay_amt+adjust_amt) ) as decimal(10,2) ) sub_pay_amt
from
(
select
101 as order_id,
2000 as seller_id,
300 as buyer_id,
'20200224' as pay_time,
1059.40 as pay_amt,
150.00 as adjust_amt
) order
left outer join
(
select
101 as order_id,
10101 as sub_order_id,
1000 as product_id ,
200.50 as price,
2 as quantity
union all
select
101 as order_id,
10102 as sub_order_id,
1001 as product_id ,
400.40 as price,
1 as quantity
union all
select
101 as order_id,
10102 as sub_order_id,
1002 as product_id ,
102.00 as price,
4 as quantity
) sub_order
-- 假设每个order和sub_order为订单粒度,并且两张表的计算日期分区和下单时间分区是同一天
on order.order_id=sub_order.order_id
步骤二:数据校验示例代码
create TABLE as `order`
select
a.order_id
,a.sub_order_id
,a.seller_id
,a.buyer_id
,a.product_id
,a.pay_time
,a.price
,a.quantity
-- 如果存在误差则减去差额占比
,cast(a.sub_pay_amt-b.diff_pay_amt*a.amt_ratio as decimal(10,2) ) as sub_pay_amt
from order_test a
left outer join
(
--父订单数据校验算出差距金额
select
order_id
,case when abs(sub_pay_amt - pay_amt) > 0 then sub_pay_amt- pay_amt else 0 end diff_pay_amt
from
(
select
order_id
,max(pay_amt) pay_amt
,sum(sub_pay_amt) sub_pay_amt
from order_test
group by 1
) a
) b
on a.order_id=b.order_id
第二题:
方法1:把id按照打卡日期分组升序排序得到日期对应排序值date_rank,id对应的打卡日期check_in_date和date_rank之间,
如果日期是连续的则差值一样,做差值得到每个员工不同连续日期之间的标记diff,根据id对diff做聚合找出最大值即是最长时间。
示例如下:
select
id,max(continue_num) max_continue_num
from
( select
diff,id,sum(1) continue_num
from
(
select
id,date_to_num-date_rank as diff
from(
select
emp_id as id,check_in_date as dt
,cast(regexp_replace(check_in_date,'-','') as bigint) date_to_num
,row_number() over(partition by emp_id order by check_in_date ) date_rank
from
(
select 1 as emp_id , '2019-01-01' as check_in_date union all
select 1 as emp_id , '2019-01-02' as check_in_date union all
select 1 as emp_id , '2019-01-12' as check_in_date union all
select 1 as emp_id , '2019-01-13' as check_in_date union all
select 1 as emp_id , '2019-01-14' as check_in_date union all
select 1 as emp_id , '2019-01-15' as check_in_date union all
select 2 as emp_id , '2019-01-01' as check_in_date union all
select 2 as emp_id , '2019-01-03' as check_in_date union all
select 2 as emp_id , '2019-01-04' as check_in_date union all
select 2 as emp_id , '2019-01-05' as check_in_date
) a
) a
) b
group by 1,2
) c
group by 1
思考:本方法适合对周期为一个月连续打卡最长的时间记录,若统计时间超过一个月需要选择另外的日期排序参照物。
方法2:主要思路:还是对每个id按时间排序,然后运用函数LAG(check_in_date,n,null)统计窗口内往上第n行值lag_n_date,如果连续打卡n天则check_in_date与lag_n_date的差值就是n
后续
在第一题中能否用一个SQL解决求分摊值及分摊后不相等问题?
思路: 当一个父订单存在N个商品需要参与分摊时(N大于或等于2),按小计数值从小到大排列,其中前N-1个项目或商品按比例分摊,第N个通过总计减去前N-1个之和。
create TABLE as `order`
select
order_id
,sub_order_id
,seller_id
,buyer_id
,product_id
,pay_time
,price
,quantity
,pay_amt
-- 第n个按照分摊pay_amt-sub_pay_amt
,case when sub_order_num-sub_order_rank>0 then sub_pay_amt
else cast(pay_amt- n_1_sum as decimal(10,2)) end sub_pay_amt
from
(
select
order_id
,sub_order_id
,seller_id
,buyer_id
,product_id
,pay_time
,price
,quantity
,pay_amt
-- 订单金额占比
,amt_ratio
-- 子订单金额降序排序
,sub_order_rank
,sub_pay_amt
-- 前n-1个计算求和
,sum(sub_pay_amt) over(partition by order_id ) n_1_sum
,sub_order_num
from
(
select
order_id
,sub_order_id
,seller_id
,buyer_id
,product_id
,pay_time
,price
,quantity
,pay_amt
-- 订单金额占比
,amt_ratio
-- 子订单金额降序排序
,sub_order_rank
-- 前n-1个按照分摊比例计算
,case when sub_order_num-sub_order_rank>0 then cast(pay_amt* (price*quantity*1.0/(pay_amt+adjust_amt) ) as decimal(10,2) )
else 0 end sub_pay_amt
,sub_order_num
from
(
select
order.order_id
,sub_order.sub_order_id
,order.seller_id
,order.buyer_id
,sub_order.product_id
,order.pay_time
,sub_order.price
,sub_order.quantity
,order.pay_amt
,adjust_amt
-- 订单金额占比
,price*quantity*1.0/(pay_amt+adjust_amt) amt_ratio
-- 子订单金额降序排序
,row_number() over(partition by order.order_id order by price*quantity desc) sub_order_rank
-- 子订单数
,c.sub_order_num
-- 支付金额*订单占比 = 分摊金额
--,cast(pay_amt* (price*quantity*1.0/(pay_amt+adjust_amt) ) as decimal(10,2) ) sub_pay_amt
from
(
select
101 as order_id,
2000 as seller_id,
300 as buyer_id,
'20200224' as pay_time,
1059.40 as pay_amt,
150.00 as adjust_amt
) order
left outer join
(
select
101 as order_id,
10101 as sub_order_id,
1000 as product_id ,
200.50 as price,
2 as quantity
union all
select
101 as order_id,
10102 as sub_order_id,
1001 as product_id ,
400.40 as price,
1 as quantity
union all
select
101 as order_id,
10102 as sub_order_id,
1002 as product_id ,
102.00 as price,
4 as quantity
) sub_order
-- 假设每个order和sub_order为订单粒度,并且两张表的计算日期分区和下单时间分区是同一天
on order.order_id=sub_order.order_id
-- 子订单数
left outer join
(
select order_id
,count(distinct sub_order_id) sub_order_num
from
(
select
101 as order_id,
10101 as sub_order_id,
1000 as product_id ,
200.50 as price,
2 as quantity
union all
select
101 as order_id,
10102 as sub_order_id,
1001 as product_id ,
400.40 as price,
1 as quantity
union all
select
101 as order_id,
10103 as sub_order_id,
1002 as product_id ,
102.00 as price,
4 as quantity
) a
group by 1
) c
on order.order_id=c.order_id
) a
) b
) c