union以指定的条件去重

with temp as(
select a,b,c from t1
union all
select a,b,c from t2
)
select a,b,c
from (select a,b,c,row_number() over (partition by a sort by b,c) rn from temp)
where rn=1
;
根据a列去重

业务场景:工单的去重

数据来源于mysql和es,一个工单在mysql中完成后,同步到es
如果mysql中该条数据没有删除,则会同步两次,在后面的old和new的full outer join中将会导致错误
因此需要在ods层处理该问题
采用窗口函数,按照orderid分区,按照来源排序,如果是重复的,取来源于es的数据

with temp as(

select

orderid ,
ordercreatetime ,
companyId ,
areasid ,
institutionid ,
platfromfiledcode ,
orderlargertype ,
ordersecondtype ,
orderthirdlytype ,
serviceflowalias ,
ordersource ,
ordersourcename ,
orderstatus ,
orderstatusname ,

actualhour ,
urgent ,
supervisenum ,
reworknum ,
importance,

dealuserids ,
dealuserorgids,
'es' as comefrom,
dt

from wfs_order_list_index where dt='$do_date'

union all 

select
id,
create_time,
company_id,
areas_id,
institution_id,
'' as platfromfiledcode,
order_larger_type,
order_second_type,
order_thirdly_type,
'' as serviceflowalias,
order_source,
'' as ordersourcename,
status,
'' as orderstatusname,
actual_hour,
urgent,
supervise_num,
rework_num,
importance,
deal_user_ids,
deal_user_org_ids,
'mysql' as comefrom,
dt

from tbwork_order where dt='$do_date'

)


insert overwrite table ods_order partition(dt)
select 

orderid ,
ordercreatetime ,
companyId ,
areasid ,
institutionid ,
platfromfiledcode ,
orderlargertype ,
ordersecondtype ,
orderthirdlytype ,
serviceflowalias ,
ordersource ,
ordersourcename ,
orderstatus ,
orderstatusname ,

actualhour ,
urgent ,
supervisenum ,
reworknum ,
importance,

dealuserids ,
dealuserorgids,
comefrom,
rn,
dt

 from (
select *,ROW_NUMBER() over(partition by orderid sort by comefrom) as rn from temp
)
where rn=1

https://wenwen.sogou.com/z/q705629911.htm

你可能感兴趣的:(大数据)