有一个SQL语句要在hive里执行但是hive不支持intersect,所以要使用其他方法代替
intersect就是取交集,可以使用inner join进行连接然后取其中一列并去重
select s_store_name
,sum(ss_net_profit)
from store_sales
,date_dim
,store,
(select ca_zip
from (
SELECT substr(ca_zip,1,5) ca_zip
FROM customer_address
WHERE substr(ca_zip,1,5) IN ('27385','58049','58200','16808','21360',
'32961','18586','79307','15492')
intersect
select ca_zip
from (SELECT substr(ca_zip,1,5) ca_zip,count(*) cnt
FROM customer_address, customer
WHERE ca_address_sk = c_current_addr_sk and
c_preferred_cust_flag='Y'
group by ca_zip
having count(*) > 10)A1)A2) V1
where ss_store_sk = s_store_sk
and ss_sold_date_sk = d_date_sk
and d_qoy = 1 and d_year = 2002
and (substr(s_zip,1,2) = substr(V1.ca_zip,1,2))
group by s_store_name
order by s_store_name
limit 100;
代码没有格式化,如要测试可以先格式化一下
SELECT s_store_name ,
sum(ss_net_profit)
FROM store_sales ,
date_dim ,
store,
( (SELECT DISTINCT A.ca_zip
FROM (
(SELECT substr(ca_zip,1,5) ca_zip
FROM customer_address
WHERE substr(ca_zip,1,5) IN ('27385','58049','58200','16808','21360',
'32961','18586','79307','15492'))A
INNER JOIN
(SELECT substr(ca_zip,1,5) ca_zip,
count(*) cnt
FROM customer_address,
customer
WHERE ca_address_sk = c_current_addr_sk
AND c_preferred_cust_flag='Y'
GROUP BY ca_zip
HAVING count(*) > 10) B
on(A.ca_zip=B.ca_zip) ))C )
WHERE ss_store_sk = s_store_sk
AND ss_sold_date_sk = d_date_sk
AND d_qoy = 1
AND d_year = 2002
AND (substr(s_zip,1,2) = substr(C.ca_zip,1,2))
GROUP BY s_store_name
ORDER BY s_store_name LIMIT 100;
欢迎点赞