HiveSQL基础之表连接

1. inner join

  • 内连接,返回两个表的交集
  • 既在user_list_1又在user_list_2的用户
SELECT * 
FROM user_list_1 as a
JOIN user_list_2 as b
ON a.user_id=b.user_id;
  • 表连接时,必须进行重命名
  • on后面使用的连接条件必须起到唯一键值的作用
  • inner可以省略不写,效果一样
  • 一定要先去重,再做表连接,养成良好习惯
  • 在2019年购买后又退款的用户
SELECT a.user_name
FROM
	(SELECT distinct user_name
	FROM user_trade
	WHERE year(dt)=2019) as a
  JOIN
	(SELECT distinct user_name
	FROM user_refund
	WHERE year(dt)='2019') as b ON a.user_name=b.user_name;
  • 在2017年和2018年都购买的用户
SELECT a.user_name
FROM
	(SELECT distinct user_name 
	FROM user_trade
	WHERE year(dt)='2017') as a
  JOIN
	(SELECT distinct user_name
	FROM user_trade
	WHERE year(dt)='2018') as b ON a.user_name=b.user_name;
  • 在2017,2018,2019年都有交易的用户
SELECT a.user_name
FROM
	(SELECT distinct user_name
	FROM trade_2017) as a
  JOIN
	(SELECT distinct user_name
	FROM trade_2018) as b on a.user_name=b.user_name
  JOIN 
	(SELECT distinct user_name
	FROM trade_2019) as c on b.user_name=c.user_name;

2. left join

  • 左连接,以左边为全集,返回能够匹配上右表的匹配结果,没有匹配上的显示NULL
SELECT * 
FROM 
	user_list_1 as a
LEFT JOIN 
	user_list_2 as b ON a.user_id=b.user_id;
  • 在user_list_1中但是不在user_list_2中
SELECT a.user_id,
	   a.user_name
FROM user_list_1 as a LEFT JOIN user_list_2 as b ON a.user_id=b.user_id
WHERE b.user_id is null;
  • 在2019年购买,但是没有退款的用户
SELECT a.user_name
FROM
	(SELECT distinct user_name 
	FROM user_trade
	WHERE year(dt)='2019') as a
  LEFT JOIN
	(SELECT distinct user_name
	FROM user_refund
	WHERE year(dt)='2019') as b ON a.user_name=b.user_name
WHERE b.user_name is null;
  • 2019年购买用户的学历分布
SELECT b.education,
	   count(a.user_name)
FROM
	(SELECT distinct user_name
	FROM user_trade
	WHERE year(dt)='2019') as a
  LEFT JOIN
	(SELECT user_name, 
		   get_json_object(extra1,'$.education') as education
	FROM user_info) as b ON a.user_name=b.user_name
GROUP BY b.education;
  • 在2017,2018年都购买,但是在2019年没有购买的用户
SELECT a.user_name
FROM
	(SELECT distinct user_name
	FROM trade_2017) as a
  JOIN
	(SELECT distinct user_name
	FROM trade_2018) as b on a.user_name=b.user_name
  LEFT JOIN
    (SELECT distinct user_name
	FROM trade_2019) as c on b.user_name=c.user_name
WHERE c.user_name is null;

3. full join

  • full outer join 关键字只要左表和右表其中一个表存在匹配,则返回行。
  • user_list_1和user_list_2的所有用户
SELECT coalesce(a.user_name,b.user_name)
FROM user_list_1 as a FULL JOIN user_list_2 as b on a.user_name=b.user_name;
  • coalesce是一个函数,(expression_1,expression_2,…expression_n)依次参考各参数表达式,遇到非null值即停止并返回该值,如果所有的表达式都是空值的话,最终将返回一个空值

4. union all

  • 联合所有
  • 字段名必须一致
  • 字段顺序必须一致
  • 没有连接条件
  • 将user_list_1和user_list_3合并在一起
SELECT user_name,
	   user_id
FROM user_list_1
UNION ALL
SELECT user_name,
	   user_id
FROM user_list_3;
  • 2017-2019年有交易的用户数
SELECT count(distinct a.user_name)
FROM 
	(
	SELECT distinct user_name
	FROM trade_2017
  UNION ALL
	SELECT distinct user_name
	FROM trade_2018
  UNION ALL
	SELECT distinct user_name
	FROM trade_2019) as a;
  • 2019年每个用户的支付和退款金额汇总
SELECT a.user_name,
       sum(a.pay_amount),
       sum(a.refund_amount)
FROM
	(SELECT user_name,
		    sum(pay_amount) as pay_amount,
		    0 as refund_amount
	FROM user_trade
	WHERE year(dt)='2019'
	GROUP BY user_name
  UNION ALL
	SELECT user_name,
		   0 as pay_amount,
		   sum(refund_amount) as refund_amount
	FROM user_refund
	WHERE year(dt)='2019'
	GROUP BY user_name) as a
GROUP BY a.user_name;
  • 如何用full join来实现
SELECT coalesce(a.user_name,b.user_name),
	   if(a.pay_amount is null,0,a.pay_amount),
	   if(b.refund_amount is null,0,b.refund_amount)
FROM
	(SELECT user_name,
		    sum(pay_amount) as pay_amount
	FROM user_trade
	WHERE year(dt)='2019'
	GROUP BY user_name) as a
  FULL JOIN
	(SELECT user_name,
		   sum(refund_amount) as refund_amount
	FROM user_refund
	WHERE year(dt)='2019'
	GROUP BY user_name) as b ON a.user_name=b.user_name;
  • 变形:2019年每个支付用户的支付金额和退款金额
SELECT a.user_name,
	   if(a.pay_amount is null,0,a.pay_amount),
	   if(b.refund_amount is null,0,b.refund_amount)
FROM
	(SELECT user_name,
		    sum(pay_amount) as pay_amount
	FROM user_trade
	WHERE year(dt)='2019'
	GROUP BY user_name) as a
  LEFT JOIN
	(SELECT user_name,
		   sum(refund_amount) as refund_amount
	FROM user_refund
	WHERE year(dt)='2019'
	GROUP BY user_name) as b ON a.user_name=b.user_name;

5. 综合练习

  • 首次激活时间在2017年,但是一直没有支付的用户年龄段分布
SELECT a.age_level,
	   count(a.user_name)
FROM
	(
	SELECT user_name,
		   case when age<20 then '20岁以下'
		   		when age<30 then '20-30岁'
		   		when age<40 then '30-40岁'
		   		else '40岁以上' end as age_level
	FROM user_info
	WHERE year(firstactivetime)='2017') as a
  LEFT JOIN 
	(SELECT distinct user_name
	FROM user_trade
	WHERE dt>0) as b ON a.user_name=b.user_name
WHERE b.user_name is null
GROUP BY a.age_level;
  • 2018,2019年交易的用户,其激活时间段分布
SELECT hour(firstactivetime),
       count(distinct a.user_name)
FROM	
	(SELECT distinct user_name
	FROM trade_2018
  UNION ALL
	SELECT distinct user_name
	FROM trade_2019) as a
  LEFT JOIN user_info as b ON a.user_name=b.user_name
GROUP BY hour(firstactivetime);
  • 在2019年购买后又退款的用户性别分布
SELECT c.sex,
       count(a.user_name)     
FROM 
      (SELECT user_name
      FROM user_trade
      WHERE year(dt)=2019
      GROUP BY user_name)a
    JOIN
      (SELECT user_name
      FROM user_refund
      WHERE year(dt)=2019
      GROUP BY user_name)b on a.user_name=b.user_name
    LEFT JOIN
       (SELECT user_name,
               sex
        FROM user_info)c on b.user_name=c.user_name
GROUP BY c.sex;
  • 在2018年购买,但是没在2019年购买的用户的城市分布
SELECT d.city,
       count(c.user_name)
FROM 
      (SELECT a.user_name  
      FROM 
      	  (SELECT user_name
      	  FROM user_trade
      	  WHERE year(dt)=2018
      	  GROUP BY user_name)a
      	LEFT JOIN
      	  (SELECT user_name
      	  FROM user_trade
      	  WHERE year(dt)=2019
      	  GROUP BY user_name)b on a.user_name=b.user_name
      WHERE b.user_name is null)c 
  LEFT JOIN
      (SELECT user_name,
              city
      FROM user_info)d on c.user_name=d.user_name
GROUP BY d.city;
  • 在2017-2019年,有交易但是没退款的用户的手机品牌分布
SELECT d.phonebrand,
       count(c.user_name)
FROM 
    (SELECT a.user_name
    FROM 
        (SELECT user_name
        FROM trade_2017
        union 
        SELECT user_name
        FROM trade_2018
        union 
        SELECT user_name
        FROM trade_2019)a
      LEFT JOIN
        (SELECT distinct user_name
        FROM user_refund
        WHERE dt>'0')b on a.user_name=b.user_name
    WHERE b.user_name is null)c
  LEFT JOIN
    (SELECT user_name,
            extra2['phonebrand'] as phonebrand
    FROM user_info)d on c.user_name=d.user_name
GROUP BY d.phonebrand;

你可能感兴趣的:(SQL,HiveSQL)