HiveSQL基础之常用函数

1. 如何把时间戳转换成日期

  • 语法
from_unixtime(bigint unixtime, string format)
  • format
1. yyyy-MM-dd hh:mm:ss

2. yyyy-MM-dd hh

3. yyyy-MM-dd hh:mm

4. yyyyMMdd
  • 举例
SELECT 
from_unixtime(paytime,'yyyy-MM-dd hh:mm:ss')
FROM user_trade
WHERE dt='2019-04-09';
  • 拓展:把日期转换为时间戳
  • unix_timestamp
SELECT 
unix_timestamp(firstactivetime)
FROM user_info
WHERE level=5;

2. 如何计算日期间隔

  • 语法
datediff(string enddate, string startdate)
  • 举例:用户首次激活时间,与2019年5月1日的日期间隔
SELECT user_name,
	   datediff('2019-05-01', to_date(firstactivetime))
FROM user_info
LIMIT 10;
  • 拓展:日期增加,减少函数
  • 语法
date_add(string startdate, int days)
date_sub (string startdate, int days)
  • 举例
SELECT user_name,
	   date_add(firstactivetime,7)
FROM user_info
LIMIT 10;

3. case when 条件函数

  • 统计以下4个年龄段20岁以下,20-30岁,30-40岁,40岁以上的用户分布
SELECT case when age<20 then '20岁以下'
			when age<30 then '20-30岁'
			when age<40 then '30-40岁'
			else '40岁以上' end,
		count(user_id) as user_num
FROM user_info
GROUP BY case when age<20 then '20岁以下'
			when age<30 then '20-30岁'
			when age<40 then '30-40岁'
			else '40岁以上' end;

4. if函数

  • 统计每个性别用户等级高低的分布情况
SELECT sex,
	   if(level>5,'高','低'),
	   count(user_id) as user_num
FROM user_info
GROUP BY sex,
		 if(level>5,'高','低');

5. 字符串函数

  • 语法
substr(string A, int start, int len)
  • 每个月新激活用户数
SELECT substr(firstactivetime,1,7) as month,
	   count(user_id) as user_num
FROM user_info
GROUP BY substr(firstactivetime,1,7);
  • 语法
get_json_object(string json_string, string path)
  • 不同手机品牌的用户数
SELECT get_json_object(extra1,'$.phonebrand') as phone_brand,
	   count(user_id) as user_num
FROM user_info
GROUP BY get_json_object(extra1,'$.phonebrand');
  • 不同手机品牌学历分布用户数
SELECT extra2['phonebrand'] as phone_brand,
	   extra2['education'] as education,
	   count(user_id) as user_num
FROM user_info
GROUP BY extra2['phonebrand'],extra2['education'];

6. 聚合统计函数

  • ELLA用户2018年平均支付金额,以及2018年最大支付日期和最小支付日期的时间间隔
SELECT avg(pay_amount) as avg_amount,
	   datediff(max(from_unixtime(pay_time,'yy-MM-dd')),
	   min(from_unixtime(pay_time,'yy-MM-dd')))
FROM user_trade
WHERE year(dt)='2018'
	  and user_name='ELLA';
# 另一种写法
SELECT avg(pay_amount) as avg_amount,
	   datediff(from_unixtime(max(pay_time),'yy-MM-dd'),
	   from_unixtime(min(pay_time),'yy-MM-dd'))
FROM user_trade
WHERE year(dt)='2018'
	  and user_name='ELLA';

7. 综合练习

  • 2018年购买的商品品类在两个以上的用户数
SELECT count(a.user_name)
FROM (SELECT user_name,
	   count(distinct goods_category) as category_num
FROM user_trade
WHERE year(dt)='2018'
GROUP BY user_name HAVING count(distinct goods_category) > 2) as a;
  • 用户激活时间在2018年,年龄段在20-30岁和30-40岁的婚姻状况分布
SELECT a.age_type, 
	   if(marriage_status=1,'已婚','未婚'),
	   count(a.user_id)
FROM
	(SELECT case when age<20 then '20岁以下'
				when age<30 then '20-30岁'
		        when age<40 then '30-40岁'
		        else '40岁以上' end as age_type,
		   get_json_object(extra1, '$.marriage_status') as marriage_status,
		   user_id
	FROM user_info
	WHERE year(firstactivetime)='2018') as a
WHERE a.age_type in ('20-30岁', '30-40岁')
GROUP BY a.age_type, 
	   if(marriage_status=1,'已婚','未婚');
  • 激活天数距今超过300天的男女分布情况
SELECT sex,
	   count(user_id)
FROM user_info
WHERE datediff('2019-08-07', substr(firstactivetime,1,10))>300
GROUP BY sex;
  • 不同性别,教育程度的分布情况
SELECT sex,
	   get_json_object(extra1,'$.education'),
	   count(user_id)
FROM user_info
GROUP BY sex,
	     get_json_object(extra1,'$.education');
  • 2019年1月1日到2019年4月30日,每个时段的不同品类购买金额分布
select substr(from_unixtime(pay_time,'yyyy-MM-dd hh'),12),
       goods_category,
       sum(pay_amount)
from user_trade
where dt between '2019-01-01' and '2019-04-30'
group by substr(from_unixtime(pay_time,'yyyy-MM-dd hh'),12),
         goods_category;

8. 常见错误

  • 标点符号错误(使用了全角符号)
  • 没有对子查询的表进行重命名
  • 使用错误的字段名
  • 丢了逗号分隔符

你可能感兴趣的:(HiveSQL,SQL)