数据分析-Hive学习 Day3

一、基础语法

1. select from where

数据分析-Hive学习 Day3_第1张图片

# 选择城市在北京,性别为女的10个用户名
SELECT user_name
FROM user_info
WHERE city='beijing' and sex='female'
limit 10


数据分析-Hive学习 Day3_第2张图片

分区表必须对分区字段进行限制

-- 选出在2019年4月9日,购买商品品类为food的用户名、购买数量,支付金额
SELECT user_name,
		piece,
		pay_amount	
FROM user_trade
WHERE dt='2014-04-09' AND gppds_category='food'; # 分区表必须限制分区字段,这里对dt进行限制

2. group by

-- 2019年一月至四月,每个品类有多少人购买,累计金额是多少
SELECT   goods_category
         count(distinct user_name),
		sum(pay_amount)
FROM user_trade
WHERE dt BETWEEN '2019-01-01' AND '2019-04-01'
GROUP BY goods_category

group by -- having

--2019年4月,支付金额超过5万元的用户
SELECT user_name,
     sum(pay_amount) as total_amount
FROM user_trade
WHERE dt BETWEEN '2019-04-01' AND '2019-04-30'
GROUP BY user_name HAVING sum(pay_amount)>50000;

3. ORDER BY

--2019年4月,支付金额最多的top5用户
SELECT user_name,
		sum(pay_amount) as 'per_amount'
FROM user_trade
WHERE dt BETWEEN '2019-04-01' AND '2019-04-30'
GROUP BY user_name
ORDER BY per_amount DESC LIMIT 5;
-- ORDER BY 执行顺序在select后面

二、常用函数

1. 时间戳转化为日期

SELECT pay_time,
       from_unixtime(pay_time, 'yyyy-MM-dd hh:mm:ss')
FROM user_trade
WHERE dt ='2019-04-09'

2. 如何计算日期间隔

datediff(string enddate, string startdate):结束日期机器拿去开始日期的天数

date_add

date_sub

-- 用户的首次激活时间,与2019年5月1日的日期间隔
SELECT user_name, 
       datediff('2019-05-01', to_date(firstactivetime))
FROM user_info
limit 10

3. CASE WHEN

--统计以下四个年龄段20岁以下,20-30岁,30-40岁,40岁以上的用户数
SELECT CASE WHEN age<20 THEN '20岁以下'
		WHEN age>20 AND age<30 THEN '20-30岁'
		WHEN age>=30 and age<40 THEN '30-40岁'
		ELSE '40岁及以上'
		END,
		COUNT(DISTINCT user_id) user_num
FROM user_info
GROUP BY CASE WHEN age<20 THEN '20岁以下'
			WHEN age>20 and age<30 THEN '20-30岁'
			WHEN age>=30 and age<40 THEN '30-40岁'
			ELSE '40岁及以上' 
			END
			

4. 统计每个性别用户等级高低的分布情况

-- 统计每个性别用户等级高低的分布情况,level>5为高级
SELECT sex
		if(level>5, '高', '低'),
		count(distinct user_id) user_num
FROM user_info
GROUP BY sex,
		if(level>5, '高', '低')

5. 字符串函数

substr(stringA, int start, int len)

备注:如果不指定截取的长度,则从开始一直截取到最后一位

-- 每个月新激活的用户数
SELECT substr(firstactivetime, 1, 7) as month
		count(distinct user_id) user_num
FROM user_info
GROUP BY substr(firstactivetime, 1, 7)

get_json_object(stringjson_string, string path)

param1: 需要解析的json字段

param2:用key取出想要获取的value

数据分析-Hive学习 Day3_第3张图片

--不同品牌的用户数
# 第一种情况
SELECT get_json_object(extra1, '$.phonebrand') as phone_brand,
		count(distinct user_id) user_num
FROM user_info
GROUP BY get_json_object(extra1,'$.phonebrand') as phone_brand;

-- 第二种情况
SELECT extra2['phonebrand'] as phone_brand,
		count(distinct user_id) user_num
FROM user_info
GROUP BY extra2['phonebrand'];

6. 聚合统计函数

-- ELLA用户的2018年的平均支付金额,以及2018年最大的支付日期与最小的支付日期的间隔
SELECT AVG(pay_amount) as avg_amount,
		datediff(max(from_unixtime(pay_time, 'yyyy-MM--dd')),min(from_unixtime(pay_time,'yyyy-MM-dd')))
FROM user_trade
WHERE year(dt) = '2018'
		and user_name = 'ELLA';
		

三、练习

  1. 2018年购买的商品品类在两个以上的用户数
SELECT sum(t.user_name)
FROM (SELECT user_name, count(distinct goods_category) count_category
	  FROM  user_trade
      WHERE year(dt) = '2018'
      group by user_name having count(distinct goods_category) >2) t
   
  1. 激活时间在2018年,年龄段在20-30岁和30-40岁的婚姻状况分布

    SELECT A.age_type,
    		if(a.marriage_status=1, '已婚', '未婚')
    		count(distinct A.user_id)
    FROM(
    	SELECT CASE WHEN age<20 THEN '20岁及以下'
    				WHEN  age>=20 and age<30 THEN '20-30岁'
    				WHEN  age>=30 and age <40 THEN '30-40岁'
                    ELSE '40岁'及以上
                    END AS age_type,
                 get_json_object(extral, '$.marriage_status') as 														marriage_status,
                 user_id
           FROM user_info
           WHERE to_date(firstactivetime) BETWEEN '2018-01-01'  AND '2018-12-31'
        ) A
    WHERE a.age_type in ('20-30岁', '30-40岁')
    GROUP BY A.age_type,
    		if(a.marriage_status=1, '已婚', '未婚');
    

你可能感兴趣的:(hive,数据分析,学习)