1、查看数据
$ head orders.csv order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order 2539329,1,prior,1,2,08, 2398795,1,prior,2,3,07,15.0 473747,1,prior,3,3,12,21.0 2254736,1,prior,4,4,07,29.0 431534,1,prior,5,4,15,28.0 3367565,1,prior,6,2,07,19.0 550135,1,prior,7,1,09,20.0 3108588,1,prior,8,1,14,14.0 2295261,1,prior,9,1,16,0.0 $ head order_products__prior.csv order_id,product_id,add_to_cart_order,reordered 2,33120,1,1 2,28985,2,1 2,9327,3,0 2,45918,4,1 2,30035,5,0 2,17794,6,1 2,40141,7,1 2,1819,8,1 2,43668,9,0
2、创建表结构
create table orders( order_id string, user_id string, eval_set string, order_number bigint, order_dow string, order_hour_of_day string, days_since_prior_order string )row format delimited fields terminated by ',' lines terminated by '\n'; create table order_products__prior( order_id string, product_id string, add_to_cart_order string, reordered string )row format delimited fields terminated by ',' lines terminated by '\n';
3、每个用户有多少个订单[orders](聚合count)
select user_id,count(*) from orders group by user_id order by user_id desc limit 3; 99999 35 99998 6 99997 4
4、每个用户买了多少商品(表连接)
select user_id,count(*) from orders join order_products__prior on(orders.order_id=order_products__prior.order_id) group by user_id order by user_id desc limit 3; 99999 329 99998 21 99997 38
5、每个用户平均每个订单是多少商品(聚合函数avg)
select user_id,avg(cnt) from orders join ( select order_id,count(*) as cnt from order_products__prior group by order_id)t on (orders.order_id=t.order_id) group by user_id order by user_id desc limit 3; 99999 9.676470588235293 99998 4.2 99997 12.666666666666666
6、每个用户一周中的订单分布(行装列,case when)
select user_id, sum(case when order_dow=='0' then '1' else '0' end )as dow_0, sum(case when order_dow=='1' then '1' else '0' end )as dow_1, sum(case when order_dow=='2' then '1' else '0' end )as dow_2, sum(case when order_dow=='3' then '1' else '0' end )as dow_3, sum(case when order_dow=='4' then '1' else '0' end )as dow_4, sum(case when order_dow=='5' then '1' else '0' end )as dow_5, sum(case when order_dow=='6' then '1' else '0' end )as dow_6 from orders group by user_id order by user_id desc limit 10;
7、每个用户平均每个购买天中,购买的商品数量(if补充缺失值)
--方法1 select user_id,avg(s_cnt) as avg_cnt from (select user_id, if(days_since_prior_order="",0,days_since_prior_order) as days_since_prior_order, sum(cnt) as s_cnt from orders join (select order_id,count(*) as cnt from order_products__prior group by order_id)t on (orders.order_id=t.order_id) group by user_id,days_since_prior_order)t2 group by user_id order by user_id desc limit 3; --方法二 select user_id,sum(cnt)/count(distinct days_since_prior_order) from orders join (select order_id,count(*) as cnt from order_products__prior group by order_id)t on (orders.order_id=t.order_id) group by user_id order by user_id desc limit 3;
8、每个用户最喜欢的3个商品(字段链接concat_ws,列转行collect_list或者concat_set,排序row_number() over(partition by x order by y) as z)
select user_id,collect_list(concat_ws("_",product_id,cast(r as string))) as top3 from ( select user_id, product_id, row_number() over(partition by user_id order by cnt desc) as r, cnt from (select user_id,product_id,count(*) as cnt from orders join order_products__prior on(orders.order_id=order_products__prior.order_id) group by user_id,product_id )t1 )t2 where r<=3 group by user_id order by user_id desc limit 10;
总结:
1、函数的使用:
concat_ws
collect_list
row_number() over(partition by xxx order by)
2、hive优化: