大数据经典面试题分享(蚂蚁金服)

一.题目说明:

以下表记录了用户每天的蚂蚁森林低碳生活领取的记录流水。
table_name:user_low_carbon
user_id data_dt low_carbon
用户 日期 减少碳排放(g)

蚂蚁森林植物换购表,用于记录申领环保植物所需要减少的碳排放量
table_name: plant_carbon
plant_id plant_name low_carbon
植物编号 植物名 换购植物所需要的碳

----题目
1.蚂蚁森林植物申领统计
问题:假设2017年1月1日开始记录低碳数据(user_low_carbon),假设2017年10月1日之前满足申领条件的用户都申领了一颗p004-胡杨,
剩余的能量全部用来领取“p002-沙柳” 。
统计在10月1日累计申领“p002-沙柳” 排名前10的以用户信息;及他比后一名多领了几颗沙柳。
得到的统计结果如下表样式:
user_id plant_count less_count(比后一名多领了几颗沙柳)
u_101 1000 100
u_088 900 400
u_103 500 …

2、蚂蚁森林低碳用户排名分析
问题:查询user_low_carbon表中每日流水记录,条件为:
用户在2017年,连续三天(或以上)的天数里,
每天减少碳排放(low_carbon)都超过100g的用户低碳流水。
需要查询返回满足以上条件的user_low_carbon表中的记录流水。
例如用户u_002符合条件的记录如下,因为2017/1/2~2017/1/5连续四天的碳排放量之和都大于等于100g:
seq(key) user_id data_dt low_carbon
xxxxx10 u_002 2017/1/2 150
xxxxx11 u_002 2017/1/2 70
xxxxx12 u_002 2017/1/3 30
xxxxx13 u_002 2017/1/3 80
xxxxx14 u_002 2017/1/4 150
xxxxx14 u_002 2017/1/5 101
备注:统计方法不限于sql、procedure、python,java等

提供的数据说明:
user_low_carbon:
u_001 2017/1/1 10
u_001 2017/1/2 150
u_001 2017/1/2 110
u_001 2017/1/2 10
u_001 2017/1/4 50
u_001 2017/1/4 10
u_001 2017/1/6 45
u_001 2017/1/6 90
u_002 2017/1/1 10
u_002 2017/1/2 150
u_002 2017/1/2 70
u_002 2017/1/3 30
u_002 2017/1/3 80
u_002 2017/1/4 150
u_002 2017/1/5 101
u_002 2017/1/6 68

plant_carbon:
p001 梭梭树 17
p002 沙柳 19
p003 樟子树 146
p004 胡杨 215

1.创建表
create table user_low_carbon(user_id String,data_dt String,low_carbon int) row format delimited fields terminated by ‘\t’;
create table plant_carbon(plant_id string,plant_name String,low_carbon int) row format delimited fields terminated by ‘\t’;

2.加载数据
load data local inpath “/opt/module/data/low_carbon.txt” into table user_low_carbon;
load data local inpath “/opt/module/data/plant_carbon.txt” into table plant_carbon;

3.设置本地模式
set hive.exec.mode.local.auto=true;

二.答案

select user_id,plant_count,plant_count - lead(plant_count,1) over(order by plant_count desc)
from
(select user_id,floor((sum-hy)/sl) plant_count
from 
(select user_id,sum(low_carbon) sum
from user_low_carbon
where datediff(regexp_replace(data_dt,"/","-"),"2017-1-1")>=0
and datediff(regexp_replace(data_dt,"/","-"),"2017-10-1")<=0
group by user_id)t1,
(select  low_carbon hy from plant_carbon where plant_id = "p004")t2,
(select  low_carbon sl from plant_carbon where plant_id = "p002")t3)t4
limit 10;



select user_id,sum(low_carbon) sum
from user_low_carbon
where datediff(regexp_replace(data_dt,"/","-"),"2017-1-1")>=0
and datediff(regexp_replace(data_dt,"/","-"),"2017-10-1")<=0
group by user_id    //t1

select low_carbon hy from plant_carbon where plant_id = "p004"  //t2

select low_carbon sl from plant_carbon where plant_id = "p002"     //t3

select user_id,floor((sum - hy)/sl) plant_count
from
(select user_id,sum(low_carbon) sum
from user_low_carbon
where datediff(regexp_replace(data_dt,"/","-"),"2017-1-1")>=0
and datediff(regexp_replace(data_dt,"/","-"),"2017-10-1")<=0
group by user_id)t1,
(select low_carbon hy from plant_carbon where plant_id = "p004")t2,
(select low_carbon sl from plant_carbon where plant_id = "p002")t3
limit 10   //t4





答案:
----------第一题---------------
user_id	sum_low_carbon	sl_count	_c3
u_007	1470	66	3
u_013	1430	63	10
u_008	1240	53	7
u_005	1100	46	1
u_010	1080	45	1
u_014	1060	44	5
u_011	960		39	2
u_009	930		37	5
u_006	830		32	9
u_002	659		23	1

----------第二题---------------
t.user_id	t.data_dt	t.low_carbon
u_002	2017/1/2	150
u_002	2017/1/2	70
u_002	2017/1/3	30
u_002	2017/1/3	80
u_002	2017/1/4	150
u_002	2017/1/5	101
u_005	2017/1/2	50
u_005	2017/1/2	80
u_005	2017/1/3	180
u_005	2017/1/4	180
u_005	2017/1/4	10
u_008	2017/1/4	260
u_008	2017/1/5	360
u_008	2017/1/6	160
u_008	2017/1/7	60
u_008	2017/1/7	60
u_009	2017/1/2	70
u_009	2017/1/2	70
u_009	2017/1/3	170
u_009	2017/1/4	270
u_010	2017/1/4	90
u_010	2017/1/4	80
u_010	2017/1/5	90
u_010	2017/1/5	90
u_010	2017/1/6	190
u_010	2017/1/7	90
u_010	2017/1/7	90
u_011	2017/1/1	110
u_011	2017/1/2	100
u_011	2017/1/2	100
u_011	2017/1/3	120
u_013	2017/1/2	150
u_013	2017/1/2	50
u_013	2017/1/3	150
u_013	2017/1/4	550
u_013	2017/1/5	350
u_014	2017/1/5	250
u_014	2017/1/6	120
u_014	2017/1/7	270
u_014	2017/1/7	20



select user_id,regexp_replace(data_dt,"/","-") data_dt,sum(low_carbon) sum
from user_low_carbon
where year(regexp_replace(data_dt,"/","-"))="2017"
group by user_id,data_dt   //t1

select user_id,data_dt,sum
from (select user_id,regexp_replace(data_dt,"/","-") data_dt,sum(low_carbon) sum
from user_low_carbon
where year(regexp_replace(data_dt,"/","-"))="2017"
group by user_id,data_dt)t1
where sum > 100    //t2

select user_id,data_dt,
lag(data_dt,2,"1970-1-1") over(partition by user_id order by regexp_replace(data_dt,"/","-")) qt,
lag(data_dt,1,"1970-1-1") over(partition by user_id order by regexp_replace(data_dt,"/","-")) zt,
lead(data_dt,1,"1970-1-1") over(partition by user_id order by regexp_replace(data_dt,"/","-")) mt,
lead(data_dt,2,"1970-1-1") over(partition by user_id order by regexp_replace(data_dt,"/","-")) ht
from (select user_id,data_dt,sum
from (select user_id,regexp_replace(data_dt,"/","-") data_dt,sum(low_carbon) sum
from user_low_carbon
where year(regexp_replace(data_dt,"/","-"))="2017"
group by user_id,data_dt)t1
where sum > 100)t2  //t3

select user_id,data_dt,
datediff(regexp_replace(data_dt,"/","-"),qt) sub_qt,
datediff(regexp_replace(data_dt,"/","-"),zt) sub_zt,
datediff(regexp_replace(data_dt,"/","-"),mt) sub_mt,
datediff(regexp_replace(data_dt,"/","-"),ht) sub_ht
from (select user_id,data_dt,
lag(data_dt,2,"1970-1-1") over(partition by user_id order by regexp_replace(data_dt,"/","-")) qt,
lag(data_dt,1,"1970-1-1") over(partition by user_id order by regexp_replace(data_dt,"/","-")) zt,
lead(data_dt,1,"1970-1-1") over(partition by user_id order by regexp_replace(data_dt,"/","-")) mt,
lead(data_dt,2,"1970-1-1") over(partition by user_id order by regexp_replace(data_dt,"/","-")) ht
from (select user_id,data_dt,sum
from (select user_id,regexp_replace(data_dt,"/","-") data_dt,sum(low_carbon) sum
from user_low_carbon
where year(regexp_replace(data_dt,"/","-"))="2017"
group by user_id,data_dt)t1
where sum > 100)t2)t3    //t4

select user_id,regexp_replace(data_dt,"-","/") data_dt
from (select user_id,data_dt,
datediff(regexp_replace(data_dt,"/","-"),qt) sub_qt,
datediff(regexp_replace(data_dt,"/","-"),zt) sub_zt,
datediff(regexp_replace(data_dt,"/","-"),mt) sub_mt,
datediff(regexp_replace(data_dt,"/","-"),ht) sub_ht
from (select user_id,data_dt,
lag(data_dt,2,"1970-1-1") over(partition by user_id order by regexp_replace(data_dt,"/","-")) qt,
lag(data_dt,1,"1970-1-1") over(partition by user_id order by regexp_replace(data_dt,"/","-")) zt,
lead(data_dt,1,"1970-1-1") over(partition by user_id order by regexp_replace(data_dt,"/","-")) mt,
lead(data_dt,2,"1970-1-1") over(partition by user_id order by regexp_replace(data_dt,"/","-")) ht
from (select user_id,data_dt,sum
from (select user_id,regexp_replace(data_dt,"/","-") data_dt,sum(low_carbon) sum
from user_low_carbon
where year(regexp_replace(data_dt,"/","-"))="2017"
group by user_id,data_dt)t1
where sum > 100)t2)t3)t4
where 
(sub_mt = -1 and sub_ht = -2) or 
(sub_zt = 1 and sub_mt = -1) or 
(sub_zt = 1 and sub_qt = 2)    //t5

select t5.user_id,t5.data_dt,low_carbon
from (select user_id,regexp_replace(data_dt,"-","/") data_dt
from (select user_id,data_dt,
datediff(regexp_replace(data_dt,"/","-"),qt) sub_qt,
datediff(regexp_replace(data_dt,"/","-"),zt) sub_zt,
datediff(regexp_replace(data_dt,"/","-"),mt) sub_mt,
datediff(regexp_replace(data_dt,"/","-"),ht) sub_ht
from (select user_id,data_dt,
lag(data_dt,2,"1970-1-1") over(partition by user_id order by regexp_replace(data_dt,"/","-")) qt,
lag(data_dt,1,"1970-1-1") over(partition by user_id order by regexp_replace(data_dt,"/","-")) zt,
lead(data_dt,1,"1970-1-1") over(partition by user_id order by regexp_replace(data_dt,"/","-")) mt,
lead(data_dt,2,"1970-1-1") over(partition by user_id order by regexp_replace(data_dt,"/","-")) ht
from (select user_id,data_dt,sum
from (select user_id,regexp_replace(data_dt,"/","-") data_dt,sum(low_carbon) sum
from user_low_carbon
where year(regexp_replace(data_dt,"/","-"))="2017"
group by user_id,data_dt)t1
where sum > 100)t2)t3)t4
where 
(sub_mt = -1 and sub_ht = -2) or 
(sub_zt = 1 and sub_mt = -1) or 
(sub_zt = 1 and sub_qt = 2))t5
join
user_low_carbon t6
on t5.user_id = t6.user_id and t5.data_dt = t6.data_dt


//如果是tmp表(子查询) 不用指定查询的字段属于哪张表,如果是join
操作就必须指定查询字段属于哪张表
//临时表一定要用()包围并取一个别名




------------------------------------第二种思路-------------------------------------------------------

select user_id,data_dt,sum(low_carbon) sum
from user_low_carbon
where year(regexp_replace(data_dt,"/","-")) = "2017"
group by user_id,data_dt    //t1      对用户和日期进行分组,求出用户每天的碳节约量sum(low_carbon)

select user_id,data_dt,sum
from (select user_id,data_dt,sum(low_carbon) sum
from user_low_carbon
where year(regexp_replace(data_dt,"/","-")) = "2017"
group by user_id,data_dt)t1
where sum > 100       //t2       求出用户每天碳节约量大于100的情况

同一用户对日期进行排序row_number() over() ,再用日期减去排序,如果日期相等的话说明是连续天,
对生成的新的日期进行分组求和count() over() ,用where过滤出>=3的数据

select user_id,data_dt,sum,row_number() over(partition by user_id order by data_dt) rank
from (select user_id,data_dt,sum
from (select user_id,data_dt,sum(low_carbon) sum
from user_low_carbon
where year(regexp_replace(data_dt,"/","-")) = "2017"
group by user_id,data_dt)t1
where sum > 100)t2            //t3     增加一个窗口函数 按照日期进行排序

select user_id,data_dt,sum,rank,date_sub(regexp_replace(data_dt,"/","-") ,rank) sub_date
from (select user_id,data_dt,sum,row_number() over(partition by user_id order by data_dt) rank
from (select user_id,data_dt,sum
from (select user_id,data_dt,sum(low_carbon) sum
from user_low_carbon
where year(regexp_replace(data_dt,"/","-")) = "2017"
group by user_id,data_dt)t1
where sum > 100)t2 )t3         //t4      求出日期与排名的差值

select user_id,data_dt,sub_date,count(sub_date) over(partition by user_id,sub_date) sumdate
from (select user_id,data_dt,sum,rank,date_sub(regexp_replace(data_dt,"/","-") ,rank) sub_date
from (select user_id,data_dt,sum,row_number() over(partition by user_id order by data_dt) rank
from (select user_id,data_dt,sum
from (select user_id,data_dt,sum(low_carbon) sum
from user_low_carbon
where year(regexp_replace(data_dt,"/","-")) = "2017"
group by user_id,data_dt)t1
where sum > 100)t2 )t3)t4      //t5     求出日期差相等的个数(以用户进行分组)

select user_id,data_dt,sumdate
from (select user_id,data_dt,sub_date,count(sub_date) over(partition by user_id,sub_date) sumdate
from (select user_id,data_dt,sum,rank,date_sub(regexp_replace(data_dt,"/","-") ,rank) sub_date
from (select user_id,data_dt,sum,row_number() over(partition by user_id order by data_dt) rank
from (select user_id,data_dt,sum
from (select user_id,data_dt,sum(low_carbon) sum
from user_low_carbon
where year(regexp_replace(data_dt,"/","-")) = "2017"
group by user_id,data_dt)t1
where sum > 100)t2 )t3)t4)t5
where 
sumdate >=3          //       t6        得到的变为连续3天(或以上)碳节约量大于100的数据

select t6.user_id,t6.data_dt,t.low_carbon
from (select user_id,data_dt,sumdate
from (select user_id,data_dt,sub_date,count(sub_date) over(partition by user_id,sub_date) sumdate
from (select user_id,data_dt,sum,rank,date_sub(regexp_replace(data_dt,"/","-") ,rank) sub_date
from (select user_id,data_dt,sum,row_number() over(partition by user_id order by data_dt) rank
from (select user_id,data_dt,sum
from (select user_id,data_dt,sum(low_carbon) sum
from user_low_carbon
where year(regexp_replace(data_dt,"/","-")) = "2017"
group by user_id,data_dt)t1
where sum > 100)t2 )t3)t4)t5
where 
sumdate >=3 )t6
join 
user_low_carbon t
on t6.user_id = t.user_id and t6.data_dt = t.data_dt     //   join user_low_carbon表,将 low_carbon 数据查出来(join操作行数会变多)



你可能感兴趣的:(大数据)