数据:
u01 2017/1/21 5
u02 2017/1/23 6
u03 2017/1/22 8
u04 2017/1/20 3
u01 2017/1/23 6
u01 2017/2/21 8
u02 2017/1/23 6
u01 2017/2/22 4
建表语句
create table action(
userId string,
visitDate string,
visitCount string)
row format delimited fields terminated by "\t";
load data local inpath '/home/data/test.txt' into table action;
1、输出的日期格式不一样,需进行格式化
select
userId,
date_format(regexp_replace(visitDate,'/','-'),'yyyy-MM') mn,
visitCount
from
action; t1
2、计算每个用户的小计
select
userId,
mn,
sum(visitcount) as sum_visitcount
from
()t1
group by
userId,mn; t2
3、按照人和时间进行累加
计算累加列,开窗函数根据用户id分区,按照格式化后的日期进行排序
select
userId,
mn,
sum_visitCount,
sum(sum_visitCount) over(partition by userId order by mn)
from
()t2;
二、
1、每个店铺的UV(访客数)
步骤一、去重
//group by 去重
select
shop,
user_id
from
visit
group by shop,user_id; t1
步骤二、计数
count(*)进行计数
select
shop
count(*) uv
from
()t1
group by shop;
2、每个店铺访问次数top3的访客信息。输出店铺名称、访客id、访问次数
步骤一
输出结果应该类似:
a u1 3
a u2 2
先计算每个人访问店铺的总次数
select
shop,
user_id,
count(*) ct
from
visit
group by shop,user_id; t1
步骤二、对同一店铺,对访问次数进行倒序排序 添加rank值,并取前三名
select
shop,
user_id,
ct,
row_number()over(partition by shop order by ct desc) rk
from
()t1; t2
//取前三名
select
shop,
user_id,
ct
from t2
where rk<=3;
三、
建表、导入数据,设置Hive的本地模式(运行更快)
create table user_low_carbon(user_id String,data_dt String,low_carbon int) row format delimited fields terminated by '\t';
create table plant_carbon(plant_id string,plant_name String,low_carbon int) row format delimited fields terminated by '\t';
load data local inpath "/home/data/user_low_carbon.txt" into table user_low_carbon;
load data local inpath "/home/data/plant_carbon.txt" into table plant_carbon;
set hive.exec.mode.local.auto=true; //本地模式
1、统计每个用户到2017/10/1总低碳量
日期格式需要转换,group by 去重,申领沙柳的前10,也可以转换为低碳总量的前10,保险起见limit15
select
user_id,
sum(low_carbon) sum_low_carbon
from
user_low_carbon
where
date_format(regexp_replace(data_dt,'/','-'),'yyyy-MM-dd')<='2017-10-01'
group by
user_id
order by
sum_low_carbon desc
limit 15; t1 //提前过滤能优化查询
2、取出胡杨的能量、取出沙柳的能量
select
low_carbon from plant_carbon where plant_id='p004'; t2
select
low_carbon from plant_carbon where plant_id='p002'; t3
3.每个人申领沙柳的棵树
floor取整
select
user_id,
floor((sum_low_carbon-t2.low_carbon)/t3.low_carbon) plant_count
from
t1,t2,t3; t4
4、按照申领沙柳棵树排序 将下一行数据中的plant_count放置当前行
//当前行减去下一行,lead
select
user_id,
plant_count,
lead(plant_count,1,'9999') over(order by plant_count desc)lead_plant_count
from
()t4
limit 10; t5
5、求相差的棵树
select
user_id,
plant_count,
(plant_count-lead_plant_count) less_count
from
()t5
order by
plant_count desc;