=============hql补强点!练习题===============================
01,01,80
01,02,90
01,03,99
02,01,70
02,03,80
03,03,80
04,01,50
04,02,30
create table test1(
name int,
course int,
score int
)
row format delimited
fields terminated by ',';
left join(很好的利用了反向,) --- 推荐!
select *
from test1 t
left join(
select name
from test1
where score < 80
) tmp
on t.name = tmp.name
where tmp.name is null;
----
编写sql完成如下查询,一次查询实现最好,也可以写多次查询实现:
1、查出每个学期每门课程最高分记录(包含全部5个字段)
---- 又有聚合结果,又有详细信息; 1、开窗(聚合操作) 2、自连接,双join
select id,
userid,
course,
score,
term
from (
select id,
userid,
course,
score,
term,
row_number() over (partition by term,course order by score desc) maxScore
from course_score
) tt
where tt.maxScore=1;
---
select
s.id,
s.userid,
s.course,
s.score,
s.term
from
course_score s
join
(select
course,
userid,
max(score) score
from course_score
group by course,userid) t1
on s.course=t1.course ----》 多个条件确定唯一一行值
and
s.userid=t1.userid
and
s.score=t1.score
;
2、查出单个学期中语文课在90分以上的学生的数学成绩记录(包含全部5个字段)
---- 理解有些偏差,符合条件的单个学期的学生对应的该学期的数学成绩,还是对应所有学期数学成绩(注意:的,的,理解就对了-这里偏所有)
select
s.id,
s.userid,
s.course,
s.score,
s.term
from
course_score s
join
(select
id,
userid,
course,
score,
term
from
course_score
where
score >=70
and
course="语文") t1
on s.userid=t1.userid
where s.course="数学"
and s.term = t1.term ----》 加一个判断就是,对应的该学期的数学成绩
;
解法二:
select
*
from course_score t1
join course_score t2
on t1.userid = t2.userid and t1.course='语文' and t1.score>=90
and t2.id is not null and t2.course='数学'
---------------------------------
5分钟连续点击日志3次(当前点击日志时间 - 前3次点击时间 < 300s)
1,2011-12-07 13:01:03
1,2011-12-07 13:03:04
1,2011-12-07 13:02:03
1,2011-12-07 13:06:04
1,2011-12-07 13:08:09
2,2011-12-07 13:01:12
2,2011-12-07 13:01:15
2,2011-12-07 13:02:12
2,2011-12-07 13:03:12
2,2011-12-07 13:04:12
3,2011-12-07 13:04:12
3,2011-12-07 13:05:12
3,2011-12-07 13:07:12
3,2011-12-07 13:06:12
3,2011-12-07 13:10:12
create table url(
uid int,
dt timestamp
)
row format delimited
fields terminated by ',';
load data local inpath '/root/url.txt' into table url;
select distinct tmp.uid
from (
select uid,dt,
lag(dt,3) over(partition by uid order by dt asc) t
from url
) tmp
where (unix_timestamp(tmp.dt)-unix_timestamp(tmp.t))/60 < 5
=================================================
用一条语句写出每个人的居住地址和工作地址
user1
1,001,002
2,003,002
3,005,002
4,007,002
region
001,北京
002,上海
003,杭州
create table user1(
id int,
hc string,
jc string
)
row format delimited
fields terminated by ','
load data local inpath '/root/user1.txt' into table user1;
create table region(
code string,
city string
)
row format delimited
fields terminated by ','
load data local inpath '/root/region.txt' into table region;
select u1.id,r1.city,r2.city
from user1 u1
left join region r1
on u1.hc=r1.code
left join region r2
on u1.jc = r2.code
-------------------列转行-----------------------------
id,name,math,computer,english
1,huang,34,58,67
2,quan,54,60,69
create table score(
id int,
name string,
math int,
computer int,
english int
) row format delimited
fields terminated by ','
load data local inpath '/root/score.txt' into table score;
select id,name,course,score
from score
lateral view explode(str_to_map(concat('Math',':',Math,',','Computer',':',Computer,',','English',':',English),',')) tmp as course,score
结果:
id,name,course,score
1,huang,Math,34
1,huang,Computer,58
1,huang,English,67
2,quan,Math,54
2,quan,Computer,60
2,quan,English,69
select id,name,'Math' as course,math as score from score
union all
select id , name,'Computer' as course,computer as score from score
union all
select id,name,'English' as course,english as score from score
0 as computer: 直接写0会有默认的别名,可以自己设置as
常数可以直接输出呀!
select name,1 as computer from score
--------------
ID | type_flag | tags |
---|---|---|
10001 | 3 | 11_20_30,11_22_34,12_23_30,13_24_36 |
10002 | 2 | 11_20,11_22,12_23,13_24 |
10003 | 1 | 11,12 |
ID type_flag tag1 tag2 tag3
10001 3 11 20 30
10001 3 11 22 34
10001 3 12 23 30
10001 3 13 24 36
10002 2 11 20
10002 2 11 22
10002 2 12 23
10002 2 13 24
10003 1 11
10003 1 12
巧思:1、建表 2、type_flag对应数组的个数 :type_flag in (1,2,3),
create table flag
(
ID bigint,
type_flag int,
tags array
)
row format delimited
fields terminated by '\t'
collection items terminated by ',';
select
ID,type_flag,
(case when type_flag in (1,2,3) then arraySplit[0] else '' end) tag1,
(case when type_flag in (2,3) then arraySplit[1] else '' end) tag2,
(case when type_flag in (3) then arraySplit[2] else '' end) tag3
from (
select
ID,type_flag,
split(temptag,'_') arraySplit
from flag
lateral view explode(tags) temp as temptag
) tmp;
------------------
叶子:如果这个节点没有任何孩子节点。
根:如果这个节点是整棵树的根,即没有父节点。
内部节点:如果这个节点既不是叶子节点也不是根节点。
写一个查询语句,输出所有节点的编号和节点的类型,并将结果按照节点编号排序。
id p_id ====》 id Type
1 null ====》 1 Root
2 1 ====》 2 Inner
3 1 ====》 3 Leaf
4 2 ====》 4 Leaf
5 2 ====》 5 Leaf
create table stu(
id int,
p_id int
)row format delimited
fields terminated by ' ';
SELECT DISTINCT t1.id,
(CASE
WHEN t1.p_id IS NULL
THEN 'Root'
ELSE
CASE
WHEN t2.id IS NULL
THEN 'Leaf'
ELSE 'Inner'
END
END) AS Type
FROM stu t1 LEFT JOIN stu t2 ON t1.id=t2.p_id
----------------每个用户连续登录最大天数-------------------------------
id datetime
1,2019-07-26
1,2019-07-27
1,2019-07-30
1,2019-07-31
1,2019-08-01
2,2019-07-26
2,2019-07-27
2,2019-07-28
2,2019-07-30
2,2019-07-31
每个用户连续登录最大天数
create table login(
id int,
datetime string
) row format delimited
fields terminated by ','
load data local inpath '/root/login.txt' into table login;
select temp.id,max(temp.num)
from(
select tmp.id id,count(1) num
from (
select id,date_sub(datetime,row_number() over(partition by id order by datetime asc)) t
from login
group by id,datetime -- 保证同一天只有一条数据
) tmp
group by tmp.id,tmp.t
) temp
group by temp.id
最近七天内连续三天活跃用户数:
select mid_id
from
(
select mid_id
from
(
select
mid_id,
date_sub(dt,rank) date_diff
from
(
select
mid_id,
dt,
dense_rank() over(partition by mid_id order by dt) rank
from "$APP".dws_uv_detail_day
where dt>=date_add('$do_date',-6) and dt<='$do_date'
)t1
)t2
group by mid_id,date_diff
having count(*)>=3
)t3
group by mid_id
--------------两人通话记录总时长----------------------------------------
大不了自定义udf解决!
b,a,13:01:03 b,a,13:01:03
b,a,14:01:05 b,a,14:01:05
a,b,13:01:09 a,b,13:01:09
b,d,13:01:03 b,d,13:01:03
c,a,23:01:13 c,a,23:01:13
d,b,15:03:03 d,b,15:03:03
create table phone(
name string,
other string,
talktime string
) row format delimited
fields terminated by ','
load data local inpath '/root/phone.txt' overwrite into table phone;
from_unixtime(cast(sum(tt) as bigint),'mm'): 转换成日期格式,如果超过24小时会轮回的
所以这里只能使用累加分钟数即可(+号)
字符串比较大小等于(>、<、=),连接用concat
case when: 对一行数据进行判断数据;可以有多个case when输出多列;
sum(case when) : 适用于分组后,多行值;可以有多个case when输出多列;
select tmp.`one`,tmp.`two`,sum(tmp.`duration`)
from(
select
case when name>=other then name else other end as `one`,
case when name>=other then other else name end as `two`,
split(talktime,':')[0]*60*60 + split(talktime,':')[1]*60 + split(talktime,':')[2] as `duration`
from phone
) tmp
group by tmp.`one`,tmp.`two`
----------------------------------------------------------------------
======== 需求三:用户留存主题===============================
留存率:留存用户占当时新增用户(活跃用户)的比例即是留存率
10日对于11日留存率:
10日新增设备数的留存率= 10日新增设备数 且 11日活跃数 / 10日的新增设备数
`create_date` string comment '设备新增时间',
`retention_day` int comment '截止当前日期留存天数' (1,2,3,4,n)
留存怎么验证是否有数据呢?(修改成事务表)
把数据导出成文件,修改日期,再导回去?
(1,2,3,n天留存用户明细表): union all (开窗、left join、left semi join、join 、union all看情况选用)
insert overwrite table dws_user_retention_day
partition(dt="2019-09-22")
select
nm.mid_id,
nm.user_id,
nm.version_code,
nm.version_name,
nm.lang,
nm.source,
nm.os,
nm.area,
nm.model,
nm.brand,
nm.sdk_version,
nm.gmail,
nm.height_width,
nm.app_time,
nm.network,
nm.lng,
nm.lat,
nm.create_date,
1 retention_day
from dws_uv_detail_day ud join dws_new_mid_day nm on ud.mid_id =nm.mid_id
where ud.dt='2019-09-22' and nm.create_date=date_add('2019-09-22',-1)
union all
select
nm.mid_id,
nm.user_id ,
nm.version_code ,
nm.version_name ,
nm.lang ,
nm.source,
nm.os,
nm.area,
nm.model,
nm.brand,
nm.sdk_version,
nm.gmail,
nm.height_width,
nm.app_time,
nm.network,
nm.lng,
nm.lat,
nm.create_date,
2 retention_day
from dws_uv_detail_day ud join dws_new_mid_day nm on ud.mid_id =nm.mid_id
where ud.dt='2019-09-22' and nm.create_date=date_add('2019-09-22',-2)
union all
select
nm.mid_id,
nm.user_id ,
nm.version_code ,
nm.version_name ,
nm.lang ,
nm.source,
nm.os,
nm.area,
nm.model,
nm.brand,
nm.sdk_version,
nm.gmail,
nm.height_width,
nm.app_time,
nm.network,
nm.lng,
nm.lat,
nm.create_date,
3 retention_day
from dws_uv_detail_day ud join dws_new_mid_day nm on ud.mid_id =nm.mid_id
where ud.dt='2019-09-22' and nm.create_date=date_add('2019-09-22',-3);
-----------
ADS层
留存用户数:
retention_ratio decimal(10,2)
create external table ads_user_retention_day_count
(
create_date string comment '设备新增日期',
retention_day int comment '截止当前日期留存天数',
retention_count bigint comment '留存数量'
)COMMENT '每日用户留存情况'
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_user_retention_day_count/';
insert into table ads_user_retention_day_count
select
create_date,
retention_day,
count(*) retention_count
from dws_user_retention_day
where dt='2019-09-22'
group by create_date,retention_day;
留存用户比率:
create external table ads_user_retention_day_rate
(
stat_date string comment '统计日期',
create_date string comment '设备新增日期',
retention_day int comment '截止当前日期留存天数',
retention_count bigint comment '留存数量',
new_mid_count bigint comment '当日设备新增数量',
retention_ratio decimal(10,2) comment '留存率'
) COMMENT '每日用户留存情况'
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_user_retention_day_rate/';
insert into table ads_user_retention_day_rate
select
'2019-09-22', //指定一下,'统计日期',用于查看数据
ur.create_date,
ur.retention_day,
ur.retention_count,
nc.new_mid_count,
ur.retention_count/nc.new_mid_count*100
from ads_user_retention_day_count ur join ads_new_mid_count nc
on nc.create_date=ur.create_date;
------------------------------------------
表user_id,visit_date,page_name,plat
1,请统计近7天每天到访的新用户数; (注意:“新”字)
思路:1.求出每个用户及其第一次登陆的日期,也就是成为新用户的那天,过滤出近7天;2.在第1步所得表基础上提取每个用户成为新用户的日期,按这个时间分组,求得每天的用户数
select temp.firstDate, count(temp.user_id)
from
(select user_id, min(date_format(visit_date,'yyyy-MM-dd')) as firstDate
from person_visit
group by user_id
having firstDate >=date_sub(current_date(),7) and
firstDate
group by temp.firstDate;
2,统计每个访问渠道plat7天前的新用户的3日留存率和7日留存率 (注意:7天前(D-7)的那一天的新用户)
思路:1.求得每个渠道7前的总的新用户数;2.求3日留存用户数;3.求7日留存用户数;4.手动计算留存率
3日后的用户数/七天前的新用户数 = 3日留存 ;
3日留存/七天前的新用户数=3日留存率;
七天前的新用户数
select tmp.firstDate, tmp.plat, count(tmp.user_id) from(
select temp.user_id user_id,temp.firstDate firstDate,pv.plat plat
from
(select user_id, min(date_format(visit_date,'yyyy-MM-dd')) as firstDate
from person_visit
group by user_id
havng firstDate=date_sub(current_date(),7)
) as temp join person_visit pv
on temp.user_id=pv.user_id and temp.firstDate=date_format(pv.visit_date,'yyyy-MM-dd')
)tmp
group by tmp.plat;
3日留存
select plat,count(user_id)
from person_visit
where date_format(visit_date,'yyyy-MM-dd')=date_sub(current_date(),4)
and user_id in
(
select user_id
from person_visit
group by user_id
having min(date_format(visit_date,'yyyy-MM-dd'))=date_sub(current_date(),7)
) as temp
group by plat