hive经典案例需求

=============hql补强点!练习题===============================
  
01,01,80
01,02,90
01,03,99
02,01,70
02,03,80
03,03,80
04,01,50
04,02,30


create table test1(
name int,
course int,
score int
)
row format delimited
fields terminated by ',';
    
left join(很好的利用了反向,)   ---  推荐!
select *
from test1 t
left join(
select name
from test1
where score < 80
) tmp
on t.name = tmp.name
where tmp.name is null;

 

----

编写sql完成如下查询,一次查询实现最好,也可以写多次查询实现:

1、查出每个学期每门课程最高分记录(包含全部5个字段)     

----  又有聚合结果,又有详细信息; 1、开窗(聚合操作)      2、自连接,双join

select id,
       userid,
       course,
       score,
       term
       from (
                select id,
                       userid,
                       course,
                       score,
                       term,
                       row_number() over (partition by term,course order by score desc) maxScore
                from course_score
            ) tt
where tt.maxScore=1;

---

select
    s.id,
    s.userid,
    s.course,
    s.score,
    s.term
from
    course_score s
        join
    (select
         course,
         userid,
         max(score) score
     from course_score
     group by course,userid) t1

    on s.course=t1.course               ----》 多个条件确定唯一一行值
        and
     s.userid=t1.userid
        and
     s.score=t1.score
;

 

2、查出单个学期中语文课在90分以上的学生的数学成绩记录(包含全部5个字段)   

----  理解有些偏差,符合条件的单个学期的学生对应的该学期的数学成绩,还是对应所有学期数学成绩(注意:的,的,理解就对了-这里偏所有)

select
    s.id,
    s.userid,
    s.course,
    s.score,
    s.term
from
    course_score s
        join
    (select
         id,
         userid,
         course,
         score,
         term
     from
         course_score
     where
             score >=70
       and
             course="语文") t1
    on s.userid=t1.userid
where s.course="数学"
and   s.term = t1.term        ----》  加一个判断就是,对应的该学期的数学成绩
;

解法二:select
    *
from course_score t1
join course_score t2
on t1.userid = t2.userid and t1.course='语文' and t1.score>=90
and t2.id is not null and t2.course='数学'

---------------------------------

5分钟连续点击日志3次(当前点击日志时间 - 前3次点击时间 < 300s)
 
  
1,2011-12-07 13:01:03
1,2011-12-07 13:03:04
1,2011-12-07 13:02:03
1,2011-12-07 13:06:04
1,2011-12-07 13:08:09
2,2011-12-07 13:01:12
2,2011-12-07 13:01:15
2,2011-12-07 13:02:12
2,2011-12-07 13:03:12
2,2011-12-07 13:04:12
3,2011-12-07 13:04:12
3,2011-12-07 13:05:12
3,2011-12-07 13:07:12
3,2011-12-07 13:06:12
3,2011-12-07 13:10:12


create table url(
uid int,
dt timestamp 
)
row format delimited
fields terminated by ',';


load data local inpath '/root/url.txt' into table url;


select distinct tmp.uid
from (
select uid,dt,
lag(dt,3) over(partition by uid order by dt asc) t
from url
) tmp
where (unix_timestamp(tmp.dt)-unix_timestamp(tmp.t))/60 < 5


=================================================


用一条语句写出每个人的居住地址和工作地址

user1
1,001,002
2,003,002
3,005,002
4,007,002


region
001,北京
002,上海
003,杭州

create table user1(
id int,
hc string,
jc string
)
row format delimited
fields terminated by ','


load data local inpath '/root/user1.txt' into table user1;

create table region(
code string,
city string
)
row format delimited
fields terminated by ','


load data local inpath '/root/region.txt' into table region;

select u1.id,r1.city,r2.city
from user1 u1
left join region r1
on u1.hc=r1.code
left join region r2
on u1.jc = r2.code

-------------------列转行-----------------------------

id,name,math,computer,english
1,huang,34,58,67
2,quan,54,60,69


create table score(
id int,
name string,
math int,
computer int,
english int
) row format delimited
fields terminated by ','

load data local inpath '/root/score.txt' into table score;

select id,name,course,score
from score 
lateral view explode(str_to_map(concat('Math',':',Math,',','Computer',':',Computer,',','English',':',English),',')) tmp as course,score


结果:
id,name,course,score
1,huang,Math,34
1,huang,Computer,58
1,huang,English,67
2,quan,Math,54
2,quan,Computer,60
2,quan,English,69

select id,name,'Math' as course,math as score from score
union all
select id , name,'Computer' as course,computer as score from score
union all
select id,name,'English' as course,english as score from score


0 as computer: 直接写0会有默认的别名,可以自己设置as
常数可以直接输出呀!
select name,1 as computer from score

 

--------------

ID type_flag tags
10001 3 11_20_30,11_22_34,12_23_30,13_24_36
10002 2 11_20,11_22,12_23,13_24
10003 1 11,12

ID    type_flag    tag1    tag2    tag3
10001    3    11    20    30
10001    3    11    22    34
10001    3    12    23    30
10001    3    13    24    36
10002    2    11    20    
10002    2    11    22    
10002    2    12    23    
10002    2    13    24    
10003    1    11        
10003    1    12    

巧思:1、建表     2、type_flag对应数组的个数  :type_flag  in (1,2,3),

create table flag
(
ID bigint,
type_flag int,
tags array
)
row format delimited
fields terminated by '\t'  
collection items terminated by ',';

select
    ID,type_flag,
    (case when type_flag in (1,2,3) then arraySplit[0] else '' end) tag1,
    (case when type_flag in (2,3) then arraySplit[1] else '' end) tag2,
    (case when type_flag in (3) then arraySplit[2] else '' end) tag3
from (

         select
             ID,type_flag,
             split(temptag,'_') arraySplit
         from flag
                  lateral view explode(tags) temp as temptag

     ) tmp;

------------------

叶子:如果这个节点没有任何孩子节点。

根:如果这个节点是整棵树的根,即没有父节点。

内部节点:如果这个节点既不是叶子节点也不是根节点。

写一个查询语句,输出所有节点的编号和节点的类型,并将结果按照节点编号排序。

id p_id  ====》 id Type 
1 null   ====》 1 Root 
2 1      ====》 2 Inner 
3 1      ====》 3 Leaf 
4 2      ====》 4 Leaf 
5 2      ====》 5 Leaf

 

create  table stu(
    id int,
    p_id int
)row format delimited
fields terminated by ' ';


SELECT DISTINCT t1.id,
                (CASE
                     WHEN t1.p_id IS NULL
                     THEN 'Root'
                     ELSE
                         CASE
                             WHEN t2.id IS NULL
                             THEN 'Leaf'
                             ELSE 'Inner'
                          END
                  END) AS Type
FROM stu t1  LEFT JOIN stu t2   ON t1.id=t2.p_id

 

----------------每个用户连续登录最大天数-------------------------------

id   datetime
1,2019-07-26
1,2019-07-27
1,2019-07-30
1,2019-07-31
1,2019-08-01
2,2019-07-26
2,2019-07-27
2,2019-07-28
2,2019-07-30
2,2019-07-31

每个用户连续登录最大天数

create table login(
id int,
datetime string
) row format delimited
fields terminated by ','

load data local inpath '/root/login.txt'  into table login;

select temp.id,max(temp.num)
from(
select tmp.id id,count(1) num
from (
select id,date_sub(datetime,row_number() over(partition by id order by datetime asc)) t
from login
group by id,datetime      -- 保证同一天只有一条数据
) tmp
group by tmp.id,tmp.t
) temp
group by temp.id

 

最近七天内连续三天活跃用户数:
 select mid_id
    from
    (
        select mid_id
        from 
        (
            select
                mid_id,
                date_sub(dt,rank) date_diff
            from 
            (
                select 
                    mid_id,
                    dt,
                    dense_rank() over(partition by mid_id order by dt) rank
                from "$APP".dws_uv_detail_day
                where dt>=date_add('$do_date',-6) and dt<='$do_date'
            )t1
        )t2
        group by mid_id,date_diff
        having count(*)>=3
    )t3 
    group by mid_id     

 

--------------两人通话记录总时长----------------------------------------

大不了自定义udf解决!

b,a,13:01:03         b,a,13:01:03    
b,a,14:01:05         b,a,14:01:05
a,b,13:01:09         a,b,13:01:09
b,d,13:01:03         b,d,13:01:03
c,a,23:01:13         c,a,23:01:13
d,b,15:03:03         d,b,15:03:03

create table phone(
name string,
other string,
talktime string
) row format delimited
fields terminated by ','

load data local inpath '/root/phone.txt' overwrite into table phone;


from_unixtime(cast(sum(tt) as bigint),'mm'): 转换成日期格式,如果超过24小时会轮回的
所以这里只能使用累加分钟数即可(+号)
字符串比较大小等于(>、<、=),连接用concat

case when: 对一行数据进行判断数据;可以有多个case when输出多列;
sum(case when) : 适用于分组后,多行值;可以有多个case when输出多列;


select tmp.`one`,tmp.`two`,sum(tmp.`duration`)
from(
select
case when name>=other then name else other end as `one`,
case when name>=other then other else name end as `two`,
split(talktime,':')[0]*60*60 + split(talktime,':')[1]*60 + split(talktime,':')[2] as `duration`
from phone
) tmp
group by tmp.`one`,tmp.`two`

 

----------------------------------------------------------------------

======== 需求三:用户留存主题===============================

留存率:留存用户占当时新增用户(活跃用户)的比例即是留存率

10日对于11日留存率:
10日新增设备数的留存率=  10日新增设备数 且 11日活跃数 / 10日的新增设备数

`create_date`    string  comment '设备新增时间',
`retention_day`  int comment '截止当前日期留存天数'   (1,2,3,4,n)

留存怎么验证是否有数据呢?(修改成事务表)
把数据导出成文件,修改日期,再导回去?


(1,2,3,n天留存用户明细表):  union all   (开窗、left join、left semi join、join 、union all看情况选用)
insert overwrite table dws_user_retention_day
partition(dt="2019-09-22")
select
    nm.mid_id,
    nm.user_id,
    nm.version_code,
    nm.version_name,
    nm.lang,
    nm.source,
    nm.os,
    nm.area,
    nm.model,
    nm.brand,
    nm.sdk_version,
    nm.gmail,
    nm.height_width,
    nm.app_time,
    nm.network,
    nm.lng,
    nm.lat,
    nm.create_date,
    1 retention_day 
from dws_uv_detail_day ud join dws_new_mid_day nm  on ud.mid_id =nm.mid_id 
where ud.dt='2019-09-22' and nm.create_date=date_add('2019-09-22',-1)

union all
select  
    nm.mid_id,
    nm.user_id , 
    nm.version_code , 
    nm.version_name , 
    nm.lang , 
    nm.source, 
    nm.os, 
    nm.area, 
    nm.model, 
    nm.brand, 
    nm.sdk_version, 
    nm.gmail, 
    nm.height_width,
    nm.app_time,
    nm.network,
    nm.lng,
    nm.lat,
    nm.create_date,
    2 retention_day 
from  dws_uv_detail_day ud join dws_new_mid_day nm   on ud.mid_id =nm.mid_id 
where ud.dt='2019-09-22' and nm.create_date=date_add('2019-09-22',-2)

union all
select  
    nm.mid_id,
    nm.user_id , 
    nm.version_code , 
    nm.version_name , 
    nm.lang , 
    nm.source, 
    nm.os, 
    nm.area, 
    nm.model, 
    nm.brand, 
    nm.sdk_version, 
    nm.gmail, 
    nm.height_width,
    nm.app_time,
    nm.network,
    nm.lng,
    nm.lat,
    nm.create_date,
    3 retention_day 
from  dws_uv_detail_day ud join dws_new_mid_day nm   on ud.mid_id =nm.mid_id 
where ud.dt='2019-09-22' and nm.create_date=date_add('2019-09-22',-3);

-----------

ADS层
留存用户数:
retention_ratio decimal(10,2)


create external table ads_user_retention_day_count 
(
create_date string comment '设备新增日期',
retention_day int comment '截止当前日期留存天数',
retention_count bigint comment '留存数量'
)COMMENT '每日用户留存情况'
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_user_retention_day_count/';


insert into table ads_user_retention_day_count 
select
    create_date,
    retention_day,
    count(*) retention_count
from dws_user_retention_day
where dt='2019-09-22' 
group by create_date,retention_day;


留存用户比率:
create external table ads_user_retention_day_rate 
(
stat_date string comment '统计日期',
create_date string  comment '设备新增日期',
retention_day int comment '截止当前日期留存天数',
retention_count bigint comment  '留存数量',
new_mid_count bigint comment '当日设备新增数量',
retention_ratio decimal(10,2) comment '留存率'
) COMMENT '每日用户留存情况'
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_user_retention_day_rate/';

insert into table ads_user_retention_day_rate
select 
    '2019-09-22',             //指定一下,'统计日期',用于查看数据
    ur.create_date,
    ur.retention_day, 
    ur.retention_count, 
    nc.new_mid_count,
    ur.retention_count/nc.new_mid_count*100
from ads_user_retention_day_count ur join ads_new_mid_count nc
on nc.create_date=ur.create_date;

 

------------------------------------------

表user_id,visit_date,page_name,plat

1,请统计近7天每天到访的新用户数; (注意:“新”字)

    思路:1.求出每个用户及其第一次登陆的日期,也就是成为新用户的那天,过滤出近7天;2.在第1步所得表基础上提取每个用户成为新用户的日期,按这个时间分组,求得每天的用户数

select temp.firstDate, count(temp.user_id)

from
(select user_id, min(date_format(visit_date,'yyyy-MM-dd')) as firstDate
from person_visit
group by user_id
having firstDate >=date_sub(current_date(),7)  and

firstDate ) as temp
group by temp.firstDate;

 

2,统计每个访问渠道plat7天前的新用户的3日留存率和7日留存率  (注意:7天前(D-7)的那一天的新用户)
 
 思路:1.求得每个渠道7前的总的新用户数;2.求3日留存用户数;3.求7日留存用户数;4.手动计算留存率

 3日后的用户数/七天前的新用户数 = 3日留存 ;    
 3日留存/七天前的新用户数=3日留存率;
 

七天前的新用户数

select tmp.firstDate, tmp.plat, count(tmp.user_id) from(
    select temp.user_id user_id,temp.firstDate firstDate,pv.plat plat
    from 
    (select user_id, min(date_format(visit_date,'yyyy-MM-dd')) as firstDate
    from person_visit
    group by user_id
    havng firstDate=date_sub(current_date(),7)
    ) as temp  join  person_visit pv 
    on temp.user_id=pv.user_id and  temp.firstDate=date_format(pv.visit_date,'yyyy-MM-dd')
)tmp
group by tmp.plat;


3日留存
select plat,count(user_id)
from person_visit
where date_format(visit_date,'yyyy-MM-dd')=date_sub(current_date(),4)
and user_id in
(
    select user_id
    from person_visit
    group by user_id
    having min(date_format(visit_date,'yyyy-MM-dd'))=date_sub(current_date(),7)
) as temp
group by plat

你可能感兴趣的:(大数据资料笔记整理)