目录
1.table
1.内部表、外部表
2.普通表、分区表
3.静态分区和动态分区
2.相互转换
3.复杂数据类型
1.arrays
2.maps
3.structs: java bean
4.开窗函数
CREATE External TABLE emp_manager2 (
empno decimal(4,0) ,
ename string ,
job string ,
mgr decimal(4,0) ,
hiredate string ,
sal decimal(7,2) ,
comm decimal(7,2) ,
deptno decimal(2,0)
)
row format delimited fields terminated by ','
stored as textfile;
7369,SMITH,CLERK,7902,1980-12-17 ,800,,20
7499,ALLEN,SALESMAN,7698,1981-2-20 ,1600,300,30
7521,WARD,SALESMAN,7698,1981-2-22 ,1250,500,30
7566,JONES,MANAGER,7839,1981-4-2 ,2975,,20
7654,MARTIN,SALESMAN,7698,1981-9-28 ,1250,1400,30
7698,BLAKE,MANAGER,7839,1981-5-1 ,2850,,30
7782,CLARK,MANAGER,7839,1981-6-9 ,2450,,10
7788,SCOTT,ANALYST,7566,1982-12-9 ,3000,,20
7839,KING,PRESIDENT,,1981-11-17 ,5000,,10
7844,TURNER,SALESMAN,7698,1981-9-8 ,1500,0,30
7876,ADAMS,CLERK,7788,1983-1-12 ,1100,,20
7900,JAMES,CLERK,7698,1981-12-3 ,950,,30
7902,FORD,ANALYST,7566,1981-12-3 ,3000,,20
7934,MILLER,CLERK,7782,1982-1-23 ,1300,,10
CREATE TABLE emp_p (
empno decimal(4,0) ,
ename string ,
job string ,
mgr decimal(4,0) ,
hiredate string ,
sal decimal(7,2) ,
comm decimal(7,2)
)
PARTITIONED BY (deptno decimal(2,0))
row format delimited fields terminated by ','
stored as textfile;
insert into table emp_p partition(deptno=20)
select
empno,
ename,
job ,
mgr ,
hiredate,
sal ,
comm
from emp where deptno=20;
//覆盖数据
insert overwrite table emp_p partition(deptno=20)
select
empno,
ename,
job ,
mgr ,
hiredate,
sal ,
comm
from emp where deptno=20;
set hive.exec.dynamic.partition.mode=nonstrict;
insert overwrite table emp_p partition(deptno)
select
empno,
ename,
job ,
mgr ,
hiredate,
sal ,
comm ,
deptno
from emp;
insert overwrite table emp_p partition(deptno)
select
empno,
ename,
job ,
mgr ,
hiredate,
sal ,
comm ,
deptno
from emp where deptno=20;
insert overwrite table emp_p partition(deptno)
select
empno,
ename,
job ,
mgr ,
hiredate,
sal ,
comm ,
20 as deptno
from emp where deptno=20;
create table hive_array(
name string,
locations array
)
row format delimited fields terminated by '\t'
collection items terminated by ',';
select name ,locations[0] as first_loc_work from hive_array;
select name , size(locations) from hive_array ;
select * from hive_array where array_contains(locations,'shanghai');
select name,location
from hive_array lateral view explode(locations) loc_table as location;
create table hive_map(
id int comment '用户id',
name string comment '用户名字',
relation map comment '家庭成员',
age int comment '年龄'
)
row format delimited fields terminated by ','
collection items terminated by '#'
map keys terminated by ':';
select id,name,age,relation['father'] as father from hive_map;
select id,name,age,map_keys(relation) as members from hive_map;
select id,name,age,map_values(relation) as members from hive_map;
select
id,name,age,relation['brother'] as brother
from hive_map
where
relation['brother'] is not null;
或者
select
id,name,age,relation['brother'] as brother
from hive_map
where
array_contains(map_keys(relation), 'brother');
create table hive_struct(
ip string,
userinfo STRUCT
)
row format delimited fields terminated by '#'
collection items terminated by ':';
函数:
1.开窗函数自带的
1.排序相关的
2.串行
2.聚合函数:多行数据按照一定规则 进行聚合为 一行
理论上 聚合后的行数 <=聚合前的行数
rank()
rank() over(partition by xx order by xxx) as rk
从1开始 按照顺序 生产分组内记录的编号,排序相同会重复 在名次中留下空位
row_number()
row_number() over(partition by xx order by xxx) as rn
从1开始 按照顺序 生产分组内记录的编号,排序相同不会重复
dense_rank()
dense_rank() over(partition by xx order by xxx) as dk
从1开始 按照顺序 生产分组内记录的编号,排序相同会重复 在名次中不会留下空位
举例
数据
甜甜,2022-11-10,1
甜甜,2022-11-11,5
甜甜,2022-11-12,5
甜甜,2022-11-13,3
甜甜,2022-11-14,2
甜甜,2022-11-15,4
甜甜,2022-11-16,4
创表:create table user_mt3 like user_mt2;
插入数据:load data local inpath '/home/hadoop/tmp/date/mt_test.txt'into table user_mt3;
命令
select
name,
dt,
cnt,
rank() over(partition by name order by cnt desc) as rk,
row_number() over(partition by name order by cnt desc) as rn,
dense_rank() over(partition by name order by cnt desc) as dk
from user_mt3;
案例
既要显示聚合前的数据,又要显示聚合后的数据
id name sal
1 zs 3w
2 ls 2.5w
3 ww 2w
需求: 按照工资降序排列 还显示对应的 排名
id name sal rank
1 zs 3w 1
2 ls 2.5w 2
3 ww 2w 3
案例一
数据
haige,2022-11-10,1
haige,2022-11-11,5
haige,2022-11-12,7
haige,2022-11-13,3
haige,2022-11-14,2
haige,2022-11-15,4
haige,2022-11-16,4
创建表
create table user_mt2 (
name string,
dt string,
cnt int
)
row format delimited fields terminated by ',' ;
插入数据:load data local inpath '/home/hadoop/tmp/mt.txt' into table user_mt2;
问题: 统计累计问题 ,每个用户每天累计点外卖次数
select
name ,
dt ,
cnt ,
sum(cnt) over(partition by name order by dt ) as sum_cnt
from user_mt2;
补充:单单一个基本查询 开窗函数 和 group by 不能一起使用
指定窗口大小
select
name ,
dt ,
cnt ,
sum(cnt) over(partition by name order by dt ) as sum_cnt,
sum(cnt) over(partition by name order by dt ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW ) as sum_cnt2,
sum(cnt) over(partition by name order by dt ROWS BETWEEN 3 PRECEDING AND CURRENT ROW ) as sum_cnt3,
sum(cnt) over(partition by name order by dt ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING ) as sum_cnt4
from user_mt2;
ROWS BETWEEN 3 PRECEDING AND CURRENT ROW :上三行 + 本行
ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING :下一行 + 本行
select
name ,
dt ,
cnt ,
sum(cnt) over(partition by name order by dt ) as sum_cnt,
sum(cnt) over( order by dt ) as sum_cnt1
from user_mt2;
案例二
数据
userid dt cnt
u01,2017/01/21,5
u02,2017/01/23,6
u03,2017/01/22,8
u04,2017/01/20,3
u01,2017/01/23,6
u01,2017/02/21,8
u02,2017/01/23,6
u01,2017/02/22,4
需求:
使用sql 统计出每个用户每个月的累计访问次数
用户id 月份 小计 累计
u01 2017-01 11 11
u01 2017-02 12 23
u02 2017-01 12 12
建表
create table user_log(
userid string,
dt string,
cnt int
)
row format delimited fields terminated by ',' ;
插入数据
load data local inpath "/home/hadoop/tmp/data/exemple/user_visit.txt" into table user_log;
问题
1.求出每个用户每个月的 访问次数
dt:2017/01/21 => 2017-01-21
moth:2017-01
select
userid,
date_format(replace(dt,'/','-'),'YYYY-MM') as moth,
sum(cnt) cnt_sum
from user_log
group by
userid,date_format(replace(dt,'/','-'),'YYYY-MM');
2.基于result 进一步求 累计访问次数
select
userid,
moth,
cnt_sum,
sum(cnt_sum) over(partition by userid order by moth ) as cnt_all
from
(
select
userid,
date_format(replace(dt,'/','-'),'YYYY-MM') as moth,
sum(cnt) as cnt_sum
from user_log
group by
userid,date_format(replace(dt,'/','-'),'YYYY-MM')
) as a ;
需求:
京东店铺
数据
[hadoop@bigdata33 data]$ cat user_shop.txt
user_id shop
u1,a
u2,b
u1,b
u1,a
u3,c
u4,b
u1,a
u2,c
u5,b
u4,b
u6,c
u2,c
u1,b
u2,a
u2,a
u3,a
u5,a
u5,a
u5,a
pv =》 页面浏览量 3个用户 每个人 访问了 10次页面 30
uv =》 访客次数 3个用户 每个人 访问了 10次页面 3
需求:
1.每个店铺的uv
2.一共有几家店铺
select
count(distinct shop) as shop_cnt
from taobao;
3.店铺的访问次数排名
select
shop,
cnt,
rk
from
(
select
shop,
cnt,
rank() over(order by cnt desc) as rk
from
(
select
shop,
count(1) as cnt
from taobao
group by shop
)as a
)as a
where rk < 4;
4.每个店铺访问次数 top3 的用户记录
输出: 店铺名次 访客id 访问次数
select
shop,
user_id,
cnt,
rk
from
(
select
shop,
user_id,
cnt,
rank() over(partition by shop order by cnt desc) as rk
from
(
select
shop,
user_id,
count(1) as cnt
from taobao
group by shop, user_id
)as a
)as a
where rk < 4;