目录
table
1.创建表 create table
常见数据类类型
创建hive table
2.导入数据
3.DML:
1. load 加载数据
2.insert
3.清空表中数据
4.select
1.where 过滤条件
2.order by 排序语法
3.like 语法 模糊匹配
4.合并表
5.null 处理
6.分组 聚合函数 join
7.join
8.开窗函数
5.command line
create table test (
id int comment '用户id',
name String,
age bigint
)comment 'test table'
row format delimited fields terminated by ','
stored as textfile;
//emp表数据
7369,SMITH,CLERK,7902,1980-12-17 14:00,800,,20
7499,ALLEN,SALESMAN,7698,1981-2-20 14:00,1600,300,30
7521,WARD,SALESMAN,7698,1981-2-22 14:00,1250,500,30
7566,JONES,MANAGER,7839,1981-4-2 14:00,2975,,20
7654,MARTIN,SALESMAN,7698,1981-9-28 13:00,1250,1400,30
7698,BLAKE,MANAGER,7839,1981-5-1 13:00,2850,,30
7782,CLARK,MANAGER,7839,1981-6-9 13:00,2450,,10
7788,SCOTT,ANALYST,7566,1982-12-9 14:00,3000,,20
7839,KING,PRESIDENT,,1981-11-17 14:00,5000,,10
7844,TURNER,SALESMAN,7698,1981-9-8 13:00,1500,0,30
7876,ADAMS,CLERK,7788,1983-1-12 14:00,1100,,20
7900,JAMES,CLERK,7698,1981-12-3 14:00,950,,30
7902,FORD,ANALYST,7566,1981-12-3 14:00,3000,,20
7934,MILLER,CLERK,7782,1982-1-23 14:00,1300,,10
创建表(hive下)
CREATE TABLE emp (
empno decimal(4,0) ,
ename string ,
job string ,
mgr decimal(4,0) ,
hiredate string ,
sal decimal(7,2) ,
comm decimal(7,2) ,
deptno decimal(2,0)
)
row format delimited fields terminated by ','
stored as textfile;
插入emp数据
hive (bigdata_hive2)> load data local inpath '/home/hadoop/tmp/emp.txt' into table emp;
1.加载本地数据
load data local inpath '/home/hadoop/tmp/emp.txt' INTO TABLE emp;
2 加载hdfs数据
[hadoop@bigdata13 tmp]$ hadoop fs -ls /data
[hadoop@bigdata13 tmp]$ hadoop fs -put ./emp.txt /data
[hadoop@bigdata13 tmp]$ hadoop fs -ls /data
Found 1 items
-rw-r--r-- 3 hadoop supergroup 799 2022-11-30 21:53 /data/emp.txt
load data inpath '/data/emp.txt' INTO TABLE emp;
1.table :
hdfs
metastore
hadoop fs -mv xxx /table/ [建议先不要这么做]
3. 覆盖表中数据
load data local inpath '/home/hadoop/tmp/emp.txt' OVERWRITE INTO TABLE emp;
hive 不要用update delete 因为效率低
emp2:
//向emp2中插入emp的数据
insert into table emp2
select * from emp;
//覆盖原有数据 只插入emp中deptno=10的数据
insert overwrite table emp2
select * from emp where deptno=10;
TRUNCATE [TABLE] table_name
truncate table emp2;
select
sal
from emp
order by sal desc;
create table a(id int ,name string) row format delimited fields terminated by ',' ;
create table b(id int ,name string) row format delimited fields terminated by ',' ;
load data local inpath "/home/hadoop/tmp/a.txt" into table a;
load data local inpath "/home/hadoop/tmp/b.txt" into table b;
select name from a
union all
select name from b;
select name from a
union all
select name from b
union all
select "lisi" as name ;
select name,"1" as pk from a
union all
select name,"2" as pk from b
union all
select "lisi" as name,"3" as id ;
select
empno,
ename,
job,
mgr,
hiredate,
sal,
nvl(comm,0) as comm_alias,
deptno
from emp;
select
sum(sal) as sal_sum,
max(sal) as sal_max,
min(sal) as sal_min,
avg(sal) as sal_avg,
count(1) as cnt
from emp ;
select
job,
sum(sal) as sal_sum,
max(sal) as sal_max,
min(sal) as sal_min,
avg(sal) as sal_avg,
count(1) as cnt
from emp
group by job
having sal_sum > 6000;
//子查询:
select
job,
sal_sum,
sal_max,
sal_min,
sal_avg,
cnt
from
(
select
job,
sum(sal) as sal_sum,
max(sal) as sal_max,
min(sal) as sal_min,
avg(sal) as sal_avg,
count(1) as cnt
from emp
group by job
) as a
where sal_sum > 6000;
select
a.*,
b.*
from
(
select
*
from a
) as a join
(
select
*
from b
) as b
on a.id = b.id;
select
a.*,
b.*
from
(
select
*
from a
) as a join
(
select
*
from b
) as b
on a.id = b.i
select
a.*,
b.*
from
(
select
*
from a
) as a right join
(
select
*
from b
) as b
on a.id = b.id;
select
a.*,
b.*
from
(
select
*
from a
) as a full join
(
select
*
from b
) as b
on a.id = b.id;
table:emp
维度:部门 每年
指标:人数
where: no
select
部门 每年
人数
from emp
group by 部门 每年;
每年=》hiredate 处理et
1980-12-17 00:00:00 => etl 使用处理日期相关的function date_format
select
deptno,date_format(hiredate,'YYYY') as year,
count(1) as cnt
from emp
group by deptno,date_format(hiredate,'YYYY');
select
date_format(hiredate,'YYYY-MM') as ymoth,
count(1) as cnt
from emp
group by date_format(hiredate,'YYYY-MM');
理论上 聚合后的行数 <=聚合前的行数
窗口函数/开窗函数:窗口 + 函数
窗口: 函数 运行时 计算的数据集的范围
函数: 运行时函数
语法结构:
函数 over([partition by xxx,...] [order by xxx,....])
over: 以谁进行开窗 table、
parition by : 以谁进行分组 table columns
order by : 以谁进行排序 table columns
函数:开窗函数、聚合函数
案例:既要显示聚合前的数据,又要显示聚合后的数据?
eg:
id name sal
1 zs 3w
2 ls 2.5w
3 ww 2w
需求: 按照工资降序排列 还显示对应的 排名
id name sal rank
1 zs 3w 1
2 ls 2.5w 2
3 ww 2w 3
数据:
// mt.txt
haige,2022-11-10,1
haige,2022-11-11,5
haige,2022-11-12,7
haige,2022-11-13,3
haige,2022-11-14,2
haige,2022-11-15,4
haige,2022-11-16,4
甜甜,2022-11-10,1
甜甜,2022-11-11,5
甜甜,2022-11-12,7
甜甜,2022-11-13,3
甜甜,2022-11-14,2
甜甜,2022-11-15,4
甜甜,2022-11-16,4
建表
create table user_mt (
name string,
dt string,
cnt int
)
row format delimited fields terminated by ',' ;
load data local inpath '/home/hadoop/tmp/mt.txt' overwrite into table user_mt;
select
name ,
dt ,
cnt ,
sum(cnt) over(partition by name order by dt ) as sum_cnt
from user_mt;
beeline => 连接 hive =》 hdfs
对hdfs 做一个设置 代理设置:
路径:/home/hadoop/app/hadoop/etc/hadoop
core-site.xml:
hadoop.proxyuser.hadoop.hosts
*
hadoop.proxyuser.hadoop.groups
*
分布式进行分发:[hadoop@bigdata13 hadoop]$ xsync core-site.xml
重启hdfs
[hadoop@bigdata13 hadoop]$ hadoop-cluster stop
[hadoop@bigdata13 hadoop]$ hadoop-cluster start
启动hiveserver2
路径:/home/hadoop/app/hive/bin/
[hadoop@bigdata13 bin]$ ./hiveserver2
beeline> !connect jdbc:hive2://localhost:10000 hadoop