1.建库语句:
CREATE DATABASE [IF NOT EXISTS] database_name
[COMMENT database_comment]
[LOCATION hdfs_path]
[WITH DBPROPERTIES (property_name=property_value, ...)];
例:
create DATABASE if NOT EXISTS hive_db2
comment "my first database"
location "/hive_db2"
2.库的修改:
alter database hive_db2 set DBPROPERTIES ("createtime"="2018-12-19");
3.库的删除
drop database db_hive cascade if exists;
3.建表语句:
CREATE [EXTERNAL] TABLE [IF NOT EXISTS] table_name
[(col_name data_type [COMMENT col_comment], ...)]
[COMMENT table_comment]
[PARTITIONED BY (col_name data_type [COMMENT col_comment], ...)]
[CLUSTERED BY (col_name, col_name, ...)
[SORTED BY (col_name [ASC|DESC], ...)] INTO num_buckets BUCKETS]
[ROW FORMAT row_format]
DELIMITED [FIELDS TERMINATED BY char] [COLLECTION ITEMS TERMINATED BY char]
[MAP KEYS TERMINATED BY char] [LINES TERMINATED BY char]
| SERDE serde_name [WITH SERDEPROPERTIES (property_name=property_value, property_name=property_value, ...)]
[STORED AS file_format] Textfile
[LOCATION hdfs_path]
[TBLPROPERTIES (property_name=property_value, ...)]
[AS select_statement]
例1:
create table student2(id int COMMENT "xuehao", name string COMMENT "mingzi")
COMMENT "xueshengbiao"
ROW format delimited
fields terminated by '\t'
STORED as Textfile
location '/student2'
TBLPROPERTIES ("createtime"="2018-12-18");
例2:
create table student(id int, name string)
row format delimited
fields terminated by '\t';
load data local inpath '/opt/module/datas/student.txt' into table student;
例3:
create table student4 like student2;
4.导入数据语句
4.1 不加local则导入hdfs上文件,但会剪贴原文件,local本地仅粘贴
load data [local] inpath '/opt/module/datas/student.txt'
[overwrite] into table student [partition (partcol1=val1,…)];
4.2 创建表并导入数据(依据以存在的表)
create table student6 as select * from student;
4.3 覆盖插入
insert overwrite table student3 select * from student;
4.4 插入带分区的表
insert into table stu_par partition(month = '08') select id ,name from stu_par where month = '09';
4.5 将单表中数据导入多表
from student
insert into table student4 select *
insert into table student5 select *;
4.6 多分区导入单表
from stu_par
insert into table stu_par partition(month = '06')
select id ,name where month = '08'
insert into table stu_par partition(month = '07')
select id,name where month = '10';
5.表的修改操作
5.1 修改表的属性
alter table stu_ex set TBLPROPERTIES ('EXTERNAL' = 'TRUE');
5.2 重命名表名
alter table student4 rename to student3;
5.3 修改表的serde属性(序列化和反序列化)
alter table table_name set serdepropertyes('field.delim'='\t');
6.列的更新操作
6.1 修改列语法
ALTER TABLE table_name CHANGE [COLUMN] col_old_name col_new_name column_type [COMMENT col_comment] [FIRST|AFTER column_name]
6.2 增加或替换列语法
ALTER TABLE table_name ADD|REPLACE COLUMNS (col_name data_type [COMMENT col_comment], ...)
例1:
增加列:alter table student2 add COLUMNS (score double);
例2:
修改列:alter table student2 CHANGE COLUMN score score int AFTER id;
例3:
替换列(全部替换):alter table student2 replace COLUMNS (id int, name string);
7.带有分区的表
7.0 查看分区
show partitions table_name;
7.1 创建单个分区
create table stu_par(id int, name string)
partitioned by (month string)
ROW format delimited
FIELDS terminated by '\t';
create table stu_par2(id int, name string)
partitioned by (id int)
ROW format delimited
FIELDS terminated by '\t'; 错!!!!(不能以数据库字段作为分区)
load data local inpath '/opt/module/datas/student.txt' into table stu_par partition(month = '12');
load data local inpath '/opt/module/datas/student.txt' into table stu_par partition(month = '11');
select * from stu_par where month = '11'
union
select * from stu_par where month = '12';
7.2 增加多个分区
alter table stu_par add partition (month = '08') partition(month='07');
7.3 删除多个分区
alter table stu_par drop partition(month='08'),partition(month='09');
7.4 创建多级分区
create table stu_par2(id int, name string)
partitioned by (month string, day string)
row format delimited
FIELDS terminated by '\t';
7.5 导入数据到多级分区
load data local inpath '/opt/module/datas/student.txt' into table stu_par2
partition (month='12',day='19');
7.6 向多级分区增加分区
alter table stu_par2 add partition(month = '12', day = '17');
7.7 查询多级分区中的数据
select * from stu_par2 where day = '18';
7.8 修复分区(也可以使用添加分区的语句)
msck repair table dept_partition2;
8.创建外部表(删除表不会删除表中数据,仅删除表的元数据)
create external table stu_ex2(id int, name string)
ROW format delimited
FIELDS terminated by '\t'
location '/student';
8.1 外部表与内部表的转换
alter table stu_ex set TBLPROPERTIES ('EXTERNAL' = 'TRUE');
9.数据的导出
9.1 导出同时格式化(不加local则导出到hdfs)
insert overwrite local directory '/opt/module/datas/student'
row format delimited
fields terminated by '\t'
select * from student;
9.2 hadoop命令导出到本地
dfs -get /user/hive/warehouse/student/month=201709/000000_0 /opt/module/datas/export/student3.txt;
9.3 shell命令导出
hive -f/-e 执行语句或者脚本 > file
9.4 export仅可以导出到hdfs,常用于hdfs集群hive表迁徙
export table default.student to '/user/hive/warehouse/export/student';
10.数据的导入(仅能导入export导出的数据,因为需要获取表的元数据)
import table table_name from 'export导出数据的路径';
11.清除表中数据
truncate table student;
12.Like、RLike:RLike可以使用java的正则表达式
13.group by及having的使用
select deptno, avg(sal) avg_sal from emp group by deptno having avg_sal > 2000;
14.mapreduce的join
14.1 mapreduce中的reducejoin特点:在mapper阶段进行数据关联标记,在reducer阶段进行数据聚合
14.2 mapreduce中的mapjoin特点:将小表加载到内存中,在mapper阶段根据内存中的数据对大表进行数据处理,没有reduce阶段
15.HQL的join
15.1 仅支持等值连接不支持非等值连接
例:不支持select * from A left join B on A.id != B.id;
15.2 不支持在on条件中使用‘or’
15.3 每个join都会启动一个mapreduce任务,但hive默认开启mapreduce优化
关闭mapreduce优化:set hive.auto.convert.join=false;
16.order by
会进行全局排序,则reduce数量被看作1个,效率低下
17.sort by
对于每个mapreduce各分区进行局部排序,分区中的数据随机给定
18.distribute by
18.1 即mapreduce中自定义分区操作,hql书写规则:先分区后排序
18.2 distribute by的分区规则是根据分区字段的hash码与reduce的个数进行模除后,余数相同的分到一个区。
19.cluster by
当distribute和sort字段相同时可用cluster进行替代,默认正序,单不支持desc倒序
20.分桶
20.1 开启分桶的设置
set hive.enforce.bucketing=true;
20.2 分桶表的创建
create table stu_buck(id int, name string)
clustered by(id)
into 4 buckets
row format delimited fields terminated by '\t';
20.3 分桶的规则
用分桶字段的hash值与桶的个数求余,来决定数据存放在那个桶,
20.4 分桶与分区区别
a. 分桶结果在表的目录下存在多个分桶文件
b. 分区是将数据存放在表所在目录不同文件路径下
c. 分区针对是数据存储路径,分桶针对的是数据文件,分桶可以在分区的基础粒度细化
21.分桶的抽样
21.1 抽样语法
select * from table_name tablesample(bucket x out of y on bucketKey);
21.2 抽样规则
a. y用来决定抽样比例,必须为bucket数的倍数或者因子,
例:bucket数为4时,当y=2时,4/2=2,则抽取两个桶的数据,具体抽取哪个桶由x决定
b. x用来决定抽取哪个桶中的数据
例1:当bucket=4, y=4, x=2时,则需要抽取的数据量为bucket/y=1个桶,抽取第x桶的数据
例2:当bucket=4, y=2, x=2时,则需要抽取的数据量为bucket/y=2个桶,抽取第x桶和第x+y桶的数据
例3:当bucket=12, y=3, x=2时,抽bucket/y=4个桶,抽取第x桶和第x+2y桶的数据
22.NVL函数
NVL(column_name, default_cvalue),如果该行字段值为null,则返回default_value的值
23.CONCAT_WS()函数
使用规则:concat_ws(separator, [string | array(string)]+)
例:select concat_ws('_', 'www', array('achong','com')) 拼接结果:www_achong_com
24.COLLECT_SET(col)函数
使用规则:仅接受基本数据类型,将字段去重汇总,并返回array类型
例(行转列):表结构
name xingzuo blood
孙悟空 白羊座 A
大海 射手座 A
宋宋 白羊座 B
猪八戒 白羊座 A
凤姐 射手座 A
需求:把星座和血型一样的人归类到一起
射手座,A 大海|凤姐
白羊座,A 孙悟空|猪八戒
白羊座,B 宋宋
查询语句:
SELECT CONCAT_WS(',', xingzuo, blood), CONCAT_WS('|', COLLECT_SET(NAME))
FROM xingzuo
GROUP BY xingzuo, blood
25.EXPLODE(爆炸函数)及LATERAL_VIEW)(侧写函数)
25.1 explode:将列中的array或者map结构拆分成多行
25.2 lateral_view: LATERAL VIEW udtf(expression) 表别名 AS 列别名
例(行转列)
select movie, category_name
from movie_info
lateral view explode(category) table_tmp as category_name;
26.开窗函数
26.1 语法:UDAF() over (PARTITION By col1,col2 order by col3 窗口子句(rows between .. and ..)) AS 列别名
(partition by .. order by)可替换为(distribute by .. sort by ..)
26.2 over(): 指定分析数据窗口大小
26.3 窗口子句
26.3.01 n PRECEDING:往前n行数据
26.3.02 n FOLLOWING:往后n行数据
例:select name, orderdate, cost, sum(cost) over(
partition by name
order by orderdate
rows between 1 PRECEDING and 1 FOLLOWING
) from business;
26.3.03 CURRENT ROW:当前行
26.3.04 UNBOUNDED PRECEDING 表示从前面的起点
26.3.05 UNBOUNDED FOLLOWING表示到后面的终点
例:select name, orderdate, cost, sum(cost) over(
partition by name
order by orderdate
rows between CURRENT ROW and UNBOUNDED FOLLOWING
) from business;
27.LAG(col,n,default_val):往前第n行数据
28.LEAD(col,n, default_val):往后第n行数据
例:select name, orderdate, cost, lag(orderdate, 1, 'null') over(partition by name order by orderdate)
from business;
29.ntile(n):把有序分区中的行分为n组,每组编号从1开始
例:select name,orderdate,cost, ntile(5) over(order by orderdate) num from business
30.Rank函数
rank() 出现相同排序时,总数不变
dense_rank() 出现相同排序时,总数减少
row_number() 不会出现相同排序
sql执行顺序
from... where...group by... having.... select ... order by...
hql执行顺序
from … where … group by … having … select … order by … 或
from … on … join … where … group by … having … select … distinct … order by … limit
存在开窗函数时,起码在order by之前执行
例题1:
{
"name": "songsong",
"friends": ["bingbing" , "lili"] ,
"children": {
"xiao song": 18 ,
"xiaoxiao song": 19
}
"address": {
"street": "hui long guan" ,
"city": "beijing"
}
}
基于上述数据结构,我们在Hive里创建对应的表,并导入数据。
1.1 格式化数据为:
songsong,bingbing_lili,xiao song:18_xiaoxiao song:19,hui long guan_beijing
yangyang,caicai_susu,xiao yang:18_xiaoxiao yang:19,chao yang_beijing
1.2 建表语句:
create table test(name string,
friends array<string>,
children map<string, int>,
address struct<street:string, city:string>)
row format delimited
fields terminated by ','
collection items terminated by '_'
map keys terminated by ':';
1.3 数据写入语句
load data local inpath '/opt/module/datas/test.txt' into table test;
1.4 查询语句
select friends[0] friend,children['xiao song'] age,address.city from test where name = 'songsong';