-- 创建一个内部表
create table if not exists student(
id int, name string
)
row format delimited fields terminated by '\t'
stored as textfile
location '/home/hadoop/hive/warehouse/student';
-- 查询表的类型
desc formatted student;
Show create table student;
create external table if not exists employee_external (
//if not exists 可选,如果表存在,则忽略
name string,
work_place ARRAY,
sex_age STRUCT,
skills_score MAP,
depart_title MAP>
//所有列和数据类型
)
conment 'This is an external table'
//comment可选
row format delimited
fields terminated by '|'
//字段分隔符
collection items terminated by ','
//集合和映射分隔符
map keys terminated by ':'
stored as textfiel
文件存储格式
location '/home/hadoop/hive/warehouse/employee';
Hive中默认分隔符
字段:^A(\001)
集合:^B(\002)
映射:^C(/003)
Hive中建表时可以指定分隔符
row format delimited fields terminated by '|'
SerDe:序列化和反序列化
Hive 支持不同类型的Stroage SerDe
LazySimpleSerDe: TEXTFILE
BinarySerializerDeserializer:
SEQUENCEFILE
ColumnarSerDe: ORC, RCFILE
ParquetHiveSerDe: PARQUET
AvroSerDe: AVRO
OpenCSVSerDe: for CST/TSV
JSONSerDe
RegExSerDe
HBaseSerDe
例如
create table test_serde_hb(
id string,
name string,
sex string,
age string
)
row format serde'org.apache.hadoop.hive.hbase.HBaseSerDe'
stored by'org.apache.hadoop.hive.hbase.HBaseStorageHadler'
with serdeproperties(
"hbase.columns.mapping"="key,info:name,info:sex,info:age")
tblproperties("hbase.table.name"="test_serde");
例
create table my_table(a string,b string...)
row format
serde 'org.apache.hadoop.hvie.serde2.OpenCSVSerde'
with serdeproperties(
"separatorChar"="\t",
"quoteChar"="",
"excapeChar"="\\"
)
stored as textfiel;
临时表是应用程序自动管理再复杂查询期间生成得中间数据得方法
表只对当前session有效,session退出后自动删除
表空间位于/tmp/hive-
create temporary table tmp_table_name1(c1 string);
create temporary table tmp_table_name2 AS..
create temporary table tmp_table_name3 like...
CTAS - as select 方式建表
create table ctas_employee as select * from employee;
CTAS不能创建partition,external,bucket table
CTE(ctas with common table expression)
create table cte_employee as
with
r1 as (select name from r2 where name='henry'),
r2 as (select name from employee where sex_age.sex='male'),
select * from r1 union all select * from r2;
LIKE
create table employee_like like employee;
删除
drop table if exists employee [purge];
purge直接删除(可选),否则会放到.trash目录
truncate table employee;--清空表数据
修改(alter针对元数据)
alter table employee rename to new_employee;
alter table c_employee set tblproperties('comment'='new name,comments');
alter table c_employee set fileformat rcfile;--修正表文件格式
--修改表的列操作
alter table employee_internal change old_name new_name string;--修改列名
alter table c_employee add columns(works string);--添加列
alter table c_employee replace columns(name string);--替换列
load用于在hive中移动数据
load data local inpath'/home/dayongd/Downloads/employee.txt'overwrite into table employee;
--加local关键字,表示原始文件位于linux本地,执行后为拷贝数据
local data local inpath'/home/dayongd/Downloads/employee.txt' overwrite into table employee_partitioned partition(year=2013,month=12);
-- 没有LOCAL关键字,表示文件位于HDFS文件系统中,执行后为直接移动数据
LOCAL:指定文件位于本地文件系统,执行后为拷贝数据
OVERWRITE:表示覆盖表中现有数据
分区主要用于提高性能
1:分区列的值将表划分为一个个的文件夹
2:查询时语法使用“分区”列和常规列类似
3:查询时Hive会只从指定分区查询数据,提高查询效率
分为静态分区和动态分区
创建分区表
create table dept_partition(
deptno int,
dname string,
loc string )
partitioned by (month string)//通过parttitionby定义分区
row format delimited fields terminated by '\t';
静态分区操作
-- 添加分区
alter table dept_partition add partition(month='201906') ;
alter table dept_partition add partition(month='201905') partition(month='201904');
-- 删除分区
alter table dept_partition drop partition (month='201904');
alter table dept_partition drop partition (month='201905'), partition (month='201906');
分区表操作
-- 查看分区表有多少分区
show partitions dept_partition;
-- 加载数据到分区表中
load data local inpath '/opt/datas/dept.txt' into table dept_partition partition(month='201909');
可以创建多级分区
-- 创建二级分区表
create table dept_partition2(
deptno int, dname string, loc string)
partitioned by (month string, day string)
row format delimited fields terminated by '\t';
-- 加载数据到二级分区表中
load data local inpath '/opt/datas/dept.txt' into table dept_partition2 partition(month='201909', day='13');
动态分区操作
使用动态分区需设定属性
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;//设置分区模式为nonstrict模式
动态分区建表语句和静态分区相同
动态分区插入数据
insert into table employee_partitioned partition(year, month)//insert方式添加动态分区数据
select
name,array('Toronto') as work_place,
named_struct("sex","male","age",30) as sex_age,
map("python",90) as skills_score,
map("r&d", array('developer')) as depart_title,
year(start_date) as year,month(start_date) as month
from employee_hr eh
where eh.empployee_id = 102;
分桶
分桶对应于HDFS中的文件
1:更高效的查询处理
2:使抽样更高效
3:一般根据"桶列"的哈希函数将数据进行分桶
分桶只有动态分桶
SET hive.enforce.bucketing = true;
定义分桶
CLUSTERED BY (employee_id) INTO 2 BUCKETS
分桶的列是表中已有的列分桶数最好是2的n次方
必须使用insert方式加载数据
随机抽样基于整行数据
SELECT * FROM table_nam TABLESAMPLE(BUCKET 3 OUT OF 32 ON rand()) s;
随机抽样基于指定列(使用分桶列更高效)
SELECT * FROM table_name TABLESAMPLE(BUCKET 3 OUT OF 32 ON id) s;
视图
视图概述
1:通过隐藏子查询,连接和函数来简化查询的逻辑结构
2:只保存定义,不存储数据
3:如果删除或更改基础表,则查询视图将失败
应用场景
1:将特定的列提供给用户,保护数据隐私
2:用于查询语句复杂的场景
视图操作命令
create 、show、drop、alter
-- 创建视图,支持 CTE, ORDER BY, LIMIT, JOIN,等
CREATE VIEW view_name AS SELECT statement;
-- 查找视图 (SHOW VIEWS 在 hive v2.2.0之后)
SHOW TABLES;
-- 查看视图定义
SHOW CREATE TABLE view_name;
-- 删除视图
DROP view_name;
--更改视图属性
ALTER VIEW view_name SET TBLPROPERTIES ('comment' = 'This is a view');
-- 更改视图定义,
ALTER VIEW view_name AS SELECT statement;
与表生成函数结合使用,将函数的输入和输出连接
outer关键字:即使output为空也会生成结果
select name,work_place,loc from employee lateral view outer explode(split(null,',')) a as loc;
支持多层级
select name,wps,skill,score from employee
lateral view explode(work_place) work_place_single as wps
lateral view explode(skills_score) sks as skill,score;
通常用于规范化行或解析json