Hive建表语句

Hive建内部表

-- 创建一个内部表
create table if not exists student(
id int, name string
)
row format delimited fields terminated by '\t'
stored as textfile
location '/home/hadoop/hive/warehouse/student';
-- 查询表的类型
desc formatted student;
Show create table student;

建表语句分析

create external table if not exists employee_external (
//if not exists 可选,如果表存在,则忽略
    name string,
    work_place ARRAY,
    sex_age STRUCT,
    skills_score MAP,
    depart_title MAP>
    //所有列和数据类型
)
conment 'This is an external table'
//comment可选
row format delimited
fields terminated by '|'
//字段分隔符
collection items terminated by ','
//集合和映射分隔符
map keys terminated by ':'
stored as textfiel
文件存储格式
location '/home/hadoop/hive/warehouse/employee'; 

Hive建表 - 分隔符

Hive中默认分隔符
字段:^A(\001)
集合:^B(\002)
映射:^C(/003)
Hive中建表时可以指定分隔符
row format delimited fields terminated by '|'

hive建表 - Storage SerDe

SerDe:序列化和反序列化
Hive 支持不同类型的Stroage SerDe

LazySimpleSerDe: TEXTFILE
BinarySerializerDeserializer: 
SEQUENCEFILE 		
ColumnarSerDe: ORC, RCFILE
ParquetHiveSerDe: PARQUET
AvroSerDe: AVRO
OpenCSVSerDe: for CST/TSV
JSONSerDe
RegExSerDe
HBaseSerDe  
例如
create table test_serde_hb(
id string,
name string,
sex string,
age string
)
row format serde'org.apache.hadoop.hive.hbase.HBaseSerDe'
stored by'org.apache.hadoop.hive.hbase.HBaseStorageHadler'
with serdeproperties(
"hbase.columns.mapping"="key,info:name,info:sex,info:age")
tblproperties("hbase.table.name"="test_serde");
例
create table my_table(a string,b string...)
row format
serde 'org.apache.hadoop.hvie.serde2.OpenCSVSerde'
with serdeproperties(
"separatorChar"="\t",
"quoteChar"="",
"excapeChar"="\\"
)
stored as textfiel;

创建临时表

临时表是应用程序自动管理再复杂查询期间生成得中间数据得方法
表只对当前session有效,session退出后自动删除
表空间位于/tmp/hive- 如果创建的临时表表名已存在,实际用的是临时表

create temporary table tmp_table_name1(c1 string);
create temporary table tmp_table_name2 AS..
create temporary table tmp_table_name3 like...

Hive建表的高阶语句 - CTAS and WITH

CTAS - as select 方式建表
create table ctas_employee as select * from employee;
CTAS不能创建partition,external,bucket table
CTE(ctas with common table expression)
create table cte_employee as
with
r1 as (select name from r2 where name='henry'),
r2 as (select name from employee where sex_age.sex='male'),
select * from r1 union all select * from r2;
LIKE
create table employee_like like employee; 

表删除 - 删除/修改

删除
drop table if exists employee [purge];
purge直接删除(可选),否则会放到.trash目录
truncate table employee;--清空表数据
修改(alter针对元数据)
alter table employee rename to new_employee;
alter table c_employee set tblproperties('comment'='new name,comments');
alter table  c_employee set fileformat rcfile;--修正表文件格式
--修改表的列操作
alter table employee_internal change old_name new_name string;--修改列名
alter table c_employee add columns(works string);--添加列
alter table c_employee replace columns(name string);--替换列

装载数据:load

load用于在hive中移动数据
load data local inpath'/home/dayongd/Downloads/employee.txt'overwrite into table employee;
--加local关键字,表示原始文件位于linux本地,执行后为拷贝数据
local data local inpath'/home/dayongd/Downloads/employee.txt' overwrite into table employee_partitioned partition(year=2013,month=12);
-- 没有LOCAL关键字,表示文件位于HDFS文件系统中,执行后为直接移动数据
LOCAL:指定文件位于本地文件系统,执行后为拷贝数据
OVERWRITE:表示覆盖表中现有数据

Hiv分区

分区主要用于提高性能

1:分区列的值将表划分为一个个的文件夹
2:查询时语法使用“分区”列和常规列类似
3:查询时Hive会只从指定分区查询数据,提高查询效率

分为静态分区和动态分区
创建分区表

create table dept_partition(
deptno int, 
dname string,
loc string )
partitioned by (month string)//通过parttitionby定义分区
row format delimited fields terminated by '\t';

静态分区操作

-- 添加分区
alter table dept_partition add partition(month='201906') ;
alter table dept_partition add partition(month='201905') partition(month='201904');
-- 删除分区
alter table dept_partition drop partition (month='201904');
alter table dept_partition drop partition (month='201905'), partition (month='201906');
分区表操作
-- 查看分区表有多少分区
show partitions dept_partition;
-- 加载数据到分区表中
load data local inpath '/opt/datas/dept.txt' into table dept_partition partition(month='201909');
可以创建多级分区
-- 创建二级分区表
create table dept_partition2(
deptno int, dname string, loc string)
partitioned by (month string, day string)
row format delimited fields terminated by '\t';
-- 加载数据到二级分区表中
load data local inpath '/opt/datas/dept.txt' into table dept_partition2 partition(month='201909', day='13');

动态分区操作

使用动态分区需设定属性
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;//设置分区模式为nonstrict模式
动态分区建表语句和静态分区相同
动态分区插入数据
insert into table employee_partitioned partition(year, month)//insert方式添加动态分区数据
select
name,array('Toronto') as work_place,
named_struct("sex","male","age",30) as sex_age,
map("python",90) as skills_score,
map("r&d", array('developer')) as depart_title,
year(start_date) as year,month(start_date) as month
from employee_hr eh
where eh.empployee_id = 102;	

分桶

分桶对应于HDFS中的文件
1:更高效的查询处理
2:使抽样更高效
3:一般根据"桶列"的哈希函数将数据进行分桶
分桶只有动态分桶
SET hive.enforce.bucketing = true;
定义分桶
CLUSTERED BY (employee_id) INTO 2 BUCKETS
分桶的列是表中已有的列分桶数最好是2的n次方
必须使用insert方式加载数据
随机抽样基于整行数据
SELECT * FROM table_nam TABLESAMPLE(BUCKET 3 OUT OF 32 ON rand()) s;
随机抽样基于指定列(使用分桶列更高效)
SELECT * FROM table_name TABLESAMPLE(BUCKET 3 OUT OF 32 ON id) s;

视图

视图概述
1:通过隐藏子查询,连接和函数来简化查询的逻辑结构
2:只保存定义,不存储数据
3:如果删除或更改基础表,则查询视图将失败
应用场景
1:将特定的列提供给用户,保护数据隐私
2:用于查询语句复杂的场景
视图操作命令
create 、show、drop、alter
-- 创建视图,支持 CTE, ORDER BY, LIMIT, JOIN,等
CREATE VIEW view_name AS SELECT statement; 
-- 查找视图 (SHOW VIEWS 在 hive v2.2.0之后)
SHOW TABLES; 
-- 查看视图定义
SHOW CREATE TABLE view_name; 
-- 删除视图
DROP view_name; 
--更改视图属性
ALTER VIEW view_name SET TBLPROPERTIES ('comment' = 'This is a view');
-- 更改视图定义,
ALTER VIEW view_name AS SELECT statement; 
与表生成函数结合使用,将函数的输入和输出连接
outer关键字:即使output为空也会生成结果
select name,work_place,loc from employee lateral view outer explode(split(null,',')) a as loc;
支持多层级
select name,wps,skill,score from employee 
lateral view explode(work_place) work_place_single as wps
lateral view explode(skills_score) sks as skill,score;
通常用于规范化行或解析json

你可能感兴趣的:(Hadoop,hive,hadoop)