数据类型:
普通:tinyint smalint int bigint boolean float double string timestamp binary
集合:struct map array
建表:
create table employees(
name string,
salary float,
sub array<string>,
deductions map<string,float>,
address strucestring,city:string,state:string,zip:int>
)
hive默认分隔符:
\n
^A 分割字段列
^B array和struct和map之间的分割符
^C map的key和value的分隔符
数据定义:
创建:
show databases;
create database test_database;
create database test location ‘/my/test/store’ 修改默认位置
create database test comment ‘zhushi’ 添加描述信息
craete database test with dbproperties(‘creator’=’lijie’,’date’=2016-1-1)
详情:
describe database extended test; 显示详细信息
use test 使用那个库
drop database if exists test 删除database
drop database if exists test cascade 如果库里面有表直接删除 如果是restrict有表不能删除(默认)
修改:
alter database test set dbproperties (‘creator’=’lijie1’)
查看表的详细信息:
describe extended test.table_name (也会显示分区表信息)
describe formatted test.table_name
外部表:关键字 external (外部表,hive不认为能完全拥有这个数据,所有删除该表并不会删除数据,只是删除描述表的元数据)
create external table if not exists table_name(
.....
)
复制表:(如果有external 表示复制表为外部表,若没有则和被复制的表一样)(外部表和管理表)
create table if not exists copy_table like table_name location by '/path/data'
分区(如果表中的数据和分区个数非常大,执行一个包含所有分区的查询可能会触发一个巨大的mapreduce,
1.建议使用strict,这样没有对分区字段 where的话不让提交
2.设置成nostrict,可以提交
可以通过 show partitions table_name; 查看所有分区
show partitions table_name partition (count = 'US'); 查看部分分区
)
create external table if not exists log_message(
hms int,
severity string,
server string,
process_id int,
message string
)
partitioned by (year int,month int,day int)
row format delimited fields terminated by '\t';
管理表中用户通过载入数据创建分区:
load data local inpath '${env:HOME}/california-employees'
into table employees
partition (country = 'US',state='CA');
自定义存储格式
stored as textfile;
删除表:
drop table if exists table_name;
表重命名
alter table log_message rename to logmsg;
修改添加删除表分区
alter table log_message add if exists
partition (year = 2016,month=1,day=1) location '/logs/2016/1/1'
partition (year = 2016,month=1,day=2) location '/logs/2016/1/2'
partition (year = 2016,month=1,day=3) location '/logs/2016/1/3'
修改分区路径
alter table log_messages patition (year = 2016,month=1,day=3) set location 's3n://ourbucket/logs/2016/01/01'
删除分区
alter table log_messages drop if exists partition (year = 2016,month=1,day=3)
修改列
增加列
删除替换列
修改表属性
修改存储属性
数据操作
装载数据 inpath下不能包含任何文件夹
load data local inpath ‘${env:HOME}/california-employees’
overwrite into table employees
partition (country=’US’,state=’CA’)
select 插入
insetrt into table employees
partition(country='US',state='OR')
select * from old_table ot
where ot.cnty='US' and ot.st='OR' 扫描多次
----------------------
from old_table ot
insert overwrite table employees
partition(country='US',state='OR')
select * from where ot.cnty='US' and ot.st='OR'
insert overwrite table employees
partition(country='US',state='ORR')
select * from where ot.cnty='US' and ot.st='ORR'
insert overwrite table employees
partition(country='US',state='ORRRR')
select * from where ot.cnty='US' and ot.st='ORRRR' 扫描表一次
动态分区插入 (根据位置)
insert overwrite table employees
partition (country,state)
select ...,ot.cnty,ot.st
from old_table ot;
(动态静态结合)
insert overwrite table employees
partition (country='US',state)
select ...,ot.cnty,ot.st
from old_table ot
where ot.cnty='US';
导出数据:
insert overwrite local directory='/temp/employees'
select name,salary,address from employees where state = 'CA';
多个文件输出
from old_table od
insert overwrite directory '/tmp/california-employees'
select * where ot.country='US' and ot.st='ca'
insetrt overwrite directory 'tmp/ccalifornia-employees'
select * where ot.country='US' and ot.st='cca'
insetrt overwrite directory 'tmp/cccalifornia-employees'
select * where ot.country='US' and ot.st='ccca'