CREATE [EXTERNAL] TABLE [IF NOT EXISTS] table_name
[(col_name data_type [COMMENT col_comment], ...)]
[COMMENT table_comment]
[PARTITIONED BY (col_name data_type [COMMENT col_comment], ...)]
[CLUSTERED BY (col_name, col_name, ...)
[SORTED BY (col_name [ASC|DESC], ...)] INTO num_buckets BUCKETS]
[ROW FORMAT row_format]
[STORED AS file_format]
[LOCATION hdfs_path]
CREATE TABLE
创建一个指定名字的表。如果相同名字的表已经存在,则抛出异常;用户可以用 IF NOT EXIST
选项来忽略这个异常
EXTERNAL
关键字可以让用户创建一个外部表,在建表的同时指定一个指向实际数据的路径(LOCATION
)
LIKE
允许用户复制现有的表结构,但是不复制数据
COMMENT
可以为表与字段增加描述
ROW FORMAT
DELIMITED [FIELDS TERMINATED BY char] [COLLECTION ITEMS TERMINATED BY char]
[MAP KEYS TERMINATED BY char] [LINES TERMINATED BY char]
| SERDE serde_name [WITH SERDEPROPERTIES (property_name=property_value, property_name=property_value, ...)]
用户在建表的时候可以自定义 SerDe 或者使用自带的 SerDe。如果没有指定 ROW FORMAT 或者 ROW FORMAT DELIMITED,将会使用自带的 SerDe。在建表的时候,用户还需要为表指定列,用户在指定表的列的同时也会指定自定义的 SerDe,Hive 通过 SerDe 确定表的具体的列的数据。
STORED AS
TEXTFILE
SEQUENCEFILE
RCFILE
ORCFILE(0.11以后出现)
PARQUET
1、其中TEXTFILE为默认格式,建表时不指定默认为这个格式,导入数据时会直接把数据文件拷贝到hdfs上不进行处理;
SEQUENCEFILE,RCFILE,ORCFILE,PARQUET格式的表不能直接从本地文件导入数据,数据要先导入到textfile格式的表中, 然后再从表中用insert导入SequenceFile,RCFile,ORCFile,PARQUET表中;或者用复制表结构及数据的方式(create table as select * from table )。
textfile
默认格式;
存储方式为行存储;
磁盘开销大 数据解析开销大;
但使用这种方式,hive不会对数据进行切分,从而无法对数据进行并行操作。sequencefile
二进制文件,以的形式序列化到文件中;
存储方式:行存储;
可分割 压缩;
一般选择block压缩;
优势是文件和Hadoop api中的mapfile是相互兼容的refile
存储方式:数据按行分块 每块按照列存储;
压缩快 快速列存取;
读记录尽量涉及到的block最少;
读取需要的列只需要读取每个row group 的头部定义;
读取全量数据的操作 >性能可能比sequencefile没有明显的优势,orcfile
存储方式:数据按行分块 每块按照列存储;
压缩快 快速列存取;
效率比rcfile高,是rcfile的改良版本。parquet
类似于orc,相对于orc文件格式,hadoop生态系统中大部>分工程都支持parquet文件。
hive> CREATE TABLE pokes (foo INT, bar STRING);
CREATE EXTERNAL TABLE page_view(
viewTime INT,
userid BIGINT,
page_url STRING,
referrer_url STRING,
ip STRING COMMENT 'IP Address of the User',
country STRING COMMENT 'country of origination')
COMMENT 'This is the staging page view table'
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\054'
STORED AS TEXTFILE
LOCATION '';
CREATE TABLE par_table(
viewTime INT,
userid BIGINT,
page_url STRING,
referrer_url STRING,
ip STRING COMMENT 'IP Address of the User')
COMMENT 'This is the page view table'
PARTITIONED BY(date STRING, pos STRING)
ROW FORMAT DELIMITED ‘\t’ FIELDS TERMINATED BY '\n'
STORED AS SEQUENCEFILE;
CREATE TABLE par_table(
viewTime INT,
userid BIGINT,
page_url STRING,
referrer_url STRING,
ip STRING COMMENT 'IP Address of the User')
COMMENT 'This is the page view table'
PARTITIONED BY(date STRING, pos STRING)
CLUSTERED BY(userid) SORTED BY(viewTime) INTO 32 BUCKETS
ROW FORMAT DELIMITED ‘\t’ FIELDS TERMINATED BY '\n'
STORED AS SEQUENCEFILE;
hive> CREATE TABLE invites (foo INT, bar STRING) PARTITIONED BY (ds STRING);
CREATE TABLE empty_key_value_store
LIKE key_value_store;
create table user_info (user_id int, cid string, ckid string, username string)
row format delimited
fields terminated by '\t'
lines terminated by '\n';
导入数据表的数据格式是:字段之间是tab键分割,行之间是断行。
及要我们的文件内容格式:
100636 100890 c5c86f4cddc15eb7 yyyvybtvt
100612 100865 97cc70d411c18b6f gyvcycy
100078 100087 ecd6026a15ffddf5 qa000100
hive> SHOW TABLES;
按正条件(正则表达式)显示表,
hive> SHOW TABLES '.*s';
hive> ALTER TABLE pokes ADD COLUMNS (new_col INT);
hive> ALTER TABLE invites ADD COLUMNS (new_col2 INT COMMENT 'a comment');
hive> ALTER TABLE events RENAME TO 3koobecaf;
hive> DROP TABLE pokes;
ALTER TABLE table_name ADD [IF NOT EXISTS] partition_spec [ LOCATION 'location1' ] partition_spec [ LOCATION 'location2' ] ...
partition_spec:
: PARTITION (partition_col = partition_col_value, partition_col = partiton_col_value, ...)
ALTER TABLE table_name DROP partition_spec, partition_spec,...
ALTER TABLE table_name CHANGE [COLUMN] col_old_name col_new_name
column_type [COMMENT col_comment] [FIRST|AFTER column_name]
# 这个命令可以允许改变列名、数据类型、注释、列位置或者它们的任意组合
ALTER TABLE table_name ADD|REPLACE COLUMNS (col_name data_type [COMMENT col_comment], ...)
ADD
是代表新增一字段,字段位置在所有列后面(partition列前)REPLACE
则是表示替换表中所有字段。ALTER TABLE table_name SET TBLPROPERTIES table_properties table_properties:
:[property_name = property_value…..]
# 用户可以用这个命令向表中增加metadata
ALTER TABLE table_name SET FILEFORMAT file_format
ALTER TABLE table_name CLUSTERED BY(userid) SORTED BY(viewTime) INTO num_buckets BUCKETS
# 这个命令修改了表的物理存储属性
CREATE VIEW [IF NOT EXISTS] view_name
[ (column_name [COMMENT column_comment], ...) ]
[COMMENT view_comment]
[TBLPROPERTIES (property_name = property_value, ...)]
AS SELECT
# 增加视图
# 如果没有提供表名,视图列的名字将由定义的SELECT表达式自动生成
# 如果修改基本表的属性,视图中不会体现,无效查询将会失败
# 视图是只读的,不能用LOAD/INSERT/ALTER
# DROP VIEW view_name
# 删除视图
CREATE DATABASE name
show tables;
show databases;
show partitions ;
show functions
describe extended table_name dot col_name
LOAD DATA [LOCAL] INPATH 'filepath' [OVERWRITE]
INTO TABLE tablename
PARTITION (partcol1=val1, partcol2=val2 ...)]
Load
操作只是单纯的复制/移动操作,将数据文件移动到 Hive 表对应的位置。filepath
hive> LOAD DATA LOCAL INPATH './examples/files/kv1.txt' OVERWRITE INTO TABLE pokes;
filepath
可以引用一个文件(这种情况下,Hive 会将文件移动到表所对应的目录中)或者是一个目录(在这种情况下,Hive 会将目录中的所有文件移动至表所对应的目录中)LOCAL
关键字
filepath
。如果发现是相对路径,则路径会被解释为相对于当前用户的当前路径。用户也可以为本地文件指定一个完整的 URI,比如:file:///user/hive/project/data1
load
命令会将 filepath
中的文件复制到目标文件系统中。目标文件系统由表的位置属性决定。被复制的数据文件移动到表的数据对应的位置hive> LOAD DATA LOCAL INPATH './examples/files/kv2.txt' OVERWRITE INTO TABLE invites PARTITION (ds='2008-08-15');
hive> LOAD DATA INPATH '/user/myname/kv2.txt' OVERWRITE INTO TABLE invites PARTITION (ds='2008-08-15');
The above command will load data from an HDFS file/directory to the table. Note that loading data from HDFS will result in moving the file/directory. As a result, the operation is almost instantaneous.
OVERWRITE
INSERT OVERWRITE TABLE tablename1 [PARTITION (partcol1=val1, partcol2=val2 ...)]
select_statement1 FROM from_statement
FROM from_statement
INSERT OVERWRITE TABLE tablename1 [PARTITION (partcol1=val1, partcol2=val2 ...)]
select_statement1
[INSERT OVERWRITE TABLE tablename2 [PARTITION ...]
select_statement2]...
#开启动态分区,默认是false
hive.exec.dynamici.partition=true;
#开启允许所有分区都是动态的,否则必须要有静态分区才能使用。
set hive.exec.dynamic.partition.mode=nonstrict;
----注意:因为dpartition表中只有两个字段,所以当我们查询了三个字段时(多了city字段),所以系统默认以最后一个字段city为分区名,因为分区表的分区字段默认也是该表中的字段,且依次排在表中字段的最后面。所以分区需要分区的字段只能放在后面,不能把顺序弄错。如果我们查询了四个字段的话,则会报错,因为该表加上分区字段也才三个。要注意系统是根据查询字段的位置推断分区名的,而不是字段名称。
insert overwrite table dpartition partition(ct) select id ,name,city frommytest_tmp2_p;
set hive.exec.dynamici.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
insert overwrite table ds_parttion
partition(state='china',ct) #state分区为静态,ct为动态分区,以查询的city字段为分区名
select id ,city from mytest_tmp2_p;
set hive.exec.dynamici.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
insert overwrite table ds_parttion partition(state,ct) select id ,country,city from mytest_tmp2_p;
# 注意:字段的个数和顺序不能弄错。
# 1
INSERT OVERWRITE [LOCAL] DIRECTORY directory1 SELECT ... FROM ...
# 2
FROM from_statement
INSERT OVERWRITE [LOCAL] DIRECTORY directory1 select_statement1
[INSERT OVERWRITE [LOCAL] DIRECTORY directory2 select_statement2]
# 3
INSERT INTO TABLE tablename1 [PARTITION (partcol1=val1, partcol2=val2 ...)] select_statement1 FROM from_statement
SELECT [ALL | DISTINCT] select_expr, select_expr, ...
FROM table_reference
[WHERE where_condition]
[GROUP BY col_list [HAVING condition]]
[ CLUSTER BY col_list
| [DISTRIBUTE BY col_list] [SORT BY| ORDER BY col_list]
]
[LIMIT number]
# Limit 可以限制查询的记录数
SELECT * FROM t1 LIMIT 5
# 实现Top k 查询
# 下面的查询语句查询销售记录最大的 5 个销售代表。
SET mapred.reduce.tasks = 1
SELECT * FROM test SORT BY amount DESC LIMIT 5
SELECT 语句可以使用正则表达式做列选择,下面的语句查询除了 ds 和 hr 之外的所有列:
SELECT `(ds|hr)?+.+` FROM test
hive> SELECT a.foo FROM invites a WHERE a.ds='';
hive> INSERT OVERWRITE DIRECTORY '/tmp/hdfs_out' SELECT a.* FROM invites a WHERE a.ds='';
hive> INSERT OVERWRITE LOCAL DIRECTORY '/tmp/local_out' SELECT a.* FROM pokes a;
hive> INSERT OVERWRITE TABLE events SELECT a.* FROM profiles a;
hive> INSERT OVERWRITE TABLE events SELECT a.* FROM profiles a WHERE a.key < 100;
hive> INSERT OVERWRITE LOCAL DIRECTORY '/tmp/reg_3' SELECT a.* FROM events a;
hive> INSERT OVERWRITE DIRECTORY '/tmp/reg_4' select a.invites, a.pokes FROM profiles a;
hive> INSERT OVERWRITE DIRECTORY '/tmp/reg_5' SELECT COUNT(1) FROM invites a WHERE a.ds='';
hive> INSERT OVERWRITE DIRECTORY '/tmp/reg_5' SELECT a.foo, a.bar FROM invites a;
hive> INSERT OVERWRITE LOCAL DIRECTORY '/tmp/sum' SELECT SUM(a.pc) FROM pc1 a;
hive> FROM invites a INSERT OVERWRITE TABLE events SELECT a.bar, count(1) WHERE a.foo > 0 GROUP BY a.bar;
hive> INSERT OVERWRITE TABLE events SELECT a.bar, count(1) FROM invites a WHERE a.foo > 0 GROUP BY a.bar;
JOIN
hive> FROM pokes t1 JOIN invites t2 ON (t1.bar = t2.bar) INSERT OVERWRITE TABLE events SELECT t1.bar, t1.foo, t2.foo;
FROM src
INSERT OVERWRITE TABLE dest1 SELECT src.* WHERE src.key < 100
INSERT OVERWRITE TABLE dest2 SELECT src.key, src.value WHERE src.key >= 100 and src.key < 200
INSERT OVERWRITE TABLE dest3 PARTITION(ds='2008-04-08', hr='12') SELECT src.key WHERE src.key >= 200 and src.key < 300
INSERT OVERWRITE LOCAL DIRECTORY '/tmp/dest4.out' SELECT src.value WHERE src.key >= 300;
hive> FROM invites a INSERT OVERWRITE TABLE events SELECT TRANSFORM(a.foo, a.bar) AS (oof, rab) USING '/bin/cat' WHERE a.ds > '2008-08-09';
This streams the data in the map phase through the script /bin/cat (like hadoop streaming). Similarly - streaming can be used on the reduce side (please see the Hive Tutorial or examples)
# 语法
join_table:
table_reference JOIN table_factor [join_condition]
| table_reference {LEFT|RIGHT|FULL} [OUTER] JOIN table_reference join_condition
| table_reference LEFT SEMI JOIN table_reference join_condition
table_reference:
table_factor
| join_table
table_factor:
tbl_name [alias]
| table_subquery alias
| ( table_references )
join_condition:
ON equality_expression ( AND equality_expression )*
equality_expression:
expression = expression
Hive 只支持等值连接(equality joins)、外连接(outer joins)和(left semi joins)。Hive 不支持所有非等值的连接,因为非等值连接非常难转化到 map/reduce 任务
LEFT,RIGHT和FULL OUTER关键字用于处理join中空记录的情况
LEFT SEMI JOIN
是 IN/EXISTS
子查询的一种更高效的实现
join 时,每次 map/reduce 任务的逻辑是这样的:reducer 会缓存 join 序列中除了最后一个表的所有表的记录,再通过最后一个表将结果序列化到文件系统
实践中,应该把最大的那个表写在最后
join 查询时,需要注意几个关键点
只支持等值join
SELECT a.* FROM a JOIN b ON (a.id = b.id)
SELECT a.* FROM a JOIN b ON (a.id = b.id AND a.department = b.department)
可以 join 多于 2 个表,例如
SELECT a.val, b.val, c.val FROM a JOIN b ON (a.key = b.key1) JOIN c ON (c.key = b.key2)
如果join中多个表的 join key 是同一个,则 join 会被转化为单个 map/reduce 任务
LEFT,RIGHT和FULL OUTER
例子
SELECT a.val, b.val FROM a LEFT OUTER JOIN b ON (a.key=b.key)
如果你想限制 join 的输出,应该在 WHERE 子句中写过滤条件——或是在 join 子句中写
容易混淆的问题是表分区的情况
SELECT c.val, d.val
FROM c LEFT OUTER JOIN d ON (c.key=d.key)
WHERE c.ds='2010-07-07' AND d.ds='2010-07-07‘
# 如果 d 表中找不到对应 c 表的记录,d 表的所有列都会列出 NULL,包括 ds 列。也就是说,join 会过滤 d 表中不能找到匹配 c 表 join key 的所有记录。这样的话,LEFT OUTER 就使得查询结果与 WHERE 子句无关
# 解决办法
SELECT c.val, d.val FROM c LEFT OUTER JOIN d
ON (c.key=d.key AND d.ds='2009-07-07' AND c.ds='2009-07-07')
SELECT a.key, a.value
FROM a
WHERE a.key in
(SELECT b.key
FROM B);
# 可以被重写为:
SELECT a.key, a.val
FROM a LEFT SEMI JOIN b on (a.key = b.key)
select_statement UNION ALL select_statement UNION ALL select_statement ...
select * from dual a,dual b where a.key = b.key;
select * from dual a join dual b on a.key = b.key;
SELECT t1.a1 as c1, t2.b1 as c2FROM t1, t2 WHERE t1.a2 = t2.b2
select concat(key,concat(';',key)) from dual;
FAILED: Parse Error: line 0:-1 mismatched input '' expecting ) in function specification
select concat(key,concat('\073',key)) from dual
null
代表空值, 值得警惕的是, 在HiveQL中String类型的字段若是空(empty)字符串, 即长度为0, 那么对它进行IS NULL
的判断结果是False.
FROM (
MAP doctext USING 'python wc_mapper.py' AS (word, cnt)
FROM docs
CLUSTER BY word
) a
REDUCE word, cnt USING 'python wc_reduce.py';
--doctext:
是输入
--word, cnt:
是map程序的输出
--CLUSTER BY:
将wordhash后,又作为reduce程序的输入
FROM (
FROM session_table
SELECT sessionid, tstamp, data
DISTRIBUTE BY sessionid SORT BY tstamp
) a
REDUCE sessionid, tstamp, data USING 'session_reducer.sh';
–
DISTRIBUTE BY:
用于给reduce程序分配行数据