【hive内置数据类型】
Hive的内置数据类型可以分为两大类:(1)、基础数据类型;(2)、复杂数据类型。其中,基础数据类型包括:TINYINT,SMALLINT,INT,BIGINT,BOOLEAN,FLOAT,DOUBLE,STRING,BINARY,TIMESTAMP,DECIMAL,CHAR,VARCHAR,DATE。下面的表格列出这些基础类型所占的字节以及从什么版本开始支持这些类型。
数据类型 | 所占字节 | 开始支持版本 |
TINYINT | 1byte,-128 ~ 127 | |
SMALLINT | 2byte,-32,768 ~ 32,767 | |
INT | 4byte,-2,147,483,648 ~ 2,147,483,647 | |
BIGINT | 8byte,-9,223,372,036,854,775,808 ~ 9,223,372,036,854,775,807 | |
BOOLEAN | ||
FLOAT | 4byte单精度 | |
DOUBLE | 8byte双精度 | |
STRING | ||
BINARY | 从Hive0.8.0开始支持 | |
TIMESTAMP | 从Hive0.8.0开始支持 | |
DECIMAL | 从Hive0.11.0开始支持 | |
CHAR | 从Hive0.13.0开始支持 | |
VARCHAR | 从Hive0.12.0开始支持 | |
DATE | 从Hive0.12.0开始支持 |
复杂类型包括ARRAY,MAP,STRUCT,UNION,这些复杂类型是由基础类型组成的。
创建一个包含复制类型的表格可以如下
【复合数据类型用法】
一、map、struct、array 这3种的用法:
1、Array的使用
2、Map 的使用
3、Struct 的使用
4、数据组合 (不支持组合的复杂数据类型)
二、hive中的一些不常见函数的用法:
1、array_contains (Collection Functions)
2、get_json_object (Misc. Functions)
3、parse_url_tuple
三、ref:
目前 hive 支持的复合数据类型有以下几种:
map
(key1, value1, key2, value2, ...) Creates a map with the given key/value pairs
struct
(val1, val2, val3, ...) Creates a struct with the given field values. Struct field names will be col1, col2, ...
named_struct
(name1, val1, name2, val2, ...) Creates a struct with the given field names and values. (as of Hive 0.8.0)
array
(val1, val2, ...) Creates an array with the given elements
create_union
(tag, val1, val2, ...) Creates a union type with the value that is being pointed to by the tag parameter
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
|
创建数据库表,以array作为数据类型
create
table
person(
name
string,work_locations array
ROW FORMAT DELIMITED
FIELDS TERMINATED
BY
'\t'
COLLECTION ITEMS TERMINATED
BY
','
;
数据
biansutao beijing,shanghai,tianjin,hangzhou
linan changchu,chengdu,wuhan
入库数据
LOAD
DATA
LOCAL
INPATH
'/home/hadoop/person.txt'
OVERWRITE
INTO
TABLE
person;
查询
hive>
select
*
from
person;
biansutao [
"beijing"
,
"shanghai"
,
"tianjin"
,
"hangzhou"
]
linan [
"changchu"
,
"chengdu"
,
"wuhan"
]
Time
taken: 0.355 seconds
hive>
select
name
from
person;
linan
biansutao
Time
taken: 12.397 seconds
hive>
select
work_locations[0]
from
person;
changchu
beijing
Time
taken: 13.214 seconds
hive>
select
work_locations
from
person;
[
"changchu"
,
"chengdu"
,
"wuhan"
]
[
"beijing"
,
"shanghai"
,
"tianjin"
,
"hangzhou"
]
Time
taken: 13.755 seconds
hive>
select
work_locations[3]
from
person;
NULL
hangzhou
Time
taken: 12.722 seconds
hive>
select
work_locations[4]
from
person;
NULL
NULL
Time
taken: 15.958 seconds
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
|
创建数据库表
create
table
score(
name
string, score map ROW FORMAT DELIMITED
FIELDS TERMINATED
BY
'\t'
COLLECTION ITEMS TERMINATED
BY
','
MAP KEYS TERMINATED
BY
':'
;
要入库的数据
biansutao
'数学'
:80,
'语文'
:89,
'英语'
:95
jobs
'语文'
:60,
'数学'
:80,
'英语'
:99
入库数据
LOAD
DATA
LOCAL
INPATH
'/home/hadoop/score.txt'
OVERWRITE
INTO
TABLE
score;
查询
hive>
select
*
from
score;
biansutao {
"数学"
:80,
"语文"
:89,
"英语"
:95}
jobs {
"语文"
:60,
"数学"
:80,
"英语"
:99}
Time
taken: 0.665 seconds
hive>
select
name
from
score;
jobs
biansutao
Time
taken: 19.778 seconds
hive>
select
t.score
from
score t;
{
"语文"
:60,
"数学"
:80,
"英语"
:99}
{
"数学"
:80,
"语文"
:89,
"英语"
:95}
Time
taken: 19.353 seconds
hive>
select
t.score[
'语文'
]
from
score t;
60
89
Time
taken: 13.054 seconds
hive>
select
t.score[
'英语'
]
from
score t;
99
95
Time
taken: 13.769 seconds
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
|
创建数据表
CREATE
TABLE
test(id
int
,course struct ROW FORMAT DELIMITED
FIELDS TERMINATED
BY
'\t'
COLLECTION ITEMS TERMINATED
BY
','
;
数据
1 english,80
2 math,89
3 chinese,95
入库
LOAD
DATA
LOCAL
INPATH
'/home/hadoop/test.txt'
OVERWRITE
INTO
TABLE
test;
查询
hive>
select
*
from
test;
OK
1 {
"course"
:
"english"
,
"score"
:80}
2 {
"course"
:
"math"
,
"score"
:89}
3 {
"course"
:
"chinese"
,
"score"
:95}
Time
taken: 0.275 seconds
hive>
select
course
from
test;
{
"course"
:
"english"
,
"score"
:80}
{
"course"
:
"math"
,
"score"
:89}
{
"course"
:
"chinese"
,
"score"
:95}
Time
taken: 44.968 seconds
select
t.course.course
from
test t;
english
math
chinese
Time
taken: 15.827 seconds
hive>
select
t.course.score
from
test t;
80
89
95
Time
taken: 13.235 seconds
|
1
2
3
4
5
6
7
8
9
|
LOAD
DATA
LOCAL
INPATH
'/home/hadoop/test.txt'
OVERWRITE
INTO
TABLE
test;
create
table
test1(id
int
,a MAP
row format delimited fields terminated
by
'\t'
collection items terminated
by
','
MAP KEYS TERMINATED
BY
':'
;
1 english:80,90,70
2 math:89,78,86
3 chinese:99,100,82
LOAD
DATA
LOCAL
INPATH
'/home/hadoop/test1.txt'
OVERWRITE
INTO
TABLE
test1;
|
常见的函数就不废话了,和标准sql类似,下面我们要聊到的基本是HQL里面专有的函数,
hive里面的函数大致分为如下几种:Built-in、Misc.、UDF、UDTF、UDAF
我们就挑几个标准SQL里没有,但是在HIVE SQL在做统计分析常用到的来说吧。
这是内置的对集合进行操作的函数,用法举例:
1
2
3
4
|
create
EXTERNAL
table
IF
NOT
EXISTS userInfo (id
int
,sex string, age
int
,
name
string, email string,sd string, ed string) ROW FORMAT DELIMITED FIELDS TERMINATED
BY
'\t'
location
'/hive/dw'
;
select
*
from
userinfo
where
sex=
'male'
and
(id!=1
and
id !=2
and
id!=3
and
id!=4
and
id!=5)
and
age < 30;
select
*
from
(
select
*
from
userinfo
where
sex=
'male'
and
!array_contains(split(
'1,2,3,4,5'
,
','
),
cast
(id
as
string))) tb1
where
tb1.age < 30;
|
其中建表所用的测试数据你可以用如下链接的脚本自动生成:
http://my.oschina.net/leejun2005/blog/76631
测试数据:
first {"store":{"fruit":[{"weight":8,"type":"apple"},{"weight":9,"type":"pear"}],"bicycle":{"price":19.951,"color":"red1"}},"email":"amy@only_for_json_udf_test.net","owner":"amy1"} third
first {"store":{"fruit":[{"weight":9,"type":"apple"},{"weight":91,"type":"pear"}],"bicycle":{"price":19.952,"color":"red2"}},"email":"amy@only_for_json_udf_test.net","owner":"amy2"} third
first {"store":{"fruit":[{"weight":10,"type":"apple"},{"weight":911,"type":"pear"}],"bicycle":{"price":19.953,"color":"red3"}},"email":"amy@only_for_json_udf_test.net","owner":"amy3"} third
1
2
3
4
|
create
external
table
if
not
exists t_json(f1 string, f2 string, f3 string) row format delimited fields TERMINATED
BY
' '
location
'/test/json'
select
get_json_object(t_json.f2,
'$.owner'
)
from
t_json;
SELECT
*
from
t_json
where
get_json_object(t_json.f2,
'$.store.fruit[0].weight'
) = 9;
SELECT
get_json_object(t_json.f2,
'$.non_exist_key'
)
FROM
t_json;
|
json_tuple
A new json_tuple() UDTF is introduced in hive 0.7. It takes a set of names (keys) and a JSON string, and returns a tuple of values using one function. This is much more efficient than calling GET_JSON_OBJECT to retrieve more than one key from a single JSON string. In any case where a single JSON string would be parsed more than once, your query will be more efficient if you parse it once, which is what JSON_TUPLE is for. As JSON_TUPLE is a UDTF, you will need to use the LATERAL VIEW syntax in order to achieve the same goal.
For example,
1
|
select
a.
timestamp
, get_json_object(a.appevents,
'$.eventid'
), get_json_object(a.appenvets,
'$.eventname'
)
from
log a;
|
1
2
|
select
a.
timestamp
, b.*
from
log a lateral
view
json_tuple(a.appevent,
'eventid'
,
'eventname'
) b
as
f1, f2;
|
通过Lateral view可以方便的将UDTF得到的行转列的结果集合在一起提供服务,因为直接在SELECT使用UDTF会存在限制,即仅仅能包含单个字段,不光是多个UDTF,仅仅单个UDTF加上其他字段也是不可以,hive提示在UDTF中仅仅能有单一的表达式。如下:
hive> select my_test(“abcef:aa”) as qq,’abcd’ from sunwg01;
FAILED: Error in semantic analysis: Only a single expression in the SELECT clause is supported with UDTF’s
使用Lateral view可以实现上面的需求,Lateral view语法如下:
lateralView: LATERAL VIEW udtf(expression) tableAlias AS columnAlias (‘,’ columnAlias)*
fromClause: FROM baseTable (lateralView)*
hive> create table sunwg ( a array, b array )
> ROW FORMAT DELIMITED
> FIELDS TERMINATED BY ‘\t’
> COLLECTION ITEMS TERMINATED BY ‘,’;
OK
Time taken: 1.145 seconds
hive> load data local inpath ‘/home/hjl/sunwg/sunwg.txt’ overwrite into table sunwg;
Copying data from file:/home/hjl/sunwg/sunwg.txt
Loading data to table sunwg
OK
Time taken: 0.162 seconds
hive> select * from sunwg;
OK
[10,11] ["tom","mary"]
[20,21] ["kate","tim"]
Time taken: 0.069 seconds
hive>
> SELECT a, name
> FROM sunwg LATERAL VIEW explode(b) r1 AS name;
OK
[10,11] tom
[10,11] mary
[20,21] kate
[20,21] tim
Time taken: 8.497 seconds
hive> SELECT id, name
> FROM sunwg LATERAL VIEW explode(a) r1 AS id
> LATERAL VIEW explode(b) r2 AS name;
OK
10 tom
10 mary
11 tom
11 mary
20 kate
20 tim
21 kate
21 tim
Time taken: 9.687 seconds
测试数据:
url1 http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1
url2 https://cwiki.apache.org/confluence/display/Hive/LanguageManual+UDF#LanguageManualUDF-getjsonobject
url3 https://www.google.com.hk/#hl=zh-CN&newwindow=1&safe=strict&q=hive+translate+example&oq=hive+translate+example&gs_l=serp.3...10174.11861.6.12051.8.8.0.0.0.0.132.883.0j7.7.0...0.0...1c.1j4.8.serp.0B9C1T_n0Hs&bav=on.2,or.&bvm=bv.44770516,d.aGc&fp=e13e41a6b9dab3f6&biw=1241&bih=589
1
2
|
create
external
table
if
not
exists t_url(f1 string, f2 string) row format delimited fields TERMINATED
BY
' '
location
'/test/url'
;
SELECT
f1, b.*
FROM
t_url LATERAL
VIEW
parse_url_tuple(f2,
'HOST'
,
'PATH'
,
'QUERY'
,
'QUERY:k1'
) b
as
host, path, query, query_id;
|
url1 facebook.com /path1/p.php k1=v1&k2=v2 v1
url2 cwiki.apache.org /confluence/display/Hive/LanguageManual+UDF NULL NULL
url3 www.google.com.hk / NULL NULL
http://blog.csdn.net/wf1982/article/details/7474601
http://www.cnblogs.com/ggjucheng/archive/2013/01/08/2850797.html
http://www.oratea.net/?p=650
https://cwiki.apache.org/confluence/display/Hive/LanguageManual+UDF#LanguageManualUDF-parseurltuple
https://cwiki.apache.org/confluence/display/Hive/Tutorial
http://blog.csdn.net/inte_sleeper/article/details/7196114 hive lateral view语句:列拆分成行
转自:http://blog.csdn.net/chenxingzhen001/article/details/20901045
http://my.oschina.net/leejun2005/blog/120463