hive基础查询笔记

#
==使用正则表达式==

hive (ods)> select symbol,'price.*' from stocks;

==表结构==

hive (ods)> 
          > desc emp1;
OK
col_name        data_type       comment
name                    string                                      
salary                  float                                       
subordinates            array<string>                               
deductions              map<string,float>                           
address                 structstring,city:string,state:string,zip:int>                      
country                 string                                      
state                   string                                      

# Partition Information          
# col_name              data_type               comment             

country                 string                                      
state                   string

==查询数组、struct、map中的元素==

hive (ods)> select name, subordinates[0], deductions["shebao"] ,address.city from emp1;
OK
name    _c1     _c2     city
lucy    aLucy   100.0   Beijing
hive (ods)> select * from emp1;
OK
emp1.name       emp1.salary     emp1.subordinates       emp1.deductions emp1.address    emp1.country    emp1.state
lucy    10000.0 ["aLucy"]       {"shebao":100.0}        {"street":"xidan","city":"Beijing","state":"Dong","zip":100000} BJ      SHOUDU
Time taken: 0.137 seconds, Fetched: 1 row(s)

==计算==

hive (ods)> 
          > 
          > select upper(name), salary, deductions["shebao"], round(salary * (1-deductions["shebao"])) from emp1;
OK
_c0     salary  _c2     _c3
LUCY    10000.0 100.0   -990000.0
Time taken: 0.187 seconds, Fetched: 1 row(s)

==聚合查询==

hive (ods)> 
          > 
          > 
          > select count(*), avg(salary) from emp1;

==设置参数提高聚合性能==

set hive.map.aggr=true;
hive (ods)> 
          > select count(distinct symbol) from emp1;

表生成函数

hive 之行拆列lateral view explode(col3) col3 as name

explode(ARRAY) 列表中的每个元素生成一行

explode(MAP) map中每个key-value对,生成一行,key为一列,value为一列

hive (ods)> 
          > create table explode_test(
          > col1 string,
          > col2 string,
          > col3 string
          > )
          > row format delimited fields terminated by '\t'
          > stored as textfile;
OK
Time taken: 0.207 seconds
hive (ods)> 
          > 
          > load data local inpath '/home/hadoop/study_hadoop/explode.txt' into table explode_test;
Loading data to table ods.explode_test
Table ods.explode_test stats: [numFiles=1, totalSize=20]
OK
Time taken: 0.622 seconds
hive (ods)> select *from explode_test;
OK
explode_test.col1       explode_test.col2       explode_test.col3
a       b       1,2,3
c       d       4,5,6
Time taken: 0.121 seconds, Fetched: 2 row(s)

==遍历数组中的每一列==

hive (ods)> select col1, col2, name
          > from explode_test
          > lateral view explode(split(col3,',')) col3 as name;
OK
col1    col2    name
a       b       1
a       b       2
a       b       3
c       d       4
c       d       5
c       d       6
Time taken: 0.124 seconds, Fetched: 6 row(s)

二遍历数组中的每一列

hive (ods)> 
          > 
          > create table hzl_test
          > 
          > (
          > 
          > col1 string,
          > 
          > col2 string,
          > 
          > col3 array
          > 
          > )
          > 
          > row format delimited 
          > 
          > fields terminated by '\t'
          > collection items terminated by ','  ;
OK
Time taken: 0.135 seconds
hive (ods)> 
          > 
          > load data local inpath '/home/hadoop/study_hadoop/explode.txt' into table hzl_test;
Loading data to table ods.hzl_test
Table ods.hzl_test stats: [numFiles=1, totalSize=20]
OK
Time taken: 0.466 seconds
hive (ods)> select * from hzl_test;
OK
hzl_test.col1   hzl_test.col2   hzl_test.col3
a       b       [1,2,3]
c       d       [4,5,6]
Time taken: 0.117 seconds, Fetched: 2 row(s)
hive (ods)> 
          > 
          > 
          > select col1,col2,name
          > from hzl_test
          > lateral view explode(col3) col3 as name;
OK
col1    col2    name
a       b       1
a       b       2
a       b       3
c       d       4
c       d       5
c       d       6
Time taken: 0.12 seconds, Fetched: 6 row(s)

==补充:==

hive (ods)> 
          > 
          > select t.list[0],t.list[1],t.list[2] from (
          > select (split(col3,',')) list from explode_test) t;
OK
_c0     _c1     _c2
1       2       3
4       5       6

==查看数组长度size==

hive (ods)> 
          > 
          > select size(split(col3,',')) list from explode_test ;
OK
list
3
3

hive 列转行 concat_ws(‘,’,collect_set(col3))

hive (ods)> 
          > 
          > select * from tmp_jiangzl_test;
OK
tmp_jiangzl_test.col1   tmp_jiangzl_test.col2   tmp_jiangzl_test.col3
a       b       1
a       b       2
a       b       3
c       d       4
c       d       5
c       d       6
Time taken: 0.116 seconds, Fetched: 6 row(s)
hive (ods)> 
          > 
          > select col1,col2,concat_ws(',',collect_set(col3))
          > from tmp_jiangzl_test
          > group by col1,col2;

col1    col2    _c2
a       b       1,2,3
c       d       4,5,6
Time taken: 34.791 seconds, Fetched: 2 row(s)          

parse_url_tuple

hive (ods)> select * from t_url;
OK
t_url.f1        t_url.f2
url1    http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1
url2    https://cwiki.apache.org/confluence/display/Hive/LanguageManual+UDF#LanguageManualUDF-getjsonobject
url3    https://www.google.com.hk/#hl=zh-CN&newwindow=1&safe=strict&q=hive+translate+example&oq=hive+translate+example&gs_l=serp.3...10174.11861.6.12051.8.8.0.0.0.0.132.883.0j7.7.0...0.0...1c.1j4.8.serp.0B9C1T_n0Hs&bav=on.2,or.&bvm=bv.44770516,d.aGc&fp=e13e41a6b9dab3f6&biw=1241&bih=589
Time taken: 0.122 seconds, Fetched: 3 row(s)
hive (ods)> select f1,b.* from t_url lateral view parse_url_tuple(f2,'HOST','PATH','QUERTY','QUERTY:k1')b as host,path,querty,querty_id;
OK
f1      b.host  b.path  b.querty        b.querty_id
url1    facebook.com    /path1/p.php    NULL    NULL
url2    cwiki.apache.org        /confluence/display/Hive/LanguageManual+UDF     NULL    NULL
url3    www.google.com.hk       /       NULL    NULL
Time taken: 0.142 seconds, Fetched: 3 row(s)

#

lateral view

  • 是Hive中提供给UDTF的conjunction,它可以解决UDTF不能添加额外的select列的问题。当我们想对hive表中某一列进行split之后,想对其转换成1 to N的模式,即一行转多列。hive不允许我们在UDTF函数之外,再添加其它select语句。

get_json_object

hive (ods)> select get_json_object(t_json.f2, '$.owner') from t_json;
OK
_c0
amy1
amy2
amy3
Time taken: 0.106 seconds, Fetched: 3 row(s)
hive (ods)> 
          > 
          > select * from t_json;
OK
t_json.f1       t_json.f2       t_json.f3
first   {"store":{"fruit":[{"weight":8,"type":"apple"},{"weight":9,"type":"pear"}],"bicycle":{"price":19.951,"color":"red1"}},"email":"amy@only_for_json_udf_test.net","owner":"amy1"}    third
first   {"store":{"fruit":[{"weight":9,"type":"apple"},{"weight":91,"type":"pear"}],"bicycle":{"price":19.952,"color":"red2"}},"email":"amy@only_for_json_udf_test.net","owner":"amy2"}   third
first   {"store":{"fruit":[{"weight":10,"type":"apple"},{"weight":911,"type":"pear"}],"bicycle":{"price":19.953,"color":"red3"}},"email":"amy@only_for_json_udf_test.net","owner":"amy3"} third
Time taken: 0.102 seconds, Fetched: 3 row(s)
hive (ods)>

URL解析函数:parse_url

hive (ods)> 
          > 
          > 
          > select parse_url('https://www.baidu.com/s?cl=3&tn=baidutop10&fr=top1000&wd=%E8%BF%AA%E5%A3%AB%E5%B0%BC%E6%94%B6%E8%B4%AD%E7%A6%8F%E5%85%8B%E6%96%AF&rsv_idx=2','HOST') from dual;
OK
_c0
www.baidu.com

其它内置函数

正则表达式

正则表达式替换函数:regexp_replace

hive (ods)> 
          > select regexp_replace('foobar', 'oo|ar', '') from dual;
OK
_c0
fb
Time taken: 0.112 seconds, Fetched: 1 row(s)
hive (ods)> select regexp_replace('foobar', 'oo|ar', '-') from dual;
OK
_c0
f-b-
Time taken: 0.105 seconds, Fetched: 1 row(s)
hive (ods)>

正则表达式解析函数:regexp_extract

hive (ods)> select regexp_extract('foothebar', 'foo(.*?)(bar)', 1) from dual;
OK
_c0
the
Time taken: 0.105 seconds, Fetched: 1 row(s)
hive (ods)> select regexp_extract('foothebar', 'foo(.*?)(bar)', 0) from dual;
OK
_c0
foothebar
Time taken: 0.104 seconds, Fetched: 1 row(s)
hive (ods)> select regexp_extract('foothebar', 'foo(.*?)(bar)', 2) from dual;
OK
_c0
bar

limit语句

hive (ods)> select * from staged_employees order by id limit 3;

嵌套select语句

hive (ods)> 
          > 
          > from (
          > select upper(emp1.name) as name, emp1.subordinates[0] as sub, emp1.salary, emp1.deductions["shebao"] as shebao, emp1.address.city 
          > from emp1 ) e
          > select e.name, e.sub,e.salary,e.shebao;
OK
e.name  e.sub   e.salary        e.shebao
LUCY    aLucy   10000.0 100.0
Time taken: 0.166 seconds, Fetched: 1 row(s)

case when

hive (ods)> select * from emp1;
OK
emp1.name       emp1.salary     emp1.subordinates       emp1.deductions emp1.address    emp1.country    emp1.state
lucy    10000.0 ["aLucy"]       {"shebao":100.0}        {"street":"xidan","city":"Beijing","state":"Dong","zip":100000} BJ      SHOUDU
Time taken: 0.109 seconds, Fetched: 1 row(s)
hive (ods)> select name,salary,
          > case when salary < 800 then 'low'
          > when salary >= 800 and salary <=5000 then 'middle'
          > when salary >5000 and salary <10000 then 'high'
          > else 'very high'
          > end as bracket 
          > from emp1;
OK
name    salary  bracket
lucy    10000.0 very high
Time taken: 0.3 seconds, Fetched: 1 row(s)
hive (ods)>

设置本地模式 set hive.exec.mode.local.auto = true;

列别名

  • 不能在where子句中只用列别名,但是可以使用嵌套select语句

like rlike

Time taken: 0.141 seconds, Fetched: 4 row(s)
hive (ods)> 
          > 
          > select emp1.address.street from emp1 where emp1.address.street like '%Dong%';
OK
street
DongDan
DongDan
Time taken: 0.103 seconds, Fetched: 2 row(s)
hive (ods)> select emp1.address.street from emp1 where emp1.address.street rlike '.*Dong|HouHai.*';
OK
street
DongDan
DongDan
HouHai
Time taken: 0.164 seconds, Fetched: 3 row(s)

group by having

hive (ods)> select aaa,symbol,ymd,count(*) from stocks1 group by aaa,symbol,ymd having count(*)>1;

Query ID = hadoop_20171218101126_b400d584-6699-447f-8011-1aeb3019a1de
Total jobs = 1
Launching Job 1 out of 1
Number of reduce tasks not specified. Estimated from input data size: 1
In order to change the average load for a reducer (in bytes):
  set hive.exec.reducers.bytes.per.reducer=
In order to limit the maximum number of reducers:
  set hive.exec.reducers.max=
In order to set a constant number of reducers:
  set mapreduce.job.reduces=
Starting Job = job_1513562135174_0002, Tracking URL = http://master:8088/proxy/application_1513562135174_0002/
Kill Command = /home/hadoop/hadoop-2.6.4/bin/hadoop job  -kill job_1513562135174_0002
Hadoop job information for Stage-1: number of mappers: 1; number of reducers: 1
2017-12-18 10:11:38,117 Stage-1 map = 0%,  reduce = 0%
2017-12-18 10:11:49,277 Stage-1 map = 100%,  reduce = 0%, Cumulative CPU 1.72 sec
2017-12-18 10:12:02,299 Stage-1 map = 100%,  reduce = 100%, Cumulative CPU 4.86 sec
MapReduce Total cumulative CPU time: 4 seconds 860 msec
Ended Job = job_1513562135174_0002
MapReduce Jobs Launched: 
Stage-Stage-1: Map: 1  Reduce: 1   Cumulative CPU: 4.86 sec   HDFS Read: 9900 HDFS Write: 63 SUCCESS
Total MapReduce CPU Time Spent: 4 seconds 860 msec

OK
aaa     symbol  ymd     _c3
aa      ok      '2017-12-11'    2
aa      ok      '2017-12-12'    2
bb      ok      '2017-12-11'    2
Time taken: 37.285 seconds, Fetched: 3 row(s)

/+streamtable(表名)/来指定你想要做为流数据的表,就是大表

hive (ods)> select /*+STREAMTABLE(s)*/s.ymd, s.symbol, d.* from stocks s join dividends d on s.ymd = d.ymd;

join group by

Select * from
(select label,qu from temp_testjoin_ta where dt = '2014-08-08') a
Join
(select qu,inmyway from temp_testjoin_tb where dt = '2014-08-08') b
On a. qu = b.qu
group by label,inmyway,a.qu;

(tok_table_or_col label)        (. (tok_table_or_col a) qu)     (tok_table_or_col inmyway)
l1      q1      i1
l1      q1      i2
l1      q1      i3
Time taken: 51.026 seconds, Fetched: 3 row(s)

left semi Join

hive (ods)> Select * from
          > (select label,qu from temp_testjoin_ta where dt = '2014-08-08') a
          > left semi Join
          > (select qu,inmyway from temp_testjoin_tb where dt = '2014-08-08') b
          > On a. qu = b.qu
          > ;

a.label a.qu
l1      q1
hive (ods)> 
          > 
          > 
          > Select * from
          > (select qu,inmyway from temp_testjoin_tb where dt = '2014-08-08') b
          > left semi Join
          > (select label,qu from temp_testjoin_ta where dt = '2014-08-08') a
          > On b.qu = a.qu
          > ;


b.qu    b.inmyway
q1      i1
q1      i1
q1      i2
q1      i3

map side join

hive (ods)> 
          > 
          > 
          > Select /*+MAPJOIN(a)*/count(*) from
          > (select label,qu from temp_testjoin_ta where dt = '2014-08-08') a
          > left outer Join
          > (select qu,inmyway from temp_testjoin_tb where dt = '2014-08-08') b
          > On a. qu = b.qu
          > ;

order by 注意:(desc降序,asc升序)。

hive (ods)> select qu,inmyway from temp_testjoin_tb where dt = '2014-08-08' order by qu;

OK
qu      inmyway
q1      i3
q1      i2
q1      i1
q1      i1
q2      i2
q2      i1
q3      i10
Time taken: 33.083 seconds, Fetched: 7 row(s)

避免数据倾斜

set hive.groupby.skewindata=true;
  • hive.groupby.skewindata=true的原理是:当有数据倾斜的时候进行负载均衡,当选项设定为 true,生成的查询计划会有两个 MR Job。第一个 MR Job 中,Map 的输出结果集合会随机分布到 Reduce 中,每个 Reduce 做部分聚合操作,并输出结果,这样处理的结果是相同的 Group By Key 有可能被分发到不同的 Reduce 中,从而达到负载均衡的目的;第二个 MR Job 再根据预处理的数据结果按照 Group By Key 分布到 Reduce 中(这个过程可以保证相同的 Group By Key 被分布到同一个 Reduce 中),最后完成最终的聚合操作。

  • 总结:避免数据倾斜的问题,如果对于group by或distinct,设定 hive.groupby.skewindata=true

cluster by/ sort by/ distibute by

hive (ods)> select * from temp_testjoin_tb distribute by qu sort by qu;

q1      i3      2014-08-08
q1      i2      2014-08-08
q1      i1      2014-08-08
q1      i1      2014-08-08
q3      i10     2014-08-08
q2      i2      2014-08-08
q2      i1      2014-08-08

hive (ods)> 
          > 
          > select * from temp_testjoin_tb cluster by qu ;

q1      i3      2014-08-08
q1      i2      2014-08-08
q1      i1      2014-08-08
q1      i1      2014-08-08
q3      i10     2014-08-08
q2      i2      2014-08-08
q2      i1      2014-08-08

类型转换 cast(value as TYPE)

hive (ods)> select concat('$',cast(salary as string)) from emp1;
OK
_c0
$7500.0
$1200.0
$1200.0
$10000.0

hive抽样查询

  • rand()
hive (ods)> select * from numbers tablesample(bucket 3 out of 10 on rand());
OK
numbers.number
4
5

分母表示取第几桶,分子表示分成几个桶

hive (ods)> select * from numbers tablesample(bucket 3 out of 10 on number);
OK
numbers.number
2
Time taken: 0.122 seconds, Fetched: 1 row(s)
hive (ods)> select * from numbers tablesample(bucket 3 out of 10 on number);
OK
numbers.number
2
Time taken: 0.098 seconds, Fetched: 1 row(s)
hive (ods)> select * from numbers tablesample(bucket 3 out of 10 on number);
OK
numbers.number
2
Time taken: 0.101 seconds, Fetched: 1 row(s)
hive (ods)>
  • 数据块抽样 percent
hive (ods)> select * from numbers tablesample(9 percent);
OK
numbers.number
1
Time taken: 0.089 seconds, Fetched: 1 row(s)
hive (ods)> select * from numbers tablesample(50 percent);
OK
numbers.number
1
2
3
4
5
6
Time taken: 0.075 seconds, Fetched: 6 row(s)
hive (ods)> select * from numbers tablesample(40 percent);
OK
numbers.number
1
2
3
4
5
Time taken: 0.092 seconds, Fetched: 5 row(s)
hive (ods)>
  • 分桶表的输入裁剪
hive (ods)> set hive.enforce.bucketing = true;
hive (ods)> dfs -ls /user/hive/warehouse/ods.db/numbers_bucketed;
Found 3 items
-rwxrwxrwx   3 hadoop supergroup          6 2017-12-18 17:17 /user/hive/warehouse/ods.db/numbers_bucketed/000000_0
-rwxrwxrwx   3 hadoop supergroup          9 2017-12-18 17:17 /user/hive/warehouse/ods.db/numbers_bucketed/000001_0
-rwxrwxrwx   3 hadoop supergroup          6 2017-12-18 17:17 /user/hive/warehouse/ods.db/numbers_bucketed/000002_0
hive (ods)>


hive (ods)> select * from numbers_bucketed;
OK
numbers_bucketed.number
9
6
3
10
7
4
1
8
5
2
Time taken: 0.102 seconds, Fetched: 10 row(s)
hive (ods)> dfs -ls /user/hive/warehouse/ods.db/numbers_bucketed;
Found 3 items
-rwxrwxrwx   3 hadoop supergroup          6 2017-12-18 17:17 /user/hive/warehouse/ods.db/numbers_bucketed/000000_0
-rwxrwxrwx   3 hadoop supergroup          9 2017-12-18 17:17 /user/hive/warehouse/ods.db/numbers_bucketed/000001_0
-rwxrwxrwx   3 hadoop supergroup          6 2017-12-18 17:17 /user/hive/warehouse/ods.db/numbers_bucketed/000002_0
hive (ods)> dfs -cat /user/hive/warehouse/ods.db/numbers_bucketed/000000_0
          > ;
9
6
3
hive (ods)> dfs -cat /user/hive/warehouse/ods.db/numbers_bucketed/000001_0;
10
7
4
1
hive (ods)> dfs -cat /user/hive/warehouse/ods.db/numbers_bucketed/000002_0;
8
5
2
hive (ods)> 

取样如下:

hive (ods)> select * from numbers_bucketed tablesample(bucket 2 out of 3 on number);
OK
numbers_bucketed.number
10
7
4
1
Time taken: 0.107 seconds, Fetched: 4 row(s)
hive (ods)> select * from numbers_bucketed tablesample(bucket 1 out of 3 on number);
OK
numbers_bucketed.number
9
6
3
Time taken: 0.099 seconds, Fetched: 3 row(s)
hive (ods)> select * from numbers_bucketed tablesample(bucket 3 out of 3 on number);
OK
numbers_bucketed.number
8
5
2
Time taken: 0.

你可能感兴趣的:(hive)