#
==使用正则表达式==
hive (ods)> select symbol,'price.*' from stocks;
==表结构==
hive (ods)>
> desc emp1;
OK
col_name data_type comment
name string
salary float
subordinates array<string>
deductions map<string,float>
address structstring,city:string,state:string,zip:int>
country string
state string
# Partition Information
# col_name data_type comment
country string
state string
==查询数组、struct、map中的元素==
hive (ods)> select name, subordinates[0], deductions["shebao"] ,address.city from emp1;
OK
name _c1 _c2 city
lucy aLucy 100.0 Beijing
hive (ods)> select * from emp1;
OK
emp1.name emp1.salary emp1.subordinates emp1.deductions emp1.address emp1.country emp1.state
lucy 10000.0 ["aLucy"] {"shebao":100.0} {"street":"xidan","city":"Beijing","state":"Dong","zip":100000} BJ SHOUDU
Time taken: 0.137 seconds, Fetched: 1 row(s)
==计算==
hive (ods)>
>
> select upper(name), salary, deductions["shebao"], round(salary * (1-deductions["shebao"])) from emp1;
OK
_c0 salary _c2 _c3
LUCY 10000.0 100.0 -990000.0
Time taken: 0.187 seconds, Fetched: 1 row(s)
==聚合查询==
hive (ods)>
>
>
> select count(*), avg(salary) from emp1;
==设置参数提高聚合性能==
set hive.map.aggr=true;
hive (ods)>
> select count(distinct symbol) from emp1;
explode(ARRAY) 列表中的每个元素生成一行
explode(MAP) map中每个key-value对,生成一行,key为一列,value为一列
hive (ods)>
> create table explode_test(
> col1 string,
> col2 string,
> col3 string
> )
> row format delimited fields terminated by '\t'
> stored as textfile;
OK
Time taken: 0.207 seconds
hive (ods)>
>
> load data local inpath '/home/hadoop/study_hadoop/explode.txt' into table explode_test;
Loading data to table ods.explode_test
Table ods.explode_test stats: [numFiles=1, totalSize=20]
OK
Time taken: 0.622 seconds
hive (ods)> select *from explode_test;
OK
explode_test.col1 explode_test.col2 explode_test.col3
a b 1,2,3
c d 4,5,6
Time taken: 0.121 seconds, Fetched: 2 row(s)
==遍历数组中的每一列==
hive (ods)> select col1, col2, name
> from explode_test
> lateral view explode(split(col3,',')) col3 as name;
OK
col1 col2 name
a b 1
a b 2
a b 3
c d 4
c d 5
c d 6
Time taken: 0.124 seconds, Fetched: 6 row(s)
hive (ods)>
>
> create table hzl_test
>
> (
>
> col1 string,
>
> col2 string,
>
> col3 array
>
> )
>
> row format delimited
>
> fields terminated by '\t'
> collection items terminated by ',' ;
OK
Time taken: 0.135 seconds
hive (ods)>
>
> load data local inpath '/home/hadoop/study_hadoop/explode.txt' into table hzl_test;
Loading data to table ods.hzl_test
Table ods.hzl_test stats: [numFiles=1, totalSize=20]
OK
Time taken: 0.466 seconds
hive (ods)> select * from hzl_test;
OK
hzl_test.col1 hzl_test.col2 hzl_test.col3
a b [1,2,3]
c d [4,5,6]
Time taken: 0.117 seconds, Fetched: 2 row(s)
hive (ods)>
>
>
> select col1,col2,name
> from hzl_test
> lateral view explode(col3) col3 as name;
OK
col1 col2 name
a b 1
a b 2
a b 3
c d 4
c d 5
c d 6
Time taken: 0.12 seconds, Fetched: 6 row(s)
==补充:==
hive (ods)>
>
> select t.list[0],t.list[1],t.list[2] from (
> select (split(col3,',')) list from explode_test) t;
OK
_c0 _c1 _c2
1 2 3
4 5 6
==查看数组长度size==
hive (ods)>
>
> select size(split(col3,',')) list from explode_test ;
OK
list
3
3
hive (ods)>
>
> select * from tmp_jiangzl_test;
OK
tmp_jiangzl_test.col1 tmp_jiangzl_test.col2 tmp_jiangzl_test.col3
a b 1
a b 2
a b 3
c d 4
c d 5
c d 6
Time taken: 0.116 seconds, Fetched: 6 row(s)
hive (ods)>
>
> select col1,col2,concat_ws(',',collect_set(col3))
> from tmp_jiangzl_test
> group by col1,col2;
col1 col2 _c2
a b 1,2,3
c d 4,5,6
Time taken: 34.791 seconds, Fetched: 2 row(s)
hive (ods)> select * from t_url;
OK
t_url.f1 t_url.f2
url1 http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1
url2 https://cwiki.apache.org/confluence/display/Hive/LanguageManual+UDF#LanguageManualUDF-getjsonobject
url3 https://www.google.com.hk/#hl=zh-CN&newwindow=1&safe=strict&q=hive+translate+example&oq=hive+translate+example&gs_l=serp.3...10174.11861.6.12051.8.8.0.0.0.0.132.883.0j7.7.0...0.0...1c.1j4.8.serp.0B9C1T_n0Hs&bav=on.2,or.&bvm=bv.44770516,d.aGc&fp=e13e41a6b9dab3f6&biw=1241&bih=589
Time taken: 0.122 seconds, Fetched: 3 row(s)
hive (ods)> select f1,b.* from t_url lateral view parse_url_tuple(f2,'HOST','PATH','QUERTY','QUERTY:k1')b as host,path,querty,querty_id;
OK
f1 b.host b.path b.querty b.querty_id
url1 facebook.com /path1/p.php NULL NULL
url2 cwiki.apache.org /confluence/display/Hive/LanguageManual+UDF NULL NULL
url3 www.google.com.hk / NULL NULL
Time taken: 0.142 seconds, Fetched: 3 row(s)
#
hive (ods)> select get_json_object(t_json.f2, '$.owner') from t_json;
OK
_c0
amy1
amy2
amy3
Time taken: 0.106 seconds, Fetched: 3 row(s)
hive (ods)>
>
> select * from t_json;
OK
t_json.f1 t_json.f2 t_json.f3
first {"store":{"fruit":[{"weight":8,"type":"apple"},{"weight":9,"type":"pear"}],"bicycle":{"price":19.951,"color":"red1"}},"email":"amy@only_for_json_udf_test.net","owner":"amy1"} third
first {"store":{"fruit":[{"weight":9,"type":"apple"},{"weight":91,"type":"pear"}],"bicycle":{"price":19.952,"color":"red2"}},"email":"amy@only_for_json_udf_test.net","owner":"amy2"} third
first {"store":{"fruit":[{"weight":10,"type":"apple"},{"weight":911,"type":"pear"}],"bicycle":{"price":19.953,"color":"red3"}},"email":"amy@only_for_json_udf_test.net","owner":"amy3"} third
Time taken: 0.102 seconds, Fetched: 3 row(s)
hive (ods)>
hive (ods)>
>
>
> select parse_url('https://www.baidu.com/s?cl=3&tn=baidutop10&fr=top1000&wd=%E8%BF%AA%E5%A3%AB%E5%B0%BC%E6%94%B6%E8%B4%AD%E7%A6%8F%E5%85%8B%E6%96%AF&rsv_idx=2','HOST') from dual;
OK
_c0
www.baidu.com
hive (ods)>
> select regexp_replace('foobar', 'oo|ar', '') from dual;
OK
_c0
fb
Time taken: 0.112 seconds, Fetched: 1 row(s)
hive (ods)> select regexp_replace('foobar', 'oo|ar', '-') from dual;
OK
_c0
f-b-
Time taken: 0.105 seconds, Fetched: 1 row(s)
hive (ods)>
hive (ods)> select regexp_extract('foothebar', 'foo(.*?)(bar)', 1) from dual;
OK
_c0
the
Time taken: 0.105 seconds, Fetched: 1 row(s)
hive (ods)> select regexp_extract('foothebar', 'foo(.*?)(bar)', 0) from dual;
OK
_c0
foothebar
Time taken: 0.104 seconds, Fetched: 1 row(s)
hive (ods)> select regexp_extract('foothebar', 'foo(.*?)(bar)', 2) from dual;
OK
_c0
bar
hive (ods)> select * from staged_employees order by id limit 3;
hive (ods)>
>
> from (
> select upper(emp1.name) as name, emp1.subordinates[0] as sub, emp1.salary, emp1.deductions["shebao"] as shebao, emp1.address.city
> from emp1 ) e
> select e.name, e.sub,e.salary,e.shebao;
OK
e.name e.sub e.salary e.shebao
LUCY aLucy 10000.0 100.0
Time taken: 0.166 seconds, Fetched: 1 row(s)
hive (ods)> select * from emp1;
OK
emp1.name emp1.salary emp1.subordinates emp1.deductions emp1.address emp1.country emp1.state
lucy 10000.0 ["aLucy"] {"shebao":100.0} {"street":"xidan","city":"Beijing","state":"Dong","zip":100000} BJ SHOUDU
Time taken: 0.109 seconds, Fetched: 1 row(s)
hive (ods)> select name,salary,
> case when salary < 800 then 'low'
> when salary >= 800 and salary <=5000 then 'middle'
> when salary >5000 and salary <10000 then 'high'
> else 'very high'
> end as bracket
> from emp1;
OK
name salary bracket
lucy 10000.0 very high
Time taken: 0.3 seconds, Fetched: 1 row(s)
hive (ods)>
Time taken: 0.141 seconds, Fetched: 4 row(s)
hive (ods)>
>
> select emp1.address.street from emp1 where emp1.address.street like '%Dong%';
OK
street
DongDan
DongDan
Time taken: 0.103 seconds, Fetched: 2 row(s)
hive (ods)> select emp1.address.street from emp1 where emp1.address.street rlike '.*Dong|HouHai.*';
OK
street
DongDan
DongDan
HouHai
Time taken: 0.164 seconds, Fetched: 3 row(s)
hive (ods)> select aaa,symbol,ymd,count(*) from stocks1 group by aaa,symbol,ymd having count(*)>1;
Query ID = hadoop_20171218101126_b400d584-6699-447f-8011-1aeb3019a1de
Total jobs = 1
Launching Job 1 out of 1
Number of reduce tasks not specified. Estimated from input data size: 1
In order to change the average load for a reducer (in bytes):
set hive.exec.reducers.bytes.per.reducer=
In order to limit the maximum number of reducers:
set hive.exec.reducers.max=
In order to set a constant number of reducers:
set mapreduce.job.reduces=
Starting Job = job_1513562135174_0002, Tracking URL = http://master:8088/proxy/application_1513562135174_0002/
Kill Command = /home/hadoop/hadoop-2.6.4/bin/hadoop job -kill job_1513562135174_0002
Hadoop job information for Stage-1: number of mappers: 1; number of reducers: 1
2017-12-18 10:11:38,117 Stage-1 map = 0%, reduce = 0%
2017-12-18 10:11:49,277 Stage-1 map = 100%, reduce = 0%, Cumulative CPU 1.72 sec
2017-12-18 10:12:02,299 Stage-1 map = 100%, reduce = 100%, Cumulative CPU 4.86 sec
MapReduce Total cumulative CPU time: 4 seconds 860 msec
Ended Job = job_1513562135174_0002
MapReduce Jobs Launched:
Stage-Stage-1: Map: 1 Reduce: 1 Cumulative CPU: 4.86 sec HDFS Read: 9900 HDFS Write: 63 SUCCESS
Total MapReduce CPU Time Spent: 4 seconds 860 msec
OK
aaa symbol ymd _c3
aa ok '2017-12-11' 2
aa ok '2017-12-12' 2
bb ok '2017-12-11' 2
Time taken: 37.285 seconds, Fetched: 3 row(s)
hive (ods)> select /*+STREAMTABLE(s)*/s.ymd, s.symbol, d.* from stocks s join dividends d on s.ymd = d.ymd;
Select * from
(select label,qu from temp_testjoin_ta where dt = '2014-08-08') a
Join
(select qu,inmyway from temp_testjoin_tb where dt = '2014-08-08') b
On a. qu = b.qu
group by label,inmyway,a.qu;
(tok_table_or_col label) (. (tok_table_or_col a) qu) (tok_table_or_col inmyway)
l1 q1 i1
l1 q1 i2
l1 q1 i3
Time taken: 51.026 seconds, Fetched: 3 row(s)
hive (ods)> Select * from
> (select label,qu from temp_testjoin_ta where dt = '2014-08-08') a
> left semi Join
> (select qu,inmyway from temp_testjoin_tb where dt = '2014-08-08') b
> On a. qu = b.qu
> ;
a.label a.qu
l1 q1
hive (ods)>
>
>
> Select * from
> (select qu,inmyway from temp_testjoin_tb where dt = '2014-08-08') b
> left semi Join
> (select label,qu from temp_testjoin_ta where dt = '2014-08-08') a
> On b.qu = a.qu
> ;
b.qu b.inmyway
q1 i1
q1 i1
q1 i2
q1 i3
hive (ods)>
>
>
> Select /*+MAPJOIN(a)*/count(*) from
> (select label,qu from temp_testjoin_ta where dt = '2014-08-08') a
> left outer Join
> (select qu,inmyway from temp_testjoin_tb where dt = '2014-08-08') b
> On a. qu = b.qu
> ;
hive (ods)> select qu,inmyway from temp_testjoin_tb where dt = '2014-08-08' order by qu;
OK
qu inmyway
q1 i3
q1 i2
q1 i1
q1 i1
q2 i2
q2 i1
q3 i10
Time taken: 33.083 seconds, Fetched: 7 row(s)
set hive.groupby.skewindata=true;
hive.groupby.skewindata=true的原理是:当有数据倾斜的时候进行负载均衡,当选项设定为 true,生成的查询计划会有两个 MR Job。第一个 MR Job 中,Map 的输出结果集合会随机分布到 Reduce 中,每个 Reduce 做部分聚合操作,并输出结果,这样处理的结果是相同的 Group By Key 有可能被分发到不同的 Reduce 中,从而达到负载均衡的目的;第二个 MR Job 再根据预处理的数据结果按照 Group By Key 分布到 Reduce 中(这个过程可以保证相同的 Group By Key 被分布到同一个 Reduce 中),最后完成最终的聚合操作。
总结:避免数据倾斜的问题,如果对于group by或distinct,设定 hive.groupby.skewindata=true
hive (ods)> select * from temp_testjoin_tb distribute by qu sort by qu;
q1 i3 2014-08-08
q1 i2 2014-08-08
q1 i1 2014-08-08
q1 i1 2014-08-08
q3 i10 2014-08-08
q2 i2 2014-08-08
q2 i1 2014-08-08
hive (ods)>
>
> select * from temp_testjoin_tb cluster by qu ;
q1 i3 2014-08-08
q1 i2 2014-08-08
q1 i1 2014-08-08
q1 i1 2014-08-08
q3 i10 2014-08-08
q2 i2 2014-08-08
q2 i1 2014-08-08
hive (ods)> select concat('$',cast(salary as string)) from emp1;
OK
_c0
$7500.0
$1200.0
$1200.0
$10000.0
hive (ods)> select * from numbers tablesample(bucket 3 out of 10 on rand());
OK
numbers.number
4
5
分母表示取第几桶,分子表示分成几个桶
hive (ods)> select * from numbers tablesample(bucket 3 out of 10 on number);
OK
numbers.number
2
Time taken: 0.122 seconds, Fetched: 1 row(s)
hive (ods)> select * from numbers tablesample(bucket 3 out of 10 on number);
OK
numbers.number
2
Time taken: 0.098 seconds, Fetched: 1 row(s)
hive (ods)> select * from numbers tablesample(bucket 3 out of 10 on number);
OK
numbers.number
2
Time taken: 0.101 seconds, Fetched: 1 row(s)
hive (ods)>
hive (ods)> select * from numbers tablesample(9 percent);
OK
numbers.number
1
Time taken: 0.089 seconds, Fetched: 1 row(s)
hive (ods)> select * from numbers tablesample(50 percent);
OK
numbers.number
1
2
3
4
5
6
Time taken: 0.075 seconds, Fetched: 6 row(s)
hive (ods)> select * from numbers tablesample(40 percent);
OK
numbers.number
1
2
3
4
5
Time taken: 0.092 seconds, Fetched: 5 row(s)
hive (ods)>
hive (ods)> set hive.enforce.bucketing = true;
hive (ods)> dfs -ls /user/hive/warehouse/ods.db/numbers_bucketed;
Found 3 items
-rwxrwxrwx 3 hadoop supergroup 6 2017-12-18 17:17 /user/hive/warehouse/ods.db/numbers_bucketed/000000_0
-rwxrwxrwx 3 hadoop supergroup 9 2017-12-18 17:17 /user/hive/warehouse/ods.db/numbers_bucketed/000001_0
-rwxrwxrwx 3 hadoop supergroup 6 2017-12-18 17:17 /user/hive/warehouse/ods.db/numbers_bucketed/000002_0
hive (ods)>
hive (ods)> select * from numbers_bucketed;
OK
numbers_bucketed.number
9
6
3
10
7
4
1
8
5
2
Time taken: 0.102 seconds, Fetched: 10 row(s)
hive (ods)> dfs -ls /user/hive/warehouse/ods.db/numbers_bucketed;
Found 3 items
-rwxrwxrwx 3 hadoop supergroup 6 2017-12-18 17:17 /user/hive/warehouse/ods.db/numbers_bucketed/000000_0
-rwxrwxrwx 3 hadoop supergroup 9 2017-12-18 17:17 /user/hive/warehouse/ods.db/numbers_bucketed/000001_0
-rwxrwxrwx 3 hadoop supergroup 6 2017-12-18 17:17 /user/hive/warehouse/ods.db/numbers_bucketed/000002_0
hive (ods)> dfs -cat /user/hive/warehouse/ods.db/numbers_bucketed/000000_0
> ;
9
6
3
hive (ods)> dfs -cat /user/hive/warehouse/ods.db/numbers_bucketed/000001_0;
10
7
4
1
hive (ods)> dfs -cat /user/hive/warehouse/ods.db/numbers_bucketed/000002_0;
8
5
2
hive (ods)>
hive (ods)> select * from numbers_bucketed tablesample(bucket 2 out of 3 on number);
OK
numbers_bucketed.number
10
7
4
1
Time taken: 0.107 seconds, Fetched: 4 row(s)
hive (ods)> select * from numbers_bucketed tablesample(bucket 1 out of 3 on number);
OK
numbers_bucketed.number
9
6
3
Time taken: 0.099 seconds, Fetched: 3 row(s)
hive (ods)> select * from numbers_bucketed tablesample(bucket 3 out of 3 on number);
OK
numbers_bucketed.number
8
5
2
Time taken: 0.