-- 查看系统自带的函数
hive> show functions;
-- 显示自带的函数的用法
hive> desc function upper;
-- 详细显示自带的函数的用法
hive> desc function extended upper;
round(double a)
,返回值 BIGINT,返回 double 类型的整数值部分(四舍五入)hive> select round(3.1415926);
3
hive> select round(3.5);
4
round(double a, int d)
,返回指定精度的 double 类型hive> select round(3.1415926, 4);
3.1416
floor(double a)
,返回等于或者小于该double变量的最大的整数hive> select floor(3.1415926);
3
hive> select floor(25);
25
ceil(double a)
,返回等于或者大于该double变量的最小的整数hive> select ceil(3.1415926);
4
hive> select ceil(46);
46
ceiling(double a)
,与ceil功能相同hive> select ceiling(3.1415926);
4
hive> select ceiling(46);
46
rand(), rand(int seed)
,返回一个0到1范围内的随机数。如果指定种子seed,则会等到一个稳定的随机数序列hive> select rand();
0.5577432776034763
hive> select rand();
0.6638336467363424
hive> select rand(100);
0.7220096548596434
hive> select rand(100);
0.7220096548596434
from_unixtime(bigint unixtime[, string format])
,返回值 string,转化UNIX时间戳(从1970-01-01 00:00:00 UTC到指定时间的秒数)到当前时区的时间格式hive> select from_unixtime(1323308943, 'yyyyMMdd');
20111208
unix_timestamp()
,返回值 bigint,获得当前时区的UNIX时间戳。-- 获取当前 UNIX 时间戳
hive> select unix_timestamp();
1638971143
unix_timestamp(string date)
,返回值 bigint,转换格式为"yyyy-MM-dd HH:mm:ss"的日期到UNIX时间戳。hive> select unix_timestamp('2021-12-07 13:01:03');
1638882063
unix_timestamp(string date, string pattern)
,转换pattern格式的日期到UNIX时间戳。hive> select unix_timestamp('20211207 23:01:03','yyyyMMdd HH:mm:ss');
1638918063
to_date(string datetime)
,返回值 string,返回日期时间字段中的日期部分。hive> select to_date('2021-12-08 10:03:01');
2021-12-08
year(string date)
,返回值 int,返回日期中的年。hive> select year('2021-12-08 10:03:01');
2021
month(string date)
,返回值 int,返回日期中的月份。hive> select month('2021-12-08 10:03:01');
12
day(string date)
,返回值 int,返回日期中的天。hive> select day('2021-12-08 10:03:01');
8
hour(string date)
,返回值 int,返回日期中的小时。hive> select hour('2021-12-08 10:03:01');
10
minute(string date)
,返回值 int,返回日期中的分钟。hive> select minute('2021-12-08 10:03:01');
3
second(string date)
,返回值 int,返回日期中的秒。hive> select second('2021-12-08 10:03:01');
1
weekofyear (string date)
,返回值 int,返回日期在当前的周数。hive> select weekofyear('2021-12-08 10:03:01');
49
datediff(string enddate, string startdate)
,返回值 int,返回结束日期减去开始日期的天数。hive> select datediff('2021-12-08','2021-05-09');
213
date_add(string startdate, int days)
,返回值 string,返回开始日期startdate增加days天后的日期。hive> select date_add('2021-12-08',10) ;
2021-12-18
date_sub (string startdate, int days)
,返回值 string,返回开始日期startdate减少days天后的日期。hive> select date_sub('2021-12-18',10) ;
2021-12-08
if(boolean testCondition, T valueTrue, T valueFalseOrNull)
,当条件testCondition为TRUE时,返回valueTrue;否则返回valueFalseOrNull。hive> select if(1=2,100,200);
200
hive> select if(1=1,100,200);
100
COALESCE(T v1, T v2, …)
,返回参数中的第一个非空值;如果所有值都为NULL,那么返回NULLhive> select COALESCE(null,'100','50') ;
100
CASE a WHEN b THEN c [WHEN d THEN e]* [ELSE f] END
,如果a等于b,那么返回c;如果a等于d,那么返回e;否则返回f。hive> select case 100 when 50 then 'tom' when 100 then 'mary' else 'tim' end;
mary
hive> select case 200 when 50 then 'tom' when 100 then 'mary' else 'tim' end;
tim
CASE WHEN a THEN b [WHEN c THEN d]* [ELSE e] END
,如果a为TRUE,则返回b;如果c为TRUE,则返回d;否则返回e。hive> select case when 1=2 then 'tom' when 2=2 then 'mary' else 'tim' end;
mary
hive> select case when 1=1 then 'tom' when 2=2 then 'mary' else 'tim' end;
tom
length(string A)
,返回值 int,返回字符串A的长度hive> select length('abcedfg');
7
reverse(string A)
,返回值 string,返回字符串A的反转结果hive> select reverse('abcdefg');
gfdecba
concat(string A, string B…)
,返回值 string,返回输入字符串连接后的结果,支持任意个输入字符串hive> select concat('abc','def','gh');
abcdefgh
concat_ws(string SEP, string A, string B…)
,返回值 string,返回输入字符串连接后的结果,SEP表示各个字符串间的分隔符。hive> select concat_ws(',','abc','def','gh') ;
abc,def,gh
substr(string A, int start), substring(string A, int start)
,返回值 string,返回字符串A从start位置到结尾的字符串。hive> select substr('abcdeere',3) ;
cdeere
hive> select substring('abcde',3);
cde
hive> select substr('abcde',-2);
de -- (负数从后往前)
substr(string A, int start, int len),substring(string A, int start, int len)
,返回值 string,返回字符串A从start位置开始,长度为len的字符串hive> select substr('abcde',3,2);
cd
hive> select substring('abcde',3,2);
cd
hive> select substring('abcde',-3,2);
cd
upper(string A) ucase(string A)
,返回值 string,返回字符串A的大写格式。hive> select upper('abSEd');
ABSED
hive> select ucase('abSEd');
ABSED
lower(string A) lcase(string A)
,返回值 string,返回字符串A的小写格式。hive> select lower('abSEd') ;
absed
hive> select lcase('abSEd');
absed
trim(string A)
,返回值 string,去除字符串两边的空格hive> select trim(' ab c ');
ab c
parse_url(string urlString, string partToExtract [, string keyToExtract])
,返回值 string,返回URL中指定的部分。partToExtract的有效值为: HOST、PATH、QUERY、REF、PROTOCOL、AUTHORITY、FILE、and USERINFOhive> select parse_url('https://www.tableName.com/path1/p.php?k1=v1&k2=v2#Ref1', 'HOST');
www.tableName.com
hive> select parse_url('https://www.tableName.com/path1/p.php?k1=v1&k2=v2#Ref1', 'QUERY', 'k1');
v1
get_json_object(string json_string, string path)
,返回值 string,解析json的字符串json_string,返回path指定的内容。如果输入的json字符串无效,那么返回NULL。hive> select get_json_object('{"store":{"fruit":\[{"weight":8,"type":"apple"},{"weight":9,"type":"pear"}], "bicycle":{"price":19.95,"color":"red"} },"email":"amy@only_for_json_udf_test.net","owner":"amy"}','$.owner');
amy
repeat(string str, int n)
,返回值 string,返回重复n次后的str字符串。hive> select repeat('abc', 5);
abcabcabcabcabc
split(string str, string pat)
,返回值 array,按照pat字符串分割str,会返回分割后的字符串数组。hive> select split('abtcdtef','t');
["ab","cd","ef"]
count(*), count(expr), count(DISTINCT expr[, expr_.])
,返回值 Int
hive> select count(*) from tableName;
21
hive> select count(distinct name) from tableName;
11
sum(col), sum(DISTINCT col)
,返回值 double
hive> select sum(t) from tableName;
100
hive> select sum(distinct t) from tableName;
70
avg(col), avg(DISTINCT col)
,返回值 double
hive> select avg(t) from tableName;
50
hive> select avg (distinct t) from tableName;
30
min(col)
,返回值 double,统计结果集中col字段的最小值hive> select min(t) from tableName;
20
max(col)
,返回值 double,统计结果集中col字段的最大值hive> select max(t) from tableName;
120
map (key1, value1, key2, value2, …)
,根据输入的 k-v 对构建map类型zhangsan sx:80,yw:89,zz:95
lisi sx:60,yw:80,zz:99
-- 建表
create table score_map(name string, score map<string, int>)
row format delimited fields terminated by '\t'
collection items terminated by ','
map keys terminated by ':';
-- 加载数据到hive表当中去
load data local inpath '/bigdata/install/hivedatas/score_map.txt' overwrite into table score_map;
-- map结构数据访问:
-- 获取所有的value:
select name, map_values(score) from score_map;
-- 获取所有的key:
select name, map_keys(score) from score_map;
-- 按照key来进行获取value值
select name, score["sx"] from score_map;
-- 查看map元素个数
select name, size(score) from score_map;
-- 构建一个map
select map(1, 'zs', 2, 'lisi');
struct(val1, val2, val3, …)
,根据输入的参数构建结构体struct类型,似于C语言中的结构体,内部数据通过X.X来获取,假设数据格式是这样的:电影ABC,有1254人评价过,打分为7.4分。ABC 1254:7.4
DEF 256:4.9
XYZ 456:5.4
-- 创建struct表
hive> create table movie_score(name string, info struct<number:int,score:float>)
row format delimited fields terminated by "\t"
collection items terminated by ":";
-- 加载数据
load data local inpath '/bigdata/install/hivedatas/struct.txt' overwrite into table movie_score;
-- 查询数据
hive> select * from movie_score;
hive> select name, info.number, info.score from movie_score;
OK
name number score
ABC 1254 7.4
DEF 256 4.9
XYZ 456 5.4
-- 构建一个struct
hive> select struct(1, 'anzhulababy', 'moon', 1.68);
OK
_c0
{"col1":1,"col2":"anzhulababy","col3":"moon","col4":1.68}
array(val1, val2, …)
,根据输入的参数构建数组array类型。biansutao beijing,shanghai,tianjin,hangzhou
linan changchun,chengdu,wuhan
hive> create table person(name string, work_locations array<string>)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
COLLECTION ITEMS TERMINATED BY ',';
-- 加载数据
hive > load data local inpath '/bigdata/install/hivedatas/person.txt' overwrite into table person;
-- 查询所有数据数据
hive > select * from person;
OK
person.name person.work_locations
biansutao ["beijing","shanghai","tianjin","hangzhou"]
linan ["changchun","chengdu","wuhan"]
-- 按照下标索引进行查询
hive > select work_locations[0] from person;
OK
_c0
beijing
changchun
-- 查询所有集合数据
hive > select work_locations from person;
OK
work_locations
["beijing","shanghai","tianjin","hangzhou"]
["changchun","chengdu","wuhan"]
-- 查询元素个数
hive > select size(work_locations) from person;
OK
_c0
4
3
-- 构建array
select array(1, 2, 1);
select array(1, 'a', 1.0);
select array(1, 2, 1.0);
size(Map)
,返回值 int,返回map类型的长度hive> select size(map(1, 'zs', 2, 'anzhulababy'));
2
size(Array)
,返回值 int,返回array类型的长度hive> select size(t) from tableName;
4
cast(expr as )
,返回值 Expected “=” to follow “type”,返回转换后的数据类型。hive> select cast('1' as bigint);
1
CONCAT(string A/col, string B/col…)
:返回输入字符串连接后的结果,支持任意个输入字符串;CONCAT_WS(separator, str1, str2,...)
:它是一个特殊形式的 CONCAT()。
COLLECT_SET(col)
:函数只接受基本数据类型,它的主要作用是将某字段的值进行去重汇总,产生array类型字段。name | constellation | blood_type |
---|---|---|
孙悟空 | 白羊座 | A |
老王 | 射手座 | A |
宋宋 | 白羊座 | B |
猪八戒 | 白羊座 | A |
按住啦baby | 射手座 | A |
射手座,A 老王|按住啦baby
白羊座,A 孙悟空|猪八戒
白羊座,B 宋宋
孙悟空 白羊座 A
老王 射手座 A
宋宋 白羊座 B
猪八戒 白羊座 A
凤姐 射手座 A
-- 创建表
hive > create table person_info(name string, constellation string, blood_type string) row format delimited fields terminated by "\t";
-- 加载数据
hive > load data local inpath '/bigdata/install/hivedatas/constellation.txt' into table person_info;
-- 按需求查询数据
hive > select t1.base, concat_ws('|', collect_set(t1.name)) name from (select name, concat(constellation, "," , blood_type) base from person_info) t1 group by t1.base;
EXPLODE(col)
:将hive一列中复杂的 array 或者 map 结构拆分成多行。LATERAL VIEW
LATERAL VIEW udtf(expression) tableAlias AS columnAlias
《疑犯追踪》 悬疑,动作,科幻,剧情
《Lie to me》 悬疑,警匪,动作,心理,剧情
《战狼2》 战争,动作,灾难
《疑犯追踪》 悬疑
《疑犯追踪》 动作
《疑犯追踪》 科幻
《疑犯追踪》 剧情
《Lie to me》 悬疑
《Lie to me》 警匪
《Lie to me》 动作
《Lie to me》 心理
《Lie to me》 剧情
《战狼2》 战争
《战狼2》 动作
《战狼2》 灾难
-- 创建hive表
hive > create table movie_info(movie string, category array<string>)
row format delimited fields terminated by "\t"
collection items terminated by ",";
-- 加载数据
hive > load data local inpath "/bigdata/install/hivedatas/movie.txt" into table movie_info;
-- 按需求查询数据
hive > select movie, category_name from movie_info lateral view explode(category) table_tmp as category_name;
select * from score;
set hive.fetch.task.conversion=none;
select * from score;
select s_id from score;
select s_id from score limit 3;
set hive.fetch.task.conversion=more;
select * from score;
select s_id from score;
select s_id from score limit 3;
-- 开启本地模式,并执行查询语句;默认false
set hive.exec.mode.local.auto=true; // 开启本地mr
-- 设置local mr的最大输入数据量,当输入数据量小于这个值时采用local mr的方式,默认为134217728,即128M
set hive.exec.mode.local.auto.inputbytes.max=50000000;
-- 设置local mr的最大输入文件个数,当输入文件个数小于这个值时采用local mr的方式,
set hive.exec.mode.local.auto.input.files.max=5;
-- 执行查询的sql语句
select * from score;
-- 关闭本地运行模式
set hive.exec.mode.local.auto=false;
select * from table1 A
left join table2 B on A.id = B.id
left join table3 C on B.item_id = C.id
use myhive;
-- 创建表
create table ori(id bigint, time_statmp bigint, uid string, keyword string, url_rank int, click_num int, click_url string) row format delimited fields terminated by '\t';
create table nullidtable(id bigint, time_statmp bigint, uid string, keyword string, url_rank int, click_num int, click_url string) row format delimited fields terminated by '\t';
create table jointable(id bigint, time_statmp bigint, uid string, keyword string, url_rank int, click_num int, click_url string) row format delimited fields terminated by '\t';
-- 导入数据
load data local inpath '/bigdata/install/hivedatas/hive_big_table/*' into table ori;
load data local inpath '/bigdata/install/hivedatas/hive_have_null_id/*' into table nullidtable;
-- 不过滤:
INSERT OVERWRITE TABLE jointable SELECT a.* FROM nullidtable a JOIN ori b ON a.id = b.id;
-- 过滤:
INSERT OVERWRITE TABLE jointable SELECT a.* FROM (SELECT * FROM nullidtable WHERE id IS NOT NULL ) a JOIN ori b ON a.id = b.id;
-- 默认值256000000,即256m
set hive.exec.reducers.bytes.per.reducer=32123456;
set mapreduce.job.reduces=7;
INSERT OVERWRITE TABLE jointable SELECT a.* FROM nullidtable a LEFT JOIN ori b ON CASE WHEN a.id IS NULL THEN 'hive' ELSE a.id END = b.id;
No rows affected (119.142 seconds)
-- 结果:这样的后果就是所有为null值的id全部都变成了相同的字符串,及其容易造成数据的倾斜(所有的key相同,相同key的数据会到同一个reduce当中去)。
-- 为了解决这种情况,我们可以通过hive的rand函数,随记的给每一个为空的id赋上一个随机值,这样就不会造成数据倾斜。
set hive.exec.reducers.bytes.per.reducer=32123456;
set mapreduce.job.reduces=7;
INSERT OVERWRITE TABLE jointable SELECT a.* FROM nullidtable a LEFT JOIN ori b ON CASE WHEN a.id IS NULL THEN concat('hive', rand()) ELSE a.id END = b.id;
No rows affected (119.67 seconds)
-- 建大表、小表和JOIN后表的语句
create table bigtable(id bigint, time_statmp bigint, uid string, keyword string, url_rank int, click_num int, click_url string) row format delimited fields terminated by '\t';
create table smalltable(id bigint, time_statmp bigint, uid string, keyword string, url_rank int, click_num int, click_url string) row format delimited fields terminated by '\t';
create table jointable2(id bigint, time_statmp bigint, uid string, keyword string, url_rank int, click_num int, click_url string) row format delimited fields terminated by '\t';
-- 分别向大表和小表中导入数据
load data local inpath '/bigdata/install/hivedatas/big_data' into table bigtable;
load data local inpath '/bidata/install/hivedatas/small_data' into table smalltable;
-- 开启MapJoin参数设置, 默认为true
set hive.auto.convert.join = true;
-- 大表小表的阈值设置(默认 25M 以下认为是小表)
set hive.mapjoin.smalltable.filesize=26214400;
-- 开启Mapjoin功能
-- 是否允许hive自动对普通join进行优化;如果join中的一个表小于小表的阈值,join被转换成map join;默认为true
set hive.auto.convert.join = true;
-- 执行小表JOIN大表语句
INSERT OVERWRITE TABLE jointable2
SELECT b.id, b.time_statmp, b.uid, b.keyword, b.url_rank, b.click_num, b.click_url FROM smalltable s JOIN bigtable b ON s.id = b.id;
-- 执行大表JOIN小表语句
INSERT OVERWRITE TABLE jointable2
SELECT b.id, b.time_statmp, b.uid, b.keyword, b.url_rank, b.click_num, b.click_url FROM bigtable b JOIN smalltable s ON s.id = b.id;
-- 是否在Map端进行聚合,默认为True
set hive.map.aggr = true;
--在 Map端进行聚合操作的条目数目;默认100000
set hive.groupby.mapaggr.checkinterval = 100000;
-- 有数据倾斜的时候进行负载均衡(默认是false)
set hive.groupby.skewindata = true;
-- 当选项设定为 true,生成的查询计划会有两个MR Job。第一个MR Job中,Map的输出结果会随机分布到Reduce中,每个Reduce做部分聚合操作,并输出结果,这样处理的结果是相同的Group By Key有可能被分发到不同的Reduce中,从而达到负载均衡的目的;第二个MR Job再根据预处理的数据结果按照Group By Key分布到Reduce中(这个过程可以保证相同的Group By Key被分布到同一个Reduce中),最后完成最终的聚合操作。
create table bigtable(id bigint, time_statmp bigint, uid string, keyword string, url_rank int, click_num int, click_url string) row format delimited fields terminated by '\t';
load data local inpath '/bigdata/install/hivedatas/data/100万条大表数据(id除以10取整)/bigtable' into table bigtable;
--每个reduce任务处理的数据量 默认256000000(256M)
set hive.exec.reducers.bytes.per.reducer=321234560000;
select count(distinct ip ) from log_text;
Error: Error while processing statement: FAILED: Execution Error, return code 2 from org.apache.hadoop.hive.ql.exec.mr.MapRedTask (state=08S01,code=2)
-- 转换成
set hive.exec.reducers.bytes.per.reducer=32123456;
select count(ip) from (select ip from log_text group by ip) t;
-- 虽然会多用一个Job来完成,但在数据量大的情况下,这个绝对是值得的。
-- 示例SQL
select A.col1, B.col2 from
(select count(*) as col1 from table1) as A,
(select count(*) as col2 from table2) as B;
-- 开启并行执行;默认false
set hive.exec.parallel=true;
-- 同一个sql允许最大并行度,默认为8。
set hive.exec.parallel.thread.number=16;
-- 设置非严格模式(默认nonstrict)
set hive.mapred.mode=nonstrict;
-- 设置严格模式
set hive.mapred.mode=strict;
-- 设置严格模式下 执行sql语句报错; 非严格模式下是可以的
select * from score; -- score是个分区表
-- 异常信息:FAILED: SemanticException [Error 10056]: Queries against partitioned tables without a partition filter are disabled for safety reasons.
-- 正确的是:
select * from score where month='201806';
-- 设置严格模式下 执行sql语句报错; 非严格模式下是可以的
select * from score where month='201806' order by s_score;
-- FAILED: SemanticException 1:50 Order by-s without limit are disabled for safety reasons.
-- 正确的是:
select * from score where month='201806' order by s_score limit 3;
<property>
<name>mapreduce.job.jvm.numtasksname>
<value>10value>
<description>How many tasks to run per jvm. If set to -1, there is no limit.
description>
property>
set mapred.job.reuse.jvm.num.tasks=10;
这个设置来设置我们的jvm重用。<property>
<name>mapreduce.map.speculative</name>
<value>true</value>
<description>If true, then multiple instances of some map tasks
may be executed in parallel.</description>
</property>
<property>
<name>mapreduce.reduce.speculative</name>
<value>true</value>
<description>If true, then multiple instances of some reduce tasks
may be executed in parallel.</description>
</property>
<property>
<name>hive.mapred.reduce.tasks.speculative.executionname>
<value>truevalue>
<description>Whether speculative execution for reducers should be turned on. description>
property>
# 设置为true为激活中间数据压缩功能,默认是false,没有开启
set hive.exec.compress.intermediate=true;
# 开启mapreduce中map输出压缩功能;默认false
set mapreduce.map.output.compress=true;
# 设置中间数据的压缩算法
set mapred.map.output.compression.codec= org.apache.hadoop.io.compress.SnappyCodec;
set hive.exec.compress.output=true;
set mapred.output.compression.codec= org.apache.hadoop.io.compress.SnappyCodec;
explain select * from score where month='201806';
是不是map数越多越好?
是不是保证每个map处理接近128m的文件块,就高枕无忧了?
set mapred.max.split.size=112345600;
set mapred.min.split.size.per.node=112345600;
set mapred.min.split.size.per.rack=112345600;
set hive.input.format= org.apache.hadoop.hive.ql.io.CombineHiveInputFormat;
mapreduce.input.fileinputformat.split.minsize=1 默认值为1
mapreduce.input.fileinputformat.split.maxsize=Long.MAXValue 默认值Long.MAXValue因此,默认情况下,切片大小=blocksize
maxsize(切片最大值): 参数如果调到比blocksize小,则会让切片变小,而且就等于配置的这个参数的值。
minsize(切片最小值): 参数调的比blockSize大,则可以让切片变得比blocksize还大。
--设置maxsize大小为10M,也就是说一个fileSplit的大小为10M
set mapreduce.input.fileinputformat.split.maxsize=10485760;
set hive.exec.reducers.bytes.per.reducer=256000000;
set hive.exec.reducers.max=1009;
计算reducer数的公式:N=min(参数2,总输入数据量/参数1)
调整reduce个数方法二
--设置每一个job中reduce个数
set mapreduce.job.reduces=3;