-- 函数
-- 查询当前系统提供的所有函数
show functions;
-- 模糊查询函数
show functions LIKE 'x*y*';
-- 查询指定函数的具体用法
DESC FUNCTION 函数名称;
-- 查询指定函数的具体用法且带示例
DESC FUNCTION EXTENDED 函数名称;
-- 字符(串)函数【注,不分单双引号】
-- 返回字符个数
SELECT length('abc'),length('你我他');
select length(name) length_name from employee;
-- 返回多个字符串连接后的结果
SELECT CONCAT('ab','cd','ef');
-- 返回多个字符串连接后的结果且带分隔符
SELECT CONCAT_WS('-','ab','cd','ef');
-- 大小写转换
SELECT name,LOWER(name),UPPER(name) FROM employee;
-- 替换
SELECT regexp_replace('aabbaacc','aa',''),
regexp_replace('aabbaacc','aa','X'),
regexp_replace('aabbaacc','a','XX'),
regexp_replace('12ab34cd','\\d','');
-- 切割
SELECT split('aa bb\tcc','\\s');
-- 按序号截取子串
SELECT substr('abcdefg',2,3);
-- 截取多余空白
SELECT length(' abc '),length(TRIM(' abc '));
-- 数学函数
SELECT 10+5,10-5,10*5,10/5,10%3; -- 注意除法得浮点数
-- 四舍五入
SELECT
ROUND(3.1415926),
ROUND(3.1415926,0),
ROUND(3.1415926,3);
-- 向上取整
SELECT ceil(3.14),ceil(3);
-- 向下取整
SELECT floor(3.14),floor(3);
-- 随机数
SELECT RAND(),RAND(),RAND(); -- 注意:返回0到1(不包含)之间的纯小数
SELECT RAND(1),RAND(1),RAND(2);
-- 获取50到70之间的随机整数:(较大值-较小值)*随机值+较小值
SELECT FLOOR(50+(70-50)*RAND());
-- 转换函数
SELECT CAST("123" AS int),CAST(123 AS string),CAST('123' AS double);
SELECT md5("123");
-- json:轻量级对象表达式
{}表示一个对象,封装了该对象的所有属性{"attr1":value1,"attr2":value2,...}
[]表示一个集合,封装了该集合的所有对象[{},{},...]
示例:
学生对象:
学号:1001
姓名:tom
性别:male
年龄:22
爱好:reading,game,internet
{"id":"1001","name":"tom","sex":"male","age":22,"hobbies":["reading","game","internet"]}
SELECT get_json_object('{"id":"1001","name":"tom","sex":"male","age":22,"hobbies":["reading","game","internet"]}','$.hobbies');
SELECT get_json_object("{\"id\":\"1001\",\"name\":\"tom\",\"sex\":\"male\",\"age\":22,\"hobbies\":[\"reading\",\"game\",\"internet\"]}",'$.hobbies[1]');
SELECT get_json_object(
'
[
{"id":"1001","name":"tom","sex":"male","age":22,"hobbies":["reading","game","internet"]},
{"id":"1002","name":"jack","sex":"male","age":23,"hobbies":["ball","game","sing"]},
{"id":"1003","name":"rose","sex":"female","age":20,"hobbies":["dance","draw","internet"]}
]
','$[1].hobbies[2]'
);
SELECT json_tuple('{"id":"1001","name":"tom","sex":"male","age":22,"hobbies":["reading","game","internet"]}','id','name','sex','hobbies');
SELECT
get_json_object('{"id":"1001","name":"tom","sex":"male","age":22,"hobbies":["reading","game","internet"]}','$.id'),
get_json_object('{"id":"1001","name":"tom","sex":"male","age":22,"hobbies":["reading","game","internet"]}','$.name'),
get_json_object('{"id":"1001","name":"tom","sex":"male","age":22,"hobbies":["reading","game","internet"]}','$.sex'),
get_json_object('{"id":"1001","name":"tom","sex":"male","age":22,"hobbies":["reading","game","internet"]}','$.hobbies[1]');
SET hivevar:json='{"id":"1001","name":"tom","sex":"male","age":22,"hobbies":["reading","game","internet"]}';
SELECT
get_json_object(${json},'$.id') id,
get_json_object(${json},'$.name') name,
get_json_object(${json},'$.sex') sex,
get_json_object(${json},'$.hobbies[1]') hobby;
-- 日期时间函数
SELECT CURRENT_DATE(),CURRENT_TIMESTAMP();
SELECT
unix_timestamp(),
unix_timestamp('2023-12-25 15:37:00'),
unix_timestamp('2023/12/25-15-37-00','yyyy/MM/dd-HH-mm-ss');
SELECT from_unixtime(1703489820),from_unixtime(1703489820,'yyyy年MM月dd日 HH点mm分ss秒');
SELECT to_date('2023-12-25 15:37:00');
SELECT
YEAR('2023-12-25 15:37:00'),
month('2023-12-25 15:37:00'),
day('2023-12-25 15:37:00'),
hour('2023-12-25 15:37:00'),
minute('2023-12-25 15:37:00'),
second('2023-12-25 15:37:00'),
dayofweek('2023-12-25 15:37:00');
SELECT date_format('2023-12-25 15:37:00','HH:mm:ss');
SELECT DATEDIFF('2023-12-25','2023-12-20');
SELECT months_between('2023-12-25','2023-10-20');
SELECT
date_add('2023-12-25',5),
date_add('2023-12-25',-5),
date_sub('2023-12-25',5),
date_sub('2023-12-25',-5);
1------1
2------1
3------1
4------2
5------2
6------2
7------3
8------3
9------3
10-----4
11-----4
12-----4
SELECT ceil(3/3),ceil(6/3),ceil(9/3),ceil(12/3);
SELECT CAST('2023-12-27 10:02:30' AS date);
-- 集合函数
SELECT array(1,2,3,'a','b','c');
SELECT size(array(1,2,3,'a','b','c'));
SELECT array_contains(array(1,2,3,'a','b','c'),'2');
SELECT sort_array(array(2,4,5,3,6,1));
SELECT map('a',1,'b',2,'c',3);
SELECT str_to_map('a:1,b:2,c:3');
SELECT str_to_map('a-1.b-2.c-3','[.]','-');
SELECT size(str_to_map('a-1.b-2.c-3','[.]','-'));
SELECT struct('1001','tom',22,'male');
SELECT named_struct('id','1001','name','tom','age',22,'sex','male');
-- 条件函数
select gid,isnull(gid),isnotnull(gid) from student;
SELECT nvl(null,0),nvl(1,0);
select gid,nvl(gid,0) from student;
SELECT COALESCE(null,null,1,2),COALESCE(0,null,1,2);
-- IF(条件,为真时执行的表达式,为假时执行的表达式)
SELECT id,name,sex,mark,IF(mark>=60,'及格','不及格') `等级` FROM score;
>=90 优
>=80 良
>=70 中
>=60 及格
<60 不及格
SELECT id,name,sex,mark,
IF(isnull(mark),'缺考',IF(mark>=90,'优',IF(mark>=80,'良',IF(mark>=70,'中',IF(mark>=60,'及格','不及格')))))
FROM score;
SELECT score.*,
CASE isnull(mark)
WHEN true THEN '缺考'
ELSE '不缺考'
END `等级`
FROM score;
SELECT score.*,
CASE
-- WHEN isnull(mark) THEN '缺考'
WHEN mark>=90 THEN '优秀'
WHEN mark>=80 THEN '良好'
WHEN mark>=70 THEN '中等'
WHEN mark>=60 THEN '及格'
WHEN mark<60 THEN '不及格'
ELSE '缺考'
END `等级`
FROM score;
-- 聚合函数
SELECT COUNT(mark),COUNT(DISTINCT mark),COUNT(*),COUNT(1) FROM score;
SELECT SUM(mark),SUM(DISTINCT mark),AVG(mark),SUM(mark)/COUNT(mark) avg2,MAX(mark),MIN(mark) FROM score;
-- 表生成函数
SELECT explode(array('aa','bb','cc'));
SELECT explode(map('aa',11,'bb',22,'cc',33));
SELECT inline(array(named_struct('id','1001','name','tom','age',22)));
CREATE TABLE t12
(
c1 float,
c2 double,
c3 DECIMAL(3,2) -- 3位有效数字中保留2位小数且会四舍五入,只能有1位整数,否则为空
);
INSERT INTO t12 VALUES(12.1234567890,12.1234567890,12.1234567890);
INSERT INTO t12 VALUES(0,0,1.23),(0,0,12.3),(0,0,123),(0,0,0.123);
-- 自定义函数步骤:
1、继承指定的父类GenericUDF并重写方法
public class MyUpper extends GenericUDF {
//完成初始化工作
@Override
public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException {
int length = arguments.length;
if(length!=1){
throw new UDFArgumentException("参数必须只能有一个");
}
ObjectInspector objectInspector =
PrimitiveObjectInspectorFactory.javaStringObjectInspector;
return objectInspector;
}
//完成计算过程
@Override
public Object evaluate(DeferredObject[] arguments) throws HiveException {
return arguments[0].get().toString().toUpperCase();
}
@Override
public String getDisplayString(String[] children) {
return "This is UDF of MyUpper";
}
}
2、编译打包
3、上传到linux或hdfs
4、加载jar包路径【在hive中执行】
add jar linux_path | hdfs_path;
list jar;
delete jar linux_path | hdfs_path;
5、注册函数
create [temporary] function 函数名 as '函数类的全路径';
6、使用自定义函数
-- 示例:
-- 临时函数
add jar /root/jars/prj-1.0-SNAPSHOT.jar;
create temporary function myupper01 as 'com.kgc.functions.MyUpper';
SELECT myupper01('abc');
-- 永久函数
dfs -mkdir /jars;
dfs -put /root/jars/prj-1.0-SNAPSHOT.jar /jars;
CREATE function myupper03 as 'com.kgc.functions.MyUpper' using jar'hdfs://hadoop101:9000/jars/prj-1.0-SNAPSHOT.jar';
SELECT myupper03('def');
DROP FUNCTION 函数名;
-- 性能调优之执行计划
explain
SELECT sex,AVG(mark)
FROM score
GROUP BY sex;
explain
SELECT * FROM student;
explain
SELECT * FROM (
SELECT * FROM student
)t;
-- fetch 抓取
SET hive.fetch.task.conversion;【more,none,minimal】
-- 本地模式
set hive.exec.mode.local.auto;【false】
set hive.exec.mode.local.auto.inputbytes.max;【134217728】
set hive.exec.mode.local.auto.input.files.max;【4】
-- 小表 mapjoin 大表
set hive.auto.convert.join;【true】
set hive.mapjoin.smalltable.filesize;【25000000】
-- 尽量少用distinct,采用group by代替
explain
select count(distinct mark) from score;
explain
select count(mark) from(
select mark from score group by mark
) t;
-- 不要出现笛卡尔积
SELECT *
FROM student s INNER JOIN grade g;