hive函数及性能优化

-- 函数
-- 查询当前系统提供的所有函数
show functions;
-- 模糊查询函数
show functions LIKE 'x*y*';
-- 查询指定函数的具体用法
DESC FUNCTION 函数名称;
-- 查询指定函数的具体用法且带示例
DESC FUNCTION EXTENDED 函数名称;

-- 字符(串)函数【注,不分单双引号】
-- 返回字符个数
SELECT length('abc'),length('你我他');
select length(name) length_name from employee;
-- 返回多个字符串连接后的结果
SELECT CONCAT('ab','cd','ef');
-- 返回多个字符串连接后的结果且带分隔符
SELECT CONCAT_WS('-','ab','cd','ef');
-- 大小写转换
SELECT name,LOWER(name),UPPER(name) FROM employee;
-- 替换
SELECT regexp_replace('aabbaacc','aa',''),
regexp_replace('aabbaacc','aa','X'),
regexp_replace('aabbaacc','a','XX'),
regexp_replace('12ab34cd','\\d','');
-- 切割
SELECT split('aa bb\tcc','\\s');
-- 按序号截取子串
SELECT substr('abcdefg',2,3);
-- 截取多余空白
SELECT length('   abc   '),length(TRIM('   abc   '));

-- 数学函数
SELECT 10+5,10-5,10*5,10/5,10%3; -- 注意除法得浮点数
-- 四舍五入
SELECT 
ROUND(3.1415926),
ROUND(3.1415926,0),
ROUND(3.1415926,3);
-- 向上取整
SELECT ceil(3.14),ceil(3);
-- 向下取整
SELECT floor(3.14),floor(3);
-- 随机数
SELECT RAND(),RAND(),RAND(); -- 注意:返回0到1(不包含)之间的纯小数
SELECT RAND(1),RAND(1),RAND(2);
-- 获取50到70之间的随机整数:(较大值-较小值)*随机值+较小值
SELECT FLOOR(50+(70-50)*RAND());

-- 转换函数
SELECT CAST("123" AS int),CAST(123 AS string),CAST('123' AS double);
SELECT md5("123");

-- json:轻量级对象表达式
{}表示一个对象,封装了该对象的所有属性{"attr1":value1,"attr2":value2,...}
[]表示一个集合,封装了该集合的所有对象[{},{},...]
示例:
学生对象:
学号:1001
姓名:tom
性别:male
年龄:22
爱好:reading,game,internet
{"id":"1001","name":"tom","sex":"male","age":22,"hobbies":["reading","game","internet"]}
SELECT get_json_object('{"id":"1001","name":"tom","sex":"male","age":22,"hobbies":["reading","game","internet"]}','$.hobbies');
SELECT get_json_object("{\"id\":\"1001\",\"name\":\"tom\",\"sex\":\"male\",\"age\":22,\"hobbies\":[\"reading\",\"game\",\"internet\"]}",'$.hobbies[1]');

SELECT get_json_object(
'
[
{"id":"1001","name":"tom","sex":"male","age":22,"hobbies":["reading","game","internet"]},
{"id":"1002","name":"jack","sex":"male","age":23,"hobbies":["ball","game","sing"]},
{"id":"1003","name":"rose","sex":"female","age":20,"hobbies":["dance","draw","internet"]}
]
','$[1].hobbies[2]'
);

SELECT json_tuple('{"id":"1001","name":"tom","sex":"male","age":22,"hobbies":["reading","game","internet"]}','id','name','sex','hobbies');

SELECT 
get_json_object('{"id":"1001","name":"tom","sex":"male","age":22,"hobbies":["reading","game","internet"]}','$.id'),
get_json_object('{"id":"1001","name":"tom","sex":"male","age":22,"hobbies":["reading","game","internet"]}','$.name'),
get_json_object('{"id":"1001","name":"tom","sex":"male","age":22,"hobbies":["reading","game","internet"]}','$.sex'),
get_json_object('{"id":"1001","name":"tom","sex":"male","age":22,"hobbies":["reading","game","internet"]}','$.hobbies[1]');

SET hivevar:json='{"id":"1001","name":"tom","sex":"male","age":22,"hobbies":["reading","game","internet"]}';
SELECT 
get_json_object(${json},'$.id') id,
get_json_object(${json},'$.name') name,
get_json_object(${json},'$.sex') sex,
get_json_object(${json},'$.hobbies[1]') hobby;

-- 日期时间函数
SELECT CURRENT_DATE(),CURRENT_TIMESTAMP();
SELECT 
unix_timestamp(),
unix_timestamp('2023-12-25 15:37:00'),
unix_timestamp('2023/12/25-15-37-00','yyyy/MM/dd-HH-mm-ss');

SELECT from_unixtime(1703489820),from_unixtime(1703489820,'yyyy年MM月dd日 HH点mm分ss秒');

SELECT to_date('2023-12-25 15:37:00');

SELECT
    YEAR('2023-12-25 15:37:00'),
    month('2023-12-25 15:37:00'),
    day('2023-12-25 15:37:00'),
    hour('2023-12-25 15:37:00'),
    minute('2023-12-25 15:37:00'),
    second('2023-12-25 15:37:00'),
    dayofweek('2023-12-25 15:37:00');

SELECT date_format('2023-12-25 15:37:00','HH:mm:ss');

SELECT DATEDIFF('2023-12-25','2023-12-20');
SELECT months_between('2023-12-25','2023-10-20');

SELECT 
date_add('2023-12-25',5),
date_add('2023-12-25',-5),
date_sub('2023-12-25',5),
date_sub('2023-12-25',-5);

1------1
2------1
3------1
4------2
5------2
6------2
7------3
8------3
9------3
10-----4
11-----4
12-----4

SELECT ceil(3/3),ceil(6/3),ceil(9/3),ceil(12/3);

SELECT CAST('2023-12-27 10:02:30' AS date);

-- 集合函数
SELECT array(1,2,3,'a','b','c');
SELECT size(array(1,2,3,'a','b','c'));
SELECT array_contains(array(1,2,3,'a','b','c'),'2');
SELECT sort_array(array(2,4,5,3,6,1));
SELECT map('a',1,'b',2,'c',3);
SELECT str_to_map('a:1,b:2,c:3');
SELECT str_to_map('a-1.b-2.c-3','[.]','-');
SELECT size(str_to_map('a-1.b-2.c-3','[.]','-'));
SELECT struct('1001','tom',22,'male');
SELECT named_struct('id','1001','name','tom','age',22,'sex','male');

-- 条件函数
select gid,isnull(gid),isnotnull(gid) from student;
SELECT nvl(null,0),nvl(1,0);
select gid,nvl(gid,0) from student;
SELECT COALESCE(null,null,1,2),COALESCE(0,null,1,2);
-- IF(条件,为真时执行的表达式,为假时执行的表达式) 
SELECT id,name,sex,mark,IF(mark>=60,'及格','不及格') `等级` FROM score;
>=90 优
>=80 良
>=70 中
>=60 及格
<60  不及格
SELECT id,name,sex,mark,
    IF(isnull(mark),'缺考',IF(mark>=90,'优',IF(mark>=80,'良',IF(mark>=70,'中',IF(mark>=60,'及格','不及格')))))
FROM score;

SELECT score.*,
    CASE isnull(mark)
    WHEN true THEN '缺考'
    ELSE '不缺考'
    END `等级`
FROM score;

SELECT score.*,
    CASE 
    -- WHEN isnull(mark) THEN '缺考'
    WHEN mark>=90 THEN '优秀' 
    WHEN mark>=80 THEN '良好'
    WHEN mark>=70 THEN '中等'
    WHEN mark>=60 THEN '及格'
    WHEN mark<60 THEN '不及格'
    ELSE '缺考'
    END `等级`
FROM score;

-- 聚合函数
SELECT COUNT(mark),COUNT(DISTINCT mark),COUNT(*),COUNT(1) FROM score;
SELECT SUM(mark),SUM(DISTINCT mark),AVG(mark),SUM(mark)/COUNT(mark) avg2,MAX(mark),MIN(mark) FROM score;

-- 表生成函数
SELECT explode(array('aa','bb','cc'));
SELECT explode(map('aa',11,'bb',22,'cc',33));
SELECT inline(array(named_struct('id','1001','name','tom','age',22)));

CREATE TABLE t12
(
    c1 float,
    c2 double,
    c3 DECIMAL(3,2) -- 3位有效数字中保留2位小数且会四舍五入,只能有1位整数,否则为空
);
INSERT INTO t12 VALUES(12.1234567890,12.1234567890,12.1234567890);
INSERT INTO t12 VALUES(0,0,1.23),(0,0,12.3),(0,0,123),(0,0,0.123);

-- 自定义函数步骤:
1、继承指定的父类GenericUDF并重写方法
public class MyUpper extends GenericUDF {
    //完成初始化工作
    @Override
    public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException {
        int length = arguments.length;
        if(length!=1){
            throw new UDFArgumentException("参数必须只能有一个");
        }
        ObjectInspector objectInspector =
                PrimitiveObjectInspectorFactory.javaStringObjectInspector;
        return objectInspector;
    }

    //完成计算过程
    @Override
    public Object evaluate(DeferredObject[] arguments) throws HiveException {
        return arguments[0].get().toString().toUpperCase();
    }

    @Override
    public String getDisplayString(String[] children) {
        return "This is UDF of MyUpper";
    }
}
2、编译打包
3、上传到linux或hdfs
4、加载jar包路径【在hive中执行】
add jar linux_path | hdfs_path;
list jar;
delete jar linux_path | hdfs_path;
5、注册函数
create [temporary] function 函数名 as '函数类的全路径';
6、使用自定义函数

-- 示例:
-- 临时函数
add jar /root/jars/prj-1.0-SNAPSHOT.jar;
create temporary function myupper01 as 'com.kgc.functions.MyUpper';
SELECT myupper01('abc');
-- 永久函数
dfs -mkdir /jars;
dfs -put /root/jars/prj-1.0-SNAPSHOT.jar /jars;
CREATE function myupper03 as 'com.kgc.functions.MyUpper' using jar'hdfs://hadoop101:9000/jars/prj-1.0-SNAPSHOT.jar';
SELECT myupper03('def');

DROP FUNCTION 函数名;


-- 性能调优之执行计划
explain
SELECT sex,AVG(mark)
FROM score
GROUP BY sex;

explain
SELECT * FROM student;

explain
SELECT * FROM (
   SELECT * FROM student 
)t;

-- fetch 抓取
SET hive.fetch.task.conversion;【more,none,minimal】

-- 本地模式
set hive.exec.mode.local.auto;【false】
set hive.exec.mode.local.auto.inputbytes.max;【134217728】
set hive.exec.mode.local.auto.input.files.max;【4】

-- 小表 mapjoin 大表
set hive.auto.convert.join;【true】
set hive.mapjoin.smalltable.filesize;【25000000】

-- 尽量少用distinct,采用group by代替
explain
select count(distinct mark) from score;

explain
select count(mark) from(
    select mark from score group by mark
) t;

-- 不要出现笛卡尔积
SELECT *
FROM student s INNER JOIN grade g;

你可能感兴趣的:(hive,hadoop,数据仓库)