show functions;
desc functions extended +;
# 大小比较
select 1>1;
select 1=1;
select 1>=1;
# 空值判断
select 'asd' is null;
select 'asd' is not null;
# 模糊查询
select 'itcast' like 'it____'; # _代表匹配一个字符
select 'itcast' like 'it%'; # _代表匹配任意个数字符
# 正则查询
select 'itcast' rlike 'it.';
select 'itcast' regexp 'it.';
# 加减乘除
select 1+1;
select 1/2;
select 1/0;
# 取整取余
select 3 div 2;
select 4 % 2;
# 位运算
0 0000
1 0001
2 0010
3 0011
4 0100
5 0101
6 0110
select 4 & 6; 0100 * 0110 0100
select 4 | 6 0100 | 0110 只要有一个位数是1 该位置上的数据就是1
# 与或非
select 1>1 and 1=1; and 所有条件成立
select 1>1 or 1=1; or 一个条件成则返回true
select !1>1; 取反
# 在范围内
select 1 in (2,3);
select 1 not in (2,3);
-- 字符串长度函数
select length('asd');
-- 字符串反转函数
select reverse('asd');
-- 字符串连接函数
select concat('asd','zxc');
-- 分隔符字符串连接函数
select concat_ws('-','asd','zxc','aaa')
-- 字符串截取函数
select substr('itcast',2,2); -- 第一个截取字符的位置 第二个截取字符的个数
select substr('itcast',-2)
-- 字符串转大写函数
select upper('asd');
select lower('ASD');
-- 去空格函数
select trim(' aaa ');
select ltrim(' aaaa ');
select rtrim(' aaaa ');
-- 正则表达式替换函数
select regexp_replace('asd123','(\\w+)\d','aaa')
-- URL解析函数
select parse_url('http://www.baidu.com','HOST');
-- 割字符串函数
select split('asd,aaa',',')
-- 获取当前日期
select `current_date`();
-- 获取当前时间戳
select `current_timestamp`();
-- UNIX时间戳转日期函数
select from_unixtime(1634528834);
-- 获取当前UNIX时间戳函数
select unix_timestamp();
-- 日期转UNIX时间戳函数
select unix_timestamp('2021-10-10 10:10:10');
-- 抽取日期函数
select to_date('2021-10-10 10:10:10');
-- 日期转年函数
select year('2021-10-10 10:10:10');
-- 日期比较函数
select datediff('2021-10-10 10:10:10','2021-10-12 10:10:10');
-- 日期增加函数
select date_add('2021-10-10 10:10:10',5);
select date_sub('2021-10-10 10:10:10',5);
-- 取整函数
select round(3.1415);
-- 指定精度取整函数
select round(3.1415,2);
-- 向下取整函数
select `floor`(-1.1);
-- 向上取整函数
select ceil(1.1);
-- 进制转换函数
select conv()
-- 取随机数函数
select round(rand(3)*100);
-- 绝对值函数
select abs(-10)
-- 集合元素数
select size(`array`('a','n','b'));
-- 取map集合keys函数
select map_keys(`map`('name','python','age',19));
-- 取map集合values函数
select map_values(`map`('name','python','age',19));
-- 判断数组是否包含指定元素
select array_contains(`array`(1,2,3),4);
select 1 in (1,2,3);
-- 数组排序函数
select sort_array(`array`(1,5,3,2));
select sort_array(`array`('g','a','f','c'));
-- if条件判断
if(boolean testCondition, T valueTrue, T valueFalseOrNull)
-- 空判断函数
isnull( a )
-- 非空判断函数
isnotnull ( a )
-- 空值转换函数
nvl(T value, T default_value)
-- 非空查找函数
select coalesce(null,null,2);
-- 条件转换函数
CASE a WHEN b THEN c [WHEN d THEN e]* [ELSE f] END
select case 100 when 50 then 'tom' when 100 then 'mary' else 'tim' end;
select case sex when '男' then 'man' else 'women' end from student limit 3;
--nullif( a, b ):
-- 果a = b,则返回NULL;否则返回NULL。否则返回一个
select nullif(11,11);
select nullif(11,12);
--assert_true(condition)
--如果'condition'不为真,则引发异常,否则返回null
SELECT assert_true(11 >= 0);
SELECT assert_true(-1 >= 0);
select cast('12' as int);
select mask('asd123ZXC');
select mask('asd123ZXC','*','@','#');
-- 对前n个进行脱敏替换
select mask_first_n('asd123',3);
select mask_last_n('itcastpython',3);
-- 除了前n个字符,其余进行掩码处理
select mask_show_first_n('asdzxc',2);
select mask_show_last_n('qwertyu',2);
-- 返回字符串的hash编码。xxxxxxxxxx select mask('asd123ZXC');select mask('asd123ZXC','*','@','#');--对前n个进行脱敏替换select mask_first_n('asd123',3);select mask_last_n('itcastpython',3);--除了前n个字符,其余进行掩码处理select mask_show_first_n('asdzxc',2);select mask_show_last_n('qwertyu',2);--返回字符串的hash编码。-- 脱敏--对前n个进行脱敏替换--除了前n个字符,其余进行掩码处理--返回字符串的hash编码。
-- 反射函数
select reflect("java.lang.Math","max",11,22);
-- 取哈希值函数
select hash("allen");
-- SHA-1加密
select sha1("allen");
--SHA-2 家族算法加密
select sha2("allen",224);
select sha2("allen",512);
-- MD5加密
select md5("allen");
--crc32加密:
select crc32("allen");
UDF 普通函数 输入一行输出一行
UDAF 聚合函数 输入多行输出一行
UDTF 输入一行输出多行
explode(UDTF
-- NBA总冠军球队
create table the_nba_championship(
team_name string,
champion_year array
) row format delimited
fields terminated by ','
collection items terminated by '|';
-- 使用explode函数对champion_year进行拆分
lateral View
行列转换 concat
列比较
select count(case when gender='男' then id end) as '男',count(case when gender='女' then id end) as '女' from students
行比较
select gender,count(id) as count from student group by gender
create table row2col2(
col1 string,
col2 string,
col3 int
)row format delimited fields terminated by '\t';
create table col2row2(
col1 string,
col2 string,
col3 string
)row format delimited fields terminated by '\t';
-- 单个字段
create table tb_json_test1 (
json string
);
get_json_object(json,'$.deviceType')
-- 多个字段
create table tb_json_test2 (
device string,
deviceType string,
signal double,
`time` string
)
ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'
STORED AS TEXTFILE;
json_tuple(json,'device','deviceType')
--serder
create table tb_json_test2 (
device string,
deviceType string,
signal double,
`time` string
)
ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'
STORED AS TEXTFILE;
-- 拓展--xml数据处理
desc function extended xpath;
SELECT xpath('b1b2b3c1 c2 ', 'a/b/text()')
聚合函数、分析函数、排序函数 over(partition by 字段1... order by 字段1... rows 范围)
---建表并且加载数据
create table website_pv_info(
cookieid string,
createtime string, --day
pv int
) row format delimited
fields terminated by ',';
create table website_url_info (
cookieid string,
createtime string, --访问时间
url string --访问页面
) row format delimited
fields terminated by ','
select cookieid,sum(pv) from website_pv_info group by cookieid;
select cookieid,sum(pv) over(partition by cookieid) from website_pv_info;
-- 求出每个用户截止到当天,累积的总pv数 between unbounded preceding and current row 等价 unbounded preceding
-- order by 默认有一个计算范围 unbounded preceding
select cookieid,sum(pv) over(partition by cookieid order by createtime rows between unbounded preceding and current row ) from website_pv_info;
select cookieid,sum(pv) over(partition by cookieid order by createtime rows unbounded preceding) from website_pv_info;
select cookieid,sum(pv) over(partition by cookieid order by createtime ) from website_pv_info;
unbounded preceding
-- 排序 rank 遇到重复会生成不连续的序号 dense_rank连续序号 row_number生成行号
select *,rank() over (partition by cookieid order by pv desc ) as rank_num from website_pv_info;
select *,dense_rank() over (partition by cookieid order by pv desc ) as rank_num from website_pv_info;
select *,row_number() over (partition by cookieid order by pv desc ) as rank_num from website_pv_info;
select *,ntile(3) over (partition by cookieid order by pv desc ) as rank_num from website_pv_info;
with tb1 as ( select *,dense_rank() over (partition by cookieid order by pv desc ) as rank_num from website_pv_info )
from tb1
select *
where rank_num <= 3;
with tb1 as (select *,ntile(3) over (partition by cookieid order by pv desc ) as rank_num from website_pv_info)
from tb1
select *
where rank_num = 1;
select *,lag(pv) over (partition by cookieid order by pv desc ) as rank_num from website_pv_info;
-- 向下取值
select *,lead(pv) over (partition by cookieid order by pv desc ) as rank_num from website_pv_info;
-- 取第一个值
select *,first_value(pv) over (partition by cookieid order by pv desc ) as rank_num from website_pv_info;
-- 去最后一个值 注意取值范围 因为orderby 所用取值范围需要增加向下的范围
select *,last_value(pv) over (partition by cookieid order by pv desc rows between unbounded preceding and unbounded following) as rank_num from website_pv_info;
select sum(pv) sum(sum(pv)) over() from website_pv_info
压缩算法
map压缩
1)开启hive中间传输数据压缩功能
set hive.exec.compress.intermediate=true;
2)开启mapreduce中map输出压缩功能
set mapreduce.map.output.compress=true;
3)设置mapreduce中map输出数据的压缩方式
Set mapreduce.map.output.compress.codec= org.apache.hadoop.io.compress.SnappyCodec;
1)开启hive最终输出数据压缩功能
set hive.exec.compress.output=true;
2)开启mapreduce最终输出数据压缩
set mapreduce.output.fileoutputformat.compress=true;
3)设置mapreduce最终数据输出压缩方式
set mapreduce.output.fileoutputformat.compress.codec = org.apache.hadoop.io.compress.SnappyCodec;
4)设置mapreduce最终数据输出压缩为块压缩
set mapreduce.output.fileoutputformat.compress.type=BLOCK;
行存储和列存储
TextFILE
ORC
PAROUET
格式对比
TEXTfile
create table log_text (
track_time string,
url string,
session_id string,
referer string,
ip string,
end_user_id string,
city_id string
)ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS TEXTFILE ;
orc
create table log_orc(
track_time string,
url string,
session_id string,
referer string,
ip string,
end_user_id string,
city_id string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS orc ;
parquet
create table log_parquet(
track_time string,
url string,
session_id string,
referer string,
ip string,
end_user_id string,
city_id string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS PARQUET ;