HQL的DQL操作基本与SQL相同
hive (bigdata)> select * from student;
hive (bigdata)> select name from student;
hive (bigdata)> select score+10 from score;
hive (bigdata)> select count(*) count from score;
hive (bigdata)> select max(score) max_score from score;
hive (bigdata)> select sum(score) sum_score from score;
hive (bigdata)> show functions;
hive (bigdata)> select * from score limit 5;
hive (bigdata)> select * from score limit 3,3;
hive (bigdata)> select * from score where score > 60;
hive (bigdata)> select * from score where score > 60 and score < 80;
hive (bigdata)> select * from score where score between 60 and 80;
hive (bigdata)> select * from score where score is null;
hive (bigdata)> select * from score where score in(88,100);
like,rlike
rlike:Hive中对like的一个扩展,可以直接使用正则表达式
hive (bigdata)> select * from score where name like %鹏;
hive (bigdata)> select * from score where score like '6%';
hive (bigdata)> select * from score where score like '_6%';
hive (bigdata)> select * from score where score like '%4%';
hive (bigdata)> select * from score where name like '%永%';
hive (bigdata)> select * from score where name rlike '[永]';
select * from score where subject = '数学' and score > 60;
select * from score where subject not in('数学','语文');
#group by
select score.subject,avg(score.score) from score group by subject;
select score.name,avg(score.score) from score group by name;
#having
select score.name,avg(score.score) avg_score from score group by name having avg_score > 70;
having 与 where 不同点:
#内连接:只有进行连接的两个表中都存在与连接条件相匹配的数据才会显示出来。
#使用join查询员工的信息:ID,姓名,部门,职位,薪资
hive (bigdata)> select e.id,e.name,d.name,e.position,e.salary from employee e join department d on e.department_id = d.id;
#左外连接:将左侧都显示,右侧不满足条件的替换为null
hive (bigdata)> select e.id,e.name,d.name,e.position,e.salary from employee e left join department d on e.department_id = d.id;
#右外连接:将右侧显示,左侧不满足条件的替换为null
hive (bigdata)> select e.id,e.name,d.name,e.position,e.salary from employee e right join department d on e.department_id = d.id;
#满外链接:两个表都显示,不满足条件的都替换为null
hive (bigdata)> select e.id,e.name,d.name,e.position,e.salary from employee e full join department d on e.department_id = d.id;
#笛卡尔积连接(不要使用)
hive (bigdata)> set hive.mapred.mode='strict';
hive (bigdata)> select e.id,e.name,d.name,e.position,e.salary from employee e join department d;
全排序(此结果排序),部分排序(区内排序),二次排序(compareTo),辅助排序(组内排序)
#全排序
hive (bigdata)> select * from score order by score;
hive (bigdata)> select * from score order by score desc;
hive (bigdata)> select * from score order by score,subject desc;
#部分排序
hive (bigdata)> set mapreduce.job.reduces=4;
hive (bigdata)> select * from score sort by score desc;
#分区排序
#distribute by:类似于MR中分区,先分区后排序
hive (bigdata)> select * from score distribute by subject sort by score desc;
#cluster by
#distribute by+ sort by 可以使用cluster by(只能按照倒序)代替(排序的字段和分区的字段是一样的)
hive (bigdata)> select * from score cluster by subject;
分区针对的是数据的存储路径;分桶针对的是数据文件。
分桶表数据存储
#创建分桶表
hive (bigdata)> create table student_bucket(id int,name string)
> clustered by(id) into 4 buckets
> row format delimited fields terminated by '\t';
#直接导入数据,发现并未分桶
hive (bigdata)> load data local inpath '/opt/test/student' into table student_bucket;
#设置并使用insert方式导入数据
hive (bigdata)> set hive.enforce.bucketing=true;
hive (bigdata)> insert into table student_bucket select * from student cluster by(id);
分桶表抽样查询
对于非常大的数据集,有时用户需要使用的是一个具有代表性的查询结果而不是全部结果。Hive 可以通过对表进行抽样来满足这个需求。
select * from tablename tablesample(bucket x out of y on id);
例如,table 总共分了 4 份,当 y=2 时,抽取(4/2=)2 个 bucket 的数据,当 y=8 时,抽取(4/8=)1/2个bucket 的数据。
例如,table 总 bucket 数为4,tablesample(bucket 4 out of 4),表示总共抽取(4/4=)1 个bucket 的数据,抽取第 4 个 bucket 的数据。
x 的值必须小于等于 y 的值,否则会报错