hive学习之经典sql 50题 hive版
建表:
create table student(s_id string,s_name string,s_birth string,s_sex string) row format delimited fields terminated by '\t';
create table course(c_id string,c_name string,t_id string) row format delimited fields terminated by '\t';
create table teacher(t_id string,t_name string) row format delimited fields terminated by '\t';
create table score(s_id string,c_id string,s_score int) row format delimited fields terminated by '\t';
生成数据
vi /export/data/hivedatas/student.csv
01 赵雷 1990-01-01 男
02 钱电 1990-12-21 男
03 孙风 1990-05-20 男
04 李云 1990-08-06 男
05 周梅 1991-12-01 女
06 吴兰 1992-03-01 女
07 郑竹 1989-07-01 女
08 王菊 1990-01-20 女
vi /export/data/hivedatas/course.csv
01 语文 02
02 数学 01
03 英语 03
vi /export/data/hivedatas/teacher.csv
01 张三
02 李四
03 王五
vi /export/data/hivedatas/score.csv
01 01 80
01 02 90
01 03 99
02 01 70
02 02 60
02 03 80
03 01 80
03 02 80
03 03 80
04 01 50
04 02 30
04 03 20
05 01 76
05 02 87
06 01 31
06 03 34
07 02 89
07 03 98
导数据到hive
load data local inpath '/export/data/hivedatas/student.csv' into table student;
load data local inpath '/export/data/hivedatas/course.csv' into table course;
load data local inpath '/export/data/hivedatas/teacher.csv' into table teacher;
load data local inpath '/export/data/hivedatas/score.csv' into table score;
–注:–hive查询语法
SELECT [ALL | DISTINCT] select_expr, select_expr, ...
FROM table_reference
[WHERE where_condition]
[GROUP BY col_list [HAVING condition]]
[CLUSTER BY col_list
| [DISTRIBUTE BY col_list] [SORT BY| ORDER BY col_list]
]
[LIMIT number]
– 1、查询"01"课程比"02"课程成绩高的学生的信息及课程分数:
select student.*,a.s_score as 01_score,b.s_score as 02_score
from student
join score a on student.s_id=a.s_id and a.c_id='01'
left join score b on student.s_id=b.s_id and b.c_id='02'
where a.s_score>b.s_score;
–答案2
select student.*,a.s_score as 01_score,b.s_score as 02_score
from student
join score a on a.c_id='01'
join score b on b.c_id='02'
where a.s_id=student.s_id and b.s_id=student.s_id and a.s_score>b.s_score;
– 2、查询"01"课程比"02"课程成绩低的学生的信息及课程分数:
select student.*,a.s_score as 01_score,b.s_score as 02_score
from student
join score a on student.s_id=a.s_id and a.c_id='01'
left join score b on student.s_id=b.s_id and b.c_id='02'
where a.s_score
–答案2
select student.*,a.s_score as 01_score,b.s_score as 02_score
from student
join score a on a.c_id='01'
join score b on b.c_id='02'
where a.s_id=student.s_id and b.s_id=student.s_id and a.s_score
– 3、查询平均成绩大于等于60分的同学的学生编号和学生姓名和平均成绩:
select student.s_id,student.s_name,tmp.平均成绩 from student
join (
select score.s_id,round(avg(score.s_score),1)as 平均成绩
from score group by s_id)as tmp
on tmp.平均成绩>=60
where student.s_id = tmp.s_id
–答案2
select student.s_id,student.s_name,round(avg (score.s_score),1) as 平均成绩 from student
join score on student.s_id = score.s_id
group by student.s_id,student.s_name
having avg (score.s_score) >= 60;
– 4、查询平均成绩小于60分的同学的学生编号和学生姓名和平均成绩:
– (包括有成绩的和无成绩的)
select student.s_id,student.s_name,tmp.avgScore from student
join (
select score.s_id,round(avg(score.s_score),1)as avgScore from score group by s_id)as tmp
on tmp.avgScore < 60
where student.s_id=tmp.s_id
union all
select s2.s_id,s2.s_name,0 as avgScore from student s2
where s2.s_id not in
(select distinct sc2.s_id from score sc2);
–答案2
select score.s_id,student.s_name,round(avg (score.s_score),1) as avgScore from student
inner join score on student.s_id=score.s_id
group by score.s_id,student.s_name
having avg (score.s_score) < 60
union all
select s2.s_id,s2.s_name,0 as avgScore from student s2
where s2.s_id not in
(select distinct sc2.s_id from score sc2);
– 5、查询所有同学的学生编号、学生姓名、选课总数、所有课程的总成绩:
select student.s_id,student.s_name,(count(score.c_id) )as total_count,sum(score.s_score)as total_score
from student
left join score on student.s_id=score.s_id
group by student.s_id,student.s_name ;
– 6、查询"李"姓老师的数量:
select t_name,count(1) from teacher where t_name like '李%' group by t_name;
– 7、查询学过"张三"老师授课的同学的信息:
select student.* from student
join score on student.s_id =score.s_id
join course on course.c_id=score.c_id
join teacher on course.t_id=teacher.t_id and t_name='张三';
– 8、查询没学过"张三"老师授课的同学的信息:
select student.* from student
left join (select s_id from score
join course on course.c_id=score.c_id
join teacher on course.t_id=teacher.t_id and t_name='张三')tmp
on student.s_id =tmp.s_id
where tmp.s_id is null;
– 9、查询学过编号为"01"并且也学过编号为"02"的课程的同学的信息:
select * from student
join (select s_id from score where c_id =1 )tmp1
on student.s_id=tmp1.s_id
join (select s_id from score where c_id =2 )tmp2
on student.s_id=tmp2.s_id;
– 10、查询学过编号为"01"但是没有学过编号为"02"的课程的同学的信息:
select student.* from student
join (select s_id from score where c_id =1 )tmp1
on student.s_id=tmp1.s_id
left join (select s_id from score where c_id =2 )tmp2
on student.s_id =tmp2.s_id
where tmp2.s_id is null;
– 11、查询没有学全所有课程的同学的信息:
–先查询出课程的总数量
select count(1) from course;
–再查询所需结果
select student.* from student
left join(
select s_id
from score
group by s_id
having count(c_id)=3)tmp
on student.s_id=tmp.s_id
where tmp.s_id is null;
–方法二(一步到位):
select student.* from student
join (select count(c_id)num1 from course)tmp1
left join(
select s_id,count(c_id)num2
from score group by s_id)tmp2
on student.s_id=tmp2.s_id and tmp1.num1=tmp2.num2
where tmp2.s_id is null;
– 12、查询至少有一门课与学号为"01"的同学所学相同的同学的信息:
select student.* from student
join (select c_id from score where score.s_id=01)tmp1
join (select s_id,c_id from score)tmp2
on tmp1.c_id =tmp2.c_id and student.s_id =tmp2.s_id
where student.s_id not in('01')
group by student.s_id,s_name,s_birth,s_sex;
– 13、查询和"01"号的同学学习的课程完全相同的其他同学的信息:
–备注:hive不支持group_concat方法,可用 concat_ws(’|’, collect_set(str)) 实现
select student.*,tmp1.course_id from student
join (select s_id ,concat_ws('|', collect_set(c_id)) course_id from score
group by s_id having s_id not in (1))tmp1
on student.s_id = tmp1.s_id
join (select concat_ws('|', collect_set(c_id)) course_id2
from score where s_id=1)tmp2
on tmp1.course_id = tmp2.course_id2;
– 14、查询没学过"张三"老师讲授的任一门课程的学生姓名:
select student.* from student
left join (select s_id from score
join (select c_id from course join teacher on course.t_id=teacher.t_id and t_name='张三')tmp2
on score.c_id=tmp2.c_id )tmp
on student.s_id = tmp.s_id
where tmp.s_id is null;
– 15、查询两门及其以上不及格课程的同学的学号,姓名及其平均成绩:
select student.s_id,student.s_name,tmp.avg_score from student
inner join (select s_id from score
where s_score<60
group by score.s_id having count(s_id)>1)tmp2
on student.s_id = tmp2.s_id
left join (
select s_id,round(AVG (score.s_score)) avg_score
from score group by s_id)tmp
on tmp.s_id=student.s_id;
– 16、检索"01"课程分数小于60,按分数降序排列的学生信息:
select student.*,s_score from student,score
where student.s_id=score.s_id and s_score<60 and c_id='01'
order by s_score desc;
– 17、按平均成绩从高到低显示所有学生的所有课程的成绩以及平均成绩:
select a.s_id,tmp1.s_score as chinese,tmp2.s_score as math,tmp3.s_score as english,
round(avg (a.s_score),2) as avgScore
from score a
left join (select s_id,s_score from score s1 where c_id='01')tmp1 on tmp1.s_id=a.s_id
left join (select s_id,s_score from score s2 where c_id='02')tmp2 on tmp2.s_id=a.s_id
left join (select s_id,s_score from score s3 where c_id='03')tmp3 on tmp3.s_id=a.s_id
group by a.s_id,tmp1.s_score,tmp2.s_score,tmp3.s_score order by avgScore desc;
– 18.查询各科成绩最高分、最低分和平均分:以如下形式显示:课程ID,课程name,最高分,最低分,平均分,及格率,中等率,优良率,优秀率:
–及格为>=60,中等为:70-80,优良为:80-90,优秀为:>=90
select course.c_id,course.c_name,tmp.maxScore,tmp.minScore,tmp.avgScore,tmp.passRate,tmp.moderate,tmp.goodRate,tmp.excellentRates from course
join(select c_id,max(s_score) as maxScore,min(s_score)as minScore,
round(avg(s_score),2) avgScore,
round(sum(case when s_score>=60 then 1 else 0 end)/count(c_id),2)passRate,
round(sum(case when s_score>=60 and s_score<70 then 1 else 0 end)/count(c_id),2) moderate,
round(sum(case when s_score>=70 and s_score<80 then 1 else 0 end)/count(c_id),2) goodRate,
round(sum(case when s_score>=80 and s_score<90 then 1 else 0 end)/count(c_id),2) excellentRates
from score group by c_id)tmp on tmp.c_id=course.c_id;
– 19、按各科成绩进行排序,并显示排名:
– row_number() over()分组排序功能(mysql没有该方法)
select s1.*,row_number()over(order by s1.s_score desc) Ranking
from score s1 where s1.c_id='01'order by Ranking asc
union all select s2.*,row_number()over(order by s2.s_score desc) Ranking
from score s2 where s2.c_id='02'order by Ranking asc
union all select s3.*,row_number()over(order by s3.s_score desc) Ranking
from score s3 where s3.c_id='03'order by Ranking asc;
– 20、查询学生的总成绩并进行排名:
select score.s_id,s_name,sum(s_score) sumscore,row_number()over(order by sum(s_score) desc) Ranking
from score ,student
where score.s_id=student.s_id
group by score.s_id,s_name order by sumscore desc;
后续部分参见:
https://blog.csdn.net/Thomson617/article/details/83280617
Hive下的SQL语法总结:
(1).Hive不支持join的非等值连接,不支持or
分别举例如下及实现解决办法。
不支持不等值连接
错误:select * from a inner join b on a.id<>b.id
替代方法:select * from a inner join b on a.id=b.id and a.id is null;
不支持or
错误:select * from a inner join b on a.id=b.id or a.name=b.name
替代方法:select * from a inner join b on a.id=b.id
union all
select * from a inner join b on a.name=b.name
两个sql union all的字段名必须一样或者列别名要一样。
(2).分号字符:不能智能识别concat(‘;’,key),只会将‘;’当做SQL结束符号。
•分号是SQL语句结束标记,在HiveQL中也是,但是在HiveQL中,对分号的识别没有那么智慧,例如:
•select concat(key,concat(';',key)) from dual;
•但HiveQL在解析语句时提示:
FAILED: Parse Error: line 0:-1 mismatched input '' expecting ) in function specification
•解决的办法是,使用分号的八进制的ASCII码进行转义,那么上述语句应写成:
•select concat(key,concat('\073',key)) from dual;
(3).不支持INSERT INTO 表 Values(), UPDATE, DELETE等操作.这样的话,就不要很复杂的锁机制来读写数据。
INSERT INTO syntax is only available starting in version 0.8。INSERT INTO就是在表或分区中追加数据。
(4).HiveQL中String类型的字段若是空(empty)字符串, 即长度为0, 那么对它进行IS NULL的判断结果是False,使用left join可以进行筛选行。
(5).不支持 ‘< dt <’这种格式的范围查找,可以用dt in(”,”)或者between替代。
(6).Hive不支持将数据插入现有的表或分区中,仅支持覆盖重写整个表,示例如下:
INSERT OVERWRITE TABLE t1 SELECT * FROM t2;
(7).group by的字段,必须是select后面的字段,select后面的字段不能比group by的字段多.
如果select后面有聚合函数,则该select语句中必须有group by语句
而且group by后面不能使用别名
(8).hive的0.13版之前select , where 及 having 之后不能跟子查询语句(一般使用left join、right join 或者inner join替代)
(9).先join(及inner join) 然后left join或right join
(10).hive不支持group_concat方法,可用 concat_ws('|', collect_set(str)) 实现
(11).not in 和 <> 不起作用,可用left join tmp on tableName.id = tmp.id where tmp.id is null 替代实现
... ...