第一章 环境准备
1.1 建表语句
hive>
-- 创建学生表
DROP TABLE IF EXISTS student;
create table if not exists student_info(
stu_id string COMMENT '学生id',
stu_name string COMMENT '学生姓名',
birthday string COMMENT '出生日期',
sex string COMMENT '性别'
)
row format delimited fields terminated by ','
stored as textfile;
-- 创建课程表
DROP TABLE IF EXISTS course;
create table if not exists course_info(
course_id string COMMENT '课程id',
course_name string COMMENT '课程名',
tea_id string COMMENT '任课老师id'
)
row format delimited fields terminated by ','
stored as textfile;
-- 创建老师表
DROP TABLE IF EXISTS teacher;
create table if not exists teacher_info(
tea_id string COMMENT '老师id',
tea_name string COMMENT '学生姓名'
)
row format delimited fields terminated by ','
stored as textfile;
-- 创建分数表
DROP TABLE IF EXISTS score;
create table if not exists score_info(
stu_id string COMMENT '学生id',
course_id string COMMENT '课程id',
score int COMMENT '成绩'
)
row format delimited fields terminated by ','
stored as textfile;
1.2 数据准备
(1)创建/opt/module/data目录
[atguigu@hadoop102 module]$ mkdir data
(2)将如下4个文件放到/opt/module/data目录下
[atguigu@hadoop102 data]$ vim student_info.txt
001,彭于晏,1995-05-16,男
002,胡歌,1994-03-20,男
003,周杰伦,1995-04-30,男
004,刘德华,1998-08-28,男
005,唐国强,1993-09-10,男
006,陈道明,1992-11-12,男
007,陈坤,1999-04-09,男
008,吴京,1994-02-06,男
009,郭德纲,1992-12-05,男
010,于谦,1998-08-23,男
011,潘长江,1995-05-27,男
012,杨紫,1996-12-21,女
013,蒋欣,1997-11-08,女
014,赵丽颖,1990-01-09,女
015,刘亦菲,1993-01-14,女
016,周冬雨,1990-06-18,女
017,范冰冰,1992-07-04,女
018,李冰冰,1993-09-24,女
019,邓紫棋,1994-08-31,女
020,宋丹丹,1991-03-01,女
[atguigu@hadoop102 data]$ vim course_info.txt
01,语文,1003
02,数学,1001
03,英语,1004
04,体育,1002
05,音乐,1002
[atguigu@hadoop102 data]$ vim teacher_info.txt
1001,张高数
1002,李体音
1003,王子文
1004,刘丽英
[atguigu@hadoop102 data]$ vim score_info.txt
001,01,94
002,01,74
004,01,85
005,01,64
006,01,71
007,01,48
008,01,56
009,01,75
010,01,84
011,01,61
012,01,44
013,01,47
014,01,81
015,01,90
016,01,71
017,01,58
018,01,38
019,01,46
020,01,89
001,02,63
002,02,84
004,02,93
005,02,44
006,02,90
007,02,55
008,02,34
009,02,78
010,02,68
011,02,49
012,02,74
013,02,35
014,02,39
015,02,48
016,02,89
017,02,34
018,02,58
019,02,39
020,02,59
001,03,79
002,03,87
004,03,89
005,03,99
006,03,59
007,03,70
008,03,39
009,03,60
010,03,47
011,03,70
012,03,62
013,03,93
014,03,32
015,03,84
016,03,71
017,03,55
018,03,49
019,03,93
020,03,81
001,04,54
002,04,100
004,04,59
005,04,85
007,04,63
009,04,79
010,04,34
013,04,69
014,04,40
016,04,94
017,04,34
020,04,50
005,05,85
007,05,63
009,05,79
015,05,59
018,05,87
1.3 插入数据
(1)插入数据
hive>
load data local inpath '/opt/module/data/student_info.txt' into table student_info;
load data local inpath '/opt/module/data/course_info.txt' into table course_info;
load data local inpath '/opt/module/data/teacher_info.txt' into table teacher_info;
load data local inpath '/opt/module/data/score_info.txt' into table score_info;
(2)验证插入数据情况
hive>
select * from student_info limit 5;
select * from course_info limit 5;
select * from teacher_info limit 5;
select * from score_info limit 5;
第二章 简单查询
2.1 查找特定条件
2.1.1 查询姓名中带“冰”的学生名单
select
*
from
student_info
where stu_name like '%冰%';
2.1.3 检索课程编号为“04”且分数小于60的学生的课程信息,结果按分数降序排列
select
stu_id,
course_id,
score
from score_info
where course_id ='04' and score<60
order by score desc;
2.1.4 查询数学成绩不及格的学生和其对应的成绩,按照学号升序排序
答案一:
SELECT
student.stu_id,
student.stu_name,
score.score
FROM
student_info student
INNER JOIN score_info score ON student.stu_id = score.stu_id
INNER JOIN course_info course ON score.course_id = course.course_id
WHERE
course.course_name = '数学'
AND score.score < 60
ORDER BY
student.stu_id ASC;
答案二:
select
s.stu_id,
s.stu_name,
t1.score
from student_info s
join (
select
*
from score_info
where course_id=(select course_id from course_info where course_name='数学') and score < 60
) t1 on s.stu_id = t1.stu_id
order by s.stu_id;
第三章 汇总分析
3.1 汇总分析
3.1.1 查询编号为“02”的课程的总成绩
select sum(score) from score_info where course_id='02';
3.2 分组
3.2.1 查询各科成绩最高和最低的分,以如下的形式显示:课程号,最高分,最低分
select course_id ,max(score) ,min(score) from score_info group by course_id;
3.2.2 查询每门课程有多少学生参加了考试(有考试成绩)
select course_id,count(stu_id) from score_info group by course_id;
3.3 分组结果的条件
3.3.3 查询同姓(假设每个学生姓名的第一个字为姓)的学生名单并统计同姓人数大于2的姓
答案一:
select substring(stu_name,0,1) ,count(*) as cnt from student_info group by substring(stu_name,0,1) having cnt>=2;
答案二:
select
t1.first_name,
count(*) count_first_name
from (
select
stu_id,
stu_name,
substr(stu_name,0,1) first_name
from student_info
) t1
group by t1.first_name
having count_first_name >= 2;
3.4 查询结果排序&分组指定条件
3.4.2 按照如下格式显示学生的语文、数学、英语三科成绩,没有成绩的输出为0,按照学生的有效平均成绩降序显示
学生id 语文 数学 英语 有效课程数 有效平均成绩
答案一:
select
si.stu_id,
sum(if(ci.course_name='语文',score,0)) `语文`,
sum(if(ci.course_name='数学',score,0)) `数学`,
sum(if(ci.course_name='英语',score,0)) `英语`,
count(*) `有效课程数`,
avg(si.score) `平均成绩`
from
score_info si
join
course_info ci
on
si.course_id=ci.course_id
group by
si.stu_id
order by
`平均成绩` desc;
答案二:
SELECT
tab1.stu_id,
tab1.score,
tab2.score,
tab3.score,
tab4.cnt,
tab5.avg_score
FROM
(
SELECT
stu.stu_id,
IF
( si.score IS NULL, 0, si.score ) AS score
FROM
student_info stu
INNER JOIN score_info si ON stu.stu_id = si.stu_id
WHERE
si.course_id = '01'
) tab1
FULL JOIN (
SELECT
stu.stu_id,
IF
( si.score IS NULL, 0, si.score ) AS score
FROM
student_info stu
INNER JOIN score_info si ON stu.stu_id = si.stu_id
WHERE
si.course_id = '02'
) tab2 ON tab1.stu_id = tab2.stu_id
FULL JOIN (
SELECT
stu.stu_id,
IF
( si.score IS NULL, 0, si.score ) AS score
FROM
student_info stu
INNER JOIN score_info si ON stu.stu_id = si.stu_id
WHERE
si.course_id = '03'
) tab3 ON tab3.stu_id = tab2.stu_id
FULL JOIN (
SELECT
stu.stu_id,
count(*) AS cnt
FROM
student_info stu
INNER JOIN score_info si ON stu.stu_id = si.stu_id
WHERE
si.score IS NOT NULL
GROUP BY
stu.stu_id
) tab4 ON tab4.stu_id = tab3.stu_id
FULL JOIN (
SELECT
stu.stu_id,
avg( si.score ) AS avg_score
FROM
student_info stu
INNER JOIN score_info si ON stu.stu_id = si.stu_id
WHERE
si.score IS NOT NULL
GROUP BY
stu.stu_id
) tab5 ON tab5.stu_id = tab4.stu_id
ORDER BY
avg_score DESC;
3.4.3 查询一共参加三门课程且其中一门为语文课程的学生的id和姓名
答案一:
select stu_id,stu_name
from student_info
where stu_id in (select stu_id
from (
select stu_id, count(*) course_num, collect_set(course_id) course_list
from score_info
group by stu_id
having course_num = 3
and array_contains(course_list, '01')) t1);
答案二:
select
t2.stu_id,
s.stu_name
from (
select t1.stu_id
from (
select stu_id,
course_id
from score_info
where stu_id in (
select stu_id
from score_info
where course_id = "01"
)
) t1
group by t1.stu_id
having count(t1.course_id) = 3
) t2
join student_info s on t2.stu_id = s.stu_id;
输出:
stu_id stu_name
006 陈道明
008 吴京
011 潘长江
012 杨紫
019 邓紫棋
第四章 复杂查询
4.1 子查询
4.1.1 查询所有课程成绩均小于60分的学生的学号、姓名
答案一:
select stu_id
from score_info
where score < 60
and stu_id not in (select stu_id from score_info where score >= 60)
答案二:
select s.stu_id,s.stu_name from (
select stu_id, sum(if(score >= 60, 1, 0)) flag
from score_info
group by stu_id
having flag = 0
)t1 join student_info s on s.stu_id=t1.stu_id;
结果:
s.stu_id s.stu_name
008 吴京
017 范冰冰
第五章 多表查询
5.1 表联结
5.1.1 查询有两门以上的课程不及格的同学的学号及其平均成绩
① 先找出有两门以上不及格的学生名单,按照学生分组,过滤组内成绩低于60的并进行count,count>=2。
② 接着做出一张表查询学生的平均成绩并和上一个子查询中的学生学号进行连接
select t1.stu_id, t1.avg_score
from (
select stu_id, sum(if(score < 60, 1, 0)) cn, avg(score) avg_score
from score_info
group by stu_id
having cn >= 2) t1
join student_info s on t1.stu_id = s.stu_id;
5.2 多表连接
5.2.6 查询学过“李体音”老师所教的所有课的同学的学号、姓名
答案一:
select stu_id, stu_name
from student_info
where stu_id in (select stu_id
from score_info
where course_id in (select course_id
from course_info
where tea_id in (select tea_id from teacher_info where tea_name = '李体音'))
group by stu_id
having count(*) = 2);
答案二:
select
t1.stu_id,
si.stu_name
from
(
select
stu_id
from score_info si
where course_id in
(
select
course_id
from course_info c
join teacher_info t
on c.tea_id = t.tea_id
where tea_name='李体音' --李体音教的所有课程
)
group by stu_id
having count(*)=2 --学习所有课程的学生
)t1
left join student_info si
on t1.stu_id=si.stu_id;
结果:
s.stu_id s.stu_name
005 唐国强
007 陈坤
009 郭德纲
Time taken: 27.16 seconds, Fetched: 3 row(s)
5.2.7 查询学过“李体音”老师所讲授的任意一门课程的学生的学号、姓名
答案一:
select stu_id,stu_name
from student_info
where stu_id in (
select stu_id
from score_info
where course_id in (
select course_id
from course_info
where tea_id in (select tea_id from teacher_info where tea_name = '李体音')
));
答案二:
select t1.stu_id,
si.stu_name
from (
select stu_id
from score_info si
where course_id in
(
select course_id
from course_info c
join teacher_info t
on c.tea_id = t.tea_id
where tea_name = '李体音'
)
group by stu_id
) t1
left join student_info si
on t1.stu_id = si.stu_id;
结果:
s.stu_id s.stu_name
001 彭于晏
002 胡歌
004 刘德华
005 唐国强
007 陈坤
009 郭德纲
010 于谦
013 蒋欣
014 赵丽颖
015 刘亦菲
016 周冬雨
017 范冰冰
018 李冰冰
020 宋丹丹
5.2.8 查询没学过"李体音"老师讲授的任一门课程的学生姓名
答案一:
select stu_id,stu_name
from student_info
where stu_id not in (
select stu_id
from score_info
where course_id in (
select course_id
from course_info
where tea_id in (select tea_id from teacher_info where tea_name = '李体音')
));
答案二:
select
stu_id,
stu_name
from student_info
where stu_id not in
(
select
stu_id
from score_info si
where course_id in
(
select
course_id
from course_info c
join teacher_info t
on c.tea_id = t.tea_id
where tea_name='李体音'
)
group by stu_id
);
结果:
stu_id stu_name
003 周杰伦
006 陈道明
008 吴京
011 潘长江
012 杨紫
019 邓紫棋
5.2.9 查询至少有一门课与学号为“001”的学生所学课程相同的学生的学号和姓名
答案一:
select stu_id, stu_name
from student_info
where stu_id in (select stu_id
from score_info
where course_id in (select course_id from score_info where stu_id = '001'))
and stu_id != '001';
答案二:
select si.stu_id,
si.stu_name
from score_info sc
join student_info si
on sc.stu_id = si.stu_id
where sc.course_id in
(
select course_id
from score_info
where stu_id = '001' --001的课程
)
and sc.stu_id <> '001' --排除001学生
group by si.stu_id, si.stu_name;
结果:
s1.stu_id s2.stu_name
002 胡歌
004 刘德华
005 唐国强
006 陈道明
007 陈坤
008 吴京
009 郭德纲
010 于谦
011 潘长江
012 杨紫
013 蒋欣
014 赵丽颖
015 刘亦菲
016 周冬雨
017 范冰冰
018 李冰冰
019 邓紫棋
020 宋丹丹