从哪里查询取决于FROM关键字后面的table_reference。可以是普通物理表、视图、join结果或子查询结果。
[WITH CommonTableExpression (, CommonTableExpression)*]
SELECT [ALL | DISTINCT] select_expr, select_expr, ...
FROM table_reference
[WHERE where_condition]
[GROUP BY col_list]
[ORDER BY col_list]
[CLUSTER BY col_list
| [DISTRIBUTE BY col_list] [SORT BY col_list]
]
[LIMIT [offset,] rows];
用于指定查询返回结果中重复的行如何处理。
--返回所有匹配的行
select state from t_usa_covid19_p;
--相当于
select all state from t_usa_covid19_p;
--返回所有匹配的行 去除重复的结果
select distinct state from t_usa_covid19_p;
--多个字段distinct 整体去重
select county,state from t_usa_covid19_p;
select distinct county,state from t_usa_covid19_p;
select distinct sex from student;
select * from t_usa_covid19_p where 1 > 2; -- 1 > 2 返回false
select * from t_usa_covid19_p where 1 = 1; -- 1 = 1 返回true
--where条件中使用函数 找出州名字母长度超过10位的有哪些
select * from t_usa_covid19_p where length(state) >10 ;
--where子句支持子查询
SELECT *
FROM A
WHERE A.a IN (SELECT foo FROM B);
反例:
--注意:where条件中不能使用聚合函数
--报错 SemanticException:Not yet supported place for UDAF 'count'
--聚合函数要使用它的前提是结果集已经确定。
--而where子句还处于“确定”结果集的过程中,因而不能使用聚合函数。
select state,count(deaths)
from t_usa_covid19_p where count(deaths) >100 group by state;
--找出来自加州,累计死亡人数大于1000的县 state字段就是分区字段 进行分区裁剪 避免全表扫描
select * from t_usa_covid19_p where state ="California" and deaths > 1000;
--多分区裁剪
select * from t_usa_covid19_p where count_date = "2021-01-28" and state ="California" and deaths > 1000;
--根据state州进行分组
--SemanticException:Expression not in GROUP BY key 'deaths'
--deaths不是分组字段 报错
--state是分组字段 可以直接出现在select_expr中
select state,deaths
from t_usa_covid19_p where count_date = "2021-01-28" group by state;
–被聚合函数应用
select state,sum(deaths)
from t_usa_covid19_p where count_date = “2021-01-28” group by state;
--统计死亡病例数大于10000的州
--where语句中不能使用聚合函数 语法报错
select state,sum(deaths)
from t_usa_covid19_p where count_date = "2021-01-28" and sum(deaths) >10000 group by state;
--先where分组前过滤(此处是分区裁剪),再进行group by分组, 分组后每个分组结果集确定 再使用having过滤
select state,sum(deaths)
from t_usa_covid19_p
where count_date = "2021-01-28"
group by state
having sum(deaths) > 10000;
--这样写更好 即在group by的时候聚合函数已经作用得出结果 having直接引用结果过滤 不需要再单独计算一次了
select state,sum(deaths) as cnts
from t_usa_covid19_p
where count_date = "2021-01-28"
group by state
having cnts> 10000;
--没有限制返回2021.1.28 加州的所有记录
select * from t_usa_covid19_p
where count_date = "2021-01-28"
and state ="California";
--返回结果集的前5条
select * from t_usa_covid19_p
where count_date = "2021-01-28"
and state ="California"
limit 5;
--返回结果集从第1行开始 共3行
select * from t_usa_covid19_p
where count_date = "2021-01-28"
and state ="California"
limit 2,3; --注意 第一个参数偏移量是从0开始的
select * from student;
--不指定reduce task个数
--日志显示:Number of reduce tasks not specified. Estimated from input data size: 1
select * from student cluster by num;
--手动设置reduce task个数
set mapreduce.job.reduces =2;
select * from student cluster by num;
CLUSTER BY无法单独完成,因为分和排序的字段只能是同一个;
ORDER BY更不能在这里使用,因为是全局排序,只有一个输出,无法满足分的需求。
--案例:把学生表数据根据性别分为两个部分,每个分组内根据年龄的倒序排序。
select * from student distribute by sex sort by age desc;
5. 如果DISTRIBUTE BY +SORT BY的字段一样,则:CLUSTER BY=DISTRIBUTE BY +SORT BY
--下面两个语句执行结果一样
select * from student distribute by num sort by num;
select * from student cluster by num;
select_statement
UNION [ALL | DISTINCT]
select_statement
UNION [ALL | DISTINCT]
select_statement ...;
--使用DISTINCT关键字与使用UNION默认值效果一样,都会删除重复行。
select num,name from student_local
UNION
select num,name from student_hdfs;
--和上面一样
select num,name from student_local
UNION DISTINCT
select num,name from student_hdfs;
--使用ALL关键字会保留重复行。
select num,name from student_local
UNION ALL
select num,name from student_hdfs;
--如果要将ORDER BY,SORT BY,CLUSTER BY,DISTRIBUTE BY或LIMIT应用于单个SELECT
--请将子句放在括住SELECT的括号内
SELECT num,name FROM (select num,name from student_local LIMIT 2) subq1
UNION
SELECT num,name FROM (select num,name from student_hdfs LIMIT 3) subq2
--如果要将ORDER BY,SORT BY,CLUSTER BY,DISTRIBUTE BY或LIMIT子句应用于整个UNION结果
--请将ORDER BY,SORT BY,CLUSTER BY,DISTRIBUTE BY或LIMIT放在最后一个之后。
select num,name from student_local
UNION
select num,name from student_hdfs
order by num desc;
--from子句中子查询(Subqueries)
--子查询
SELECT num
FROM (
select num,name from student_local
) tmp;
--包含UNION ALL的子查询的示例
SELECT t3.name
FROM (
select num,name from student_local
UNION distinct
select num,name from student_hdfs
) t3;
--where子句中子查询(Subqueries)
--不相关子查询,相当于IN、NOT IN,子查询只能选择一个列。
--(1)执行子查询,其结果不被显示,而是传递给外部查询,作为外部查询的条件使用。
--(2)执行外部查询,并显示整个结果。
SELECT *
FROM student_hdfs
WHERE student_hdfs.num IN (select num from student_local limit 2);
--相关子查询,指EXISTS和NOT EXISTS子查询
--子查询的WHERE子句中支持对父查询的引用
SELECT A
FROM T1
WHERE EXISTS (SELECT B FROM T2 WHERE T1.X = T2.Y);
--select语句中的CTE
with q1 as (select num,name,age from student where num = 95002)
select *
from q1;
-- from风格
with q1 as (select num,name,age from student where num = 95002)
from q1
select *;
-- chaining CTEs 链式
with q1 as ( select * from student where num = 95002),
q2 as ( select num,name,age from q1)
select * from (select num from q2) a;
-- union
with q1 as (select * from student where num = 95002),
q2 as (select * from student where num = 95004)
select * from q1 union all select * from q2;
--视图,CTAS和插入语句中的CTE
-- insert
create table s1 like student;
with q1 as ( select * from student where num = 95002)
from q1
insert overwrite table s1
select *;
select * from s1;
-- ctas
create table s2 as
with q1 as ( select * from student where num = 95002)
select * from q1;
-- view
create view v1 as
with q1 as ( select * from student where num = 95002)
select * from q1;
select * from v1;
table_reference:是join查询中使用的表名,也可以是子查询别名(查询结果当成表参与join)。
table_factor:与table_reference相同,是联接查询中使用的表名,也可以是子查询别名。
join_condition:join查询关联的条件,如果在两个以上的表上需要连接,则使用AND关键字。
join_table:
table_reference [INNER] JOIN table_factor [join_condition]
| table_reference {LEFT|RIGHT|FULL} [OUTER] JOIN table_reference join_condition
| table_reference LEFT SEMI JOIN table_reference join_condition
| table_reference CROSS JOIN table_reference [join_condition] (as of Hive 0.10)
join_condition:
ON expression
左半开连接(LEFT SEMI JOIN)会返回左边表的记录,前提是其记录对于右边的表满足ON语句中的判定条件。
从效果上来看有点像inner join之后只返回左表的结果。
select *
from employee e left semi join employee_address e_addr
on e.id =e_addr.id;
--相当于 inner join 只不过效率高一些
select e.*
from employee e inner join employee_address e_addr
on e.id =e_addr.id;
SELECT a.* FROM a JOIN b ON (a.id = b.id)
SELECT a.* FROM a JOIN b ON (a.id = b.id AND a.department = b.department)
SELECT a.* FROM a LEFT OUTER JOIN b ON (a.id <> b.id)
SELECT a.val, b.val, c.val FROM a JOIN b ON (a.key = b.key1) JOIN c ON (c.key = b.key2)
SELECT a.val, b.val, c.val FROM a JOIN b ON (a.key = b.key1) JOIN c ON (c.key = b.key1)
--由于联接中仅涉及b的key1列,因此被转换为1个MR作业来执行
SELECT a.val, b.val, c.val FROM a JOIN b ON (a.key = b.key1) JOIN c ON (c.key = b.key2)
--会转换为两个MR作业,因为在第一个连接条件中使用了b中的key1列,而在第二个连接条件中使用了b中的key2列。
-- 第一个map / reduce作业将a与b联接在一起,然后将结果与c联接到第二个map / reduce作业中。
SELECT a.val, b.val, c.val FROM a JOIN b ON (a.key = b.key1) JOIN c ON (c.key = b.key1)
--由于联接中仅涉及b的key1列,因此被转换为1个MR作业来执行,并且表a和b的键的特定值的值被缓冲在reducer的内存中。然后,对于从c中检索的每一行,将使用缓冲的行来计算联接。
SELECT a.val, b.val, c.val FROM a JOIN b ON (a.key = b.key1) JOIN c ON (c.key = b.key2)
--计算涉及两个MR作业。其中的第一个将a与b连接起来,并缓冲a的值,同时在reducer中流式传输b的值。
-- 在第二个MR作业中,将缓冲第一个连接的结果,同时将c的值通过reducer流式传输。
5. 在join的时候,可以通过语法STREAMTABLE提示指定要流式传输的表。如果省略STREAMTABLE提示,则Hive将流式传输最右边的表。
SELECT /*+ STREAMTABLE(a) */ a.val, b.val, c.val FROM a JOIN b ON (a.key = b.key1) JOIN c ON (c.key = b.key1)
--a,b,c三个表都在一个MR作业中联接,并且表b和c的键的特定值的值被缓冲在reducer的内存中。
-- 然后,对于从a中检索到的每一行,将使用缓冲的行来计算联接。如果省略STREAMTABLE提示,则Hive将流式传输最右边的表。
6. join在WHERE条件之前进行
7. 如果除一个要连接的表之外的所有表都很小,则可以将其作为仅map作业执行(mapjoin)。
SELECT /*+ MAPJOIN(b) */ a.key, a.value FROM a JOIN b ON a.key = b.key
--不需要reducer。对于A的每个Mapper,B都会被完全读取。限制是不能执行FULL / RIGHT OUTER JOIN b。