SELECT用于映射符合指定查询条件的行
Hive SELECT是数据库标准SQL的子集
SELECT 1;
SELECT * FROM table_name;
SELECT id,name,age FROM people WHERE age>20;
SELECT * FROM employee WHERE name!='Lucy' LIMIT 5;
select t.deptno, avg(t.sal) avg_sal from emp t group by t.deptno;
select deptno, avg(sal) avg_sal from emp group by deptno having avg_sal > 2000;
Hive查询 - CTE和嵌套查询
CTE(Common Table Expression)
-- CTE语法
WITH t1 AS (SELECT …) SELECT * FROM t1;
-- CTE演示
with tab1 as (select id,name,age from people)
select * from tab1;
嵌套查询
-- 嵌套查询示例
SELECT * FROM (SELECT * FROM employee) a;
Hive JOIN - 关联查询
指对多表进行联合查询
JOIN用于将两个或多个表中的行组合在一起查询
类似于SQL JOIN,但是Hive仅支持等值连接
-- 隐式连接 不写join但是加条件就叫隐式连接 先出笛卡尔积数据量非常大,再次筛选 而join会先过滤效率高
select * from emp_basic eb,emp_psn ep
where eb.emp_id = ep.emp_id;
JOIN发生在WHERE子句之前
请分别实现以下需求
-- 顾客表
load data inpath '/data/retail_db/customers.csv' into table customers;
select * from customers;
-- 部门表
load data inpath '/data/retail_db/departments.csv' into table departments;
select * from departments;
-- 商品表
load data inpath '/data/retail_db/products.csv' into table products;
select * from products;
select *
from customers
where customer_state = 'NY' and customer_city like 'New York';
select count(1)
from (
select count(1)
from orders
group by order_customer_id) a;
select * from products limit 5;
-- 用个毛子查询直接函数
select concat(customer_fname,' ',customer_lname)
from customers ;
select * from customers
where customer_id rlike '[1,2,3][4,5,6]'
and customer_fname rlike '^Ma+[a,r]'
and customer_lname rlike '^S'
and customer_state rlike '^N'
and customer_city rlike 'Green' ;
select *
from customers c
where not exists(select * from orders o where order_customer_id = c.customer_id );
MapJoin操作在Map端完成
开启mapjoin操作
set hive.auto.convert.join=true;
//设置 MapJoin 优化默认自动开启
set hive.mapjoin.smalltable.filesize=25000000
//设置小表不超过多大时开启 mapjoin 优化
运行时自动将连接转换为MAPJOIN
MAPJOIN操作不支持:
所有子集数据必须具有相同的名称和类型
目前使用的版本为1.1
可以在顶层查询中使用(0.13.0之后)
ORDER BY, SORT BY, CLUSTER BY, DISTRIBUTE BY 和LIMIT适用于合并后的整个结果
集合其他操作可以使用JOIN/OUTER JOIN来实现
使用INSERT语句将数据插入表/分区
-- INSERT支持OVERWRITE(覆盖)和INTO(追加)
INSERT OVERWRITE/INTO TABLE tablename1
[PARTITION (partcol1=val1, partcol2=val2 ...)]
select fileds,... from tb_other;
INSERT OVERWRITE TABLE test select 'hello'; -- INSERT不支持的写法
insert into employee select * from ctas_employee; -- 通过查询语句插入
-- 多插入
from ctas_employee
insert overwrite table employee select *
insert overwrite table employee_internal select *;
-- 插入到分区
from ctas_patitioned
insert overwrite table employee PARTITION (year, month)
select *,'2018','09';
-- 通过指定列插入(insert into可以省略table关键字)
insert into employee(name) select 'John' from test limit 1;
-- 通过指定值插入
insert into employee(name) value('Judy'),('John');
使用insert语句将数据插入/导出到文件
-- 从同一数据源插入本地文件,hdfs文件,表
from ctas_employee
insert overwrite local directory '/tmp/out1' select *
insert overwrite directory '/tmp/out1' select *
insert overwrite table employee_internal select *;
-- 以指定格式插入数据
insert overwrite directory '/tmp/out3'
row format delimited fields terminated by ','
select * from ctas_employee;
-- 其他方式从表获取文件
hdfs dfs -getmerge <table_file_path>
IMPORT和EXPORT用于数据导入和导出
使用EXPORT导出数据
EXPORT TABLE employee TO '/tmp/output3'; -- 导出到hdfs
EXPORT TABLE employee_partitioned partition (year=2014, month=11) TO '/tmp/output5';
使用IMPORT导入数据
IMPORT TABLE employee FROM '/tmp/output3';
IMPORT TABLE employee_partitioned partition (year=2014, month=11) FROM '/tmp/output5';
ORDER BY (ASC|DESC)类似于标准SQL
set hive.groupby.orderby.position.alias=true;
SORT BY对每个Reducer中的数据进行排序
DISTRIBUTE BY类似于标准SQL中的GROUP BY
-- 默认ASC,正序 DESC,倒序
SELECT department_id , name, employee_id, evaluation_score
FROM employee_hr
DISTRIBUTE BY department_id SORT BY evaluation_score DESC;
CLUSTER BY = DISTRIBUTE BY + SORT BY
练习2:实现Hive数据加载及排序
load data inpath '/data/retail_db/order_items.csv' into table order_items;
select * from order_items;
create table order_test2 like order_items;
from order_items
insert into table order_test2 select *;
create table o_01 like order_items;
create table o_02 like order_items;
from order_items
insert into table o_01 select *
insert into table o_02 select *;
-- 本地
from order_items
insert overwrite local directory '/root/order_items.txt' select *;
-- 数据导出到hdfs
EXPORT TABLE order_items TO '/tmp/output1';
select order_item_order_id, sum(order_item_quantity) e
from order_items o
group by order_item_order_id
order by e desc
limit 10;