select用于映射符合指定查询条件的行
select查询类似于sql
CTE(Common Table Expression)
with
t1 as(select ...)
select * from t1
嵌套查询
select * from(select * from employee)a;
cte和嵌套可以转换
指对多表进行联合查询
JOIN用于将两个或多个表中的行组合在一起查询
类似于SQL JOIN,但是Hive仅支持等值连接,不支持非等值连接
内连接:INNER JOIN
外连接:OUTER JOIN
RIGHT JOIN, LEFT JOIN, FULL OUTER JOIN
交叉连接:CROSS JOIN
隐式连接:Implicit JOIN
JOIN发生在WHERE子句之前
数据:
stu
id,name
1,zhangsan
2,lisi
3,wangwu
score
id,scores
2,90
3,80
4,87
各种连接后的情况
内连接:只有进行连接的两个表中都存在与连接条件相匹配的数据才会被保
留下来
select * from stu join score on stu.id=score.id
结果:
2,lisi 2,90
3,wangwu 3,80
左外连接:JOIN 操作符左边表中符合 WHERE 子句的所有记录将会被返回。
select * from stu left join score on stu.id=score.id
结果:
1,zhangsan null
2,lisi 90
3,wangwu 80
右外连接:JOIN 操作符右边表中符合 WHERE 子句的所有记录将会被返回。
select * from stu right join score on stu.id=score.id
结果:
2,lisi 90
3,wangwu 80
4,null 87
全外连接:将会返回所有表中符合 WHERE 语句条件的所有记录。如果任一表的指定字段没有符合条件的值的话,那么就使用 NULL 值替代。
select * from stu full join score on stu.id=score.id
结果:
1,zhangsan null
2,lisi 90
3,wangwu 80
4,null 87
cross full
select * from stu , score where stu.id=score.id
结果:
1,zhangsan 2,90
1,zhangsan 3,80
1,zhangsan 4,87
2,lisi 2,90
2,lisi 3,80
2,lisi 4,87
3,wangwu 2,90
3,wangwu 3,80
3,wangwu 4,87
分别实现以下需求
--将顾客表、部门表、商品表数据存入Hive
load data local inpath'/root/test/customers.csv'into table customers;
load data local inpath '/root/test/departments.csv' into table departments;
load data local inpath '/root/test/products.csv' into table products;
--查询顾客表中地区为“NY”所在城市为'New York'的用户
select * from customers where customer_state='NY'and customer_city='New York';
--查询订单表中共有多少不同顾客下过订单
select count(distinct order_customer_id) from orders;
--查询商品表中前5个商品
select * from products limit5;
--从顾客表中查询每位顾客的全名(分别使用CTE和子查询)
select concat(customer_fname,' ',customer_lname) from customers;
--使用正则表达式匹配顾客表中ID、姓名与所在城市列
select * from customers
where customer_id rlike '[1,2,3][4,5,6]'
and customer_fname rlike '^Ma+[a,r]'
and customer_lname rlike '^S'
and customer_state rlike '^N'
and customer_city rlike 'Green' ;
--使用关联查询获取没有订单的所有顾客
select customer_fname from customers c where c.customer_id
not in
(select order_customer_id from orders);--这条语句记住顾客表需要取别名才能查出结果!!!
注:rlike支持java的正则表达式;使用子查询需要取别名
小表关联大表,可以进行不等值连接
小表放左边,大表放右边
set hive.auto.convert.join=true;
运行时自动连接转换成mapjoin
mapjoin操作不支持:
在union all,lateral view,group by/join/sort by/cluster by/distribute by
等操作后面
在union,join以及其他mapjoin之前
select * from customers where customer_fname='Mary'
union all
select * from customers where customer_fname='Mary';
--差集
SELECT a.name FROM employee a LEFT JOIN employee_hr b
ON a.name = b.name WHERE b.name IS NULL;
--交集
SELECT a.name FROM employee a
JOIN employee_hr b ON a.name = b.name;
hive>load data [local] inpath '/opt/datas/student.txt' [overwrite] into table
student [partition (partcol1=val1,…)];
1)load data:表示加载数据
2)local:表示从本地加载数据到 hive 表;否则从 HDFS 加载数据到 hive 表
3)inpath:表示加载数据的路径
4)into table:表示加载到哪张表
5)student:表示具体的表
6)overwrite:表示覆盖表中已有数据,否则表示追加
7)partition:表示上传到指定分区
student.txt 文件内容
1001 zhangsan
1002 lishi
1003 wangwu
--(1)创建一张表
hive (default)> create table student(id string, name string) row format delimited
fields terminated by '\t';
--(2)加载本地文件到 hive
hive (default)> load data local inpath '/opt/datas/student.txt' into table
student;
--(3)加载 HDFS 文件到 hive 中
-- 上传文件到 HDFS
hive (default)> dfs -put /opt/datas/student.txt /kgc/hive;
-- 加载 HDFS 上数据
hive (default)>load data inpath '/kgc/hive/student.txt' into table student;
--(4)加载数据覆盖表中已有的数据
-- 上传文件到 HDFS
hive (default)> dfs -put /opt/datas/student.txt /kgc/hive;
-- 加载数据覆盖表中已有的数据
hive (default)>load data inpath '/kgc/hive/student.txt' overwrite into table
student;
INSERT OVERWRITE/INTO TABLE tablename1
[PARTITION (partcol1=val1, partcol2=val2 ...)]
select fileds,... from tb_other;
示例:
注意: create table ctas_partitoned as select * from employee_partitioned;
通过CTAS创建的表虽然直接select的分区表,但不是分区表
hive> desc ctas_partitoned;
OK
name string
employe_id int
number string
year int
month int
Time taken: 0.053 seconds, Fetched: 5 row(s)
insert into ctas_employee select * from employee;
from ctas_employee
insert overwrite table employee select *
insert overwrite table employee_internal select *;
from ctas_patitioned
insert into table employee_partitioned PARTITION (year, month)
select name,work_place,sex_age,skills_score,depart_title,'2018','09';
(insert into可以省略table关键字)
insert into employee(name) select 'John' from test limit 1;
insert into table employee(name)values('July'),('John');
create table if not exists student3
as select id, name from student;
hive (default)> create table if not exists student5(
id int, name string)
row format delimited fields terminated by '\t'
location '/kgc/hive/student5';
--上传数据
hive (default)> dfs -put /opt/datas/student.txt /kgc/hive/student5;
使用insert语句将数据插入/导出到文件
文件插入只支持OVERWRITE
支持来自同一个数据源/表的多次插入
LOCAL:写入本地文件系统
默认数据以TEXT格式写入,列由^A分隔
支持自定义分隔符导出文件为不同格式,CSV,JSON等
from ctas_employee
insert overwrite local directory '/tmp/out1' select *
insert overwrite directory '/tmp/out1' select *
insert overwrite table employee_internal select *;
insert overwrite directory '/tmp/out3'
row format delimited fields terminated by ','
select * from ctas_employee;
hdfs dfs -getmerge <table_file_path>
--import数据到指定hive表中
hive (default)> import table student2 partition(month='201709') from
'/user/hive/warehouse/export/student';
IMPORT TABLE employee FROM '/tmp/output3';
IMPORT TABLE employee_partitioned partition (year=2014, month=11) FROM '/tmp/output5';
导出到本地要加local ,导到hdfs上不需要local
--将查询结果导出到本地
hive (default)> insert overwrite local directory '/opt/datas/export/student' select * from student;
--将查询的结果格式化导出到本地
hive (default)> insert overwrite local directory '/opt/datas/export/student1'
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
COLLECTION ITEMS TERMINATED BY '\n'
select * from student;
--将查询的结果导出到HDFS上(没有local)
hive (default)> insert overwrite directory '/user/hive/warehouse/student2'
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
COLLECTION ITEMS TERMINATED BY '\n'
select * from student;
hive (default)> hdfs dfs -get /user/hive/warehouse/student/month=201709/000000_0
/opt/datas/export/student3.txt;
基本语法:hive -f/-e 执行语句或者脚本 >file
[hadoop@hadoop00 hive]$ bin/hive -e 'select * from default.student;' > /opt/datas/export/student4.txt;
EXPORT TABLE employee TO '/tmp/output3';
EXPORT TABLE employee_partitioned partition (year=2014, month=11) TO '/tmp/output5';
使用sqoop
hive (default)> truncate table student;
注意:Truncate 只能删除管理表数据,不能删除外部表中数据。
注:
import和export的区别:
Order By:全局排序,只有一个 reducer,会导致当输入规模较大时,需要较长的计算时间,结果全局有序。
set hive.groupby.orderby.position.alias=true;
select * from employee_hr order by employe_id desc limit 1;
--会把大于95和小于95进行区分
select * from employee_hr
order by case when employe_id>95 then 1 else 0 end desc
limit 10;
--1代表里面都是大于95的,但并未排序
--0代表里面都是不大于95是,也为排序
select * from employee_hr order by 1;--通过order by 1/0后整个表进行排序
1)区别:order by 在 hive.mapred.mode = strict模式下必须指定 limit 否则执行会报错。
2)报错信息:
set hive.mapred.mode=strict;
hive> select * from test order by id;
FAILED: Error in semantic analysis: 1:28 In strict mode, if ORDER BY is specified,
LIMIT must also be specified. Error encountered near token 'id'
3)原因:
在order by状态下所有数据会到一台服务器进行reduce操作也即只有一个reduce,如果在数据量大的情况下会出现无法输出结果的情况,如果进行“limit n”,那只有“n * map number”条记录传入到reduce端,只有一个reduce也可以处理过来
Sort By:每个 MapReduce 内部进行排序,对全局结果集来说不一定有序
Distribute By:类似 MR 中 partition,进行分区,结合 sort by 使用。
--先按照部门编号分区,再按照员工编号降序排序
hive (default)> set mapreduce.job.reduces=3;
hive (default)> select * from emp distribute by deptno sort by empno desc;
cluster by=distribute by+sort by(当distribute by和sort by的字段相同时,可以使用cluster by)
# 以下两种写法等价
select * from emp cluster by empno;
select * from emp distribute by empno sort by empno;
--将order_items.csv数据通过load方式加载到order_items表
load data inpath '/data/retail_db/order_items.csv' into table order_items;
--将order_items表中数据加载到order_test2表
create table order_test2 like order_items;
from order_items
insert into table order_test2 select *;
--将order_items表中数据同时加载到o_01和o_02表
create table o_01 like order_items;
create table o_02 like order_items;
from order_items
insert into table o_01 select *
insert into table o_02 select *;
-- 将order_items表中数据导出到本地以及hdfs
from order_items
insert overwrite local directory '/root/order_items.txt'select *;
EXPORT TABLE order_items TO '/tmp/output1';
--统计order_items表中订单数量排行(取前10)
select order_item_order_id, sum(order_item_quantity) e
from order_items o
group by order_item_order_id
order by e desc
limit 10;
用于分组
set hive.groupby.orderby.position.alias=true;
case when 条件表达式 then 条件满足的值
when 条件表达式 then 条件满足的值
else 条件不满足的值
end
select name,count(1) from employee group by name
having count(1)>1;
--having使用
select sex_age.age from employee group by sex_age.age having count(*) <= 1;
--使用子查询代替having
select a.age from ( select count(*) as cnt, sex_age.age
from employee group by sex_age.age ) a where a.cnt <= 1;
PV量:网站浏览量 count(1)—>PV
UV量:网站访问用户量 count(distinct user_id)—>UV
窗口函数是 SQL 中一类特别的函数。和聚合函数相似,窗口函数的输入也是多行记录。不 同的是,聚合函数的作用于由 GROUP BY 子句聚合的组,而窗口函数则作用于一个窗口, 这里,窗口是由一个 OVER 子句 定义的多行记录。
聚合函数对其所作用的每一组记录输 出一条结果,
窗口函数对其所作用的窗口中的每一行记录输出一条结果
Function (arg1,..., arg n) OVER ([PARTITION BY <...>] [ORDER BY <....>] [<window_clause>])
语法解释:
按功能可划分为:排序,聚合,分析
不同的序号,对所有数值输出序号唯一连续,去重
对相同数值,输出相同的序号,下一个序号跳过(1,1,3)
对相同数值,输出相同的序号,下一个序号连续(1,1,2)
将有序的数据集合平均分配到n个桶中, 将桶号分配给每一行,根据桶号,选取前或后 n分之几的数据
(目前排名- 1)/(总行数- 1),值相对于一组值的百分比排名
row number() | rank() | dense_rank() | |
---|---|---|---|
90 | 1 | 1 | 1 |
90 | 2 | 1 | 1 |
80 | 3 | 3 | 2 |
-示例
--row_number()(1,2,3)
select name,dept_num,employee_id,salary,
row_number() over(partition by dept_num order by salary desc) rn
from employee_contract;
--rank()(1,1,3)
select name,dept_num,employee_id,salary,
rank() over(partition by dept_num order by salary desc) rn
from employee_contract;
--dense_rank():不跳过数字(1,1,2)
select name,dept_num,employee_id,salary,
dense_rank() over(partition by dept_num order by salary desc) rn
from employee_contract;
--ntile
select name,dept_num,salary,
ntile(2) over(partition by dept_num order by salary desc) as nlite
from employee_contract;
--percent_rank
select name,dept_num,salary,
percent_rank() over(order by salary ) as pr
from employee_contract;
计数,可以和DISTINCT一起用
SELECT
COUNT(DISTINCT a) OVER (PARTITION BY c ORDER BY d ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING)
求和
平均值
最大/小值
从Hive 2.1.0开始在OVER子句中支持聚合函数
示例
SELECT
name, dept_num, salary,
COUNT(*) OVER (PARTITION BY dept_num) AS row_cnt,
--COUNT(DISTINCT *) OVER (PARTITION BY dept_num) AS row_cnt_dis,
SUM(salary) OVER(PARTITION BY dept_num ORDER BY dept_num) AS deptTotal,--只在分区内排名
SUM(salary) OVER(ORDER BY dept_num) AS runningTotal1, --每加一个分区,sum加一个分区的值
SUM(salary) OVER(ORDER BY dept_num, name rows unbounded preceding) AS runningTotal2,--每加一个name,sum都会加一个name的值
AVG(salary) OVER(PARTITION BY dept_num) AS avgDept,
MIN(salary) OVER(PARTITION BY dept_num) AS minDept,
MAX(salary) OVER(PARTITION BY dept_num) AS maxDept
FROM employee_contract
ORDER BY dept_num, name;
小于等于当前值的行数/分组内总行数
cume_dist(6000):小于等于salary:6000的行数/总行数
--cume_dist:小于等于当前值的行数/分组内总行数
SELECT
name, dept_num, salary,
cume_dist()over(partition by dept_num order by salary) as cume_dist
from employee_contract;
SELECT
name, dept_num, salary,
cume_dist()over( ) as cume_dist
from employee_contract;
某一列进行往前/后第n行值(n可选,默认为1)
--lead
SELECT
name, dept_num, salary,
lead(salary,2)over(partition by dept_num order by salary) as lead
from employee_contract;
--lead显示的是本组中salary当前行的后两行的数据,这种一组内的后两个一定是null
+----------+-----------+---------+-------+--+
| name | dept_num | salary | lead |
+----------+-----------+---------+-------+--+
| Wendy | 1000 | 4000 | 5000 |
| Will | 1000 | 4000 | 5500 |
| Michael | 1000 | 5000 | 6400 |
| Lucy | 1000 | 5500 | NULL |
| Steven | 1000 | 6400 | NULL |
| Lily | 1001 | 5000 | 6400 |
| Jess | 1001 | 6000 | NULL |
| Mike | 1001 | 6400 | NULL |
| Yun | 1002 | 5500 | 8000 |
| Wei | 1002 | 7000 | NULL |
| Richard | 1002 | 8000 | NULL |
+----------+-----------+---------+-------+--+
--lag
SELECT
name, dept_num, salary,
lag(salary,2)over(partition by dept_num order by salary) as lag
from employee_contract;
--lag显示的是本组中salary当前行的前两行的数据,这种一组内的前两个一定是null
+----------+-----------+---------+-------+--+
| name | dept_num | salary | lag |
+----------+-----------+---------+-------+--+
| Wendy | 1000 | 4000 | NULL |
| Will | 1000 | 4000 | NULL |
| Michael | 1000 | 5000 | 4000 |
| Lucy | 1000 | 5500 | 4000 |
| Steven | 1000 | 6400 | 5000 |
| Lily | 1001 | 5000 | NULL |
| Jess | 1001 | 6000 | NULL |
| Mike | 1001 | 6400 | 5000 |
| Yun | 1002 | 5500 | NULL |
| Wei | 1002 | 7000 | NULL |
| Richard | 1002 | 8000 | 5500 |
+----------+-----------+---------+-------+--+
对该列到目前为止的首个值
到目前行为止的最后一个值
--first_value,last_value
SELECT
name, dept_num, salary,
first_value(salary)over(partition by dept_num order by salary) as first_value,
last_value(salary)over(partition by dept_num order by salary) as last_value
from employee_contract;
+----------+-----------+---------+--------------+-------------+--+
| name | dept_num | salary | first_value | last_value |
+----------+-----------+---------+--------------+-------------+--+
| Wendy | 1000 | 4000 | 4000 | 4000 |
| Will | 1000 | 4000 | 4000 | 4000 |
| Michael | 1000 | 5000 | 4000 | 5000 |
| Lucy | 1000 | 5500 | 4000 | 5500 |
| Steven | 1000 | 6400 | 4000 | 6400 |
| Lily | 1001 | 5000 | 5000 | 5000 |
| Jess | 1001 | 6000 | 5000 | 6000 |
| Mike | 1001 | 6400 | 5000 | 6400 |
| Yun | 1002 | 5500 | 5500 | 5500 |
| Wei | 1002 | 7000 | 5500 | 7000 |
| Richard | 1002 | 8000 | 5500 | 8000 |
+----------+-----------+---------+--------------+-------------+--+
--first_value:当前组内的当前行的首行
--last_value:当前组内的当前行的最后一行,及当前行
SELECT
name, dept_num, salary,
--薪资往后数两行
LEAD(salary, 2) OVER(PARTITION BY dept_num ORDER BY salary) AS lead,
--薪资往前数两行
LAG(salary, 2, 0) OVER(PARTITION BY dept_num ORDER BY salary) AS lag,
FIRST_VALUE(salary) OVER (PARTITION BY dept_num ORDER BY salary) AS first_value,
LAST_VALUE(salary) OVER (PARTITION BY dept_num ORDER BY salary) AS last_value_default,
LAST_VALUE(salary) OVER (PARTITION BY dept_num ORDER BY salary RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS last_value
FROM employee_contract
ORDER BY dept_num, salary;
ROWS BETWEEN
--注:按照dept_num进行分区并按照name进行排序
SELECT
name, dept_num AS dept, salary AS sal,
--当前行前2行到当前行的最高薪资(在分区内进行比较)
MAX(salary) OVER (PARTITION BY dept_num ORDER BY name ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) win1,
--从当前行的前2行到最后一行的最高薪资
MAX(salary) OVER (PARTITION BY dept_num ORDER BY name ROWS BETWEEN 2 PRECEDING AND UNBOUNDED FOLLOWING) win2,
--从当前行的前1行到当前行的后2行最高薪资
MAX(salary) OVER (PARTITION BY dept_num ORDER BY name ROWS BETWEEN 1 PRECEDING AND 2 FOLLOWING) win3,
--从当前行的前2行到当前行的前1行最高薪资
MAX(salary) OVER (PARTITION BY dept_num ORDER BY name ROWS BETWEEN 2 PRECEDING AND 1 PRECEDING) win4,
--从当前行的后1行到当前行的后2行最高薪资
MAX(salary) OVER (PARTITION BY dept_num ORDER BY name ROWS BETWEEN 1 FOLLOWING AND 2 FOLLOWING) win5,
--从当前行到当前行最高薪资(当前行)
MAX(salary) OVER (PARTITION BY dept_num ORDER BY name ROWS BETWEEN CURRENT ROW AND CURRENT ROW) win6,
--从当前行到当前行的后1行最高薪资
MAX(salary) OVER (PARTITION BY dept_num ORDER BY name ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING) win7,
--从当前行到最后行最高薪资
MAX(salary) OVER (PARTITION BY dept_num ORDER BY name ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) win8,
--从第一行到当前行最高薪资
MAX(salary) OVER (PARTITION BY dept_num ORDER BY name ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) win9,
--从第一行到当前行后1行最高薪资
MAX(salary) OVER (PARTITION BY dept_num ORDER BY name ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING) win10,
--从第一行到最后一行最高薪资
MAX(salary) OVER (PARTITION BY dept_num ORDER BY name ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) win11,
--从当前行的前两行到当前行最高薪资
MAX(salary) OVER (PARTITION BY dept_num ORDER BY name ROWS 2 PRECEDING) win12
FROM employee_contract ORDER BY dept, name;
-- 假设当前close值为3000,语句将包含分区内范围从2500到4000的行
SUM(close) RANGE BETWEEN 500 PRECEDING AND 1000 FOLLOWING
--统计order_items表中销量最多的前10个商品
select order_item_product_id,sum(order_item_quantity) e
from order_items
group by order_item_product_id
order by e desc
limit 10;
--统计每个商品大类下的商品子类
select product_category_id,count(product_name) c
from products
group by product_category_id
order by product_category_id;
--根据商品子类id大小对每个商品大类下的子类进行排名
--使用ROW_NUMBER()函数
select product_category_id,product_id,
row_number()over(partition by product_category_id order by product_id)
from products;
--统计order_items表中各订单中不同商品总数、订单总金额、订单最高/最低/平均金额
select order_item_order_id,count(distinct order_item_product_id),
sum(order_item_subtotal),
max(sum(order_item_subtotal))over(partition by order_item_product_id) max,
min(sum(order_item_subtotal))over(partition by order_item_product_id) min,
avg(sum(order_item_subtotal))over(partition by order_item_product_id) avg
from order_items
group by order_item_order_id;