SELECT 1;
SELECT [DISTINCT] column_nam_list FROM table_name;
SELECT * FROM table_name;
SELECT * FROM employee WHERE name!='Lucy' LIMIT 5;
-- CTE语法
WITH t1 AS (SELECT …) SELECT * FROM t1
with
a1 as(select className,sum(score) countScore from score group by className),
a2 as(select u.*,s.className,s.score from userinfos u inner join score s on u.userid=s.userid)
select a2.username,a2.className,a2.score,(a2.score/a1.countScore*100) from a1 inner join a2 on a1.className=a2.className
-- 嵌套查询示例
SELECT * FROM (SELECT * FROM employee) a;
select r.username,r.className,r.score,(r.score/l.countScore*100) zb
from (select className,sum(score) countScore from score group by className) l
inner join (select u.*,s.className,s.score from userinfos u
inner join score s on u.userid=s.userid) r on l.className = r.className;
SET hive.support.quoted.identifiers = none;
SELECT `^o.*` FROM offers;
SELECT `a.*` FROM userinfos;--单查年龄这一属性
两个连续下划线,用于数据验证
INPUT__FILE__NAME:Mapper Task的输入文件名称
BLOCK__OFFSET__INSIDE__FILE:当前全局文件位置
Hive——join的使用
常见的join有:
数据表格包以及建表文档:
链接:https://pan.baidu.com/s/14eiNjfvgAVX-ndUh6-CEkw 提取码:u62m
数据表格
建表语句
CREATE EXTERNAL TABLE IF NOT EXISTS customers (
customer_id int,
customer_fname varchar(45),
customer_lname varchar(45),
customer_email varchar(45),
customer_password varchar(45),
customer_street varchar(255),
customer_city varchar(45),
customer_state varchar(45),
customer_zipcode varchar(45)
)
ROW FORMAT serde 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
with serdeproperties ("separatorChar"=",")
LOCATION '/data/retail_db/customers';
CREATE EXTERNAL TABLE IF NOT EXISTS categories (
category_id int,
category_department_id int,
category_name varchar(45)
)
ROW FORMAT serde 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
with serdeproperties ("separatorChar"=",")
LOCATION '/data/retail_db/categories';
CREATE EXTERNAL TABLE IF NOT EXISTS departments (
department_id int,
department_name varchar(45)
)
ROW FORMAT serde 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
with serdeproperties ("separatorChar"=",")
LOCATION '/data/retail_db/departments';
CREATE EXTERNAL TABLE IF NOT EXISTS order_items (
order_item_id int,
order_item_order_id int,
order_item_product_id int,
order_item_quantity int,
order_item_subtotal float,
order_item_product_price float)
ROW FORMAT serde 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
with serdeproperties ("separatorChar"=",")
LOCATION '/data/retail_db/order_items';
CREATE EXTERNAL TABLE IF NOT EXISTS orders (
order_id int,
order_date date,
order_customer_id int,
order_status varchar(45)
)
ROW FORMAT serde 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
with serdeproperties ("separatorChar"=",")
LOCATION '/data/retail_db/orders';
CREATE EXTERNAL TABLE IF NOT EXISTS products (
product_id int,
product_category_id int,
product_name varchar(45),
product_description varchar(255),
product_price float,
product_image varchar(255))
ROW FORMAT serde 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
with serdeproperties ("separatorChar"=",")
LOCATION '/data/retail_db/products';
[root@zjw opt]# hdfs dfs -mkdir /data
[root@zjw opt]# hdfs dfs -mkdir -p /data/retail_db/customers
[root@zjw opt]# hdfs dfs -mkdir -p /data/retail_db/categories
[root@zjw opt]# hdfs dfs -mkdir -p /data/retail_db/departments
[root@zjw opt]# hdfs dfs -mkdir -p /data/retail_db/order_items
[root@zjw opt]# hdfs dfs -mkdir -p /data/retail_db/orders
[root@zjw opt]# hdfs dfs -mkdir -p /data/retail_db/products
[root@zjw opt]# hdfs dfs -put /opt/retail_db-csv/customers.csv /data/retail_db/customers
[root@zjw opt]# hdfs dfs -put /opt/retail_db-csv/categories.csv /data/retail_db/categories
[root@zjw opt]# hdfs dfs -put /opt/retail_db-csv/departments.csv /data/retail_db/departments
[root@zjw opt]# hdfs dfs -put /opt/retail_db-csv/order_items.csv /data/retail_db/order_items
[root@zjw opt]# hdfs dfs -put /opt/retail_db-csv/orders.csv /data/retail_db/orders
[root@zjw opt]# hdfs dfs -put /opt/retail_db-csv/products.csv /data/retail_db/products
然后建表塞数据:
hive> create database myexp;
OK
Time taken: 4.162 seconds
hive> use myexp;
OK
Time taken: 0.025 seconds
hive> CREATE EXTERNAL TABLE IF NOT EXISTS customers (
> customer_id int,
> customer_fname varchar(45),
> customer_lname varchar(45),
> customer_email varchar(45),
> customer_password varchar(45),
> customer_street varchar(255),
> customer_city varchar(45),
> customer_state varchar(45),
> customer_zipcode varchar(45)
> )
> ROW FORMAT serde 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
> with serdeproperties ("separatorChar"=",")
> LOCATION '/data/retail_db/customers';
OK
Time taken: 0.177 seconds
hive> CREATE EXTERNAL TABLE IF NOT EXISTS categories (
> category_id int,
> category_department_id int,
> category_name varchar(45)
> )
> ROW FORMAT serde 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
> with serdeproperties ("separatorChar"=",")
> LOCATION '/data/retail_db/categories';
OK
Time taken: 0.053 seconds
hive> CREATE EXTERNAL TABLE IF NOT EXISTS departments (
> department_id int,
> department_name varchar(45)
> )
> ROW FORMAT serde 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
> with serdeproperties ("separatorChar"=",")
> LOCATION '/data/retail_db/departments';
OK
Time taken: 0.059 seconds
hive>
> CREATE EXTERNAL TABLE IF NOT EXISTS order_items (
> order_item_id int,
> order_item_order_id int,
> order_item_product_id int,
> order_item_quantity int,
> order_item_subtotal float,
> order_item_product_price float)
> ROW FORMAT serde 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
> with serdeproperties ("separatorChar"=",")
> LOCATION '/data/retail_db/order_items';
OK
Time taken: 0.041 seconds
hive>
> CREATE EXTERNAL TABLE IF NOT EXISTS orders (
> order_id int,
> order_date date,
> order_customer_id int,
> order_status varchar(45)
> )
> ROW FORMAT serde 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
> with serdeproperties ("separatorChar"=",")
> LOCATION '/data/retail_db/orders';
OK
Time taken: 0.055 seconds
hive> CREATE EXTERNAL TABLE IF NOT EXISTS products (
> product_id int,
> product_category_id int,
> product_name varchar(45),
> product_description varchar(255),
> product_price float,
> product_image varchar(255))
> ROW FORMAT serde 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
> with serdeproperties ("separatorChar"=",")
> LOCATION '/data/retail_db/products';
OK
Time taken: 0.082 seconds
1、查询顾客表中地区为“NY”所在城市为’New York’的用户
hive> select * from customers where customer_state='NY' and customer_city='New York';
2、查询订单表中共有多少不同顾客下过订单
多种写法
方法1:
select b.order_customer_id,concat(a.customer_fname,'-',a.customer_lname)
from ----字符拼接函数concat(str1,str2,...)
customers a
inner join
(select order_customer_id from orders group by order_customer_id) b
on
a.customer_id=b.order_customer_id;
方法2:(with as)
with
t1 as (select order_customer_id from orders group by order_customer_id)
select c.customer_id,concat(c.customer_fname,'.',c.customer_lname) as name
from t1 t inner join customers c on t.order_customer_id= c.customer_id;
方法3:
select c.customer_id,concat(c.customer_fname,'.',c.customer_lname) as name
from (select order_customer_id from orders group by order_customer_id) t
inner join customers c on t.order_customer_id= c.customer_id;
方法4:
select c.customer_id,concat(c.customer_fname,'.',c.customer_lname) as name
from customers c where c.customer_id in
(select order_customer_id from orders group by order_customer_id);
方法5:(exits)
select c.customer_id,concat(c.customer_fname,'.',c.customer_lname) as name
from customers c where exists
(select order_customer_id from orders s where s.order_customer_id=c.customer_id);
方法6:
with
t1 as (select distinct order_customer_id from orders)
select c.customer_id,concat(c.customer_fname,'.',c.customer_lname) as name
from t1 t inner join customers c on t.order_customer_id= c.customer_id;
select * from products limit 5;
select c.customer_id,concat(c.customer_fname,'.',c.customer_lname) as name
from customers c where not exists
(select order_customer_id from orders s where s.order_customer_id=c.customer_id );
MapJoin是Hive的一种优化操作,其适用于小表JOIN大表的场景,由于表的JOIN操作是在Map端且在内存进行的,所以其并不需要启动Reduce任务也就不需要经过shuffle阶段,从而能在一定程度上节省资源提高JOIN效率
方法一:
在Hive0.11前,必须使用MAPJOIN来标记显示地启动该优化操作,由于其需要将小表加载进内存所以要注意小表的大小
SELECT /*+ MAPJOIN(smalltable)*/ .key,value
FROM smalltable JOIN bigtable ON smalltable.key = bigtable.key
方法二:
在Hive0.11后,Hive默认启动该优化,也就是不在需要显示的使用MAPJOIN标记,其会在必要的时候触发该优化操作将普通JOIN转换成MapJoin,可以通过以下两个属性来设置该优化的触发时机
hive.auto.convert.join
默认值为true,自动开户MAPJOIN优化
hive.mapjoin.smalltable.filesize
默认值为2500000(25M),通过配置该属性来确定使用该优化的表的大小,如果表的大小小于此值就会被加载进内存中
注意:使用默认启动该优化的方式如果出现默名奇妙的BUG(比如MAPJOIN并不起作用),就将以下两个属性置为fase手动使用MAPJOIN标记来启动该优化
select /*+MAPJOIN(smallTableTwo)*/ idOne, idTwo, value FROM
( select /*+MAPJOIN(smallTableOne)*/ idOne, idTwo, value FROM
bigTable JOIN smallTableOne on (bigTable.idOne = smallTableOne.idOne)
) firstjoin
JOIN
smallTableTwo ON (firstjoin.idTwo = smallTableTwo.idTwo)
但是,如果使用的是方法一即没有MAPJOIN标记则以上查询语句将会被作为两个MJ执行,进一步的,如果预先知道表大小是能够被加载进内存的,则可以通过以下属性来将两个MJ合并成一个MJ
hive.auto.convert.join.noconditionaltask:Hive在基于输入文件大小的前提下将普通JOIN转换成MapJoin,并是否将多个MJ合并成一个
hive.auto.convert.join.noconditionaltask.size:多个MJ合并成一个MJ时,其表的总的大小须小于该值,同时hive.auto.convert.join.noconditionaltask必须为true
关于map join的一点小坑
map join虽然很好,但是会有如下问题:
1)map join关联多个小表时,都放入内存,则考虑内存大小需要针对上述小表大小进行累加
2)大表B表map join关联分区小表A表(200M)时,即使限制了A的分区(取10M),但依旧放入内存的大小依旧是A表的原先大小(200M)
//MINUS
SELECT a.name
FROM employee a
LEFT JOIN employee_hr b
ON a.name = b.name
WHERE b.name IS NULL;
//INTERCEPT
SELECT a.name
FROM employee a
JOIN employee_hr b
ON a.name = b.name;
如果还需要对UNION的结果集进行一些其他的处理,整个语句表达式可以嵌入到FROM子句中,如下所示:
SELECT *
FROM (
select_statement
UNION ALL
select_statement
) unionResultAlias
例如,假设我们有两个不同的表分别表示哪个用户发布了一个视频,以及哪个用户发布了一个评论,那么下面的查询将UNION ALL的结果与用户表join在一起,为所有视频发布和评论发布创建一个注释流:
SELECT u.id, actions.date
FROM (
SELECT av.uid AS uid
FROM action_video av
WHERE av.date = '2008-06-03'
UNION ALL
SELECT ac.uid AS uid
FROM action_comment ac
WHERE ac.date = '2008-06-03'
) actions JOIN users u ON (u.id = actions.uid)
select * from offers order by case when offerid = 1 then 1 else 0 end;
select * from offers order by 1;
举例使用:
创表:
create table myniltab(mntid int, nmtname string);
insert into myniltab(mntid,nmtname) values(1,'hehe'),(2,'xixi');
insert into myniltab(mntid) values(3),(4);
insert into myniltab(mntid,nmtname) values(5,'chacha');
select * from myniltab order by nmtname desc;
select * from myniltab order by case when nmtname is null then 1 else 0 end;
set mapred.reduce.tasks=2;
如图已经开启了两个reduce task
distribute by:
select * from myniltab distribute by mntid sort by nmtname;
用法:
SELECT name, employee_id FROM employee_hr CLUSTER BY name;