hive
------------------
数据仓库,OLAP,分析处理,存储和分析,延迟较高。
数据库: OLTP,在线事务处理,低延迟,事务支持。
运行在hadoop,类SQL方法方式运行,SQL(HiveQL,HQL),MR运算。
操纵的结构化数据。
schema(模式,元信息存放到数据库中),HDFS文件。derby,mysql。
数据库和表都是路径。
hive
------------------
类似mysql
[配置hive]
1.conf/hive-env.sh
HADOOP_HOME=...
//不配也可以
2.conf/hive-site.xml
${system:java.io.tmpdir}
//配置本地临时目录
$hive>schematool -initSchema --dbtype derby
//初始化模式
$hive>hive
//初始化模式
hive常用命令
----------------------
$hive>!clear ;
//hive中执行shell命令
$hive>!dfs -lsr / ;
//hive中执行hdfs命令
$hive>create table hive1.t as select hive2.t2 ;
//复制表
将hive中的schema存放到外部的mysql中
---------------------------------------
1.编写hive-site.xml,添加mysql连接信息
[hive/conf/hive-site.xml]
...
javax.jdo.option.ConnectionDriverName
com.mysql.jdbc.Driver
javax.jdo.option.ConnectionURL
jdbc:mysql://192.168.231.1:3306/myhive
javax.jdo.option.ConnectionUserName
root
javax.jdo.option.ConnectionPassword
root
2.在mysql中创建myhive数据库
$hive>create database myhive ;
3.mysql驱动程序(jar)放置到hive classpath下。
...
4.重新初始化hive schema元数据库。
$>hive/bin/shematool -initschema --dbtype mysql
启动hiveserver2服务,接收多个客户端连接请求,
使得client通过jdbc连接操纵hive数据仓库
--------------------------------------------
$>hive/bin/hive --service hiveserver2 start
//启动服务
$>netstat -ano | grep 10000
//查看端口
在eclipse中创建maven(暂时不用)项目
----------------------------------
1.创建java项目
2.引入外部jar包
181个
3.修改hive-site.xml配置文件
使用OS操作系统的认证方式。
[hive-site.xml]
hive.server2.enable.doAs=false
hive.metastore.sasl.enabled=false
hive.server2.authentication=NONE
3'.重启hiveserver2服务器
$hive>hive --service hiveserver2 stop
$hive>hive --service hiveserver2 start &
$hive>netstat -ano | grep 10000
//验证是否启动10000端口
4.编写App程序
public static void main(String[] args) throws Exception {
Class.forName("org.apache.hive.jdbc.HiveDriver");
Connection conn = DriverManager.getConnection("jdbc:hive2://192.168.231.100:10000/hive1","ubuntu","123456");
PreparedStatement ppst = conn.prepareStatement("select * from t");
ResultSet rs = ppst.executeQuery();
while(rs.next()){
int id = rs.getInt("id");
String name = rs.getString("name");
int age = rs.getInt("age");
System.out.println(id + "," + name + "," + age);
}
rs.close();
ppst.close();
conn.close();
}
5.常用聚集函数
count()
sum()
avg()
max()
min()
6.解决beeline命令行终端的上下键导航历史命令的bug
[bin/beeline]
修改行
if [[ ! $(ps -o stat= -p $$) =~ + ]]; then
为
if [[ ! $(ps -o stat= -p $$) =~ "+" ]]; then
hive命令
-------------
$hive>dfs -lsr /
//执行dfs命令
$hive>!clear ;
//执行shell脚本
$>hive -e "select * from test"
//-e execute
$>hive -S -e "select * from test"
//-S 静默,不输出OK,...
$>hive -f /x/x/x/a.sql
//-f 执行一个文件,通常用于批处理
$hive>tab tab
//显示所有命令
$hive>-- this is a comment !
//显示所有命令
$hive>set hive.cli.print.header=true;
//现在字段名称(头)
$hive>create database if not exists xxx
//
$hive>create database hive3 with dbproperties('author'='xupc','createtime'='today')
$hive>alter database hivee3 set dbproperties('author'='you')
[创建表语法]
CREATE [TEMPORARY] [EXTERNAL] TABLE [IF NOT EXISTS] [db_name.] table_name
[(col_name data_type [COMMENT col_comment], ...)]
[COMMENT table_comment]
[ROW FORMAT row_format]
[STORED AS file_format]
[创建表例子]
CREATE TABLE IF NOT EXISTS employee( eid int, name String, salary String, destination String)
COMMENT 'Employee details'
//注释
PARTITION BY (PNAME PTYPE,...)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
//字段结束符
LINES TERMINATED BY '\n'
//行结束符
STORED AS TEXTFILE;
//存储成何种文件
[创建表带分区]
create table hive1.test5(id int ,name string ,age int ) partitioned by (province string , city string) row format DELIMITED FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\n' STORED AS TEXTFILE;
[加载数据===insert]
LOAD DATA [LOCAL] INPATH 'filepath' [OVERWRITE] INTO TABLE tablename [PARTITION (partcol1=val1, partcol2=val2 ...)]
LOAD DATA LOCAL INPATH '/home/user/sample.txt' OVERWRITE INTO TABLE employee;
[创建分区表]
create table test5(id int,name string,age int) partitioned by (province string,city string);
//按照省份和城市分区
[加载数据到指定分区]
load data local inpath '/home/ubuntu/employees.txt' into table hive1.test5 partition(province='hebei',citry='baoding');
[查看hdfs]
/user/hive/warehouse/hive1.db/test5/province=hebei/city=baoding/employees.txt
//
[查询分区表]
$hive>select * from hive1.test5 where province = 'hebei' and city = 'baoding';
[分区表的查询模式:strict / nostrict]
$hive>set hive.mapred.mode=strict
//严格模式,默认是nostrict
[查看分区表有哪些分区]
$hive>show partitions hive1.test5 ;
$hive>show partitions hive1.test5 partition(province='hebei') ;
//查看具体分区的细节信息
$hive>desc extended hive1.test5 ;
//查看扩展表信息
[手动增加分区]
$hive>alter table hive1.test5 add partition(province='hanan',city='pingdingshan')
//
$hive>alter table hive1.test5 add partition(area='huabei',province='hanan',city='pingdingshan')
//增加不存在的分区列,是非法的。
[修改表]
$hive>alter table hive1.test5 rename to hive1.test6 ;
//重命名
$hive>alter table hive1.test5 add partion(province='hebei',city='zhangjiakou') location 'xxx'
//添加多个分区
partion(province='hebei',city='zhangjiakou')
partion(province='hebei',city='zhangjiakou')
partion(province='hebei',city='zhangjiakou')
$hive>alter table hive1.test5 partition(province=..,city=..) set location 'xxxx'
//移动分区
$hive>alter table hive1.test5 change name string
//??????
$hive>alter table hive1.test5 add columns(birth string , fire string);
//增加列
$hive>alter table hive1.test5 replace columns(birth string , fire string);
//增加列
[修改表属性]
$hive>alter table hive1.test5 set tblproperties('a'='x',...)
//修改表属性
[启用归档]
$hive>set hive.archive.enabled=true
//设置可以归档,默认false????
[复制数据到分区表]
$hive>insert into hive1.test5 partition(province='hebei',city='baoding') select * from hive1.test2 ;//
insert into hive1.test6 partition(province='hebei',city='baoding') select * from hive1.test6
//字段个数相同
where province='hebei' and city='shijiazhuang' and id > 5 ;
//查询时,分区通过where子句指定,
//插入时,分区用partition(..)指定
[动态分区]
$hive>insert overwrite table hive1.test6 partition(province,city) select id,...,province ,city
//动态分区.
from table2 ;
$hive>create table hive1.test1(id int) tblproperties('author'='xupc');
//创建表,指定属性
$hive>create table hive1.test1(id int) LOCATION '/user/ubuntu/test1'
$hive>create external table hive1.test3 like hive1.test1 ;
//创建外部表external,只复制表结构,没有数据。
$hive>create external table hive1.test4 as select * from hive1.test1;
//创建外部表external,只复制表结构,有数据。
$hive>desc extended hive1.test1;
//显示扩展信息
$hive>desc formatted hive1.test1;
//显示格式化的信息
$hive>create table hive2.test4 like hive1.test1;
//复制表
$hive>show tables in hive1;
//显示指定数据库的表集合,默认是当前库.
$hive>hive>drop database if exists xxx
//存在即删除
$hive>hive>drop database if exists xxx
//存在即删除
$hive>hive>drop database if exists xxx cascde
//级联删除
$hive>hive>create database hive2 location '/user/ubuntu/';
$hive>hive>desc[ribe] database hive2
//显示db信息,不包含扩展信息
$hive>hive>desc[ribe] database extended hive3
//显示db信息,不包含扩展信息
$hive>hive>use hive3;
//使用哪个库
分区表
-----------------
托管表
----------------
hive默认表都是托管表。hive控制其数据的生命周期。删除托管表时,元数据和数据都被删除。
外部表
---------------
hive控制元数据。删除托管表时,数据不被删除。
create external table hive1.test3 like hive1.test1 ;
使用beeline客户端可以实现远程jdbc连接
--------------------------------------
1.连接
$>hive --service beeline -u jdbc:hive2://s100:10000/hive1
$>beeline -u jdbc:hive2://s100:10000/hive1
$beenline>
$beenline>!sh clear ;
//执行shell脚本
$beenline>show databases;
//查看库
$beenline>!help;
//帮助
$beenline>!dbinfo
//帮助
配置hive的仓库位置
---------------------
[hive-site.xml]
hive.metastore.warehouse.dir=/user/hive/warehouse/
hive数据类型
----------------------
类型
Size
案例
TINYINT
1 byte signed integer.
20
//byte
SMALLINT
2 byte signed integer.
20
//short
INT
4 byte signed integer.
20
//int
BIGINT
8 byte signed integer.
20
//long
BOOLEAN
Boolean true or false.
TRUE
//boolean
FLOAT
Single precision floating point.
3.14159
//float
DOUBLE
Double precision floating point.
3.14159
//double
STRING
'Now is the time', "for all
good men"
//字符串'' / ""
TIMESTAMP
BINARY
字节数组
[集合类型]
STRUCT
struct('John', 'Doe')
MAP
map('first', 'John','last', 'Doe')
ARRAY
array('John', 'Doe')
Hive所谓的读模式
-----------------------
hive在写操作是不校验,读时校验。
-----------------------------------------------------
创建分区表
------------
create external table hive1.test2(id int , name string ,age int)
partitioned by(province string , city string)
row format DELIMITED
FIELDS TERMINATED BY '\t'
LINES TERMINATED BY '\n'
STORED AS TEXTFILE;
手动添加分区
-------------
alter table hive1.test2 add partition(province='hebei',city='baoding')
插入数据到分区表
----------------
insert into hive1.test2 partition(province='hebei',city='baoding')
select * from hive1.test6 where province='hebei' and city='shijiazhuang' and id > 5 ;
动态分区
----------------
$hive>-- 创建test3分区表
$hive>create table test3(id int,name string,age int)
partitioned by (province string,city string)
row format delimited
fields terminated by '\t'
lines terminated by '\n'
stored as textfile ;
$hive>-- 动态分区,复制一个表数据到分区表,动态创建分区
$hive>-- 如果两个都是动态分区需要关闭严格模式
$hive>set hive.exec.dynamic.partition.mode=nonstrict;
//关闭动态分区的严格模式
$hive>insert into hive1.test2 partition(province,city)
//
select id,name,age, 'henan' as province , 'kaifeng' as city
from table2 ;
$hive>-- 使用分区动静混合
$hive>insert into hive1.test2 partition(province='henan',city)
select id,name,age, 'henan' as province , 'kaifeng' as city
from table2 ;
$hive>-- 查询期间,动态创建表,并将数据写入创建表中
$hive>create table test3 as select id,name from test2 where province = 'hebei' and city = 'baoding';
$hive>-- 导出hive数据到本地目录(下载)
$hive>insert overwrite local directory '/home/ubuntu/hive' select * from test2 where province = 'hebei';
$hive>-- 导出hive数据到HDFS目录
$hive>insert overwrite directory 'hdfs://s100:8020/user/ubuntu/xxx' select * from test2 where province = 'hebei';
$>-- 查询数据向多个目录同时输出??????????????????
$>from test2 t
insert overwrite local directory '/home/ubuntu/hebei' select * where t.province = 'hebei'
insert overwrite local directory '/home/ubuntu/henan' select * where t.province = 'nanan' ;
查询
-------------------------------
$>-- 查询,投影查询,指定表的别名
$>select col1,col2,... from table t ;
$>-- 查询,使用函数
$>select upper(name) from test2 ;
$>select lower(name) from test2 ;
$>-- 数学函数
$>select round(12.345) ;
//四舍五入
$>select floor(12.345) ;
//地板
$>select ceil(12.345) ;
//天花板
$>select rand(10) ;
//随机数
$>-- 聚合函数
$>select count(*) from test2 ;
$>select sum(age) from test2 ;
$>select avg(age) from test2 ;
$>select max(age) from test2 ;
$>select min(age) from test2 ;
$>-- 去重.distinct
$>select count(dinstinct name) from test2 ;
$>-- 表生成函数
$>select explode(array('tom','tomas','tomsLee')) ;
$>-- ascii函数,字符串首个字母ascii值
$>select ascii('abc') ;
$>-- base64字符串编码
$>select base64(binary('httpt://localhost:8080/helloworld')) ;//输出select base64(binary('httpt://localhost:8080/helloworld')) ;
$>-- binary函数,将字符串转换成二进制
$>select base64(binary('httpt://localhost:8080/helloworld')) ;//输出select base64(binary('httpt://localhost:8080/helloworld')) ;
$>-- binary函数,将字符串转换成二进制
$>select binary('httpt://localhost:8080/helloworld') ;
$>-- 类型转换
$>select cast('120' as bigint) + 200 ;
//320
$>-- 字符串连接
$>select concat('120',200) ;
//120200
$>-- 分页查询limit
$>select * from test2 limit 1,1 ;
//offset , length
$>-- 嵌套子查询
$>from (select * from test2 where province = 'hebei') e select e.id,e.name,e.age where e.city = 'baoding';
//OK
$>select e.id,e.name,e.age from (select * from test2 where province = 'hebei') e where e.city = 'baoding';
//OK
$>case .. when then 相当于if
$>select id,name,case when age <= 12 then 'young'
when age > 12 and age <= 13 then 'middle'
when age > 13 and age <= 15 then 'old'
else 'too old'
end as yearstate from test2 ;
$>-- 不能在where使用列别名
$>select id,name n ,age from test2 where n like 't%' ;
//wrong,where中不能字段别名。
$>-- 范围运算
$>select id,name n ,age from test2 where age <= 14 and age >= 12 ;
$>select id,name n ,age from test2 where age between 12 and 14 ;
$>-- 浮点数比较的规避方案
$>select cast(0.2 as float) ;
$>-- group by查询
$>select count(*),province from test2 group by province ;
//
$>select count(*) as c ,province from test2 group by province having c > 3 ;
//having是组内过滤
[Hive的Join操作,只支持等值连接]
$>-- 创建customers和orders表,一对多关系。
$>-- customers
$>create table customers(id int , name string , age int)
row format delimited
fields terminated by '\t'
lines terminated by '\n'
stored as textfile ;
$>-- orders
$>create table orders(id int , orderno string , price float , cid int)
row format delimited
fields terminated by '\t'
lines terminated by '\n'
stored as textfile ;
$>-- 准备数据
customers.txt数据
------------------
1
tom1
12
2
tom2
13
3
tom3
14
orders.txt数据
-------------------
1
No001
121.34
1
2
No002
121.35
1
3
No003
121.00
1
4
No004
22.66
2
5
No005
300.65
2
6
No006
800.56
2
7
No007
1000.12
$>-- 加载数据到customers + orders中.
$>load data local inpath '/home/ubuntu/customers.txt' into table hive1.customers;
$>load data local inpath '/home/ubuntu/orders.txt' into table hive1.orders;
$>select * from customers;
$> --内连接 join .. on
$>select a.id,a.name,b.id,b.orderno,b.price from customers a join orders b on a.id = b.cid ;
1
tom1
1
No001
121.34
1
tom1
2
No002
121.35
1
tom1
3
No003
121.0
2
tom2
4
No004
22.66
2
tom2
5
No005
300.65
2
tom2
6
No006
800.56
$>-- 连接查询优化手段:查询表的大小从左到右是递增的。
$>select c.id,c.name,c.age,o.orderno,o.price from customers c join orders o on c.id = o.cid where ...;
//right
$>select c.id,c.name,c.age,o.orderno,o.price from orders o join customers c on c.id = o.cid where ...;
//wrong
$>-- 使用查询暗示 hint
$>select /*+streamtable(c)*/ c.id,c.name,c.age,o.orderno,o.price from orders o join customers c on c.id = o.cid;
$>-- left outer join
$>select c.id,c.name,c.age,o.orderno,o.price from customers c left outer join order o on c.id = o.cid ;
1
tom1
1
No001
121.34
1
tom1
2
No002
121.35
1
tom1
3
No003
121.0
2
tom2
4
No004
22.66
2
tom2
5
No005
300.65
2
tom2
6
No006
800.56
3
tom3
NULL
NULL
NULL
$>-- right outer join
$>select c.id,c.name,c.age,o.orderno,o.price from customers a right outer join order b on c.id = o.cid ;
1
tom1
1
No001
121.34
1
tom1
2
No002
121.35
1
tom1
3
No003
121.0
2
tom2
4
No004
22.66
2
tom2
5
No005
300.65
2
tom2
6
No006
800.56
NULL
NULL
7
No007
1000.12
$>-- full outer join
$>select c.id,c.name,c.age,o.orderno,o.price from customers a full outer join order b on c.id = o.cid ;
NULL
NULL
7
No007
1000.12
1
tom1
3
No003
121.0
1
tom1
2
No002
121.35
1
tom1
1
No001
121.34
2
tom2
6
No006
800.56
2
tom2
5
No005
300.65
2
tom2
4
No004
22.66
3
tom3
NULL
NULL
NULL
$>-- 左半连接,select和where子句不能引用到右边表字段
$>-- 左表的记录在右表中一旦找到对应的记录,右侧表即停止扫描。
$>select c.id,c.name from customers c left semi join orders o on c.id = o.cid ;
$>-- hive不支持右半连接操作. right semi join xxxx
$>-- 笛卡尔连接 m x n
$>select c.id,c.name,o.orderno from customers c join orders o ;
$>-- map端连接,一张小表,通过mapper的时候,将小表完全载入内存。
$>-- 暗示 mapjoin(c)在0.7之前使用。
$>select /*+mapjoin(c)*/ c.id,c.name,o.orderno from customers c join orders o ;
$>select /*+mapjoin(o)*/ c.id,c.name,o.orderno from customers c join orders o ;
$>set hive.auto.conert.join=true
--转换连接,map端优化,在右外链接和全外连接中不支持
$>hive.mapjoin.smalltable.filesize=25000000
--设置小表阀值
$>-- order by 全排序,对所有数据通过一个reduce进行排序。
$>-- 如果开启了hive.mapred.mode=strict,在全排序时必须集合limit使用。
$>-- 现在推荐使用hive.strict.checks.*属性.
$>select * from orders order by cid asc , price desc ;
--全局排序
$>-- sort by 每个reduce进行排序(局部排序)。
$>select * from orders sort by cid asc , price desc ;
--局部排序
$>-- distribute by等价于自定义分区函数,写在sort by之前.
$>select * from orders distribute by cid sort by price desc ;
--局部排序
$>--cluster by === distribute by ... sort by ...
$>分桶采样
$>select * from orders tablesample(bucket 3 out of 10 on number) ;
$>-- 按照数据块百分比采样,100块,抽取10块.,如果总共1块,没有采样。
$>select * from orders tablesample(0.1 percent) ;
$>-- union all 联合操作,字段类型和个数需要匹配。
$>select id , name from customers union all select id ,orderno from orders ;
$>-- view(虚表),降低查询的复杂度
$>-- create view v_name as select ...
$>create view view1 as select c.id,c.name,c.age,o.id,o.prderno,o.price
--创建视图
from customers c left outer join orders o on c.id = o.cid ;
$>-- 通过视图直接查询
$>select * from view1 ;
$>select * from view1 where price > 200 ;
$>-- 使用like方式创建view
$>create view v2 like view1 ;
$>--删除视图
$>drop view if exists v2;
$>--索引,hive没有key(primary key + auto_increment)
$>--创建索引,DEFERRED REBUILD该选项时,索引为空白状态,需要rebuild才能够初始化。
$>CREATE INDEX idx_customers_id ON TABLE customers (id) AS 'org.apache.hadoop.hive.ql.index.compact.CompactIndexHandler' WITH DEFERRED REBUILD IDXPROPERTIES ('creator' = 'me') IN TABLE customers_index COMMENT 'this is a comment!';
$>-- order index
$>CREATE INDEX idx_orders_orderno ON TABLE orders (orderno) AS 'org.apache.hadoop.hive.ql.index.compact.CompactIndexHandler' WITH DEFERRED REBUILD IN TABLE orders_index;
$>alter index idx_customers_id on customers rebuild ;
--重建索引,产生索引表(hdfs文件)
hdfs://s100:8020/../customers_index/000000_0
--索引文件(表)
[内容如下]
1
hdfs://s100:8020/user/hive/warehouse/hive1.db/customers/customers.txt
[0]
2
hdfs://s100:8020/user/hive/warehouse/hive1.db/customers/customers.txt
[10]
3
hdfs://s100:8020/user/hive/warehouse/hive1.db/customers/customers.txt
[20]
$>alter index idx_customers_id on customers rebuild ;
--重建索引
$>alter index idx_orders_orderno on orders rebuild ;
--重建索引
No001hdfs://s100:8020/user/hive/warehouse/hive1.db/orders/orders.txt0
No002hdfs://s100:8020/user/hive/warehouse/hive1.db/orders/orders.txt17
No003hdfs://s100:8020/user/hive/warehouse/hive1.db/orders/orders.txt34
No004hdfs://s100:8020/user/hive/warehouse/hive1.db/orders/orders.txt51
No005hdfs://s100:8020/user/hive/warehouse/hive1.db/orders/orders.txt67
No006hdfs://s100:8020/user/hive/warehouse/hive1.db/orders/orders.txt84
No007hdfs://s100:8020/user/hive/warehouse/hive1.db/orders/orders.txt101
$>-- 删除索引
$>drop index idx_customers_id on table customers;
$>--分区是路径,是目录,是文件逻辑隔离。有效降低查询量。
$>--桶表(bucket),是文件。
$>--创建桶表
$>create table ... clustered by (field_name) into n buckets ;
$>create table orderitems (id int , itemname string , oid int) clustered by (oid) into 3 buckets row format delimited fields terminated by '\t' lines terminated by '\n' stored as textfile ;
1
item1
1
2
item2
1
3
item3
1
4
item4
2
5
item5
2
6
item6
2
7
item7
3
8
item8
3
9
item9
3
10
item10
3
12
item10
4
13
item10
4
14
item10
4
15
item10
4
***** 何时Hive可以避免MR操作 *****
--------------------------------------
不是mr的job就是本地模式。
1.全表扫描:没有where子句。
select * from test2 ;
2.where子句作用只有分区字段,也不需要mr.
select * from test2 where province = 'hebei' ;
3.设置hive.exec.model.local.auto=true
该属性hive会尽量使用local模式查询。
4.其余的所有查询都会转换成MR.