tar -zxvf apache-hive-3.1.2-bin.tar.gz -C /bigdata/install/
cd /bigdata/install/
# 重命名
mv apache-hive-3.1.2-bin/ hive-3.1.2
cd hive-3.1.2
vim hive-site.xml
<configuration>
<property>
<name>javax.jdo.option.ConnectionURLname>
<value>jdbc:mysql://node03:3306/metastore?useSSL=falsevalue>
property>
<property>
<name>javax.jdo.option.ConnectionDriverNamename>
<value>com.mysql.jdbc.Drivervalue>
property>
<property>
<name>javax.jdo.option.ConnectionUserNamename>
<value>rootvalue>
property>
<property>
<name>hive.metastore.warehouse.dirname>
<value>/user/hive/warehousevalue>
property>
<property>
<name>javax.jdo.option.ConnectionPasswordname>
<value>123456value>
property>
<property>
<name>hive.metastore.schema.verificationname>
<value>falsevalue>
property>
<property>
<name>hive.metastore.event.db.notification.api.authname>
<value>falsevalue>
property>
<property>
<name>hive.cli.print.current.dbname>
<value>truevalue>
property>
<property>
<name>hive.cli.print.headername>
<value>truevalue>
property>
<property>
<name>hive.server2.thrift.bind.hostname>
<value>node03value>
property>
<property>
<name>hive.server2.thrift.portname>
<value>10000value>
property>
configuration>
mkdir -p /bigdata/install/hive-3.1.2/logs
mv hive-log4j2.properties.template hive-log4j2.properties
vim hive-log4j2.properties
mysql-connector-java-5.1.38.jar
到/bigdata/soft
目录中cp mysql-connector-java-5.1.38.jar /bigdata/install/hive-3.1.2/lib/
# 解决日志jar包冲突
mv log4j-slf4j-impl-2.10.0.jar log4j-slf4j-impl-2.10.0.jar.bak
# 配置Hive环境变量
export HIVE_HOME=/bigdata/install/hive-3.1.2
export PATH=$PATH:$HIVE_HOME/bin
# 刷新
$ source /etc/profile
# mysql -uroot -p123456
mysql> create database metastore;
mysql> show databases;
+--------------------+
| Database |
+--------------------+
| information_schema |
| metastore |
| mysql |
| performance_schema |
| sys |
+--------------------+
5 rows in set (0.00 sec)
mysql> exit
# 初始化元数据库
$ schematool -initSchema -dbType mysql -verbose
hive
# 查看数据库
show databases;
vim /bigdata/install/hadoop-3.1.4/etc/hadoop/core-site.xml
,添加如下内容 <property>
<name>hadoop.proxyuser.hadoop.hostsname>
<value>*value>
property>
<property>
<name>hadoop.proxyuser.hadoop.groupsname>
<value>*value>
property>
scp core-site.xml node02:$PWD
scp core-site.xml node03:$PWD
# /bigdata/install/hive-3.1.2
$ source /etc/profile # 否则会报错:Cannot find hadoop installation: $HADOOP_HOME or $HADOOP_PREFIX must be set or hadoop must be in the path
$ bin/hiveserver2 # 启动警告信息可以忽略
bin/beeline --color=true
# 通过jdbc方式连接hiveserver2, 输入用户名和密码
beeline> !connect jdbc:hive2://node03:10000
可能出现的问题:在提交任务的过程中,可能任务会运行失败,会看到由于集群节点虚拟内存不足导致的,解决办法很简单,直接关闭虚拟内存检测就可以了
修改 yarn-site.xml,并将yarn-site.xml
文件分发到node02, node03,重启hadoop集群, 重启hiveserver2
<property>
<name>yarn.nodemanager.vmem-check-enabledname>
<value>falsevalue>
property>
为什么要对数据仓库分层?
对比项 | Hive | RDBMS |
---|---|---|
查询语句 | HQL | SQL |
数据存储 | HDFS | Raw Device or Local FS |
执行器 | MapReduce | Executor |
数据插入 | 支持批量导入/单条插入 | 执行单条或批量导入 |
数据操作 | 覆盖追加 | 行级更新删除 |
处理数据规模 | 大 | 小 |
执行延迟 | 高 | 低 |
分区 | 支持 | 支持 |
索引 | 0.8版本之后加入简单索引 | 支持复杂的索引 |
扩展性 | 高(好) | 有限(差) |
数据加载模式 | 读时模式(快) | 写时模式(慢) |
应用场景 | 海量数据查询 | 实时查询 |
hadoop jar xxxx.jar xxx.class /input /output
# 前台启动
hive --service hiveserver2
# 后台启动
nohup hive --service hiveserver2 &
$ beeline --color=true
Beeline version 3.1.2 by Apache Hive
beeline> !connect jdbc:hive2://node03:10000
0: jdbc:hive2://node03:10000> help
0: jdbc:hive2://node03:10000> !quit
Closing: 0: jdbc:hive2://node03:10000
# 使用 –e 参数来直接执行hql语句
hive -e "show databases"
# 使用 –f 参数执行包含hql语句的文件
hive -f hive.sql
类型名称 | 描述 | 举例 |
---|---|---|
boolean | true/false | true |
tinyint | 1字节的有符号整数 | 1 |
smallint | 2字节的有符号整数 | 1 |
int | 4字节的有符号整数 | 1 |
bigint | 8字节的有符号整数 | 1 |
float | 4字节单精度浮点数 | 1.0 |
double | 8字节单精度浮点数 | 1.0 |
string | 字符串(不设长度) | “abc” |
varchar | 字符串(1-65535长度,超长截断) | “abc” |
timestamp | 时间戳 | 1563157873 |
date | 日期 | 20190715 |
类型名称 | 描述 | 举例 |
---|---|---|
array | 一组有序的字段,字段类型必须相同 array(元素1, 元素2) | Array (1, 2, 3) |
map | 一组无序的键值对 map(k1, v1, k2, v2) | Map (‘a’, 1, ‘b’, 2) |
struct | 一组命名的字段,字段类型可以不同 struct(元素1, 元素2) | Struct (‘a’, 1, 2, 0) |
create table complex(
col1 array<int>,
col2 map<string,int>,
col3 struct<a:string,b:int,c:double>
)
ROW FORMAT DELIMITED
[FIELDS TERMINATED BY char [ESCAPED BY char]]
[COLLECTION ITEMS TERMINATED BY char]
[MAP KEYS TERMINATED BY char]
[LINES TERMINATED BY char]
FIELDS TERMINATED BY char 指定每一行记录中字段的分割符
COLLECTION ITEMS TERMINATED BY char 指定复合类型中多元素的分割符
MAP KEYS TERMINATED BY char 指定map集合类型中每一个key/value之间的分隔符
LINES TERMINATED BY char 指定每行记录的换行符,一般有默认 就是\n
1 zhangsan beijing,shanghai
2 lisi shanghai,tianjin
create table t_array(
id string,
name string,
locations array<string>
) row format delimited fields terminated by ' ' collection items terminated by ',';
load data local inpath '/bigdata/install/t_array.txt' into table t_array;
select id, name, locations[0], locations[1] from t_array;
1 name:zhangsan#age:30
2 name:lisi#age:40
create table t_map(
id string,
info map<string,string>
) row format delimited fields terminated by ' ' collection items terminated by '#' map keys terminated by ':';
load data local inpath '/bigdata/install/t_map.txt' into table t_map;
select id, info['name'], info['age'] from t_map;
1 zhangsan:30:beijing
2 lisi:40:shanghai
create table t_struct(
id string,
info struct<name:string, age:int, address:String>
) row format delimited fields terminated by ' ' collection items terminated by ':' ;
load data local inpath '/bigdata/install/t_struct.txt' into table t_struct;
select id,info.name,info.age,info.address from t_struct;
-- 创建数据库
create database db_hive;
# 或者
create database if not exists db_hive;
-- 显示所有数据库
show databases;
-- 查询数据库
show databases like 'db_hive*';
-- 查看数据库详情
desc database db_hive;
-- 显示数据库详细信息
desc database extended db_hive;
-- 切换数据库
use db_hive;
-- 删除数据库
drop database db_hive;
# 或者 如果删除的数据库不存在,最好采用if exists 判断数据库是否存在
drop database if exists db_hive;
# 或者 如果数据库中有表存在,这里需要使用cascade强制删除数据库
drop database if exists db_hive cascade;
CREATE [EXTERNAL] TABLE [IF NOT EXISTS] table_name
[(col_name data_type [COMMENT col_comment], ...)]
[COMMENT table_comment]
[PARTITIONED BY (col_name data_type [COMMENT col_comment], ...)] 分区
[CLUSTERED BY (col_name, col_name, ...) 分桶
[SORTED BY (col_name [ASC|DESC], ...)] INTO num_buckets BUCKETS]
[ROW FORMAT row_format] row format delimited fields terminated by “分隔符”
[STORED AS file_format]
[LOCATION hdfs_path]
-- 使用标准的建表语句直接建表
use myhive;
create table stu(id int, name string);
insert into stu(id,name) values(1,"zhangsan"); -- 实际一定不会用 insert into 语句
select * from stu;
-- 查询建表法:通过 AS 查询语句完成建表,将子查询的结果存入新表里
create table if not exists myhive.stu1 as select id, name from stu;
# 查看表中有数据
select * from stu1;
-- like建表法:根据已经存在的表结构创建表
create table if not exists myhive.stu2 like stu;
# 查看表中有数据
select * from stu2;
-- 查询表的类型
desc formatted myhive.stu;
-- hql示例:创建内部表并指定字段之间的分隔符,指定文件的存储格式,以及数据存放的位置
create table if not exists myhive.stu3(id int, name string)
row format delimited fields terminated by '\t'
stored as textfile
location '/user/hive/mytable/stu3'; -- 手动指定了表存储的位置, 如果没有指定, 会存储到默认位置, 在hive-site.xml --> /user/hive/warehouse/myhive.db/stu3
create external table myhive.teacher (t_id string, t_name string)
row format delimited fields terminated by '\t';
load data local inpath '/bigdata/install/hivedatas/teacher.csv' into table myhive.teacher;
cd /bigdata/install/hivedatas
hdfs dfs -mkdir -p /bigdata/hdfsload/hivedatas
hdfs dfs -put teacher.csv /bigdata/hdfsload/hivedatas
# 在hive的客户端当中执行
load data inpath '/bigdata/hdfsload/hivedatas' overwrite into table myhive.teacher;
# 将stu内部表改为外部表
alter table stu set tblproperties('EXTERNAL'='TRUE');
# 把teacher外部表改为内部表
alter table teacher set tblproperties('EXTERNAL'='FALSE');
建表语法的区别:外部表在创建的时候需要加上external关键字
删除表之后的区别
-- 创建分区表语法
create table score(s_id string, c_id string, s_score int) partitioned by (month string) row format delimited fields terminated by '\t';
-- 创建一个表带多个分区
create table score2 (s_id string,c_id string, s_score int) partitioned by (year string, month string, day string) row format delimited fields terminated by '\t';
-- 加载数据到分区表当中去
load data local inpath '/bigdata/install/hivedatas/score.csv' into table score partition (month='201806');
-- 加载数据到多分区表当中去
load data local inpath '/bigdata/install/hivedatas/score.csv' into table score2 partition(year='2018', month='06', day='01');
-- 查看分区
show partitions score;
-- 添加一个分区
alter table score add partition(month='201805');
-- 同时添加多个分区
alter table score add partition(month='201804') partition(month = '201803');
# 添加分区之后就可以在hdfs文件系统当中看到表下面多了一个文件夹
-- 删除分区
alter table score drop partition(month = '201806');
hdfs dfs -mkdir -p /scoredatas/day=20180607
create external table score4(s_id string, c_id string, s_score int) partitioned by (day string) row format delimited fields terminated by '\t' location '/scoredatas';
msck repair table score4;
!ls /home
dfs -ls /user;