内部表(managed table)
外部表(external table)
# 查看数据库
show databases;
# 创建数据库,位置在hdfs上
create database if not exists sysoa COMMENT 'OA数据库' LOCATION '/user/database/hive/warehouse/sysoa.db';
# 删除数据库,CASCADE:删除数据库之前删除所有的表格
# 使用数据库
use class;
# 创建内部表
create table if not exists students2(name string,age int,sex string,brithday date)ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' ;
# 导入数据
load data local inpath '/home/fonttian/database/hive/students2' overwrite into table students2;
# 创建外部表
create external table if not exists students3(name string,age int,sex string,brithday date)ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' stored as orc;
# 删除表结构,保留数据
truncate table students2;
# 删除表数据与结构,外部表只删除元数据
drop table students2;
指定存储格式为 Sequencefile 时,把txt格式的数据导入表中,hive 会报文件格式错,解决方案为先将txt格式传入hive,然后利用传入表格插入Sequencefile格式表格
load data local inpath '/home/fonttian/database/hive/students2' overwrite into table students3;
# 创建外部表
create external table if not exists students3_orc(name string,age int,sex string,brithday date)ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
# 从其他表格中插入数据
insert into table students3 select * from students2;
insert into table students3_orc select * from students3;
# 创建外部表,利用date字段进行分区
create external table if not exists students4(name string,age int,sex string,brithday date) partitioned by (day date) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
# 导入数据进入分区外表,分区为 day="2018-3-26"
load data local inpath '/home/fonttian/database/hive/students2' into table students4 partition (day="2018-3-26");
# 如果查询无效,可以使用下面的代码
create external table if not exists students5(name string,age int,sex string,brithday date) partitioned by (pt_int int) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
load data local inpath '/home/fonttian/database/hive/students2' into table students5 partition (pt_int=1);
load data local inpath '/home/fonttian/database/hive/students2' into table students5 partition (pt_int=2);
select * from students5;
select * from students5 where pt_int = 1;
select * from students5 where pt_int > 0;
# 创建外部表
create external table if not exists students3_parquet(name string,age int,sex string,brithday date)ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' stored as parquet;
insert into table students3_parquet select * from students3;
# 查询
SELECT * FROM students2 WHERE age>30 && Dept=TP;
# 查看是否为分区表
show partitions
# 或者使用查勘表结构的命令
describe extended students5;
desc formatted students5;
# delete partition
alter table students5 drop partition(pt_int=2);
# 导出数据-insert方式
insert overwrite local directory "/home/fonttian/database/hive/learnhive" select * from students5;
insert overwrite local directory "/home/fonttian/database/hive/learnhive" row format delimited fields terminated by '\t' collection items terminated by '\n' select * from students5;
# 流式导出,需要在shell中进行
bin/hive -e "use class;select * from students5;" > /home/fonttian/database/hive/learnhive/students5.txt
# 如果有必要需要先进行调优
# set hive.exec.reducers.max=
# set mapreduce.job.reduces=
# 按照年龄排序,查询student5表
select * from students5 sort by age asc;
类似于MapReduce中分区,对数据进行行行分区,结合sort by进行使用,同样要注意的是这里我们还是需要进行数据的格式化,这样才可以直接读取数据
insert overwrite local directory '/home/fonttian/database/hive/learnhive/students5_distribute_by' row format delimited fields terminated by '\t' collection items terminated by '\n' select * from students5 distribute by pt_int sort by age asc;
distribute by必须在sort by之前
cluster by
当distribute by字段和sort by字段相同时,就可以替代使用用。
insert overwrite local directory '/home/fonttian/database/hive/learnhive/students5_distribute_by' row format delimited fields terminated by '\t' collection items terminated by '\n' select * from students5 distribute by pt_int cluster by age asc;
# 创建外部表
create external table if not exists score(name string,math int,chinese int,english int)ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' stored as textfile;
# 导入数据
load data local inpath '/home/fonttian/database/hive/score' overwrite into table score;
# 创建外部表
create external table if not exists job(name string,likes string)ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' stored as textfile;
# 导入数据
load data local inpath '/home/fonttian/database/hive/job' overwrite into table job;
- 可以连接两个以上表
select students2.name ,students2.age,score.math,job.likes from students2 join score on(students2.name = score.name) join job on (job.name=score.name);
- 如果连接多个表的join key 是同一个,则被转化为单个map/reduce任务
- join时大表放在最后。因为每次map/reduce任务的逻辑是这样的:reduce会缓存join序列中最后一个表之外的所有的表额记录,再通过最后一个表序列化到文件系统中。
- 如果想要限制join的输出,就需要在where子句中写过滤条件,或是在join子句写。建议后者,以避免部分错误发生。
select students5.name,score.math from score left outer join students5 on(score.name = students5.name and students5.pt_int = 1);
select students5.name,score.math from students5 left outer join score on(score.name = students5.name and students5.pt_int = 1);
- Left SEMI JOIN 是IN/EXISTS子查询的一种更高效的实现。其限制为:join子句中的右边表只能在ON自剧中设置过滤条件,where子句。select子句或其他过滤地方都不行
select job.name,job.likes from job where job.name in (select score.name from score);
select job.name,job.likes from job left semi join score on (score.name = job.score);
操作类型: strings
描述: 功能与RLIKE相同
select count(*) from students5 where name not regexp '\\d{8}';
# 统计,name开头不是T的数据行数
beelin >select count(*) from students5 where name not regexp 'T.*';
语法: regexp_extract(string subject, string pattern, int index)
返回值: string
# 将字符串'IloveYou'按照'(I)(.*?)(You)'拆分,返回第一处字符,结果为I
select regexp_extract('IloveYou','(I)(.*?)(You)',1) from students5 limit 1;
# 将字符串'IloveYou'按照'(I)(.*?)(You)'拆分,返回第一处字符,结果为You
select regexp_extract('IloveYou','I(.*?)(You)',2) from students5 limit 1;
# 返回全部-结果‘IloveYou’
select regexp_extract('IloveYou','(I)(.*?)(You)',0) from students5 limit 1;
语法: regexp_replace(string A, string B, string C)
返回值: string
# 返回结果:‘Ilove’
select regexp_replace("IloveYou","You","") from students5 limit 1;
# 返回:‘Ilovelili’
select regexp_replace("IloveYou","You","lili") from test1 limit 1;
# 后台启动
$ nohup bin/hive --service hiveserver2 &
# 查看hive是否启动
$ ps -aux| grep hiveserver2
# 关闭
$ kill -9 20670
$ bin/beeline
# 使用默认账户连接hive
beeline> !connect jdbc:hive2://localhost:10000 scott tiger
# 使用配置中的账户密码连接hive
beeline> !connect jdbc:hive2://localhost:10000 fonttian 123456
# 退出
beeline> !quit