cd /export/servers/apache-hive-2.1.1-bin/
bin/hive
create database if not exists mytest
不进入hive的客户端直接执行hive的hql语句
cd /export/servers/apache-hive-2.1.1-bin
bin/hive -e 'create database if not exists mytest;'
或者我们将我们的hql语句写成一个sql脚本然后执行
cd /export/servers
vim hive.sql
create database if not exists mytest;
use mytest
create table stu(id int, name string);
通过hive -f来执行我们的sql脚本
bin/hive -f /export/servers/hive.sql
create database if not exists myhive
use myhive
说明:hive的表存放位置模式是由hive-site.xml当中的属性指定的
hive.metastore.warehouse.dir
/user/hive/warehouse
创建数据库指定位置
create database myhive2 location '/myhive2';
数据库可以有一些描述性的键值对信息,在创建时添加
create database foo with dbproperties('owner' = 'itcast', 'date' = '20190120');
查看数据库的键值对信息(foo是数据库的名字)
describe database extended foo;
修改数据库的键值对信息:
alert database foo set dbproperties('owner' = 'itheima')
查看数据库更多详细信息
desc database extended foo;
删除一个空数据库,如果数据库下面有数据表,那么就会报错
drop database myhive2;
强制删除数据库,包含数据库下面的表一起删除
drop database myhive cascade;
create [external] table [if not exists] table_name(
col_name data_type [comment '字段描述信息']
col_name data_type [comment '字段描述信息']
)
[comment '表的描述信息']
[partitioned by (col_name data_type,...)]
[clustered by (col_name, col_name,...)]
[sorted by (colname [ase|desc],...) into num_buckets buckets]
[rows format row_format]
[sorted as...]
[location '指定表的路径']
说明:
建表入门:
use myhive;
create table stu(id int, name string);
insert into stu values(1,"zhangsan");# 插入数据
select * from stu;
创建表并指定字段之间的分隔符
create table if not exists stu2(id int, name string) row format delimited fields terminated by '\t'
创建表并指定表文件的存放路径
create table if not exists stu2(id int, name string) row format delimited fields terminated by '\t' location '/user/stu2'
根据查询结果创建表
create table stu3 as select * from stu2 #通过复制表结构和表内容创建新表
根据已经存在的表结构创建表
create table stu4 like stu;
查询表的详细信息
desc formatted stu2;
删除表
drop table stu2;
外部表因为是指定其他的hdfs路径的数据加载到表当中来,所以hive表会认为自己不完全独占这份数据,所以删除hive表的时候,数据仍然会存放在hdfs当中,不会删掉
每天将收集到的网站日志定期流入HDFS文本文件,在外部表(原始日志表)的基础上做大量的统计分析,用到的中间表、结果表使用内部表存储,数据通过SELECT+INSERT进入内部表
分别创建老师与学生表外部表,并向表中加载数据
create external table teacher (t_id string, t_name string) row format delimited fields terminated by '\t'
创建学生表
create external table student(s_id string, s_name string, s_birth string)
加载数据
load data local inpath '/export/servers/hivedatas/student.csv' into table student
加载数据并覆盖已有数据
load data local inpath '/export/servers/hivedatas/student.csv' overwrite into table student
从hdfs文件系统向表中加载数据(需要提前将数据上传到hdfs文件系统)
cd /export/servers/hivedatas
hdfs dfs -mkdir -p /hivedatas
hdfs dfs -put teacher.csv /hivedatas/
load data inpath '/hivedatas/teacher.csv' into table teacher;
在大数据中,最常用的一种思想就是分治,我们可以把大的文件分割划分成一个个的小的文件,这样每次操作一个小的文件就会很容易了,同样的道理,在hive当中也是支持这种思想的,就是我们可以把大的数据,按照每月,或者天进行切分成一个个小的文件,存放在不同的文件夹中。
create table score(s_id string, c_id string, s_score int) partitioned by(month string) row format delimited fields terminated by '\t';
创建一个表带多个分区
create table score2(s_id string, c_id string, s_score int) partitioned by(year string, month string, day string) row format delimited dields terminated by '\t';
加载数据到分区表中
load data local inpath '/export/servers/hivedatas/score.csv' into table score partition(month='201806')
多分区表联合查询(使用union all)
select * from score where month = '201806' union all select * from score where month = '201806'
查看分区
show partitions score; #score是表名
添加一个分区
alert table score add partition(month = '201805')
删除分区
alert table score drop partition(month = '201806')
进行表的修复(建立表与数据文件的一个关系映射)
msck repair table score4;
分桶,就是将数据按照指定的字段进行划分到多个文档中去,分桶就是MapReduce中的分区
set hive.enforce.bucketing=true
set mapreduce.job.reduces=3
创建分桶表
create table course(c_id string, c_name string,t_id string) clustered by(c_id) into 3 buckets row format delimited fields terminated by '\t';
桶表的数据加载,由于桶表的数据加载通过hdfs dfs -put文件或者load data均不好使,只能通过 insert overwirte
创建普通表,并通过insert overwrite的方式将普通表的数据通过查询的方式加载到桶表当中去
create table course_common(c_id string, c_name string, t_id string) row format delimited fields terminated by '\t';
普通表中加载数据
load data local inpatj 'export/servers/hivedatas/course.csv' into table course_common;
通过insert overwrite给桶表中加载数据
insert overwrite table course select * from course_common cluster by(c_id);
alert table old_table_name rename to new_table_name
把表score4修改成score5
alert table score4 rename to score5
desc score5;
alert table score5 add columns(mycol string, mysco int);
alert table score5 change column mysco mysconew int;
drop table score5;
SELECT [ALL | DISTINCT] select_expr, select_expr,...
FROM table_reference
[WHERE where_condition]
[GROUP BY col_list [HAVING condition]]
[CLUSTER BY col_list
| [DISTRIBUTE BY col_list] [SORT BY|ORDER BY col_list]]
因此,如果distribute和sort字段是同一个,此时,cluster by = distribute by + sort by
select * from score; # score是表名
select s_id, c_id from score; # score是表名
(1)重命名一个列(2)便于计算(3)紧跟列名,也可以在列名和别名之间加入关键字‘AS’
select s_id as myid, c_id from score; #s_id, c_id是列名,score是表名
select count(1) from score;
select max(s_score) from score;
select min(s_score) from score;
select sum(s_score) from score;
select avg(s_score) from score;
典型的查询会返回多行数据,LIMIT子句用于限制返回的行数。
select * from score limit 3;
查询出分数大于60的数据
select * from score where s_score > 60;
查询分数等于80的数据
select * from score where s_score = 80;
查询分数在80到100的所有数据
select * from score where s_score between 80 and 100;
查询成绩为空的所有数据
select * from score where s_score is null;
查询成绩是80 和 90的数据
select * from score where s_score in(80, 90);
% 代表领个或多个字符(任意个字符)
- 代表一个字符
1、查找以8开头的所有数据
select * from score where s_score like '8%';
2、查找第二个数值为9的所有成绩数据
select * from score where s_score like '_9%';
3、查找s_id中含1的数据
select * from score where s_id rlike '[1]'; #like '%1%'
操作符 | 含义 |
---|---|
AND | 逻辑并 |
OR | 逻辑或 |
NOT | 逻辑否 |
查询成绩大于80,并且s_id是01的数据
select * from score where s_score > 80 and s_id = '01';
查询成绩大于80,或者s_id是01的数据
select * from score where s_score > 80 or s_id = '01';
查询s_id不是01和02的学生
select * from score where s_id not in ('01', '02');
GROUP BY语句
GROUP BY 语句通常会和聚合函数一起使用,按照一个或者多个列队结果进行分组,然后对每个组执行聚合操作
计算每个学生的平均分数
select s_id, avg(s_score) from score group by s_id;
计算个学生的最高成绩
select s_id, max(s_score) from score group by s_id;
having与where不同点
统计每个学生的平均数
select s_id, avg(s_score) from score group by s_id;
求每个学生平均分数大于85的人
select s_id, avg(s_score) avgscore from score group by s_id having avgscore > 85;
Hive支持通常的SQL JOIN语句,但是只支持等值连接,不支持非等值连接
查询分数对应的姓名
select s.s_id, s.s_score, stu.s_name, stu.s_birth from score s join student stu on s.s_id = stu.s_id;
好处:
合并老师与课程表
select * from teacher t join course c on t.t_id = c.t_id;
内连接:只有进行连接的两个表中都存在与连接条件相匹配的数据才会被保留下来
select * from teacher t inner join course c on t.t_id = c.t_id;
左外连接:JOIN操作符左边表中符合WHERE子句的所有记录将会被返回,查询老师对应的课程
select * from teacher t left join course c on t.t_id = c.t_id;
右外连接:JOIN操作符右边表中符合WHERE子句的所有聚类会被返回
select * from teacher t right join course c on t.t_id = c.t_id;
注意:连接n个表,至少需要n-1个连接条件。例如:连接三个表,至少需要两个连接条件。
select * from teacher t
left join course c
on t.t_id = c.t_id
left join score s
on s.c_id = c.c_id
left join student stu
on s.s_id = stu.s_id;
大多数情况加,Hive会对每对JOIN连接对象启动一个MapReduce任务。
Order By:全局排序,一个reduce
查询学生的成绩,并按照分数降序排列
SELECT * FROM student s left join score sco on s.s_id = sco.s_id order by sco.s_score DESC;
查询学生的成绩,并按照分数升序排序
SELECT * FROM student s left join score sco on s.s_id = sco.s_id order by sco.s_score ASC;
按照分数的平均值排序
select s_id, avg(s_score) avg from score group by s_id order by avg;
按照学生id和平均成绩进行排序
select s_id, avg(s_score) avg from score group by s_id order by s_id, avg;
Sort By:每个MapReduce内部进行排序,对全局结果集来说不是排序
1、设置reduce个数
set mapreduce.job.reduces = 3;
2、查看设置reduce个数
set mapreduce.job.reduces;
3、查询成绩按照成绩降序排列
select * from score sort by s_score;
4、将查询结果导入到文件中(按照成绩降序排列)
insert overwrite local directory '/export/servers/hivedatas/sort' select * from score sort by s_score;
Distribute By:类似MR中partition,进行分区,结合sort by使用
注意,Hive要求Distribute By语句要写在sort by语句之前
对于distribute by 进行测试,一定要分配多reduce进行处理,否则无法看到distribute by的效果
1、设置reduce的个数,将我们对应的s_id划分到对应的 reduce当中去
set mapreduce.job.reduces = 7;
2、通过distribute by 进行数据的分区
insert overwrite local directory '/export/servers/hivedatas/sort' select * from score distribute by s_id sort by s_score;
当distribute by和sort by字段相同时,可以使用cluster by方式
cluster by 除了具有distribute by的功能外还兼具sort by的功能,但是排序只能是倒序排序,不能指定排序规则为ASC或者DESC