CREATE TABLE IF NOT EXISTS myhive;
CREATE TABLE myhive LOCATION '/myhive';
ALTER DATABASE myhive SET DBPROPERITIES ('createtime'='20191201');
DESC DATABASE myhive2;
DESC DATABASE EXTENDED myhive2;
DROP DATABASE myhive2
;DROP DATABASE myhive2 CASCADE
;基本格式:
CREATE [EXTERNAL] TABLE [IF NOT EXISTS] table_name
[(col_name data_type [COMMENT col_comment], ...)]
[COMMENT table_comment]
[PARTITIONED BY (col_name data_type [COMMENT col_comment], ...)]
[CLUSTERED BY (col_name, col_name, ...)
[SORTED BY (col_name [ASC|DESC], ...)] INTO num_buckets BUCKETS]
[ROW FORMAT row_format]
[STORED AS file_format]
[LOCATION hdfs_path]
字段参考:https://cwiki.apache.org/confluence/display/Hive/LanguageManual+Types
创建完成后,表的数据会存放在HDFS的/user/hive/warehouse/你选择的数据库中
由hive-site.xml配置文件中的一个属性指定
hive.metastore.warehouse.dir
/user/hive/warehouse
如果在创建的表时指定分隔符和存放位置需要用到关键字。hive当中的默认分隔符:\001(键盘打不出来,因为是asc码值,非打印字符,这么做是为了避免分隔符的冲突)
创建指定分隔符的管理表:CREATE TABLE IF NOT EXISIS stu2( id INT,name STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LOCATION '/user/stu2';
复制表结构和表中的数据:CREATE TABLE stu3 AS SELECT * FROM stu2;
只复制表的结构,不复制表的数据:CREATE TABLE stu4 LIKE stu2;
查询表的类型:DESC FORMATTED stu2;
CREATE EXTERNAL TABLE student(
s_id STRING,
s_name STRING,
s_birth STRING,
s_sex STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
LOAD DATA LOCAL INPATH '/export/servers/hivedatas/student.csv' INTO TABLE student;
LOAD DATA LOCAL INPATH '/export/servers/hivedatas/student.csv' OVERWRITE INTO TABLE student;
LOAD DATA INPATH '/hivedatas/teacher.csv' INTO TABLE teacher;
如果删掉student外部表,HDFS的数据仍然存在,并且重新创建表之后,表中就直接存在数据了,
CREATE TABLE score(
s_id STRING,c_id STRING,s_score INT)
PARTITIONED BY (MONTH STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
外部分区表:
CREATE EXTERNAL TABLE score4(
s_id STRING,c_id STRING,s_score INT)
PARTITIONED BY (month='201806')
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
CREATE TABLE score2(
s_id STRING,c_id STRING,s_score INT)
PARTITIONED BY (YEAR STRING,MONTH STRING,DAY STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
LOAD DATA LOCAL INPATH '/export/servers/hivedatas/score.csv' INTO TABLE score PARTITION (MONTH='201806');
LOAD DATA LOCAL INPATH '/export/servers/hivedatas/score.csv' INTO TABLE score2 PARTITION (YEAR='2018',MONTH='06',DAY='01');
SHOW PARTITIONS score;
ALTER TABLE score ADD PARTITION(month='201805');
ALTER TABLE score ADD PARTITION(month='201804') PARTITION(month='201803');
ALTER TABLE score DROP PARTITION(month='201806');
CREATE TABLE course(
c_id STRING,c_name STRING,t_id STRING)
CLUSTERED BY (c_id) INTO 3 BUCKETS
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
INSERT OVERWRITE TBALE course SELECT * FROM course_common CLUSTER BY (c_id);
ALTER TABLE score4 RENAME score5;
ALTER TABLE score5 ADD columns(mycol STRING,mysco STRING);
ALTER TABLE score CHANGE column mysco mysconew INT;
直接向分区表中插入数据:INSERT INTO TABLE score3 PARTITION(month='201807') VALUES('001','002','003');
(不推荐用该方法)
通过查询插入数据:
(1) LOAD DATA LOCAL INPATH '/export/servers/hivedatas/score.csv' OVERWRITE INTO TABLE score PARTITION(month='201806');
(2)INSERT OVERWRITE TABLE score PARTITION(month='201806') SELECT s_id,c_id,s_score FROM score;
(关键字overwrite必须要有)
多插入模式:
FROM score
INSERT OVERWRITE TABLE score_first PARTITION(month='201806')
SELECT s_id,c_id,s_score
INSERT OVERWRITE TABLE score_second PARTITION(month='201806')
SELECT s_id,c_id,s_score
SELECT [ALL | DISTINCT] select_expr, select_expr, ...
FROM table_reference
[WHERE where_condition]
[GROUP BY col_list [HAVING condition]]
[CLUSTER BY col_list | [DISTRIBUTE BY col_list] [SORT BY| ORDER BY col_list]]
[LIMIT number]
SELECT * FROM socre;
SELECT s_id,c_id FROM score;
SELECT s_id AS studentID,c_id FROM score;
SELECT COUNT(1) FROM score;
总行数SELECT MAX(s_score) FROM score;
求指定字段最大值SELECT MIN(s_score) FROM score;
求指定字段最小值SELECT SUM(s_score) FROM score;
求指定字段总和SELECT AVG(s_score) FROM score;
求指定字段平均值SELECT * FROM score LIMIT 3
返回指定行数的数据SELECT * FROM score WHERE s_score LIKE '8%';
查找以8开头的指定字段SELECT * FROM score WHERE s_score LIKE '_9%;'
查找第二个数值是9的指定字段SELECT * FROM score WHERE s_score RLIKE '[9]';
查找成绩中含有9的指定字段SELECT s_id,AVG(s_score) FROM score GROUP BY s_id;
按照s_id进行分组,并求出每个学生的平均分数==(GROUP BY的字段必须是SELECT后面的字段,SELECT后面的字段不能比GROUP BY后面的字段多)==HAVING和WHERE的区别:
SELECT s_id,AVG(s_score) AS avgScore FROM score
GROUP BY s_id
HAVING avgScore>85;
(Hive只支持等值连接,不支持非等值连接)
==INNER JOIN(内连接):==只有进行连接的两个表中都存在与连接条件相匹配的数据才会被保留下来。
SELECT * FROM teacher AS t
INNER JOIN course AS c
ON t.t_id=c.t_id;
LEFT JOIN(左外连接): JOIN操作符左边表中符合WHERE子句的所有记录将会被返回。
SELECT * FROM teacher AS t
LEFT JOIN course AS c
ON t.t_id=c.t_id;
RIGHT JOIN(右外连接): JOIN操作符右边表中符合WHERE子句的所有记录将会被返回。
SELECT * FROM teacher AS t
RIGHT JOIN course AS c
ON t.t_id=c.t_id;
FULL JOIN(满外连接): 将会返回所有表中符合WHERE语句条件的所有记录。如果任一表的指定字段没有符合条件的值的话,那么就使用NULL值替代。
SELECT * FROM teacher AS t
FULL JOIN course AS c
ON t.t_id=c.t_id;
多表连接: 连接 n个表,至少需要n-1个连接条件。
SELECT * FROM teacher AS t
LEFT JOIN course AS c
ON t.t_id=c.t_id
LEFT JOIN score AS s
ON s.c_id=c.c_id
LEFT JOIN student AS stu
ON s.s_id=stu.s_id;
SELECT * FROM student AS stu
LEFT JOIN score AS s
ON stu.s_id=s.s_id
ORDER BY s.s_score DESC|ASC;
多个列
SELECT s_id,AVG(s_score) AS avgScore
FROM score
GROUP BY s_id
ORDER BY s_id,avgScore;
只启动一个reduce
每个MapReduce内部进行排序,对全局结果集来说不是排序。
set mapreduce.job.reduces=3 设置reduce个数
SELECT * FROM score SORT BY s_score;
INSERT OVERWRITE LOCAL DIRECTORY '/export/servers/hivedatas/sort' SELECT * FROM score SORT BY s_score;
类似MR中partition,进行分区,结合sort by使用,并且要将DISTRIBUTE BY的语句写在SORT BY之前。
(对于distribute by进行测试,一定要分配多reduce进行处理,否则无法看到distribute by的效果)
先按照学生id进行分区,再按照学生成绩进行排序
set mapreduce.job.reduces=7
先设置reduce的个数为7个
通过DISTRIBUTE BY 进行分区,在通过SORT BY排序
INSERT OVERWRITE LOCAL DIRECTORY '/export/servers/hivedatas/sort'
SELECT * FROM score
DISTRIBUTE BY s_id
SORT BY s_score;
当DISTRIBUTE BY和SORT BY字段相同时,可以使用CLUSTER BY方式。
CLUSTER BY除了具有DISTRIBUTE BY的功能外还兼具SORT BY的功能。但是排序只能是倒序排序,不能指定排序规则为ASC或者DESC。
SELECT *
FROM score
CLUSTER BY s_id;
相当于
SELECT *
FROM score
DISTRIBUTE BY s_id
SORT BY s_id;
格式:hive [-hiveconf x=y] * [<-i filename>]* [<-f filename>|<-e query-string>] [-S]
参数 | 功能 |
---|---|
-i | 从文件初始化HQL |
-e | 从命令行执行指定的HQL |
-f | 执行HQL脚本 |
-v | 输出执行的HQL语句到控制台 |
-hiveconf | 设置hive运行时候的参数配置 |
$HIVE_CONF_DIR/hive-site.xml/
默认配置文件:$HIVE_CONF_DIR/hive-default.xml
)。另外,Hive也会读入Hadoop的配置,因为Hive是作为Hadoop的客户端启动的,Hive的配置会覆盖Hadoop的配置。配置文件的设定对本机启动的所有Hive进程都有效。-hiveconf param=value
来设定参数。例如bin/hive -hiveconf hive.root.logger=INFO,console
。命令行参数只对本次启动的Session(对于Server方式启动,则是所有请求的Sessions)有效。set mapred.reduce.tasks=100;
。这一设定的作用域也是session级的。优先级:参数声明 > 命令行参数 > 配置文件参数(hive)
参考文档:https://cwiki.apache.org/confluence/display/Hive/LanguageManual+UDF
SHOW FUNCTIONS;
查看系统自带的函数
DESC FUNCTION UPPER
显示自带函数的用法
DESC FUNCTION EXTENDED UPPER
显示更为详细的自带函数的用法
官方文档: https://cwiki.apache.org/confluence/display/Hive/HivePlugins
当Hive提供的内置函数无法满足你的业务处理需要时,此时就可以考虑使用用户自定义函数.
自定义函数可以分为三种:
UDF(User Defined Function)
:一进一出类型的,比如大小写转换
UDAF(User Defined Aggregation Function)
:聚集函数,多进一出,比如求最值和总数等;
UDTF(User Defined Table-Generating Function)
:一进多出,比如lateral view explore()
<repositories>
<repository>
<id>cloudera</id>
<url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
</repository>
</repositories>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.6.0-cdh5.14.0</version>
</dependency>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-exec</artifactId>
<version>1.1.0-cdh5.14.0</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.0</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
<encoding>UTF-8</encoding>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>2.2</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF
META-INF/*.DSA
META-INF/*/ RSA</exclude>
</excludes>
</filter>
</filters>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
package cn.itcast.udf;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.Text;
public class ItcastUDF extends UDF {
public Text evaluate(final Text s) {
if (null == s) {
return null;
}
return new Text(s.toString().toUpperCase());
}
}
mv original-day06_hive-1.0-SNAPSHOT.jar udf.jar
add jar /export/servers/hive-1.1.0-cdh5.14.0/lib/udf.jar;
create temporary function touppercase AS 'cn.itcast.udf.ItcastUDF';
SELECT touppercase('abc');
开启map输出阶段压缩可以减少job中MapTask和ReduceTask间数据传输量。
set hive.exec.compress.intermediate=true;
set mapreduce.map.output.compress=true;
set mapreduce.map.output.compress.codec=org.apache.hadoop.io.compress.SnappyCodec;
set hive.exec.compress.output=true;
set mapreduce.output.fileoutputformat.compress=true;
set mapreduce.output.fileoutputformat.compress.codec = org.apache.hadoop.io.compress.SnappyCodec;
set mapreduce.output.fileoutputformat.compress.type=BLOCK;
Hive支持的存储数的格式主要有:TEXTFILE(行式存储) 、SEQUENCEFILE(行式存储)、ORC(列式存储)、PARQUET(列式存储)。
一般拿到原始数据都是TextFile格式,需要经过分析后,通过
INSERT OVERWRITE TABLE tbl_name SELECT * FROM tbl_name2;
将分析出的结果导入到另一张临时表中就可以使用Parquet或者orc这些列式存储格式了。
注意: 存储格式和压缩方式没有关系,但是实际工作当中存储格式一般都会和压缩方式一起使用
orc存储格式本身自带一个叫zlib的压缩方式,就算orc去除掉zlib压缩,它因为列式存储的优势,还是可以将原有的数据变小。
虽然用zlib压缩的文件大小更小,但是压缩和解压都计较耗时,实际工作中一般都是将orc的存储格式和snappy的压缩方式一起用。