CTAS – Create Table As Select
CREATE TABLE ctas_employee as SELECT * FROM employee(基于select查询的结果生成表)
CTAS CANNOT create a partition, external, or bucket table(不能生成分区表,外部表,桶表)
Create table like other table (fast):
CREATE TABLE employee_like LIKE employee(复制表的结构,不携带数据)
-- Common Table Expression (CTE) 公共表达式
CREATE TABLE cte_employee AS
WITH
r1 AS (SELECT name FROM r2 WHERE name = 'Michael'),
r2 AS (SELECT name FROM employee WHERE sex_age.sex= 'Male'),
r3 AS (SELECT name FROM employee WHERE sex_age.sex= 'Female')
SELECT * FROM r1 UNION ALL SELECT * FROM r3;
CREATE TEMPORARY TABLE tmp_table_name1 (c1 string);
支持CTAS表达式
CREATE TEMPORARY TABLE tmp_table_name2 AS..
CREATE TEMPORARY TABLE tmp_table_name3 LIKE..
-- 删除库
drop database if exists db_name;
-- 强制删除库
drop database if exists db_name cascade;
-- 删除表
drop table if exists employee;
-- 清空表
truncate table employee;
-- 清空表,第二种方式的效率较高
insert overwrite table employee select * from employee where 1=0;
-- 删除分区
alter table employee_table drop partition (stat_year_month>='2018-01');
-- 按条件删除数据
insert overwrite table employee_table select * from employee_table where id>'180203a15f';
Hive默认不支持删除数据,修改数据,只能添加数据和查询数据
Hive带事务的表支持update和delete
alter table employee_hr set LOCATION 'hdfs:///user/data_employee_hr';
To increase performance Hive has the capability to partition data (为了提高性能,Hive可对数据进行分区)
The values of partitioned column divide a table into segments (folders) (分区列的值将表分成段)
Entire partitions can be ignored at query time (查询时可以忽略整个分区)
Partitions have to be properly created by users. When inserting data must specify a partition (必须由用户正确创建分区。 插入数据时必须指定分区)
There is no difference in schema between “partition” columns and regular columns when using in query (在查询中使用时,“分区”列和常规列之间没有区别)
At query time, Hive will automatically filter out partitions not being used for better performance (在查询时,Hive将自动过滤掉未使用的分区以获得更好的性能)
静态分区:先把空的分区表创建好,然后再手动导入分区数据。
一级分区:
create table dept_partition(
deptno int,
dename string,
loc string
)
partitioned by (year string)
row format delimited fields terminated by ',';
加载数据到分区:
1.手动增加分区
alter table dept_partition add partition(year='2021')
2.加载数据到分区:
load data local inpath '/root/hdp/hive_stage/dept.txt' into table dept_partition partition(year='2021');
二级分区
CREATE TABLE emp_partition_2 (
name string,
work_place ARRAY,
sex_age STRUCT,
skills_score MAP,
depart_title MAP>
)
PARTITIONED BY (year INT, month INT)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '|'
COLLECTION ITEMS TERMINATED BY ','
MAP KEYS TERMINATED BY ':';
--添加分区
alter table employee_partitioned add partition (year=2020,month=11);
--显示表的所有分区
show partitions employee_partitioned;
--删除分区
alter table employee_partitioned drop if exists partition (year=2020,month=11);
--可以直接使用HDFS将数据移动到Hive分区表对应的目录
-- Static Partition is NOT enabled automatically. We have to add/delete partition by ALTER TABLE statement(静态分区未自动启用。 我们必须通过ALTER TABLE语句添加/删除分区)
hive提供了一个严格模式,可以防止用户执行那些可能产生意想不到的不好的效果的查询。即某些查询在严格 模式下无法执行。通过设置hive.mapred.mode的值为strict,可以禁止三种类型的查询。
带有分区的表的查询如果在一个分区表执行SQL,除非where语句中包含分区字段过滤条件来显示数据范围,否则不允许执行。换句话说,就是用户不允许扫描所有的分区。进行这个限制的原因是,通常分区表都拥有非常大的数据集,而且数据增加迅速。如果没有进行分区限制的查询可能会消耗令人不可接受的巨大资源来处理这个表:
hive> SELECT DISTINCT(planner_id) FROM fracture_ins WHERE planner_id=5; FAILED: Error in semantic analysis: No Partition Predicate Found for Alias “fracture_ins” Table "fracture_ins
如下这个语句在where语句中增加了一个分区过滤条件(也就是限制了表分区):SELECT DISTINCT(planner_id) FROM fracture_ins > WHERE planner_id=5 AND hit_date=20120101; … normal results …
带有order by的查询 对于使用了order by的查询,要求必须有limit语句。因为orderby为了执行排序过程会将所有的结果分发到同一个reducer中进行处理,强烈要求用户增加这个limit语句可以防止reducer额外执行很长一段时间:
hive> SELECT * FROM fracture_ins WHERE hit_date>2012 ORDER BY planner_id; FAILED: Error in semantic analysis: line 1:56 In strict mode, limit must be specified if ORDER BY is present planner_id
只需要增加limit语句就可以解决这个问题: hive> SELECT * FROM fracture_ins WHERE hit_date>2012 ORDER BY planner_id > LIMIT 100000; … normal results …
限制笛卡尔积的查询对关系型数据库非常了解的用户可能期望在执行join查询的时候不使用on语句而是使用where语句,这样关系数据库的执行优化器,就可以高效的将where语句转换成那个on语句。不幸的是,hive不会执行这种优化,因此,如果表足够大,那么这个查询就会 出现不可控的情况: hive> SELECT * FROM fracture_act JOIN fracture_ads : Error in semantic analysis: In strict mode, cartesian product is not allowed. If you really want to perform the operation, +set hive.mapred.mode=nonstrict+
下面这个才是正确的使用join和on语句的查询: hive> SELECT * FROM fracture_act JOIN fracture_ads > ON (fracture_act.planner_id = fracture_ads.planner_id);
Hive also supports dynamically giving partition values. This is useful when data volume is large and we don’t know what will be the partition values (Hive还支持动态提供分区值。 当数据量很大并且我们不知道分区值是什么时,这很有用)
By default, the user must specify at least one static partition column. This is to avoid accidentally overwriting partitions. To disable this restriction, we can set the partition mode to nonstrict (see line 3) from the default strict mode (默认情况下,用户必须至少指定一个静态分区列。 这是为了避免意外覆盖分区。 要禁用此限制,可以将分区模式从默认的严格模式设置为非严格模式)
实验数据
create table employee_hr (
name string,
employee_id string,
sin_number string,
start_date string
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '|'
STORED AS TEXTFILE
LOCATION '/hive_stage/employee_hr';
动态分区表:
CREATE TABLE emp_partition_dynamic (
name string,
work_place ARRAY,
sex_age STRUCT,
skills_score MAP,
depart_title MAP>
)
PARTITIONED BY (year INT, month INT)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '|'
COLLECTION ITEMS TERMINATED BY ','
MAP KEYS TERMINATED BY ':';
从实验数据表把数据导入动态分区表
--开启动态分区设置
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
动态分区的数据必须使用insert语法插入
insert into table employee_partitioned partition(year,month)
select
name,
array('Toronto') as work_place,
named_struct("sex","Male","age",30) as sex_age,
map("Python",90) as skills_score,
map("R&D",array('Developer')) as depart_title,
year(start_date) as year,
month(start_date) as month
from employee_hr eh
实验数据表
create table if not exists employee_id (
name string,
employee_id int,
work_place array,
sex_age struct,
skills_score map,
depart_title map>
)
row format delimited fields terminated by '|'
collection items terminated by ','
map keys terminated by ':';
load data inpath '/hive_stage/employee_id/employee_id.txt' overwrite into table employee_id
创建桶表:
set hive.enforce.bucketing = true;
create table if not exists employee_id_buckets (
name string, employee_id int,
work_place array,
sex_age struct,
skills_score map,
depart_title map>
)
clustered by (employee_id) into 2 buckets
row format delimited fields terminated by '|'
collection items terminated by ','
map keys terminated by ':';
INSERT OVERWRITE TABLE employee_id_buckets
SELECT * FROM employee_id;
--Verify the buckets in the HDFS
查看更多大数据开发教程