create table t_archer(
id int comment "ID",
name string comment "英雄名称",
hp_max int comment "最大生命",
mp_max int comment "最大法力",
attack_max int comment "最高物攻",
defense_max int comment "最大物防",
attack_range string comment "攻击范围",
role_main string comment "主要定位",
role_assist string comment "次要定位"
) comment "王者荣耀射手信息"
row format delimited fields terminated by "\t";
hadoop dfs -put archer.txt /user/hive/warehouse/test.db/t_archer
select * from t_archer;
create table t_hot_hero_skin_price(
id int,
name string,
win_rate int,
skin_price map<string,int>
) row format delimited
fields terminated by ',' --字段之间分隔符
collection items terminated by '-' --集合元素之间分隔符
map keys terminated by ':'; --集合元素kv之间分隔符;
hadoop dfs -put hot_hero_skin_price.txt /user/hive/warehouse/test.db/t_hot_hero_skin_price
select * from t_hot_hero_skin_price;
create table t_team_ace_player(
id int,
team_name string,
ace_player_name string
); --没有指定row format语句 此时采用的是默认的\001作为字段的分隔符
hadoop dfs -put team_ace_player.txt /user/hive/warehouse/test.db/t_team_ace_player
select * from t_team_ace_player;
create table t_team_ace_player_location(
id int,
team_name string,
ace_player_name string
)
location ‘/data’; --使用location关键字指定本张表数据在hdfs上的存储路径
hadoop dfs -put team_ace_player.txt /data
select * from t_team_ace_player_location;
注: 分区字段不能是表中已经存在的字段,因为分区字段最终也会以细腻字段的形式显示在表结构上。
--注意分区表创建语法规则
--分区表建表
create table t_all_hero_part(
id int,
name string,
hp_max int,
mp_max int,
attack_max int,
defense_max int,
attack_range string,
role_main string,
role_assist string
) partitioned by (role string)--注意哦 这里是分区字段
row format delimited fields terminated by "\t";
--双分区表,按省份和市分区
--分区字段之间是一种递进的关系 因此要注意分区字段的顺序 谁在前在后
create table t_user_province_city (id int, name string,age int) partitioned by (province string, city string);
静态上传文件
SQL
load data local inpath '/root/data/all_hero/archer.txt' into table t_all_hero_part partition(role='sheshou');
load data local inpath '/root/data/all_hero/assassin.txt' into table t_all_hero_part partition(role='cike');
load data local inpath '/root/data/all_hero/mage.txt' into table t_all_hero_part partition(role='fashi');
load data local inpath '/root/data/all_hero/support.txt' into table t_all_hero_part partition(role='fuzhu');
load data local inpath '/root/data/all_hero/tank.txt' into table t_all_hero_part partition(role='tanke');
load data local inpath '/root/data/all_hero/warrior.txt' into table t_all_hero_part partition(role='zhanshi');
SQL
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
--创建一张新的分区表 t_all_hero_part_dynamic
create table t_all_hero_part_dynamic(
id int,
name string,
hp_max int,
mp_max int,
attack_max int,
defense_max int,
attack_range string,
role_main string,
role_assist string
) partitioned by (role string)
row format delimited
fields terminated by "\t";
insert into table t_all_hero_part_dynamic partition(role) --注意这里 分区值并没有手动写死指定
select tmp.*,tmp.role_main from t_all_hero tmp;
--双分区表的数据加载 静态分区加载数据
load data local inpath '/root/hivedata/user.txt' into table t_user_province_city
partition(province='zhejiang',city='hangzhou');
load data local inpath '/root/hivedata/user.txt' into table t_user_province_city
partition(province='zhejiang',city='ningbo');
load data local inpath '/root/hivedata/user.txt' into table t_user_province_city
partition(province='shanghai',city='pudong');
--双分区表的使用 使用分区进行过滤 减少全表扫描 提高查询效率
select * from t_user_province_city where province= "zhejiang" and city ="hangzhou";
CREATE TABLE t_usa_covid19_bucket(
count_date string,
county string,
state string,
fips int,
cases int,
deaths int
)
CLUSTERED BY(state) INTO 5 BUCKETS; --分桶的字段一定要是表中已经存在的字段
CREATE TABLE t_usa_covid19(
count_date string,
county string,
state string,
fips int,
cases int,
deaths int
)
row format delimited fields terminated by ",";
hadoop dfs -put us-covid19-counties.dat /user/hive/warehouse/test.db/t_usa_covid19
--基于分桶字段state查询来自于New York州的数据
--不再需要进行全表扫描过滤
--根据分桶的规则hash_function(New York) mod 5计算出分桶编号
--查询指定分桶里面的数据 就可以找出结果 此时是分桶扫描而不是全表扫描
select *
from t_usa_covid19_bucket where state="New York";
普通查询耗时:181ms
基于分桶字段查询耗时:175ms
基于分桶字段state查询,不再需要进行全表扫描过滤,根据分桶的规则hash_function(New York) mod 5计算出分桶编号,查询指定分桶里面的数据 就可以找出结果 此时是分桶扫描而不是全表扫描
当数据量特别大时,对全体数据进行处理存在困难时,抽样就显得尤其重要了。抽样可以从被抽取的数据中估计和推断出整体的特性,是科学实验、质量检验、社会调查普遍采用的一种经济有效的工作和研究方法。
事务表创建几个要素:开启参数、分桶表、存储格式orc、表属性
开启事务配置(可以使用set设置当前session生效 也可以配置在hive-site.xml中)
set hive.support.concurrency = true; --Hive是否支持并发
set hive.enforce.bucketing = true; --从Hive2.0开始不再需要 是否开启分桶功能
set hive.exec.dynamic.partition.mode = nonstrict; --动态分区模式 非严格
set hive.txn.manager = org.apache.hadoop.hive.ql.lockmgr.DbTxnManager; –
set hive.compactor.initiator.on = true; --是否在Metastore实例上运行启动线程和清理线程
set hive.compactor.worker.threads = 1; --在此metastore实例上运行多少个压缩程序工作线程。
create table trans_student(
id int,
name String,
age int
)
clustered by (id) into 2 buckets
stored as orc
TBLPROPERTIES('transactional'='true');
insert into trans_student values(1,"allen",18);
update trans_student
set age = 20
where id = 1;
delete from trans_student where id =1;
select *
from trans_student;
--1、创建视图
create view v_usa_covid19 as select count_date, county,state,deaths from t_usa_covid19 limit 5;
--2、从已有的视图中创建视图
create view v_usa_covid19_from_view as select * from v_usa_covid19 limit 2;
show tables;
show views;--hive v2.2.0之后支持
select * from v_usa_covid19;
show create table v_usa_covid19;
drop view v_usa_covid19_from_view;
alter view v_usa_covid19 set TBLPROPERTIES ('comment' = 'This is a view');
alter view v_usa_covid19 as select county,deaths from t_usa_covid19 limit 2;
create table userinfo(firstname string, lastname string, ssn string, password string);
create view safer_user_info as select firstname, lastname from userinfo;
from (
select * from people join cart
on(cart.pepople_id = people.id) where firstname = 'join'
)a select a.lastname where a.id = 3;
--把嵌套子查询变成一个视图
create view shorter_join as
select * from people join cart
on (cart.pepople_id = people.id) where firstname = 'join';
--基于视图查询
select lastname from shorter_join where id = 3;
create table student(
num int,
name string,
sex string,
age int,
dept string)
row format delimited
fields terminated by ',';
select * from student;
CREATE TABLE student_trans (
sno int,
sname string,
sdept string
)
clustered by (sno) into 2 buckets stored as orc TBLPROPERTIES('transactional'='true');
insert overwrite table student_trans
select num,name,dept
from student;
SELECT sdept, count(*) as sdept_cnt from student_trans group by sdept;
CREATE MATERIALIZED VIEW student_trans_agg
AS SELECT sdept, count(*) as sdept_cnt from student_trans group by sdept;
SELECT sdept, count(*) as sdept_cnt from student_trans group by sdept;
--禁用物化视图自动重写
ALTER MATERIALIZED VIEW student_trans_agg DISABLE REWRITE;
--启用物化视图自动重写
ALTER MATERIALIZED VIEW student_trans_agg ENABLE REWRITE;
SELECT sdept, count(*) as sdept_cnt from student_trans group by sdept;
禁用后,无法查询命中,查询效率低下
drop materialized view student_trans_agg;
show materialized views;