通过impala对kudu进行sql操作
--描述表
DESCRIBE tabel_name;
--查看分区情况
SHOW PARTITIONS table_name;
--查看当前使用数据库
SELECT current_database();
--查看建表语句
SHOW CREATE TABLE table_name
--impala创建数据库与hive一样,create database db_name,
--但是这个数据库只是一个impala端的namespace,
--kudu官网中没有提到数据库的概念,猜测可能是没有这个概念
--impala中创建表的时候比如在test数据库中创建table_test对应在kudu中为 test:table_test
--创建数据库
CREATE DATABASE IF NOT EXISTS POC_TEST;
--进入数据库
use POC_TEST;
--impala创建数据库与hive一样,create database db_name,
--但是这个数据库只是一个impala端的namespace,
--kudu官网中没有提到数据库的概念,猜测可能是没有这个概念
--impala中创建表的时候比如在test数据库中创建table_test对应在kudu中为 test:table_test
--创建数据库
CREATE DATABASE IF NOT EXISTS POC_TEST;
--进入数据库
use POC_TEST;
Kudu中的分区方法主要有两种:partition by hash和partition by range
-- 基于hash的分区方法的基本原理是:基于primary key的hash值将每个row划分到相应的tablet当中,
-- 分区的个数即tablet的个数必须在创建表语句中指定
-- 如果未指定基于某个字段的hash值进行分区,默认以主键的hash值进行分区
--主键两个字段,分区字段未指定 hash分区
create table kudu_first_table(
id int,
name string,
age int,
gender string,
primary key(id,name)
)
partition by hash partitions 4
stored as kudu;
--主键一个字段,分区字段未指定 hash分区
CREATE TABLE my_first_table
(
id BIGINT,
name STRING,
PRIMARY KEY(id)
)
PARTITION BY HASH PARTITIONS 16
STORED AS KUDU;
--表不存在则创建, 主键一个字段, 分区字段未指定 hash分区
CREATE TABLE IF NOT EXISTS POC_TEST.sdc(
id string,
name string,
PRIMARY KEY (id)
)
PARTITION BY HASH(id) PARTITIONS 2
STORED AS KUDU;
TBLPROPERTIES('kudu.master_addresses'='master.msxf.hadoop:7051','kudu.num_tablet_replicas' = '1');
--主键两个字段,分区字段指定,hash分区
create table specify_partition_column(
id int,
name string,
age int,
gender string,
primary key(id,name)
) partition by hash(id) partitions 3
stored as kudu;
--主键两个字段,分区字段指定一个字段,hash分区
create table specify_partition_one_column(
id int,
name string,
age int,
gender string,
primary key(id)
) partition by hash(id) partitions 3
stored as kudu;
--区别:未指定分区字段时,其分区字段默认是主键,若主键有两个列则分区字段为两个,指定分区字段时,
-- 需要分区列是主键的子集;否则会报错「 Only key columns can be used in PARTITION BY」
--不指定分区:表依然会创建,但是只有一个分区,会提示「Unpartitioned Kudu tables are ineff
-- 基于range的分区方法的基本原理是:基于指定主键的取值范围将每个row划分到相应的tablet当中,
-- 用于range分区的主键以及各个取值范围都必须在建表语句中声明
CREATE TABLE cust_behavior (
_id BIGINT PRIMARY KEY,
salary STRING,
edu_level INT,
usergender STRING,
`group` STRING,
city STRING,
postcode STRING,
last_purchase_price FLOAT,
last_purchase_date BIGINT,
category STRING,
sku STRING,
rating INT,
fulfilled_date BIGINT
)
PARTITION BY RANGE (_id)
(
PARTITION VALUES < 1439560049342,
PARTITION 1439560049342 <= VALUES < 1439566253755,
PARTITION 1439566253755 <= VALUES < 1439572458168,
PARTITION 1439572458168 <= VALUES < 1439578662581,
PARTITION 1439578662581 <= VALUES < 1439584866994,
PARTITION 1439584866994 <= VALUES < 1439591071407,
PARTITION 1439591071407 <= VALUES
)
STORED AS KUDU;
--优势:可以根据数据的具体情况建立分区,比如:建立2017年之前的分区,2017-2018,2018-2019,2019-2020,2020-2021,。。。
--劣势:如果使用单级range分区的话,容易产生数据热点问题(可混合hash分区使用)、
-- 在range分区中,如果有不止一个字段作为分区字段的话也可以,语法暂时不清楚;
-- 如果插入一条主键的值不落在任何range区间时会插入失败,并报错
create table tw_details4(
user_id string,
event_date string,
event string,
properties string,
customer_id int,
project_id int,
primary key(event_date,event,user_id)
) partition by hash(user_id) partitions 3, range(event_date)(
partition values < '2017-01-01',
partition '2017-01-01' <= values < '2018-01-01',
partition '2018-01-01' <= values < '2019-01-01',
partition '2019-01-01' <= values < '2020-01-01',
partition '2020-01-01' <= values < '2021-01-01'
) stored as kudu;
--优势:可以根据时间进行检索,来减少需要scan的tablet,插入的时候不会只有一个tabletserver产生热点
CREATE TABLE kudu_ti_event_fact_copy
primary key(user_id,event_date)
partition by hash(user_id) partitions 3
stored as kudu
as select user_id,event_date,properties from auto3.ti_event_fact;
DROP TABLE [表名];
DROP DATABASE [数据库名];
DROP VIEW [视图名];
impala 允许使用标准 SQL 语句将数据插入 Kudu
--单行插入:
insert into my_first_table(time, uid, event_id, action_value) values(123,"v2", "123", 2)
insert into table1 values(v1,v2,v3)
-- 多行插入:
INSERT INTO my_first_table(time, uid, event_id, action_value) VALUES (1, "john"), (2, "jane"), (3, "jim");
-- 批量插入(Batch Insert)
--从 Impala 和 Kudu 的角度来看,通常表现最好的方法通常是使用 Impala 中的 SELECT FROM 语句导入数据
INSERT INTO my_kudu_table SELECT * FROM legacy_data_import_table;
insert into table1 select v1,v2,v3 from table2;
--根据主键判定,若已经存在则更新,若不存在则插入
upsert into table1 values(v1,v2,v3)
--单行更新
UPDATE my_first_table SET name="bob" where id = 3;
--批量更新
UPDATE my_first_table SET name="bob" where id > 2;
--where条件后面的column不是主键也可以,但是更改的范围会扩大
--主键中不支持更改,只能删除后重新添加
UPDATE kudu_first_table set age = 32 where id= 2;
UPDATE kudu_first_table set age = 31 where gender= 'female';
--获取某一天的时间(时间类型转string类型,在截取时间)
select substr(cast(CREATE_DATE as string),1,10)
from CBEE_ELIST WHERE substr(cast(CREATE_DATE as string),1,10) = '2001-02-01'
--修改表名,修改的只是表在impala中的映射名
alter table kudu_ti_event_fact_copy rename to kudu_ti_event_fact_copy_rename;
--修改kudu存储的表名,但是不会改变在impala端的映射表名,也就是在impala中依然访问更改之前的表名
ALTER TABLE kudu_ti_event_fact_copy_rename
SET TBLPROPERTIES('kudu.table_name' = 'kudu_ti_event_fact_copy');
--修改列属性
-- --**不支持---
--添加列
alter table kudu_ti_event_fact_copy_rename add columns(method string,time_stamp string);
--删除列
ALTER table kudu_ti_event_fact_copy_rename drop column method;
--删除分区
ALTER TABLE range_partition_table DROP RANGE PARTITION VALUES < '2017-01-01';
--添加分区
1,zs,addr:bj-age:18-marry:false
2,ls,addr:sh-age:28-marry:true
3,ww,addr:sz-age:26-marry:false-inc:2000
create table custom(id int,name string,info map)
row format delimited fields terminated by ','
collection items terminated by '-'
map keys terminated by ':'
;
loada data local inpath '/root/mp.txt' into table custom;
0: jdbc:hive2://localhost:10000> select id,name,info['age'] as age from custom;
+-----+-------+------+
| id | name | age |
+-----+-------+------+
| 1 | zs | 18 |
| 2 | ls | 28 |
| 3 | ww | 26 |
+-----+-------+------+
[doitedu01:21000] default> select * from custom;
Query: select * from custom
Query submitted at: 2020-08-21 22:54:38 (Coordinator: http://doitedu01:25000)
ERROR: NotImplementedException: Scan of table 'default.custom' in format 'TEXT' is not supported because the table has a column 'info' with a complex type 'MAP'.
Complex types are supported for these file formats: PARQUET.
# 如果只查询不含map类型的字段
select id,name from custom
依然不支持
create table custom_parquet(
id int,
name string,
info map
)
stored as parquet
;
insert into table custom_parquet
select * from custom
;
然后继续在impala中查询:
[doitedu01:21000] default> select * from custom_parquet;
+----+------+
| id | name |
+----+------+
| 1 | zs |
| 2 | ls |
| 3 | ww |
+----+------+
-- 发现,虽然查询 * ,但只显示出非复杂类型字段
-- 如果需要查询map中的数据,则需要用如下语法:
select
id,
name,
info.key,
info.value
from
custom_parquet,
custom_parquet.info as info
;
-- 查询结果如下:
+----+------+-------+-------+
| id | name | key | value |
+----+------+-------+-------+
| 1 | zs | addr | bj |
| 1 | zs | age | 18 |
| 1 | zs | marry | false |
| 2 | ls | addr | sh |
| 2 | ls | age | 28 |
| 2 | ls | marry | true |
| 3 | ww | addr | sz |
| 3 | ww | age | 26 |
| 3 | ww | marry | false |
| 3 | ww | inc | 2000 |
+----+------+-------+-------+
select
id,
name,
o.key,
o.value
from
custom
lateral view
explode (info) o as key,value
;
+-----+-------+--------+----------+
| id | name | o.key | o.value |
+-----+-------+--------+----------+
| 1 | zs | addr | bj |
| 1 | zs | age | 18 |
| 1 | zs | marry | false |
| 2 | ls | addr | sh |
| 2 | ls | age | 28 |
| 2 | ls | marry | true |
| 3 | ww | addr | sz |
| 3 | ww | age | 26 |
| 3 | ww | marry | false |
| 3 | ww | inc | 2000 |
+-----+-------+--------+----------+
1,战狼,吴京:于兰:王宝强
2,八百,李小璐:宋喆
create table movie(
id int,
name string,
actors array
)
row format delimited fields terminated by ','
collection items terminated by ':'
;
load data local inpath '/root/movie.txt' into table movie;
create table movie_parquet(
id int,
name string,
actors array
)
stored as parquet
;
insert into table movie_parquet select * from movie;
-- impala查询 ,是以打平的方式来查询
select
id,name,
actors.*
from movie_parquet , movie_parquet.actors as actors
-- 求每部电影的演员个数
select
id,name,count(1) as actor_nbr
from movie_parquet , movie_parquet.actors as actors
group by id,name