回顾
在上篇文章中,笔者使用的 CDH 版本为 5.16.2,其中 Hive 版本为 1.1.0(CDH 5.x 系列 Hive 版本都不高于 1.1.0,是不是不可理解),Flink 源代码本身对 Hive 1.1.0 版本兼容性不好,存在不少问题。为了兼容目前版本,笔者基于 CDH 5.16.2 环境,对 Flink 代码进行了修改,重新打包并部署。
其实经过很多开源项目的实战,比如 Apache Atlas,Apache Spark 等,Hive 1.2.x 和 Hive 1.1.x 在大部分情况下,替换一些 Jar 包,是可以解决兼容性的问题。对于笔者的环境来说,可以使用 Hive 1.2.1 版本的一些 Jar 包来代替 Hive 1.1.0 版本的 Jar 包。在本篇文章的开始部分,笔者会解决这个问题,然后再补充上篇文章缺少的实战内容。
剪不断理还乱的问题
根据读者的反馈,笔者将所有的问题总结为三类:
01
02
03
04
05
06
07
08
09
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
|
catalogs:
- name: staginghive
type : hive
hive-conf- dir : /etc/hive/conf
hive-version: 1.2.1
execution:
planner: blink
type : batch
time -characteristic: event- time
periodic-watermarks-interval: 200
result-mode: table
max-table-result-rows: 1000000
parallelism: 1
max-parallelism: 128
min-idle-state-retention: 0
max-idle-state-retention: 0
current-catalog: staginghive
current-database: ssb
restart-strategy:
type : fallback
deployment:
response-timeout: 5000
gateway-address: ""
gateway-port: 0
m: yarn-cluster
yn: 2
ys: 5
yjm: 1024
ytm: 2048
|
1
|
$ bin /sql-client .sh embedded -d conf /sql-client-hive .yaml
|
1
2
3
4
|
export HADOOP_CONF_DIR= /etc/hadoop/conf
export YARN_CONF_DIR= /etc/hadoop/conf
export HIVE_HOME= /opt/cloudera/parcels/CDH/lib/hive
export HIVE_CONF_DIR= /etc/hive/conf
|
01
02
03
04
05
06
07
08
09
10
11
12
13
|
$ tree lib
lib
├── flink-connector-hive_2.11-1.10.0.jar
├── flink-dist_2.11-1.10.0.jar
├── flink-hadoop-compatibility_2.11-1.10.0.jar
├── flink-shaded-hadoop-2-2.6.0-cdh5.16.2-9.0.jar
├── flink-table_2.11-1.10.0.jar
├── flink-table-blink_2.11-1.10.0.jar
├── hive- exec -1.1.0-cdh5.16.2.jar
├── hive-metastore-1.1.0-cdh5.16.2.jar
├── libfb303-0.9.3.jar
├── log4j-1.2.17.jar
└── slf4j-log4j12-1.7.15.jar
|
1
|
$ bin /sql-client .sh embedded -d conf /sql-client-hive .yaml
|
1
|
Caused by: java.lang.ClassNotFoundException: org.apache.commons.logging.LogFactory
|
1
|
export HADOOP_CLASSPATH=`hadoop classpath`
|
1
|
$ bin /sql-client .sh embedded -d conf /sql-client-hive .yaml
|
1
|
Caused by: org.apache.flink.table.client.gateway.SqlExecutionException: Could not create execution context. at org.apache.flink.table.client.gateway. local .ExecutionContext$Builder.build(ExecutionContext.java:753) at org.apache.flink.table.client.gateway. local .LocalExecutor.openSession(LocalExecutor.java:228) at org.apache.flink.table.client.SqlClient.start(SqlClient.java:98) at org.apache.flink.table.client.SqlClient.main(SqlClient.java:178) Caused by: org.apache.flink.table.catalog.exceptions.CatalogException: Failed to create Hive Metastore client
|
01
02
03
04
05
06
07
08
09
10
11
12
13
|
$ tree lib
lib
├── flink-connector-hive_2.11-1.10.0.jar
├── flink-dist_2.11-1.10.0.jar
├── flink-hadoop-compatibility_2.11-1.10.0.jar
├── flink-shaded-hadoop-2-2.6.0-cdh5.16.2-9.0.jar
├── flink-table_2.11-1.10.0.jar
├── flink-table-blink_2.11-1.10.0.jar
├── hive- exec -1.2.1.jar
├── hive-metastore-1.2.1.jar
├── libfb303-0.9.2.jar
├── log4j-1.2.17.jar
└── slf4j-log4j12-1.7.15.jar
|
1
|
$ bin /sql-client .sh embedded -d conf /sql-client-hive .yaml
|
1
2
|
$ bin /sql-client .sh embedded -d conf /sql-client-hive .yaml
Flink SQL>
|
01
02
03
04
05
06
07
08
09
10
11
|
0: jdbc:hive2: //xx .xxx.xxx.xxx:10000> show tables;
+--------------+--+
| tab_name |
+--------------+--+
| customer |
| dates |
| lineorder |
| p_lineorder |
| part |
| supplier |
+--------------+--+
|
1
2
3
4
5
6
|
$ bin /sql-client .sh embedded -d conf /sql-client-hive .yaml
Flink SQL> show catalogs;
default_catalog
staginghive
Flink SQL> use catalog staginghive;
|
01
02
03
04
05
06
07
08
09
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
|
# 查询数据库
Flink SQL> show databases;
...
ssb
tmp
...
Flink SQL> use ssb;
# 查询表
Flink SQL> show tables;
customer
dates
lineorder
p_lineorder
part
supplier
# 查询表结构
Flink SQL> DESCRIBE customer;
root
|-- c_custkey: INT
|-- c_name: STRING
|-- c_address: STRING
|-- c_city: STRING
|-- c_nation: STRING
|-- c_region: STRING
|-- c_phone: STRING
|-- c_mktsegment: STRING
|
1
2
3
4
5
6
7
8
9
|
Flink SQL> select sum (v_revenue) as revenue
> from p_lineorder
> left join dates on lo_orderdate = d_datekey
> where d_year = 1993
> and lo_discount between 1 and 3
> and lo_quantity < 25;
[ERROR] Could not execute SQL statement. Reason:
org.apache.calcite.sql.validate.SqlValidatorException: Tabeorder ' not found; did you mean ' LINEORDER'?
|
01
02
03
04
05
06
07
08
09
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
|
CREATE VIEW P_LINEORDER AS
SELECT LO_ORDERKEY,
LO_LINENUMBER,
LO_CUSTKEY,
LO_PARTKEY,
LO_SUPPKEY,
LO_ORDERDATE,
LO_ORDERPRIOTITY,
LO_SHIPPRIOTITY,
LO_QUANTITY,
LO_EXTENDEDPRICE,
LO_ORDTOTALPRICE,
LO_DISCOUNT,
LO_REVENUE,
LO_SUPPLYCOST,
LO_TAX,
LO_COMMITDATE,
LO_SHIPMODE,
LO_EXTENDEDPRICE*LO_DISCOUNT AS V_REVENUE
FROM ssb.LINEORDER;
但是对于 Hive 中视图的定义,Flink SQL 并没有很好地处理元数据。为了后面 SQL 的顺利执行,这里笔者在 Hive 中删除并重建该视图:
0: jdbc:hive2: //xx .xxx.xxx.xxx:10000> create view p_lineorder as
select lo_orderkey,
lo_linenumber,
lo_custkey,
lo_partkey,
lo_suppkey,
lo_orderdate,
lo_orderpriotity,
lo_shippriotity,
lo_quantity,
lo_extendedprice,
lo_ordtotalprice,
lo_discount,
lo_revenue,
lo_supplycost,
lo_tax,
lo_commitdate,
lo_shipmode,
lo_extendedprice*lo_discount as v_revenue
from ssb.lineorder;
|
1
2
3
4
5
6
7
8
9
|
Flink SQL> select sum (v_revenue) as revenue
> from p_lineorder
> left join dates on lo_orderdate = d_datekey
> where d_year = 1993
> and lo_discount between 1 and 3
> and lo_quantity < 25;
revenue
894280292647
|
01
02
03
04
05
06
07
08
09
10
11
12
13
14
15
16
17
18
|
Flink SQL> select sum (lo_revenue) as lo_revenue, d_year, p_brand
> from p_lineorder
> left join dates on lo_orderdate = d_datekey
> left join part on lo_partkey = p_partkey
> left join supplier on lo_suppkey = s_suppkey
> where p_category = 'MFGR#12' and s_region = 'AMERICA'
> group by d_year, p_brand
> order by d_year, p_brand;
lo_revenue d_year p_brand
819634128 1998 MFGR #1206
877651232 1998 MFGR #1207
754489428 1998 MFGR #1208
816369488 1998 MFGR #1209
668482306 1998 MFGR #1210
660366608 1998 MFGR #1211
862902570 1998 MFGR #1212
...
|
01
02
03
04
05
06
07
08
09
10
11
12
13
14
|
Flink SQL> select d_year, s_city, p_brand, sum (lo_revenue) - sum (lo_supplycost) as profit
> from p_lineorder
> left join dates on lo_orderdate = d_datekey
> left join customer on lo_custkey = c_custkey
> left join supplier on lo_suppkey = s_suppkey
> left join part on lo_partkey = p_partkey
> where c_region = 'AMERICA' and s_nation = 'UNITED STATES'
> and (d_year = 1997 or d_year = 1998)
> and p_category = 'MFGR#14'
> group by d_year, s_city, p_brand
> order by d_year, s_city, p_brand;
d_year s_city p_brand profit
1998 UNITED ST9 MFGR #1440 6665681
|
01
02
03
04
05
06
07
08
09
10
11
12
13
14
15
16
17
18
19
20
21
|
Flink SQL> create view p_lineorder2 as
> select lo_orderkey,
> lo_linenumber,
> lo_custkey,
> lo_partkey,
> lo_suppkey,
> lo_orderdate,
> lo_orderpriotity,
> lo_shippriotity,
> lo_quantity,
> lo_extendedprice,
> lo_ordtotalprice,
> lo_discount,
> lo_revenue,
> lo_supplycost,
> lo_tax,
> lo_commitdate,
> lo_shipmode,
> lo_extendedprice * lo_discount as v_revenue
> from ssb.lineorder;
[INFO] View has been created.
|
1
2
3
|
Flink SQL> drop view p_lineorder;
[ERROR] Could not execute SQL statement. Reason:
The given view does not exist in the current CLI session. Only views created with a CREATE VIEW statement can be accessed.
|
1
2
3
4
5
|
CREATE TABLE IF NOT EXISTS flink_partition_test (
id int,
name string
) PARTITIONED BY (day string, type string)
stored as textfile;
|
01
02
03
04
05
06
07
08
09
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
|
# 插入静态分区的数据
Flink SQL> INSERT INTO flink_partition_test PARTITION ( type = 'Flink' , `day`= '2020-02-01' ) SELECT 100001, 'Flink001' ;
# 查询
Flink SQL> select * from flink_partition_test;
id name day type
100001 Flink001 2020-02-01 Flink
# 插入动态分区
Flink SQL> INSERT INTO flink_partition_test SELECT 100002, 'Spark' , '2020-02-02' , 'SparkSQL' ;
# 查询
Flink SQL> select * from flink_partition_test;
id name day type
100002 Spark 2020-02-02 SparkSQL
100001 FlinkSQL 2020-02-01 Flink
# 动态和静态分区结合使用类似,不再演示
# 覆盖插入数据
Flink SQL> INSERT OVERWRITE flink_partition_test PARTITION ( type = 'Flink' ) SELECT 100002, 'Spark' , '2020-02-08' , 'SparkSQL-2.4' ;
id name day type
100002 Spark 2020-02-02 SparkSQL
100001 FlinkSQL 2020-02-01 Flink
|
01
02
03
04
05
06
07
08
09
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
|
Flink SQL> set ;
deployment.gateway-address=
deployment.gateway-port=0
deployment.m=yarn-cluster
deployment.response-timeout=5000
deployment.yjm=1024
deployment.yn=2
deployment.ys=5
deployment.ytm=2048
execution.current-catalog=staginghive
execution.current-database=ssb
execution.max-idle-state-retention=0
execution.max-parallelism=128
execution.max-table-result-rows=1000000
execution.min-idle-state-retention=0
execution.parallelism=1
execution.periodic-watermarks-interval=200
execution.planner=blink
execution.restart-strategy. type =fallback
execution.result-mode=table
execution. time -characteristic=event- time
execution. type =batch
Flink SQL> set deployment.yjm = 2048;
|