本机环境是 MacOS M1,系统版本为 12.5.1。
version: "3.3"
services:
namenode:
image: registry.cn-hangzhou.aliyuncs.com/jensenchen/hudi-hadoop_2.8.4-namenode:arm64
platform: linux/arm64
hostname: namenode
container_name: namenode
environment:
- CLUSTER_NAME=hudi_hadoop284_hive232_spark244
ports:
- "50070:50070"
- "8020:8020"
# JVM debugging port (will be mapped to a random port on host)
- "5005"
env_file:
- ./hadoop.env
healthcheck:
test: [ "CMD", "curl", "-f", "http://namenode:50070" ]
interval: 30s
timeout: 10s
retries: 3
networks:
mynet:
ipv4_address: 172.18.0.100
datanode1:
image: registry.cn-hangzhou.aliyuncs.com/jensenchen/hudi-hadoop_2.8.4-datanode:arm64
platform: linux/arm64
container_name: datanode1
hostname: datanode1
environment:
- CLUSTER_NAME=hudi_hadoop284_hive232_spark244
env_file:
- ./hadoop.env
ports:
- "50075:50075"
- "50010:50010"
# JVM debugging port (will be mapped to a random port on host)
- "5005"
links:
- "namenode"
- "historyserver"
healthcheck:
test: [ "CMD", "curl", "-f", "http://datanode1:50075" ]
interval: 30s
timeout: 10s
retries: 3
depends_on:
- namenode
networks:
mynet:
ipv4_address: 172.18.0.101
historyserver:
image: registry.cn-hangzhou.aliyuncs.com/jensenchen/hudi-hadoop_2.8.4-history:arm64
hostname: historyserver
container_name: historyserver
environment:
- CLUSTER_NAME=hudi_hadoop284_hive232_spark244
depends_on:
- "namenode"
links:
- "namenode"
ports:
- "58188:8188"
healthcheck:
test: [ "CMD", "curl", "-f", "http://historyserver:8188" ]
interval: 30s
timeout: 10s
retries: 3
env_file:
- ./hadoop.env
volumes:
- historyserver:/hadoop/yarn/timeline
networks:
mynet:
ipv4_address: 172.18.0.102
hive-metastore-postgresql:
image: menorah84/hive-metastore-postgresql:2.3.0
platform: linux/arm64
environment:
- POSTGRES_HOST_AUTH_METHOD=trust
volumes:
- hive-metastore-postgresql:/var/lib/postgresql
hostname: hive-metastore-postgresql
container_name: hive-metastore-postgresql
networks:
mynet:
ipv4_address: 172.18.0.103
hivemetastore:
image: registry.cn-hangzhou.aliyuncs.com/jensenchen/hudi-hadoop_2.8.4-hive_2.3.3:arm64
platform: linux/arm64
hostname: hivemetastore
container_name: hivemetastore
links:
- "hive-metastore-postgresql"
- "namenode"
env_file:
- ./hadoop.env
command: /opt/hive/bin/hive --service metastore
environment:
SERVICE_PRECONDITION: "namenode:50070 hive-metastore-postgresql:5432"
ports:
- "9083:9083"
# JVM debugging port (will be mapped to a random port on host)
- "5005"
healthcheck:
test: [ "CMD", "nc", "-z", "hivemetastore", "9083" ]
interval: 30s
timeout: 10s
retries: 3
depends_on:
- "hive-metastore-postgresql"
- "namenode"
networks:
mynet:
ipv4_address: 172.18.0.104
hiveserver:
image: registry.cn-hangzhou.aliyuncs.com/jensenchen/hudi-hadoop_2.8.4-hive_2.3.3:arm64
platform: linux/arm64
hostname: hiveserver
container_name: hiveserver
env_file:
- ./hadoop.env
environment:
SERVICE_PRECONDITION: "hivemetastore:9083"
ports:
- "10000:10000"
# JVM debugging port (will be mapped to a random port on host)
- "5005"
depends_on:
- "hivemetastore"
links:
- "hivemetastore"
- "hive-metastore-postgresql"
- "namenode"
volumes:
- ${HUDI_WS}:/var/hoodie/ws
networks:
mynet:
ipv4_address: 172.18.0.105
sparkmaster:
image: registry.cn-hangzhou.aliyuncs.com/jensenchen/hudi-hadoop_2.8.4-hive_2.3.3-sparkmaster_2.4.4:arm64
platform: linux/arm64
hostname: sparkmaster
container_name: sparkmaster
env_file:
- ./hadoop.env
ports:
- "8080:8080"
- "7077:7077"
# JVM debugging port (will be mapped to a random port on host)
- "5005"
environment:
- INIT_DAEMON_STEP=setup_spark
links:
- "hivemetastore"
- "hiveserver"
- "hive-metastore-postgresql"
- "namenode"
networks:
mynet:
ipv4_address: 172.18.0.106
spark-worker-1:
image: registry.cn-hangzhou.aliyuncs.com/jensenchen/hudi-hadoop_2.8.4-hive_2.3.3-sparkworker_2.4.4:arm64
platform: linux/arm64
hostname: spark-worker-1
container_name: spark-worker-1
env_file:
- ./hadoop.env
depends_on:
- sparkmaster
ports:
- "8081:8081"
# JVM debugging port (will be mapped to a random port on host)
- "5005"
environment:
- "SPARK_MASTER=spark://sparkmaster:7077"
links:
- "hivemetastore"
- "hiveserver"
- "hive-metastore-postgresql"
- "namenode"
networks:
mynet:
ipv4_address: 172.18.0.107
zookeeper:
image: 'registry.cn-hangzhou.aliyuncs.com/jensenchen/zookeeper:3.4.12'
platform: linux/arm64
hostname: zookeeper
container_name: zookeeper
ports:
- "2181:2181"
environment:
- ALLOW_ANONYMOUS_LOGIN=yes
networks:
mynet:
ipv4_address: 172.18.0.108
kafka:
image: 'registry.cn-hangzhou.aliyuncs.com/jensenchen/kafka:2.13-2.8.1'
platform: linux/arm64
hostname: kafkabroker
container_name: kafkabroker
ports:
- "9092:9092"
environment:
- KAFKA_ZOOKEEPER_CONNECT=zookeeper:2181
- ALLOW_PLAINTEXT_LISTENER=yes
- KAFKA_ADVERTISED_HOST_NAME=kafkabroker
networks:
mynet:
ipv4_address: 172.18.0.109
adhoc-1:
image: registry.cn-hangzhou.aliyuncs.com/jensenchen/hudi-hadoop_2.8.4-hive_2.3.3-sparkadhoc_2.4.4:arm64
platform: linux/arm64
hostname: adhoc-1
container_name: adhoc-1
env_file:
- ./hadoop.env
depends_on:
- sparkmaster
ports:
- '4040:4040'
# JVM debugging port (mapped to 5006 on the host)
- "5006:5005"
environment:
- "SPARK_MASTER=spark://sparkmaster:7077"
links:
- "hivemetastore"
- "hiveserver"
- "hive-metastore-postgresql"
- "namenode"
volumes:
- ${HUDI_WS}:/var/hoodie/ws
networks:
mynet:
ipv4_address: 172.18.0.110
adhoc-2:
image: registry.cn-hangzhou.aliyuncs.com/jensenchen/hudi-hadoop_2.8.4-hive_2.3.3-sparkadhoc_2.4.4:linux-arm64-0.10.1
platform: linux/arm64
hostname: adhoc-2
container_name: adhoc-2
env_file:
- ./hadoop.env
ports:
# JVM debugging port (mapped to 5005 on the host)
- "5005:5005"
depends_on:
- sparkmaster
environment:
- "SPARK_MASTER=spark://sparkmaster:7077"
links:
- "hivemetastore"
- "hiveserver"
- "hive-metastore-postgresql"
- "namenode"
volumes:
- ${HUDI_WS}:/var/hoodie/ws
networks:
mynet:
ipv4_address: 172.18.0.111
doris-fe:
image: registry.cn-hangzhou.aliyuncs.com/jensenchen/doris:1.2.2-fe-arm
hostname: doris-fe
container_name: doris-fe
environment:
- FE_SERVERS=fe1:172.18.0.112:9010
- FE_ID=1
networks:
mynet:
ipv4_address: 172.18.0.112
doris-be:
image: registry.cn-hangzhou.aliyuncs.com/jensenchen/doris:1.2.2-be-arm
hostname: doris-fe
container_name: doris-be
environment:
- FE_SERVERS=fe1:172.18.0.112:9010
- BE_ADDR=172.18.0.113:9050
depends_on:
- doris-fe
networks:
mynet:
ipv4_address: 172.18.0.113
volumes:
namenode:
historyserver:
hive-metastore-postgresql:
networks:
mynet:
external: true
其中,hudi 的 docker-compose 是参考了 hudi 中的写法,但是在 hudi 中 hiveserver 和 adhoc 容器都需要挂载到本机的 hudi 目录,并且需要提前编译 hudi,这里将 hiveserver 和 adhoc 两个容器进行了重新编译,将需要的包和配置文件打在了镜像中,后续将不再依赖外部目录的挂载。
networks 需要使用自定义网络,预先需要创建好,因为 docker-compose 默认创建的网络名字是 {docker-compose.yml 文件所在的文件夹名}_{default},“_” 在主机名中是不合法字符,后续访问中会出现类似错误:
ERROR 1105 (HY000): HMSClientException, msg: org.apache.hadoop.hive.metastore.api.MetaException: Got exception: java.net.URISyntaxException Illegal character in hostname at index 30: thrift://hivemetastore.compose_default:9083
可以预先使用命令创建 mynet:
docker network create mynet --driver bridge --subnet 172.18.0.0/16 --gateway 172.18.0.1
MacOS 由于内部实现容器的方式不同,在部署时宿主机直接修改 max_map_count
值可能无法成功,需要先创建以下容器:
docker run -it --privileged --pid=host --name=change_count debian nsenter -t 1 -m -u -n -i sh
容器创建成功执行以下命令:
sysctl -w vm.max_map_count=2000000
然后 exit
退出,创建 Doris Docker 集群。
HIVE_SITE_CONF_javax_jdo_option_ConnectionURL=jdbc:postgresql://hive-metastore-postgresql/metastore
HIVE_SITE_CONF_javax_jdo_option_ConnectionDriverName=org.postgresql.Driver
HIVE_SITE_CONF_javax_jdo_option_ConnectionUserName=hive
HIVE_SITE_CONF_javax_jdo_option_ConnectionPassword=hive
HIVE_SITE_CONF_datanucleus_autoCreateSchema=false
HIVE_SITE_CONF_hive_metastore_uris=thrift://hivemetastore:9083
HDFS_CONF_dfs_namenode_datanode_registration_ip___hostname___check=false
HDFS_CONF_dfs_webhdfs_enabled=true
HDFS_CONF_dfs_permissions_enabled=false
#HDFS_CONF_dfs_client_use_datanode_hostname=true
#HDFS_CONF_dfs_namenode_use_datanode_hostname=true
HDFS_CONF_dfs_replication=1
CORE_CONF_fs_defaultFS=hdfs://namenode:8020
CORE_CONF_hadoop_http_staticuser_user=root
CORE_CONF_hadoop_proxyuser_hue_hosts=*
CORE_CONF_hadoop_proxyuser_hue_groups=*
YARN_CONF_yarn_log___aggregation___enable=true
YARN_CONF_yarn_resourcemanager_recovery_enabled=true
YARN_CONF_yarn_resourcemanager_store_class=org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore
YARN_CONF_yarn_resourcemanager_fs_state___store_uri=/rmstate
YARN_CONF_yarn_nodemanager_remote___app___log___dir=/app-logs
YARN_CONF_yarn_log_server_url=http://historyserver:8188/applicationhistory/logs/
YARN_CONF_yarn_timeline___service_enabled=true
YARN_CONF_yarn_timeline___service_generic___application___history_enabled=true
YARN_CONF_yarn_resourcemanager_system___metrics___publisher_enabled=true
YARN_CONF_yarn_resourcemanager_hostname=resourcemanager
YARN_CONF_yarn_timeline___service_hostname=historyserver
YARN_CONF_yarn_resourcemanager_address=resourcemanager:8032
YARN_CONF_yarn_resourcemanager_scheduler_address=resourcemanager:8030
YARN_CONF_yarn_resourcemanager_resource___tracker_address=resourcemanager:8031
YARN_CONF_yarn_nodemanager_vmem___check___enabled=false
{"volume": 483951, "symbol": "MSFT", "ts": "2018-08-31 09:30:00", "month": "08", "high": 111.74, "low": 111.55, "key": "MSFT_2018-08-31 09", "year": 2018, "date": "2018/08/31", "close": 111.72, "open": 111.55, "day": "31"}
{"volume": 1533226, "symbol": "AAPL", "ts": "2018-08-31 09:30:00", "month": "08", "high": 227.3101, "low": 226.23, "key": "AAPL_2018-08-31 09", "year": 2018, "date": "2018/08/31", "close": 227.3101, "open": 226.53, "day": "31"}
{"volume": 36179, "symbol": "GOOG", "ts": "2018-08-31 09:30:00", "month": "08", "high": 1236.0, "low": 1234.36, "key": "GOOG_2018-08-31 09", "year": 2018, "date": "2018/08/31", "close": 1234.54, "open": 1236.0, "day": "31"}
{"volume": 456506, "symbol": "FB", "ts": "2018-08-31 09:30:00", "month": "08", "high": 177.5, "low": 176.465, "key": "FB_2018-08-31 09", "year": 2018, "date": "2018/08/31", "close": 176.83, "open": 177.29, "day": "31"}
{"volume": 142747, "symbol": "NFLX", "ts": "2018-08-31 09:30:00", "month": "08", "high": 372.0, "low": 370.49, "key": "NFLX_2018-08-31 09", "year": 2018, "date": "2018/08/31", "close": 371.9, "open": 370.49, "day": "31"}
{"volume": 126884, "symbol": "TSLA", "ts": "2018-08-31 09:30:00", "month": "08", "high": 301.81, "low": 300.11, "key": "TSLA_2018-08-31 09", "year": 2018, "date": "2018/08/31", "close": 300.61, "open": 301.81, "day": "31"}
{"volume": 1201915, "symbol": "F", "ts": "2018-08-31 09:30:00", "month": "08", "high": 9.63, "low": 9.6, "key": "F_2018-08-31 09", "year": 2018, "date": "2018/08/31", "close": 9.61, "open": 9.63, "day": "31"}
{"volume": 176474, "symbol": "AMZN", "ts": "2018-08-31 09:30:00", "month": "08", "high": 2010.8101, "low": 2007.0, "key": "AMZN_2018-08-31 09", "year": 2018, "date": "2018/08/31", "close": 2010.5, "open": 2009.8199, "day": "31"}
{"volume": 142523, "symbol": "NVDA", "ts": "2018-08-31 09:30:00", "month": "08", "high": 277.1899, "low": 276.64, "key": "NVDA_2018-08-31 09", "year": 2018, "date": "2018/08/31", "close": 277.1899, "open": 276.875, "day": "31"}
{"volume": 351118, "symbol": "INTC", "ts": "2018-08-31 09:30:00", "month": "08", "high": 48.06, "low": 47.96, "key": "INTC_2018-08-31 09", "year": 2018, "date": "2018/08/31", "close": 48.03, "open": 47.961, "day": "31"}
这里将 batch_1.json 数据进行了缩减,如需获取全量可以从这里下载:https://github.com/apache/hudi/blob/master/docker/demo/data/batch_1.json
执行 cat batch_1.json | kcat -b kafkabroker -t stock_ticks -P
就可以将数据导入 Kafka 中。
#!/bin/bash
SCRIPT_PATH=$(cd `dirname $0`; pwd)
WS_ROOT=`dirname $SCRIPT_PATH`
COMPOSE_FILE_NAME="docker-compose_mac_aarch64.yml"
# restart cluster
HUDI_WS=${WS_ROOT} docker-compose -f ${SCRIPT_PATH}/${COMPOSE_FILE_NAME} down
sleep 5
HUDI_WS=${WS_ROOT} docker-compose -f ${SCRIPT_PATH}/${COMPOSE_FILE_NAME} up -d
sleep 15
docker exec -it adhoc-1 /bin/bash /var/hoodie/ws/docker/demo/setup_demo_container.sh
docker exec -it adhoc-2 /bin/bash /var/hoodie/ws/docker/demo/setup_demo_container.sh
#!/bin/bash
SCRIPT_PATH=$(cd `dirname $0`; pwd)
HUDI_DEMO_ENV=$1
# set up root directory
WS_ROOT=`dirname $SCRIPT_PATH`
COMPOSE_FILE_NAME="docker-compose_mac_aarch64.yml"
# shut down cluster
HUDI_WS=${WS_ROOT} docker-compose -f ${SCRIPT_PATH}/${COMPOSE_FILE_NAME} down
# remove houst mount directory
rm -rf /tmp/hadoop_data
rm -rf /tmp/hadoop_name
所涉及所有镜像为:
docker pull registry.cn-hangzhou.aliyuncs.com/jensenchen/hudi-hadoop_2.8.4-hive_2.3.3-sparkworker_2.4.4:arm64
docker pull registry.cn-hangzhou.aliyuncs.com/jensenchen/hudi-hadoop_2.8.4-hive_2.3.3-sparkmaster_2.4.4:arm64
docker pull registry.cn-hangzhou.aliyuncs.com/jensenchen/hudi-hadoop_2.8.4-datanode:arm64
docker pull registry.cn-hangzhou.aliyuncs.com/jensenchen/hudi-hadoop_2.8.4-namenode:arm64
docker pull registry.cn-hangzhou.aliyuncs.com/jensenchen/hudi-hadoop_2.8.4-history:arm64
docker pull registry.cn-hangzhou.aliyuncs.com/jensenchen/hudi-hadoop_2.8.4-hive_2.3.3:arm64
docker pull registry.cn-hangzhou.aliyuncs.com/jensenchen/hudi-hadoop_2.8.4-hive_2.3.3-sparkadhoc_2.4.4:arm64
docker pull registry.cn-hangzhou.aliyuncs.com/jensenchen/kafka:2.13-2.8.1
docker pull registry.cn-hangzhou.aliyuncs.com/jensenchen/zookeeper:3.4.12
docker pull registry.cn-hangzhou.aliyuncs.com/jensenchen/doris:1.2.2-fe-arm
docker pull registry.cn-hangzhou.aliyuncs.com/jensenchen/doris:1.2.2-be-arm
为了方便,可以先下载完成后再使用脚本启动 docker-compose 集群。
Hudi 提供了 DeltaStreamer 工具。该工具可以连接到各种数据源(包括Kafka)来拉取变更,并使用 upsert/insert 原语应用于 Hudi 表。这里使用该工具从 kafka 中下载 json 数据,并将其摄取到 COW 和 MOR 表中。如果文件系统中的表还不存在,此工具会自动初始化这些表。
docker exec -it adhoc-2 /bin/bash
# Run the following spark-submit command to execute the delta-streamer and ingest to stock_ticks_cow table in HDFS
spark-submit \
--class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer $HUDI_UTILITIES_BUNDLE \
--table-type COPY_ON_WRITE \
--source-class org.apache.hudi.utilities.sources.JsonKafkaSource \
--source-ordering-field ts \
--target-base-path /user/hive/warehouse/stock_ticks_cow \
--target-table stock_ticks_cow --props /var/demo/config/kafka-source.properties \
--schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider
# Run the following spark-submit command to execute the delta-streamer and ingest to stock_ticks_mor table in HDFS
spark-submit \
--class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer $HUDI_UTILITIES_BUNDLE \
--table-type MERGE_ON_READ \
--source-class org.apache.hudi.utilities.sources.JsonKafkaSource \
--source-ordering-field ts \
--target-base-path /user/hive/warehouse/stock_ticks_mor \
--target-table stock_ticks_mor \
--props /var/demo/config/kafka-source.properties \
--schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider \
--disable-compaction
# As part of the setup (Look at setup_demo.sh), the configs needed for DeltaStreamer is uploaded to HDFS. The configs
# contain mostly Kafa connectivity settings, the avro-schema to be used for ingesting along with key and partitioning fields.
exit
导入完成后,可以在本机打开以下链接查看数据:
http://namenode:50070/explorer.html#/user/hive/warehouse/stock_ticks_cow
http://namenode:50070/explorer.html#/user/hive/warehouse/stock_ticks_mor
将 HDFS 上的表同步到 Hive 中,包括创建 Hive 表、添加 partitions 等。
docker exec -it adhoc-2 /bin/bash
# This command takes in HiveServer URL and COW Hudi table location in HDFS and sync the HDFS state to Hive
/var/hoodie/ws/hudi-sync/hudi-hive-sync/run_sync_tool.sh \
--jdbc-url jdbc:hive2://hiveserver:10000 \
--user hive \
--pass hive \
--partitioned-by dt \
--base-path /user/hive/warehouse/stock_ticks_cow \
--database default \
--table stock_ticks_cow \
--partition-value-extractor org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor
.....
2020-01-25 19:51:28,953 INFO [main] hive.HiveSyncTool (HiveSyncTool.java:syncHoodieTable(129)) - Sync complete for stock_ticks_cow
.....
# Now run hive-sync for the second data-set in HDFS using Merge-On-Read (MOR table type)
/var/hoodie/ws/hudi-sync/hudi-hive-sync/run_sync_tool.sh \
--jdbc-url jdbc:hive2://hiveserver:10000 \
--user hive \
--pass hive \
--partitioned-by dt \
--base-path /user/hive/warehouse/stock_ticks_mor \
--database default \
--table stock_ticks_mor \
--partition-value-extractor org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor
...
2020-01-25 19:51:51,066 INFO [main] hive.HiveSyncTool (HiveSyncTool.java:syncHoodieTable(129)) - Sync complete for stock_ticks_mor_ro
...
2020-01-25 19:51:51,569 INFO [main] hive.HiveSyncTool (HiveSyncTool.java:syncHoodieTable(129)) - Sync complete for stock_ticks_mor_rt
....
exit
docker exec -it adhoc-2 /bin/bash
beeline -u jdbc:hive2://hiveserver:10000 \
--hiveconf hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat \
--hiveconf hive.stats.autogather=false
# List Tables
0: jdbc:hive2://hiveserver:10000> show tables;
+---------------------+--+
| tab_name |
+---------------------+--+
| stock_ticks_cow |
| stock_ticks_mor_ro |
| stock_ticks_mor_rt |
+---------------------+--+
3 rows selected (1.199 seconds)
0: jdbc:hive2://hiveserver:10000>
# Look at partitions that were added
0: jdbc:hive2://hiveserver:10000> show partitions stock_ticks_mor_rt;
+----------------+--+
| partition |
+----------------+--+
| dt=2018-08-31 |
+----------------+--+
1 row selected (0.24 seconds)
# COPY-ON-WRITE Queries:
=========================
0: jdbc:hive2://hiveserver:10000> select symbol, max(ts) from stock_ticks_cow group by symbol HAVING symbol = 'GOOG';
+---------+----------------------+--+
| symbol | _c1 |
+---------+----------------------+--+
| GOOG | 2018-08-31 10:29:00 |
+---------+----------------------+--+
Now, run a projection query:
0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_cow where symbol = 'GOOG';
+----------------------+---------+----------------------+---------+------------+-----------+--+
| _hoodie_commit_time | symbol | ts | volume | open | close |
+----------------------+---------+----------------------+---------+------------+-----------+--+
| 20230621074927233 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 |
| 20230621074927233 | GOOG | 2018-08-31 10:29:00 | 3391 | 1230.1899 | 1230.085 |
+----------------------+---------+----------------------+---------+------------+-----------+--+
# Merge-On-Read Queries:
==========================
Lets run similar queries against M-O-R table. Lets look at both
ReadOptimized and Snapshot(realtime data) queries supported by M-O-R table
# Run ReadOptimized Query. Notice that the latest timestamp is 10:29
0: jdbc:hive2://hiveserver:10000> select symbol, max(ts) from stock_ticks_mor_ro group by symbol HAVING symbol = 'GOOG';
WARNING: Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases.
+---------+----------------------+--+
| symbol | _c1 |
+---------+----------------------+--+
| GOOG | 2018-08-31 10:29:00 |
+---------+----------------------+--+
1 row selected (6.326 seconds)
# Run Snapshot Query. Notice that the latest timestamp is again 10:29
0: jdbc:hive2://hiveserver:10000> select symbol, max(ts) from stock_ticks_mor_rt group by symbol HAVING symbol = 'GOOG';
WARNING: Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases.
+---------+----------------------+--+
| symbol | _c1 |
+---------+----------------------+--+
| GOOG | 2018-08-31 10:29:00 |
+---------+----------------------+--+
1 row selected (1.606 seconds)
# Run Read Optimized and Snapshot project queries
0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor_ro where symbol = 'GOOG';
+----------------------+---------+----------------------+---------+------------+-----------+--+
| _hoodie_commit_time | symbol | ts | volume | open | close |
+----------------------+---------+----------------------+---------+------------+-----------+--+
| 20180924222155 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 |
| 20180924222155 | GOOG | 2018-08-31 10:29:00 | 3391 | 1230.1899 | 1230.085 |
+----------------------+---------+----------------------+---------+------------+-----------+--+
0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor_rt where symbol = 'GOOG';
+----------------------+---------+----------------------+---------+------------+-----------+--+
| _hoodie_commit_time | symbol | ts | volume | open | close |
+----------------------+---------+----------------------+---------+------------+-----------+--+
| 20180924222155 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 |
| 20180924222155 | GOOG | 2018-08-31 10:29:00 | 3391 | 1230.1899 | 1230.085 |
+----------------------+---------+----------------------+---------+------------+-----------+--+
exit
docker exec -ti doris-fe /bin/bash
# 登录容器后再登录 fe
mysql -h127.0.0.1 -uroot -P9030
CREATE CATALOG hudi PROPERTIES (
'type'='hms',
'hive.metastore.uris' = 'thrift://172.18.0.104:9083',
'hive.version' = '2.3.3'
);
CREATE CATALOG hudi2 PROPERTIES (
'type'='hms',
'hive.metastore.uris' = 'thrift://hivemetastore:9083',
'hive.version' = '2.3.3'
);
CREATE CATALOG hudi3 PROPERTIES (
'type'='hms',
'hive.metastore.uris' = 'thrift://172.18.0.104:9083',
'hadoop.username' = 'hive',
'dfs.nameservices'='namenode',
'dfs.ha.namenodes.namenode'='nn1',
'dfs.namenode.rpc-address.namenode.nn1'='172.18.0.100:8082',
'dfs.client.failover.proxy.provider.your-nameservice'='org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider'
);
因为上面 hivemetastore 的 IP 为 172.18.0.104,所以这里这样指定。
MySQL [(none)]> show catalogs;
+-----------+-------------+----------+-----------+
| CatalogId | CatalogName | Type | IsCurrent |
+-----------+-------------+----------+-----------+
| 10004 | hudi | hms | |
| 0 | internal | internal | yes |
+-----------+-------------+----------+-----------+
MySQL [(none)]> switch hudi;
MySQL [(none)]> show catalogs;
+-----------+-------------+----------+-----------+
| CatalogId | CatalogName | Type | IsCurrent |
+-----------+-------------+----------+-----------+
| 10004 | hudi | hms | |
| 0 | internal | internal | yes |
+-----------+-------------+----------+-----------+
MySQL [(none)]> show catalogs;
+-----------+-------------+----------+-----------+
| CatalogId | CatalogName | Type | IsCurrent |
+-----------+-------------+----------+-----------+
| 10004 | hudi | hms | |
| 0 | internal | internal | yes |
+-----------+-------------+----------+-----------+
MySQL [(none)]> use default;
Reading table information for completion of table and column names
You can turn off this feature to get a quicker startup with -A
Database changed
MySQL [default]> select symbol, max(ts) from stock_ticks_cow group by symbol HAVING symbol = 'GOOG';
+--------+---------------------+
| symbol | max(`ts`) |
+--------+---------------------+
| GOOG | 2018-08-31 10:29:00 |
+--------+---------------------+