其实这一步在实际生产中分两类,一类是结构型,一类是日志型,我们今天解决的日志型数据,这类数据很多时候都是通过埋点的方式收集的,这里就不再阐述,直接把采集的样例数据拿出来进行操作。
x-superset-image: &superset-image apache/superset:latest
x-superset-depends-on: &superset-depends-on
- db
- redis
x-superset-volumes: &superset-volumes
- ./superset/docker:/app/docker
- superset_home:/app/superset_home
version: '3.6'
services:
mysql1:
container_name: mysql1
image: mysql:5.6
ports:
- 3306:3306
environment:
- MYSQL_ROOT_PASSWORD=root
- MYSQL_USER=huxiang
- MYSQL_PASSWORD=huxiang
volumes:
- /usr/local/bigdata/mysql/db1:/var/lib/mysql
- /usr/local/bigdata/mysql/conf1:/etc/mysql/mysql.conf.d
mysql2:
container_name: mysql2
image: mysql:5.6
ports:
- 3307:3306
environment:
- MYSQL_ROOT_PASSWORD=root
- MYSQL_USER=huxiang
- MYSQL_PASSWORD=huxiang
volumes:
- /usr/local/bigdata/mysql/db2:/var/lib/mysql
- /usr/local/bigdata/mysql/conf2:/etc/mysql/mysql.conf.d
mysql3:
container_name: mysql3
image: mysql:5.6
ports:
- 3308:3306
environment:
- MYSQL_ROOT_PASSWORD=root
- MYSQL_USER=huxiang
- MYSQL_PASSWORD=huxiang
mysql4:
container_name: mysql4
image: mysql:5.6
ports:
- 3309:3306
environment:
- MYSQL_ROOT_PASSWORD=root
- MYSQL_USER=huxiang
- MYSQL_PASSWORD=huxiang
dbt:
image: dbt:v1.2.0
container_name: dbt
volumes:
- /usr/local/bigdata/dbt/app:/usr/app
- /usr/local/bigdata/dbt/dbt-profiles:/root/.dbt
- /usr/local/bigdata/dbt/crontab-conf/crontab:/etc/crontab
tty: true
stdin_open: true
trino1:
image: trinodb/trino:359
container_name: trino1
volumes:
- /usr/local/bigdata/trino/etc1/:/etc/trino/
ports:
- 10081:10081
tty: true
stdin_open: true
trino2:
image: trinodb/trino:359
container_name: trino2
volumes:
- /usr/local/bigdata/trino/etc2/:/etc/trino/
ports:
- 10082:10082
tty: true
stdin_open: true
trino3:
image: trinodb/trino:359
container_name: trino3
volumes:
- /usr/local/bigdata/trino/etc3/:/etc/trino/
ports:
- 10083:10083
tty: true
stdin_open: true
redis:
image: redis:latest
container_name: superset_cache
restart: unless-stopped
volumes:
- redis:/data
db:
env_file: docker/.env-non-dev
image: postgres:10
container_name: superset_db
restart: unless-stopped
volumes:
- db_home:/var/lib/postgresql/data
superset:
env_file: docker/.env-non-dev
image: *superset-image
container_name: superset_app
command: ["/app/docker/docker-bootstrap.sh", "app-gunicorn"]
user: "root"
restart: unless-stopped
ports:
- 8088:8088
depends_on: *superset-depends-on
volumes: *superset-volumes
superset-init:
image: *superset-image
container_name: superset_init
command: ["/app/docker/docker-init.sh"]
env_file: docker/.env-non-dev
depends_on: *superset-depends-on
user: "root"
volumes: *superset-volumes
volumes:
superset_home:
external: false
db_home:
external: false
redis:
external: false
#1、修改mysql1 mysql2容器服务挂载卷mysqld.cnf配置,mysql1开启binlog
#mysql1
[mysqld]
server-id=1
binlog_format=ROW
log-bin=master-bin
log-bin-index=master-bin.index
#mysql2 早期MySQL版本需要手动开启中继日志,5.6以后默认开启
[mysqld]
server-id=2
relay-log-index=relay-log.index
relay-log=relay-log
#2、启动mysql1 mysql2容器
docker-compose up -d --build mysql1 mysql2
#3、mysql1主库创建授权从库同步账号,查看主库binlog 坐标
#创建主从连接账户repl
CREATE USER repl;
#提供给从库账号repl密码123456 跟主库连接,有slave的权限
GRANT replication slave ON *.* TO 'repl'@'192.168.137.128' identified by '123456';
#刷新生效
FLUSH PRIVILEGES;
#查看主库binlog 坐标,从库创建同步连接时需要使用
show MASTER status;
#4、mysql2从库创建同步链接
#关闭复制
STOP SLAVE;
#创建主库连接,master_log_file和master_log_pos的值需要show MASTER status查询
CHANGE MASTER TO
master_host='192.168.137.128',
master_port=3306,
master_user='root',
master_password='root',
master_log_file='master-bin.000006',
master_log_pos=587;#小于当前master-bin日志binlog值,通过主节点show MASTER status;命令查看
#开启复制
START SLAVE;
#查看从库状态
show slave status;
CREATE DATABASE matomo default character set utf8mb4 collate utf8mb4_general_ci;
USE matomo;
SET NAMES utf8mb4;
SET FOREIGN_KEY_CHECKS = 0;
-- ----------------------------
-- Table structure for matomo_log_visit
-- ----------------------------
DROP TABLE IF EXISTS `matomo_log_visit`;
CREATE TABLE `matomo_log_visit` (
`idvisit` bigint(10) UNSIGNED NOT NULL AUTO_INCREMENT,
`idsite` int(10) UNSIGNED NOT NULL,
`idvisitor` binary(8) NOT NULL,
`visit_last_action_time` datetime NOT NULL,
`config_id` binary(8) NOT NULL,
`location_ip` varbinary(16) NOT NULL,
`profilable` tinyint(1) NULL DEFAULT NULL,
`user_id` varchar(200) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL DEFAULT NULL,
`visit_first_action_time` datetime NOT NULL,
`visit_goal_buyer` tinyint(1) NULL DEFAULT NULL,
`visit_goal_converted` tinyint(1) NULL DEFAULT NULL,
`visitor_returning` tinyint(1) NULL DEFAULT NULL,
`visitor_seconds_since_first` int(11) UNSIGNED NULL DEFAULT NULL,
`visitor_seconds_since_order` int(11) UNSIGNED NULL DEFAULT NULL,
`visitor_count_visits` int(11) UNSIGNED NOT NULL DEFAULT 0,
`visit_entry_idaction_name` int(10) UNSIGNED NULL DEFAULT NULL,
`visit_entry_idaction_url` int(11) UNSIGNED NULL DEFAULT NULL,
`visit_exit_idaction_name` int(10) UNSIGNED NULL DEFAULT NULL,
`visit_exit_idaction_url` int(10) UNSIGNED NULL DEFAULT 0,
`visit_total_actions` int(11) UNSIGNED NULL DEFAULT NULL,
`visit_total_interactions` mediumint(8) UNSIGNED NULL DEFAULT 0,
`visit_total_searches` smallint(5) UNSIGNED NULL DEFAULT NULL,
`referer_keyword` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL DEFAULT NULL,
`referer_name` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL DEFAULT NULL,
`referer_type` tinyint(1) UNSIGNED NULL DEFAULT NULL,
`referer_url` varchar(1500) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL DEFAULT NULL,
`location_browser_lang` varchar(20) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL DEFAULT NULL,
`config_browser_engine` varchar(10) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL DEFAULT NULL,
`config_browser_name` varchar(40) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL DEFAULT NULL,
`config_browser_version` varchar(20) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL DEFAULT NULL,
`config_client_type` tinyint(1) NULL DEFAULT NULL,
`config_device_brand` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
`config_device_model` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
`config_device_type` tinyint(100) NULL DEFAULT NULL,
`config_os` char(3) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL DEFAULT NULL,
`config_os_version` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
`visit_total_events` int(11) UNSIGNED NULL DEFAULT NULL,
`visitor_localtime` time NULL DEFAULT NULL,
`visitor_seconds_since_last` int(11) UNSIGNED NULL DEFAULT NULL,
`config_resolution` varchar(18) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL DEFAULT NULL,
`config_cookie` tinyint(1) NULL DEFAULT NULL,
`config_flash` tinyint(1) NULL DEFAULT NULL,
`config_java` tinyint(1) NULL DEFAULT NULL,
`config_pdf` tinyint(1) NULL DEFAULT NULL,
`config_quicktime` tinyint(1) NULL DEFAULT NULL,
`config_realplayer` tinyint(1) NULL DEFAULT NULL,
`config_silverlight` tinyint(1) NULL DEFAULT NULL,
`config_windowsmedia` tinyint(1) NULL DEFAULT NULL,
`visit_total_time` int(11) UNSIGNED NOT NULL,
`location_city` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL DEFAULT NULL,
`location_country` char(3) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL DEFAULT NULL,
`location_latitude` decimal(9, 6) NULL DEFAULT NULL,
`location_longitude` decimal(9, 6) NULL DEFAULT NULL,
`location_region` char(3) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL DEFAULT NULL,
`last_idlink_va` bigint(20) UNSIGNED NULL DEFAULT NULL,
`custom_dimension_1` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL DEFAULT NULL,
`custom_dimension_2` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL DEFAULT NULL,
`custom_dimension_3` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL DEFAULT NULL,
`custom_dimension_4` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL DEFAULT NULL,
`custom_dimension_5` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL DEFAULT NULL,
PRIMARY KEY (`idvisit`) USING BTREE,
INDEX `index_idsite_config_datetime`(`idsite`, `config_id`, `visit_last_action_time`) USING BTREE,
INDEX `index_idsite_datetime`(`idsite`, `visit_last_action_time`) USING BTREE,
INDEX `index_idsite_idvisitor`(`idsite`, `idvisitor`) USING BTREE
) ENGINE = InnoDB AUTO_INCREMENT = 15834 CHARACTER SET = utf8mb4 COLLATE = utf8mb4_general_ci ROW_FORMAT = Compact;
3308、3309主从复制同上!并且创建motomotj数据库
CREATE DATABASE motomotj default character set utf8mb4 collate utf8mb4_general_ci;
我们trino使用1+2模式,一台低配机器作为协调者,存储元数据,两台高配机器作为执行者,使得查询引擎最优化
#/etc/node.properties
#- node.environment:集群中的所有trino节点都必须由相同的环境名称。必须以小写字母开头,只能包含小写字母、数字和下划线。
#- node.id:安装的trino节点的唯一标识符。每个节点都必须由唯一的标识符。标识符必须以字母数字字符开头,并且只能包含字母数字、- 或 _ 字符。
#- node.data-dir:trino的数据目录,trino会在该目录中存放日志、以及其他数据。
#trino1(10081)
node.environment=trino_dev
node.id=ffffffff-ffff-ffff-ffff-ffffffffffff
node.data-dir=/opt/trino/trino-server-359/data
#trino2(10082)
node.environment=trino_dev
node.id=ffffffff-ffff-ffff-ffff-fffffffffffg
node.data-dir=/opt/trino/trino-server-359/data
#trino3(10083)
node.environment=trino_dev
node.id=ffffffff-ffff-ffff-ffff-fffffffffffh
node.data-dir=/opt/trino/trino-server-359/data
-server
-Xmx3G
-XX:-UseBiasedLocking
-XX:+UseG1GC
-XX:G1HeapRegionSize=32M
-XX:+ExplicitGCInvokesConcurrent
-XX:+ExitOnOutOfMemoryError
-XX:+HeapDumpOnOutOfMemoryError
-XX:-OmitStackTraceInFastThrow
-XX:ReservedCodeCacheSize=512M
-XX:PerMethodRecompilationCutoff=10000
-XX:PerBytecodeRecompilationCutoff=10000
-Djdk.attach.allowAttachSelf=true
-Djdk.nio.maxCachedBufferSize=2000000
#/etc/config.properties 内存配置根据人民币来
#trino1(10081)
coordinator=true
node-scheduler.include-coordinator=false
http-server.http.port=10081
query.max-memory=1GB
query.max-memory-per-node=1GB
query.max-total-memory-per-node=1GB
discovery.uri=http://paratera128:10081 #服务发现地址(Coordinators节点)
#trino1(10082)
coordinator=false
http-server.http.port=10082
query.max-memory=2GB
query.max-memory-per-node=2GB
query.max-total-memory-per-node=2GB
discovery.uri=http://paratera128:10081 #服务发现地址(Coordinators节点)
#trino1(10083)
coordinator=false
http-server.http.port=10083
query.max-memory=2GB
query.max-memory-per-node=2GB
query.max-total-memory-per-node=2GB
discovery.uri=http://paratera128:10081 #服务发现地址(Coordinators节点)
#/etc/log.properties
io.trino=INFO
#/etc/catalog/source_dw_mysql.properties
connector.name=mysql
connection-url=jdbc:mysql://192.168.137.128:3307?enabledTLSProtocols=TLSv1.2&useSSL=false
connection-user=root
connection-password=root
#/etc/catalog/target_dw_mysql.properties
connector.name=mysql
connection-url=jdbc:mysql://192.168.137.128:3308?enabledTLSProtocols=TLSv1.2&useSSL=false
connection-user=root
connection-password=root
#/etc/catalog/view_dw_mysql.properties
connector.name=mysql
connection-url=jdbc:mysql://192.168.137.128:3309?enabledTLSProtocols=TLSv1.2&useSSL=false
connection-user=root
connection-password=root
docker-compose up -d --build trino1 trino2 trino3
#通过客户端查看
[root@paratera128 bin]# ./trino-cli --server paratera128:10081
trino> show catalogs;
Catalog
---------
source_dw_mysql
target_dw_mysql
system
(2 rows)
Query 20220909_083221_00002_4y9gb, FINISHED, 1 node
Splits: 19 total, 19 done (100.00%)
0.60 [0 rows, 0B] [0 rows/s, 0B/s]
trino> show schemas from source_dw_mysql;
Schema
--------------------
information_schema
matomo
performance_schema
(3 rows)
Query 20220909_083250_00004_4y9gb, FINISHED, 1 node
Splits: 19 total, 19 done (100.00%)
5.21 [3 rows, 59B] [0 rows/s, 11B/s]
trino> use source_dw_mysql.matomo;
USE
trino:matomotj> select idvisit, idsite,user_id,visit_first_action_time,visit_total_time,visit_goal_buye,referer_type,location_ip from matomo_log_visit;
idvisit | idsite | user_id | visit_first_action_time | visit_total_time | visit_goal_buyer | referer_type | location_ip
--------------------+--------+---------+-------------------------+------------------+------------------+--------------+-------------
752921921632342041 | 3 | 2 | 2011-12-12 08:32:32 | 2 | 2 | 2 | NULL
752921920193695823 | 1 | 2 | 2019-04-12 08:32:32 | 2 | 2 | 2 | NULL
752921921472958559 | 3 | 1 | 2014-09-12 08:32:13 | 1 | 1 | 1 | NULL
752921919732322378 | 1 | 1 | 2022-01-12 08:32:13 | 1 | 1 | 1 | NULL
752921920290164800 | 3 | 2 | 2017-06-12 08:32:32 | 2 | 2 | 2 | NULL
752921920147558414 | 3 | 1 | 2020-03-12 08:32:13 | 1 | 1 | 1 | NULL
752921921565233177 | 2 | 2 | 2012-11-12 08:32:32 | 2 | 2 | 2 | NULL
752921921414238230 | 2 | 2 | 2015-08-12 08:32:32 | 2 | 2 | 2 | NULL
752921920256610361 | 2 | 1 | 2018-05-12 08:32:13 | 1 | 1 | 1 | NULL
752921921519095896 | 1 | 2 | 2013-10-12 08:32:32 | 2 | 2 | 2 | NULL
752921920038506590 | 2 | 2 | 2021-02-12 08:32:32 | 2 | 2 | 2 | NULL
752921920336302089 | 1 | 1 | 2016-07-12 08:32:13 | 1 | 1 | 1 | NULL
(12 rows)
Query 20220909_083340_00008_4y9gb, FINISHED, 1 node
Splits: 17 total, 17 done (100.00%)
1.32 [12 rows, 0B] [9 rows/s, 0B/s]
镜像打包、容器部署见《大数据工具之dbt》
#vim dbt-profiles/profiles.yml
dbt_project:
target: dev
outputs:
dev:
type: trino
host: 192.168.137.128
user: root
port: 10081
database: target_dw_mysql
schema: matomotj
threads: 4
session_properties:
query_max_run_time: 5d
exchange_compression: True
prod:
type: mysql
server: [server/host]
port: [port] # optional
database: [schema] # optional, should be same as schema
schema: [schema]
username: [username]
password: [password]
driver: MySQL ODBC 8.0 ANSI Driver
配置动态源数据库库,增加如下配置
#vim /app/dbt_project/dbt_project.yml
vars:
matomo_catalog: source_dw_mysql
matomo_schema: matomo
数据转换models脚本编写,使用初始化项目后的默认脚本加载路径,直接将已编写的sql脚本放到app/dbt_project/models/example目录下
matomo_log_visit_material.sql
accumulated_visit_model.sql
visit_statics_model.sql
new_user_model.sql
配置加载model
#vim app/dbt_project/models/example/schema.yml
version: 2
models:
- name: matomo_log_visit_material_view
description: "A starter dbt model"
columns:
- name: idvisit
description: "The primary key for this table"
tests:
- unique
- not_null
- name: accumulated_visit_model
description: "An accumulated visit statistics group by day."
columns:
- name: visit_first_day
description: "This is a unique day for statistics"
tests:
- unique
- not_null
- name: visit_statics_model
description: "User visit statics as of the due date."
columns:
- name: deadline
description: "This is the deadline of the statistics"
tests:
- unique
- not_null
- name: new_user_model
description: "New user and old user statics everyday."
columns:
- name: queryDate
description: "This is the date"
tests:
- unique
- not_null
#启动dbt容器服务
docker-compose up -d --build dbt
#dbt run 执行转换脚本
[root@paratera128 dbt]# docker exec -it dbt /bin/bash
root@19ccd0529f4a:/usr/app# cd dbt_project/
root@19ccd0529f4a:/usr/app/dbt_project# dbt run
06:00:33 Running with dbt=1.0.0
08:52:08 Found 4 models, 6 tests, 0 snapshots, 0 analyses, 167 macros, 0 operations, 0 seed files, 0 sources, 0 exposures, 0 metrics
08:52:08
08:52:09 Concurrency: 4 threads (target='trino')
08:52:09
08:52:09 1 of 4 START incremental model matomotj.matomo_log_visit_material............... [RUN]
08:52:14 1 of 4 OK created incremental model matomotj.matomo_log_visit_material.......... [SUCCESS in 4.52s]
08:52:14 2 of 4 START incremental model matomotj.accumulated_visit_model................. [RUN]
08:52:14 3 of 4 START incremental model matomotj.new_user_model.......................... [RUN]
08:52:14 4 of 4 START incremental model matomotj.visit_statics_model..................... [RUN]
08:52:18 2 of 4 OK created incremental model matomotj.accumulated_visit_model............ [SUCCESS in 3.97s]
08:52:22 4 of 4 OK created incremental model matomotj.visit_statics_model................ [SUCCESS in 8.72s]
08:52:23 3 of 4 OK created incremental model matomotj.new_user_model..................... [SUCCESS in 8.95s]
08:52:23
08:52:23 Finished running 4 incremental models in 14.91s.
08:52:23
08:52:23 Completed successfully
08:52:23
08:52:23 Done. PASS=4 WARN=0 ERROR=0 SKIP=0 TOTAL=4
这个在打包dbt镜像时已经加入,容器启动后会自动执行,如需修改cron表达式可以通过dbt挂载卷crontab文件修改!
#需要先将源码中相关脚本拷贝到挂载卷目录下
#构建启动容器
docker-compose up -d --build superset
#创建管理员
[root@paratera128 bigdata]# docker exec -it superset /bin/bash
superset@879629924d4a:/app$ export FLASK_APP=superset
superset@879629924d4a:/app$ flask fab create-admin
Username [admin]: admin
User first name [admin]: admin
User last name [user]: admin
Email [[email protected]]: admin
Password:
Repeat for confirmation:
--------------------------------------------------------------------------------
WARNING
--------------------------------------------------------------------------------
A Default SECRET_KEY was detected, please use superset_config.py to override it.
Use a strong complex alphanumeric string and use a tool to help you generate
a sufficiently random sequence, ex: openssl rand -base64 42
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
logging was configured successfully
2022-09-09 01:54:51,493:INFO:superset.utils.logging_configurator:logging was configured successfully
2022-09-09 01:54:51,505:INFO:root:Configured event logger of type <class 'superset.utils.log.DBEventLogger'>
Falling back to the built-in cache, that stores data in the metadata database, for the following cache: `FILTER_STATE_CACHE_CONFIG`. It is recommended to use `RedisCache`, `MemcachedCache` or another dedicated caching backend for production deployments
2022-09-09 01:54:51,510:WARNING:superset.utils.cache_manager:Falling back to the built-in cache, that stores data in the metadata database, for the following cache: `FILTER_STATE_CACHE_CONFIG`. It is recommended to use `RedisCache`, `MemcachedCache` or another dedicated caching backend for production deployments
Falling back to the built-in cache, that stores data in the metadata database, for the following cache: `EXPLORE_FORM_DATA_CACHE_CONFIG`. It is recommended to use `RedisCache`, `MemcachedCache` or another dedicated caching backend for production deployments
2022-09-09 01:54:51,518:WARNING:superset.utils.cache_manager:Falling back to the built-in cache, that stores data in the metadata database, for the following cache: `EXPLORE_FORM_DATA_CACHE_CONFIG`. It is recommended to use `RedisCache`, `MemcachedCache` or another dedicated caching backend for production deployments
/usr/local/lib/python3.8/site-packages/flask_appbuilder/models/sqla/interface.py:68: SAWarning: relationship 'SqlaTable.slices' will copy column tables.id to column slices.datasource_id, which conflicts with relationship(s): 'Slice.table' (copies tables.id to slices.datasource_id). If this is not the intention, consider if these relationships should be linked with back_populates, or if viewonly=True should be applied to one or more if they are read-only. For the less common case that foreign key constraints are partially overlapping, the orm.foreign() annotation can be used to isolate the columns that should be written towards. To silence this warning, add the parameter 'overlaps="table"' to the 'SqlaTable.slices' relationship. (Background on this error at: https://sqlalche.me/e/14/qzyx)
for prop in class_mapper(obj).iterate_properties:
Recognized Database Authentications.
Admin User admin created.
#初始化数据库
#Superset说到底其实就是一个Web应用程序,自带数据库,需要初始化
#更新dataclasses,初始化 superset 数据库
pip install dataclasses
superset db upgrade
#若提示:UserWarning: Flask-Caching: CACHE_TYPE is set to null, caching is effectively disabled.
#找到python3.7/site-packages/superset/config.py打开编辑:
#搜索:"CACHE_TYPE",全部改成"simple"
#基础数据初始化
superset init