【Docker搭建airflow】

【Docker搭建airflow】

参考链接:https://airflow.apache.org/docs/apache-airflow/stable/start/docker.html#docker-compose-env-variables

步骤:

(1) 从官网拉取yaml文件

获取一个yaml文件,参考如下:

version: '2.1'

services:

   redis:

       image: redis:latest

       ports:

           - "61379:6379"

       # command: redis-server --requirepass redispass

   postgres:

       image: postgres:13

       ports:

           - "5432:5432"

       environment:

           - POSTGRES_USER=airflow

           - POSTGRES_PASSWORD=airflow

           - POSTGRES_DB=airflow

       # Uncomment these lines to persist data on the local filesystem.

       #     - PGDATA=/var/lib/postgresql/data/pgdata

       # volumes:

       #     - ./pgdata:/var/lib/postgresql/data/pgdata

   webserver:

      # image: puckel/docker-airflow:1.9.0-2

      # image: ${AIRFLOW_IMAGE}

      # image: apache/airflow:2.2.0

       image: metrodata.hub.com/airflow_centos_py3:1.10

       restart: always

       depends_on:

#           - postgres

           - redis

       volumes:

           - ./airflow.cfg:/root/airflow/airflow.cfg

           - ./dags:/root/airflow/dags

           - ./logs:/root/airflow/logs

           - /opt:/opt:ro

       dns:

           - 223.5.5.5

       ports:

           - "61080:8080"

       environment:

           - C_FORCE_ROOT=true

       command: airflow webserver

       healthcheck:

           test: ["CMD-SHELL", "[ -f /root/airflow/airflow-webserver.pid ]"]

           interval: 30s

           timeout: 30s

           retries: 3

   flower:

       #image: puckel/docker-airflow:1.9.0-2

       #image: ${AIRFLOW_IMAGE}

       #image: apache/airflow:2.2.0

       image: metrodata.hub.com/airflow_centos_py3:1.10

       restart: always

       depends_on:

           - redis

       dns:

           - 223.5.5.5

       ports:

           - "64555:5555"

       environment:

           - C_FORCE_ROOT=true

       command: airflow flower

   scheduler:

       #image: puckel/docker-airflow:1.9.0-2

       #image: ${AIRFLOW_IMAGE}

       #image: apache/airflow:2.2.0

       image: metrodata.hub.com/airflow_centos_py3:1.10

       restart: always

       depends_on:

           - webserver

       dns:

           - 223.5.5.5

       environment:

           - C_FORCE_ROOT=true

       volumes:

           - ./airflow.cfg:/root/airflow/airflow.cfg

           - ./dags:/root/airflow/dags

           - ./logs:/root/airflow/logs

           - /opt:/opt

       command: airflow scheduler

   worker:

       #image: ${AIRFLOW_IMAGE}

       #image: apache/airflow:2.2.0

       #image: puckel/docker-airflow:1.9.0-2

       image: metrodata.hub.com/airflow_centos_py3:1.10

       restart: always

#        network_mode: host

       depends_on:

           - scheduler

       dns:

           - 223.5.5.5

       environment:

           - C_FORCE_ROOT=true

       volumes:

           - ./airflow.cfg:/root/airflow/airflow.cfg

           - ./dags:/root/airflow/dags

           - ./logs:/root/airflow/logs

           - /opt:/opt

       command: airflow worker

备注:以上配置文件中使用的镜像(metrodata.hub.com/airflow_centos_py3:1.10)可以在百度网盘中下载:

https://pan.baidu.com/s/1oxUiNYxcU_FinhHhXbkuYQ 提取码:yzws

下载之后:

docker load -i aa.tar

(2)配置一个配置文件airflow.cfg,参考如下:

[core]

airflow_home = /root/airflow

dags_folder = /root/airflow/dags

base_log_folder = /root/airflow/logs

remote_log_conn_id =

encrypt_s3_logs = False

logging_level = INFO

logging_config_class =

log_format = [%%(asctime)s] {{%%(filename)s:%%(lineno)d}} %%(levelname)s - %%(message)s

simple_log_format = %%(asctime)s %%(levelname)s - %%(message)s

#executor = SequentialExecutor

executor = CeleryExecutor

sql_alchemy_conn = postgresql+psycopg2://airflow:[email protected]:5432/airflow

## 注,上述ip 10.253.14.131为airflow服务器自身的服务器ip

sql_alchemy_pool_size = 5

sql_alchemy_pool_recycle = 3600

parallelism = 32

dag_concurrency = 16

dags_are_paused_at_creation = True

non_pooled_task_slot_count = 128

max_active_runs_per_dag = 16

load_examples = False

plugins_folder = /root/airflow/plugins

fernet_key = RiTu0XwbUAWsGzRbkgFeZ1aC4oZ4JRRqBLs6LdGFSho=

donot_pickle = False

dagbag_import_timeout = 30

task_runner = BashTaskRunner

default_impersonation =

security =

unit_test_mode = False

task_log_reader = task

enable_xcom_pickling = True

killed_task_cleanup_time = 60

[cli]

api_client = airflow.api.client.local_client

endpoint_url = http://10.253.14.131:61080

[api]

auth_backend = airflow.api.auth.backend.default

[operators]

default_owner = Airflow

default_cpus = 1

default_ram = 512

default_disk = 512

default_gpus = 0

[webserver]

authenticate = True

auth_backend = airflow.contrib.auth.backends.password_auth

base_url = http://10.253.14.131:61080

web_server_host = 0.0.0.0

web_server_port = 8080

web_server_ssl_cert =

web_server_ssl_key =

web_server_worker_timeout = 120

worker_refresh_batch_size = 1

worker_refresh_interval = 30

secret_key = temporary_key

workers = 4

worker_class = sync

access_logfile = -

error_logfile = -

expose_config = True

filter_by_owner = False

owner_mode = user

dag_default_view = tree

dag_orientation = LR

demo_mode = False

log_fetch_timeout_sec = 5

hide_paused_dags_by_default = False

page_size = 100

[email]

email_backend = airflow.utils.email.send_email_smtp

[smtp]

smtp_host = smtp.exmail.qq.com

smtp_ssl = True

smtp_starttls = False

smtp_user = [email protected]

smtp_port = 465

smtp_password = 1q2w3eRDZRDZ

smtp_mail_from = [email protected]

[celery]

celery_app_name = airflow.executors.celery_executor

worker_concurrency = 16

worker_log_server_port = 61793

broker_url = redis://10.253.14.131:61379/1

result_backend = db+postgresql://airflow:[email protected]:5432/airflow

flower_host = 0.0.0.0

flower_port = 64555

default_queue = default

celery_config_options = airflow.config_templates.default_celery.DEFAULT_CELERY_CONFIG

ssl_active = False

[dask]

cluster_address = 127.0.0.1:8786

[scheduler]

job_heartbeat_sec = 5

scheduler_heartbeat_sec = 5

run_duration = -1

min_file_process_interval = 0

dag_dir_list_interval = 300

print_stats_interval = 30

child_process_log_directory = /root/airflow/logs/scheduler

scheduler_zombie_task_threshold = 300

catchup_by_default = True

max_tis_per_query = 0

statsd_on = False

statsd_host = localhost

statsd_port = 8125

statsd_prefix = airflow

max_threads = 2

authenticate = False

[ldap]

uri =

user_filter = objectClass=*

user_name_attr = uid

group_member_attr = memberOf

superuser_filter =

data_profiler_filter =

bind_user = cn=Manager,dc=example,dc=com

bind_password = insecure

basedn = dc=example,dc=com

cacert = /etc/ca/ldap_ca.crt

search_scope = LEVEL

[mesos]

master = localhost:5050

framework_name = Airflow

task_cpu = 1

task_memory = 256

checkpoint = False

authenticate = False

[kerberos]

ccache = /tmp/airflow_krb5_ccache

principal = airflow

reinit_frequency = 3600

kinit_path = kinit

keytab = airflow.keytab

[github_enterprise]

api_rev = v3

[admin]

hide_sensitive_variable_fields = True

(3)在当前要安装的目录 建立相关的目录

mkdir -p ./dags ./logs ./plugins

(4)安装docker-compose

curl -L "https://github.com/docker/compose/releases/download/1.25.0/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose

加上可执行权限:

chmod +x /usr/local/bin/docker-compose

(5)执行一下(在有docker-compose.yml的目录下面)

docker-compose up

(6)进入docker容器中初始化数据库

docker exec -it airflow_webserver_1 bash

docker initdb

(7)建立登录的用户,参考以下脚本:add_user.py

import airflow

from airflow import models, settings

from airflow.contrib.auth.backends.password_auth import PasswordUser

user = PasswordUser(models.User())

user.username = 'xxxxxx'

user.email = 'xxxx'

user.password = 'xxxxxxxx'

session = settings.Session()

session.add(user)

session.commit()

session.close()

exit()

需要先进入容器:docker exec -it airflow_webserver_1 bash

再用容器里面的python执行上述py脚本

(8)互联网访问

若服务器可以直接被互联网访问:

直接访问你的airflow.cfg配置里面的[webserver]下面的baseurl 链接

否则,配置一下nginx转发

你可能感兴趣的:(【Docker搭建airflow】)