Airflow安装和使用介绍

Airflow安装和使用介绍

1·Airflow安装和启动

python3 -m venv /pyenv/airflow    # 创建airflow python虚拟环境
. /pyenv/airflow/bin/activate    # 激活虚拟环境

export AIRFLOW_HOME=~/airflow     # 设置airflow的安装目录
pip install apache-airflow    # 安装apache-airflow
airflow initdb    # 初始化数据库,airflow默认数据库为sqlite
airflow webserver -p 8080    # 启动webserver, 访问http://ip:8080可访问

2.Airflow配置文件~/airflow/airflow.cfg的常用介绍

# web server
[webserver]
# web server启动时的IP
web_server_host = 0.0.0.0
# web server启动时的端口
web_server_port = 8080

# 认证方式
authenticate = True        # 默认为False,不需要认证
auth_backend = airflow.contrib.auth.backends.password_auth    # 密码认证
# auth_backend = airflow.contrib.auth.backends.ldap_auth    # ldap认证,需配置对应的[ldap]

# 采用ldap认证方式需配置此项
[ldap]
# set this to ldaps://:
uri = ldap://ip:389
user_filter = objectClass=*
user_name_attr = sAMAccountName        # 对应web UI登录账号
group_member_attr = memberOf
superuser_filter =        # 可为空
data_profiler_filter =    # 可为空
bind_user = cn=Manager,dc=example,dc=com
bind_password = your_password     # 你得AD密码
basedn = DC=dpbg,DC=lh,DC=com
cacert = /etc/ca/ldap_ca.crt
search_scope = SUBTREE    # 使用AD域登录,必须设置为SUBTREE

# 邮件服务
[smtp]
smtp_host =        # smtp host ip
smtp_starttls = True
smtp_ssl = False
smtp_port =     # smtp port
smtp_mail_from = [email protected]    # from邮件地址

3.密码认证方式需手动添加用户。我写了添加用户的脚本,可通过执行命python insert_user.py Airflow用户

import getpass
import airflow
from airflow import models, settings
from airflow.contrib.auth.backends.password_auth import PasswordUser


def inser_user(username, email, password, is_superuser):
    user = PasswordUser(models.User())
    user.username = username    # 设置账号
    user.email = email      # 设置邮箱
    user.password = password    # 设置密码
    user.superuser = is_superuser   # 是否为超级管理员
    session = settings.Session()
    session.add(user)       # 添加用户
    session.commit()    # 提交
    session.close()


if __name__ == '__main__':
    username = input("Enter your username:")
    email = input("Enter your email:")
    is_superuser = input("Is super user(Y/N)?")
    is_superuser = 1 if is_superuser.upper() == 'Y' else 0
    password1 = getpass.getpass("Enter your password:")
    password2 = getpass.getpass("Please re-enter your password:")
    if password1 != password2:  # 验证两次输入的密码是否相同
        print("Passwords entered twice are inconsistent!")
    else:   # 添加账号
        inser_user(username=username, email=email, password=password1, is_superuser=is_superuser)
        print("Success!")

Airflow安装和使用介绍_第1张图片

4.ldap认证方式,登录WEB UI时报错ldap_ca.crt文件不存在,解决方法修改源码vim /pyenv/airflow/lib/python3.6/site-packages/airflow/contrib/auth/backends/ldap_auth.py

# 原代码注释掉
# tls_configuration = Tls(validate=ssl.CERT_REQUIRED,
#                         ca_certs_file=cacert)

# server = Server(conf.get("ldap", "uri"),
#                 use_ssl=True,
#                 tls=tls_configuration)


# 修改后代码
server = Server(conf.get("ldap", "uri"),
                use_ssl=False,
                tls=None)

5.Airflow启动和关闭比较麻烦,我自己写了脚本,大家可以参考下。也可使用systemd运行Airflow(感兴趣的可自己尝试)

# 启动脚本start_airflow.sh
#!/bin/bash
airflow webserver -D && airflow scheduler -D
echo "Start success"


# 关闭脚本stop_airflow.sh
#!/bin/bash
ps -ef|egrep 'scheduler|airflow-webserver'|grep -v grep|awk '{print $2}'|xargs kill -9
rm -rf ~/airflow/*.pid
echo "Stop success"

6.Airflow使用,本文以远程执行python脚本为例。通过WEB UI中Admin–>Connections设置ssh连接方式(见下图)。在~/airflow/dags配置python脚本,并重启Airflow。

from datetime import datetime
from datetime import timedelta

import airflow
from airflow import DAG
from airflow.operators.bash_operator import BashOperator
from airflow.contrib.operators.ssh_operator import SSHOperator
from airflow.operators.email_operator import EmailOperator

from airflow.utils.trigger_rule import TriggerRule


# These args will get passed on to each operator
# You can override them on a per-task basis during operator initialization
default_args = {
    'owner': 'Airflow', 
    'depends_on_past': False,
    'start_date': airflow.utils.dates.days_ago(0),  # dag执行开始时间
    'email': ['[email protected]'],
    'email_on_failure': True,
    'email_on_retry': True,
    # 'retries': 1,
    # 'retry_delay': timedelta(seconds=30),
    # 'queue': 'bash_queue',
    # 'pool': 'backfill',
    # 'priority_weight': 10,
    #'end_date': datetime(2019, 12, 4, 17, 30),
    'max_active_runs':2
    # 'wait_for_downstream': True,
    # 'dag': dag,
    # 'sla': timedelta(hours=2),
    # 'execution_timeout': timedelta(seconds=300),
    # 'on_failure_callback': some_function,
    # 'on_success_callback': some_other_function,
    # 'on_retry_callback': another_function,
    # 'trigger_rule': u'all_success'
}


dag = DAG(
    'dag_name',
    default_args=default_args,
    description='dag描述信息',
    schedule_interval='0 0 * * *',   # 运行时间
)



task = SSHOperator(
    ssh_conn_id='ssh_id',  # Admin-->Connections设置shh链接ID
    task_id='task_id', 
    command='python python_script.py >> /var/log/dag_name.log',    # 执行命令配置
    dag=dag
)


task_failed = EmailOperator (
    dag=dag,
    trigger_rule=TriggerRule.ONE_FAILED,    # task执行失败触发
    task_id="task_failed_id",
    to=["[email protected]"],     # 执行失败,发邮件通知
    subject="邮件主旨",
    html_content='邮件内容'
)
 

task_failed.set_upstream([task])    # task之间的依赖关系

你可能感兴趣的:(其他工具)