Airflow安装和使用介绍
1·Airflow安装和启动
python3 -m venv /pyenv/airflow # 创建airflow python虚拟环境
. /pyenv/airflow/bin/activate # 激活虚拟环境
export AIRFLOW_HOME=~/airflow # 设置airflow的安装目录
pip install apache-airflow # 安装apache-airflow
airflow initdb # 初始化数据库,airflow默认数据库为sqlite
airflow webserver -p 8080 # 启动webserver, 访问http://ip:8080可访问
2.Airflow配置文件~/airflow/airflow.cfg
的常用介绍
# web server
[webserver]
# web server启动时的IP
web_server_host = 0.0.0.0
# web server启动时的端口
web_server_port = 8080
# 认证方式
authenticate = True # 默认为False,不需要认证
auth_backend = airflow.contrib.auth.backends.password_auth # 密码认证
# auth_backend = airflow.contrib.auth.backends.ldap_auth # ldap认证,需配置对应的[ldap]
# 采用ldap认证方式需配置此项
[ldap]
# set this to ldaps://:
uri = ldap://ip:389
user_filter = objectClass=*
user_name_attr = sAMAccountName # 对应web UI登录账号
group_member_attr = memberOf
superuser_filter = # 可为空
data_profiler_filter = # 可为空
bind_user = cn=Manager,dc=example,dc=com
bind_password = your_password # 你得AD密码
basedn = DC=dpbg,DC=lh,DC=com
cacert = /etc/ca/ldap_ca.crt
search_scope = SUBTREE # 使用AD域登录,必须设置为SUBTREE
# 邮件服务
[smtp]
smtp_host = # smtp host ip
smtp_starttls = True
smtp_ssl = False
smtp_port = # smtp port
smtp_mail_from = [email protected] # from邮件地址
3.密码认证方式需手动添加用户。我写了添加用户的脚本,可通过执行命python insert_user.py
Airflow用户
import getpass
import airflow
from airflow import models, settings
from airflow.contrib.auth.backends.password_auth import PasswordUser
def inser_user(username, email, password, is_superuser):
user = PasswordUser(models.User())
user.username = username # 设置账号
user.email = email # 设置邮箱
user.password = password # 设置密码
user.superuser = is_superuser # 是否为超级管理员
session = settings.Session()
session.add(user) # 添加用户
session.commit() # 提交
session.close()
if __name__ == '__main__':
username = input("Enter your username:")
email = input("Enter your email:")
is_superuser = input("Is super user(Y/N)?")
is_superuser = 1 if is_superuser.upper() == 'Y' else 0
password1 = getpass.getpass("Enter your password:")
password2 = getpass.getpass("Please re-enter your password:")
if password1 != password2: # 验证两次输入的密码是否相同
print("Passwords entered twice are inconsistent!")
else: # 添加账号
inser_user(username=username, email=email, password=password1, is_superuser=is_superuser)
print("Success!")
4.ldap认证方式,登录WEB UI时报错ldap_ca.crt文件不存在,解决方法修改源码vim /pyenv/airflow/lib/python3.6/site-packages/airflow/contrib/auth/backends/ldap_auth.py
# 原代码注释掉
# tls_configuration = Tls(validate=ssl.CERT_REQUIRED,
# ca_certs_file=cacert)
# server = Server(conf.get("ldap", "uri"),
# use_ssl=True,
# tls=tls_configuration)
# 修改后代码
server = Server(conf.get("ldap", "uri"),
use_ssl=False,
tls=None)
5.Airflow启动和关闭比较麻烦,我自己写了脚本,大家可以参考下。也可使用systemd运行Airflow(感兴趣的可自己尝试)
# 启动脚本start_airflow.sh
#!/bin/bash
airflow webserver -D && airflow scheduler -D
echo "Start success"
# 关闭脚本stop_airflow.sh
#!/bin/bash
ps -ef|egrep 'scheduler|airflow-webserver'|grep -v grep|awk '{print $2}'|xargs kill -9
rm -rf ~/airflow/*.pid
echo "Stop success"
6.Airflow使用,本文以远程执行python脚本为例。通过WEB UI中Admin–>Connections设置ssh连接方式(见下图)。在~/airflow/dags
配置python脚本,并重启Airflow。
from datetime import datetime
from datetime import timedelta
import airflow
from airflow import DAG
from airflow.operators.bash_operator import BashOperator
from airflow.contrib.operators.ssh_operator import SSHOperator
from airflow.operators.email_operator import EmailOperator
from airflow.utils.trigger_rule import TriggerRule
# These args will get passed on to each operator
# You can override them on a per-task basis during operator initialization
default_args = {
'owner': 'Airflow',
'depends_on_past': False,
'start_date': airflow.utils.dates.days_ago(0), # dag执行开始时间
'email': ['[email protected]'],
'email_on_failure': True,
'email_on_retry': True,
# 'retries': 1,
# 'retry_delay': timedelta(seconds=30),
# 'queue': 'bash_queue',
# 'pool': 'backfill',
# 'priority_weight': 10,
#'end_date': datetime(2019, 12, 4, 17, 30),
'max_active_runs':2
# 'wait_for_downstream': True,
# 'dag': dag,
# 'sla': timedelta(hours=2),
# 'execution_timeout': timedelta(seconds=300),
# 'on_failure_callback': some_function,
# 'on_success_callback': some_other_function,
# 'on_retry_callback': another_function,
# 'trigger_rule': u'all_success'
}
dag = DAG(
'dag_name',
default_args=default_args,
description='dag描述信息',
schedule_interval='0 0 * * *', # 运行时间
)
task = SSHOperator(
ssh_conn_id='ssh_id', # Admin-->Connections设置shh链接ID
task_id='task_id',
command='python python_script.py >> /var/log/dag_name.log', # 执行命令配置
dag=dag
)
task_failed = EmailOperator (
dag=dag,
trigger_rule=TriggerRule.ONE_FAILED, # task执行失败触发
task_id="task_failed_id",
to=["[email protected]"], # 执行失败,发邮件通知
subject="邮件主旨",
html_content='邮件内容'
)
task_failed.set_upstream([task]) # task之间的依赖关系