# coding=UTF-8
#!/usr/bin/python3
# @Desc: 处理Azkaban project执行流水线
import sys
import time
import pymysql.cursors
import requests
import click
import json
AZKABANURL = 'http://ip:port'
USERNAME = 'username'
PASSWORD = 'password'
mysql_host = 'host'
mysql_port = 3306
mysql_user = 'user'
mysql_pass = 'password'
mysql_db = 'azkaban'
def init_door():
with open("list.conf", "r") as f:
exec_flow_lists = f.readlines()
for flow in exec_flow_lists:
flow = flow.replace("\n", "")
flow_dict = eval(flow)
print("\033[0;36;40m"+"*" * 10 + "开始执行FLOW" + "*" * 10+"\033[0m")
exec_flow(depend_projects=flow_dict["depend_projects"], target_flows=flow_dict["target_flows"])
def exec_flow(depend_projects, target_flows):
# 首先判断传入依赖project是否都存在
for depend_project in depend_projects:
if not judge_online(depend_project):
print(f"\033[0;31;40m依赖的项目:{depend_project}不存在,请核对!!!\033[0m")
sys.exit(1)
# 判断要执行的project是否存在
for target_flow in target_flows:
if not judge_online(target_flow[0]):
print(f"\033[0;31;40m要执行的项目:{target_flow[0]}不存在,请核对!!!\033[0m")
sys.exit(1)
# 判断所有依赖项目今天是否执行成功
if check_project_exec_result(depend_projects):
print("\033[0;32;40m所有依赖项目已执行,开始执行目标flow\033[0m")
session_id = get_session_id()
# 执行目标flow
for target_flow in target_flows:
if check_target_exec_result(target_flow[0]):
print(f"开始执行project:{target_flow[0]},flow:{target_flow[1]}")
exec_id = exec_flows(session_id, target_flow[0], target_flow[1])
if not exec_id:
print(f"执行project:{target_flow[0]},flow:{target_flow[1]}失败!!!")
sys.exit(2)
else:
print(f"执行project:{target_flow[0]},flow:{target_flow[1]}成功!!!")
def execute(sql):
# 执行sql
config = {'host': mysql_host, 'port': mysql_port, 'user': mysql_user, 'password': mysql_pass,
'db': mysql_db,
'charset': 'utf8', 'cursorclass': pymysql.cursors.DictCursor}
connection = pymysql.connect(**config)
with connection.cursor() as cursor1:
cursor1.execute(sql)
result = cursor1.fetchall()
connection.commit()
cursor1.close()
connection.close()
return result
def judge_online(project_name):
"""
判断任务是否在数据库中存在
"""
sql = "select * from projects where name='{}'".format(project_name)
execute_result = execute(sql)
return False if len(execute_result) == 0 else True
def check_project_exec_result(depend_projects):
"""
检查关联项目是否执行完毕
"""
for depend_project in depend_projects:
print(f"检查{depend_project}是否已经执行............")
sql = """SELECT *
FROM (
SELECT t2.name AS project_name, t1.*
FROM (
SELECT project_id, flow_id, status
, substr(FROM_UNIXTIME(start_time / 1000), 1, 19) AS start_time
, substr(FROM_UNIXTIME(end_time / 1000), 1, 19) AS end_time
, enc_type
FROM azkaban.execution_flows
WHERE status = 50 AND substr(FROM_UNIXTIME(end_time/1000), 1, 19)>=DATE_FORMAT(CURDATE(),'%Y-%m-%d %H:%i:%s')
) t1
INNER JOIN (
SELECT *
FROM projects
WHERE name = '{}'
) t2
ON t1.project_id = t2.id
ORDER BY end_time DESC
) t
LIMIT 1""".format(depend_project)
while True:
exec_result = get_latest_record(sql)
if len(exec_result) == 0:
print(f"{depend_project}项目还没有执行,请等待...")
time.sleep(5)
else:
print(f"{depend_project}项目已执行")
break
# 全部执行完毕
return True
def check_target_exec_result(project_name):
"""
检查目标project今天是否执行过
"""
print(f"检查{project_name}是否已经执行............")
sql = """SELECT *
FROM (
SELECT t2.name AS project_name, t1.*
FROM (
SELECT project_id, flow_id, status
, substr(FROM_UNIXTIME(start_time / 1000), 1, 19) AS start_time
, substr(FROM_UNIXTIME(end_time / 1000), 1, 19) AS end_time
, enc_type
FROM azkaban.execution_flows
WHERE status = 50 AND substr(FROM_UNIXTIME(end_time/1000), 1, 19)>=DATE_FORMAT(CURDATE(),'%Y-%m-%d %H:%i:%s')
) t1
INNER JOIN (
SELECT *
FROM projects
WHERE name = '{}'
) t2
ON t1.project_id = t2.id
ORDER BY end_time DESC
) t
LIMIT 1""".format(project_name)
exec_result = get_latest_record(sql)
if len(exec_result) != 0:
print(f"\033[0;33;40m{project_name}今天已经执行,跳过.........\033[0m")
return False
return True
def get_latest_record(sql):
"""
获取项目执行完毕的最新时间
"""
execute_result = execute(sql)
return execute_result
def get_session_id():
"""
获取AZkaban登录session_id
"""
try:
params = {
'action': 'login',
'username': USERNAME,
'password': PASSWORD
}
r = requests.post(AZKABANURL, data=params, headers={
'Content-Type': 'application/x-www-form-urlencoded; charset=utf-8',
'X-Requested-With': 'XMLHttpRequest'
})
if r.status_code == 200 and 'session.id' in r.json():
session_id = r.json()['session.id']
# print('[INFO] session_id:', session_id)
return session_id
except Exception as e:
print('[FAIL] getsession_id: %s' % str(e))
return None
def exec_flows(session_id, project_name, flow_name):
"""
执行Flow
"""
try:
params_list = {}
params_list['session.id'] = session_id
params_list['ajax'] = 'executeFlow'
params_list['project'] = project_name
params_list['flow'] = flow_name
r = requests.post(AZKABANURL + "/executor", data=params_list)
if r.status_code == 200:
execid = r.json()['execid']
print('%s项目%s流开始执行,execId:%s' % (project_name, flow_name, execid))
return execid
except Exception as e:
print('Flow执行失败:%s' % str(e))
return None
def get_exec_id(session_id, project, flow):
"""
获取Azkaban任务的运行id
"""
try:
r = requests.get('%s/manager?session.id=%s&ajax=fetchFlowExecutions&project=%s&flow=%s&start=0&length=3' % (
AZKABANURL, session_id, project, flow), verify=False)
if r.status_code == 200 and 'executions' in r.json():
execIds = []
for execution in r.json()['executions']:
execIds.append(execution['execId'])
print('[INFO] execIds:', execIds)
return execIds
else:
raise Exception('[FAIL] getExecId: status_code(%d), %s' % (r.status_code, r.json()))
except Exception as e:
print('[FAIL] getExecId: %s' % str(e))
sys.exit(-1)
def get_running_status(session_id, exec_id):
"""
获取AZkaban任务的运行状态
"""
try:
r = requests.post('%s/executor' % AZKABANURL, verify=False, data={'session.id': session_id,
'ajax': 'fetchexecflow',
'execid': exec_id})
if r.status_code == 200 and 'nodes' in r.json():
return r.json()
else:
raise Exception('[FAIL] getRunningStatus: status_code(%d), %s' % (r.status_code, r.json()))
except Exception as e:
print('[FAIL] getRunningStatus: %s' % str(e))
sys.exit(-1)
def check_status(session_id, project, flow):
"""
检查AZkaban project任务的所有运行状态
"""
exec_ids = get_exec_id(session_id, project, flow)
for exec_id in exec_ids:
runningStatus = get_running_status(project, exec_id)
for rs in runningStatus:
id, attempt, status = rs['id'], rs['attempt'], rs['status']
print('execId: %s, id: %s, attempt: %d, status: %s' % (exec_id, id, attempt, status))
if __name__ == '__main__':
init_door()
{"depend_projects":["P1","P2"],"target_flows":[("P3","p3_flow"),("P4","p4_flow")]}
{"depend_projects":["P3","P4"],"target_flows":[("P5","p5_flow")]}
{"depend_projects":["P5"],"target_flows":[("P6","p6_flow")]}
一行代表一层依赖(注意:该文件内容中间不可有空行),上述文件所表示得依赖图例为:
3. 执行:python3 azkaban_project_auto.py
4. 将3中执行改脚本的命令写成一个job文件,上传azkaban部署为定时任务即可
附件:python2版本的检测脚本如下
# coding=UTF-8
#!/usr/bin/python
# @Author: mark.zheng
# @Time: 2021-05-13
# @Desc: 处理Azkaban project执行流水线
import sys
import time
import MySQLdb
import requests
import json
reload(sys)
sys.setdefaultencoding('utf8')
AZKABANURL = 'http://ip:port'
USERNAME = 'username'
PASSWORD = 'password'
mysql_host = 'host'
mysql_port = 3306
mysql_user = 'user'
mysql_pass = 'password'
mysql_db = 'azkaban'
def init_door():
with open("list.conf", "r") as f:
exec_flow_lists = f.readlines()
for flow in exec_flow_lists:
flow = flow.replace("\n", "")
flow_dict = eval(flow)
print "\033[0;36;40m"+"*" * 10 + "开始执行FLOW" + "*" * 10+"\033[0m"
exec_flow(depend_projects=flow_dict["depend_projects"], target_flows=flow_dict["target_flows"])
def exec_flow(depend_projects, target_flows):
# 首先判断传入依赖project是否都存在
for depend_project in depend_projects:
if not judge_online(depend_project):
print "\033[0;31;40m依赖的项目:",depend_project,"不存在,请核对!!!\033[0m"
sys.exit(1)
# 判断要执行的project是否存在
for target_flow in target_flows:
if not judge_online(target_flow[0]):
print "\033[0;31;40m要执行的项目:",target_flow[0],"不存在,请核对!!!\033[0m"
sys.exit(1)
# 判断所有依赖项目今天是否执行成功
if check_project_exec_result(depend_projects):
print "\033[0;32;40m所有依赖项目已执行,开始执行目标flow\033[0m"
session_id = get_session_id()
# 执行目标flow
for target_flow in target_flows:
if check_target_exec_result(target_flow[0]):
print "开始执行project:",target_flow[0],",flow:",target_flow[1]
exec_id = exec_flows(session_id, target_flow[0], target_flow[1])
if not exec_id:
print "执行project:",target_flow[0],",flow:",target_flow[1],"失败!!!"
sys.exit(2)
else:
print "执行project:",target_flow[0],",flow:",target_flow[1],"成功!!!"
def execute(sql):
# 执行sql
config = {'host': mysql_host, 'port': mysql_port, 'user': mysql_user, 'passwd': mysql_pass,
'db': mysql_db,
'charset': 'utf8'}
connection = MySQLdb.connect(**config)
cursor1 = connection.cursor()
cursor1.execute(sql)
result = cursor1.fetchall()
connection.commit()
# with connection.cursor() as cursor1:
# cursor1.execute(sql)
# result = cursor1.fetchall()
# connection.commit()
cursor1.close()
connection.close()
return result
def judge_online(project_name):
"""
判断任务是否在数据库中存在
"""
sql = "select * from projects where name='{}'".format(project_name)
execute_result = execute(sql)
return False if len(execute_result) == 0 else True
def check_project_exec_result(depend_projects):
"""
检查关联项目是否执行完毕
"""
for depend_project in depend_projects:
print "检查",depend_project,"是否已经执行............"
sql = """SELECT *
FROM (
SELECT t2.name AS project_name, t1.*
FROM (
SELECT project_id, flow_id, status
, substr(FROM_UNIXTIME(start_time / 1000), 1, 19) AS start_time
, substr(FROM_UNIXTIME(end_time / 1000), 1, 19) AS end_time
, enc_type
FROM azkaban.execution_flows
WHERE status = 50 AND substr(FROM_UNIXTIME(end_time/1000), 1, 19)>=DATE_FORMAT(CURDATE(),'%Y-%m-%d %H:%i:%s')
) t1
INNER JOIN (
SELECT *
FROM projects
WHERE name = '{}'
) t2
ON t1.project_id = t2.id
ORDER BY end_time DESC
) t
LIMIT 1""".format(depend_project)
while True:
exec_result = get_latest_record(sql)
if len(exec_result) == 0:
print depend_project,"项目还没有执行,请等待..."
time.sleep(5)
else:
print depend_project,"项目已执行"
break
# 全部执行完毕
return True
def check_target_exec_result(project_name):
"""
检查目标project今天是否执行过
"""
print "检查",project_name,"是否已经执行............"
sql = """SELECT *
FROM (
SELECT t2.name AS project_name, t1.*
FROM (
SELECT project_id, flow_id, status
, substr(FROM_UNIXTIME(start_time / 1000), 1, 19) AS start_time
, substr(FROM_UNIXTIME(end_time / 1000), 1, 19) AS end_time
, enc_type
FROM azkaban.execution_flows
WHERE status = 50 AND substr(FROM_UNIXTIME(end_time/1000), 1, 19)>=DATE_FORMAT(CURDATE(),'%Y-%m-%d %H:%i:%s')
) t1
INNER JOIN (
SELECT *
FROM projects
WHERE name = '{}'
) t2
ON t1.project_id = t2.id
ORDER BY end_time DESC
) t
LIMIT 1""".format(project_name)
exec_result = get_latest_record(sql)
if len(exec_result) != 0:
print "\033[0;33;40m",project_name,"今天已经执行,跳过.........\033[0m"
return False
return True
def get_latest_record(sql):
"""
获取项目执行完毕的最新时间
"""
execute_result = execute(sql)
return execute_result
def get_session_id():
"""
获取AZkaban登录session_id
"""
try:
params = {
'action': 'login',
'username': USERNAME,
'password': PASSWORD
}
r = requests.post(AZKABANURL, data=params, headers={
'Content-Type': 'application/x-www-form-urlencoded; charset=utf-8',
'X-Requested-With': 'XMLHttpRequest'
})
if r.status_code == 200 and 'session.id' in r.json():
session_id = r.json()['session.id']
# print('[INFO] session_id:', session_id)
return session_id
except Exception as e:
print '[FAIL] getsession_id: %s' % str(e)
return None
def exec_flows(session_id, project_name, flow_name):
"""
执行Flow
"""
try:
params_list = {}
params_list['session.id'] = session_id
params_list['ajax'] = 'executeFlow'
params_list['project'] = project_name
params_list['flow'] = flow_name
r = requests.post(AZKABANURL + "/executor", data=params_list)
if r.status_code == 200:
execid = r.json()['execid']
print '%s项目%s流开始执行,execId:%s' % (project_name, flow_name, execid)
return execid
except Exception as e:
print 'Flow执行失败:%s' % str(e)
return None
def get_exec_id(session_id, project, flow):
"""
获取Azkaban任务的运行id
"""
try:
r = requests.get('%s/manager?session.id=%s&ajax=fetchFlowExecutions&project=%s&flow=%s&start=0&length=3' % (
AZKABANURL, session_id, project, flow), verify=False)
if r.status_code == 200 and 'executions' in r.json():
execIds = []
for execution in r.json()['executions']:
execIds.append(execution['execId'])
print '[INFO] execIds:', execIds
return execIds
else:
raise Exception('[FAIL] getExecId: status_code(%d), %s' % (r.status_code, r.json()))
except Exception as e:
print '[FAIL] getExecId: %s' % str(e)
sys.exit(-1)
def get_running_status(session_id, exec_id):
"""
获取AZkaban任务的运行状态
"""
try:
r = requests.post('%s/executor' % AZKABANURL, verify=False, data={'session.id': session_id,
'ajax': 'fetchexecflow',
'execid': exec_id})
if r.status_code == 200 and 'nodes' in r.json():
return r.json()
else:
raise Exception('[FAIL] getRunningStatus: status_code(%d), %s' % (r.status_code, r.json()))
except Exception as e:
print '[FAIL] getRunningStatus: %s' % str(e)
sys.exit(-1)
def check_status(session_id, project, flow):
"""
检查AZkaban project任务的所有运行状态
"""
exec_ids = get_exec_id(session_id, project, flow)
for exec_id in exec_ids:
runningStatus = get_running_status(project, exec_id)
for rs in runningStatus:
id, attempt, status = rs['id'], rs['attempt'], rs['status']
print 'execId: %s, id: %s, attempt: %d, status: %s' % (exec_id, id, attempt, status)
if __name__ == '__main__':
init_door()