azkaban多项目之间依赖检查执行

azkaban多项目之间依赖检查执行

  1. 编写检测脚本(python3),azkaban_project_auto.py,内容如下:
# coding=UTF-8
#!/usr/bin/python3
# @Desc: 处理Azkaban project执行流水线

import sys
import time
import pymysql.cursors
import requests
import click
import json

AZKABANURL = 'http://ip:port'
USERNAME = 'username'
PASSWORD = 'password'

mysql_host = 'host'
mysql_port = 3306
mysql_user = 'user'
mysql_pass = 'password'
mysql_db = 'azkaban'


def init_door():
    with open("list.conf", "r") as f:
        exec_flow_lists = f.readlines()
    for flow in exec_flow_lists:
        flow = flow.replace("\n", "")
        flow_dict = eval(flow)
        print("\033[0;36;40m"+"*" * 10 + "开始执行FLOW" + "*" * 10+"\033[0m")
        exec_flow(depend_projects=flow_dict["depend_projects"], target_flows=flow_dict["target_flows"])


def exec_flow(depend_projects, target_flows):
    # 首先判断传入依赖project是否都存在
    for depend_project in depend_projects:
        if not judge_online(depend_project):
            print(f"\033[0;31;40m依赖的项目:{depend_project}不存在,请核对!!!\033[0m")
            sys.exit(1)
    # 判断要执行的project是否存在
    for target_flow in target_flows:
        if not judge_online(target_flow[0]):
            print(f"\033[0;31;40m要执行的项目:{target_flow[0]}不存在,请核对!!!\033[0m")
            sys.exit(1)
    # 判断所有依赖项目今天是否执行成功
    if check_project_exec_result(depend_projects):
        print("\033[0;32;40m所有依赖项目已执行,开始执行目标flow\033[0m")
        session_id = get_session_id()
        # 执行目标flow
        for target_flow in target_flows:
            if check_target_exec_result(target_flow[0]):
                print(f"开始执行project:{target_flow[0]},flow:{target_flow[1]}")
                exec_id = exec_flows(session_id, target_flow[0], target_flow[1])
                if not exec_id:
                    print(f"执行project:{target_flow[0]},flow:{target_flow[1]}失败!!!")
                    sys.exit(2)
                else:
                    print(f"执行project:{target_flow[0]},flow:{target_flow[1]}成功!!!")


def execute(sql):
    # 执行sql
    config = {'host': mysql_host, 'port': mysql_port, 'user': mysql_user, 'password': mysql_pass,
              'db': mysql_db,
              'charset': 'utf8', 'cursorclass': pymysql.cursors.DictCursor}
    connection = pymysql.connect(**config)
    with connection.cursor() as cursor1:
        cursor1.execute(sql)
        result = cursor1.fetchall()
        connection.commit()
    cursor1.close()
    connection.close()
    return result


def judge_online(project_name):
    """
    判断任务是否在数据库中存在
    """
    sql = "select * from projects where name='{}'".format(project_name)
    execute_result = execute(sql)
    return False if len(execute_result) == 0 else True


def check_project_exec_result(depend_projects):
    """
    检查关联项目是否执行完毕
    """
    for depend_project in depend_projects:
        print(f"检查{depend_project}是否已经执行............")
        sql = """SELECT *
                         FROM (
                            SELECT t2.name AS project_name, t1.*
                            FROM (
                                SELECT project_id, flow_id, status
                                    , substr(FROM_UNIXTIME(start_time / 1000), 1, 19) AS start_time
                                    , substr(FROM_UNIXTIME(end_time / 1000), 1, 19) AS end_time
                                    , enc_type
                                FROM azkaban.execution_flows
                                WHERE status = 50 AND substr(FROM_UNIXTIME(end_time/1000), 1, 19)>=DATE_FORMAT(CURDATE(),'%Y-%m-%d %H:%i:%s')
                            ) t1
                                INNER JOIN (
                                    SELECT *
                                    FROM projects
                                    WHERE name = '{}'
                                ) t2
                                ON t1.project_id = t2.id
                            ORDER BY end_time DESC
                         ) t
                         LIMIT 1""".format(depend_project)
        while True:
            exec_result = get_latest_record(sql)
            if len(exec_result) == 0:
                print(f"{depend_project}项目还没有执行,请等待...")
                time.sleep(5)
            else:
                print(f"{depend_project}项目已执行")
                break
    # 全部执行完毕
    return True


def check_target_exec_result(project_name):
    """
    检查目标project今天是否执行过
    """
    print(f"检查{project_name}是否已经执行............")
    sql = """SELECT *
                 FROM (
                    SELECT t2.name AS project_name, t1.*
                    FROM (
                        SELECT project_id, flow_id, status
                            , substr(FROM_UNIXTIME(start_time / 1000), 1, 19) AS start_time
                            , substr(FROM_UNIXTIME(end_time / 1000), 1, 19) AS end_time
                            , enc_type
                        FROM azkaban.execution_flows
                        WHERE status = 50 AND substr(FROM_UNIXTIME(end_time/1000), 1, 19)>=DATE_FORMAT(CURDATE(),'%Y-%m-%d %H:%i:%s')
                    ) t1
                        INNER JOIN (
                            SELECT *
                            FROM projects
                            WHERE name = '{}'
                        ) t2
                        ON t1.project_id = t2.id
                    ORDER BY end_time DESC
                 ) t
                 LIMIT 1""".format(project_name)
    exec_result = get_latest_record(sql)
    if len(exec_result) != 0:
        print(f"\033[0;33;40m{project_name}今天已经执行,跳过.........\033[0m")
        return False
    return True


def get_latest_record(sql):
    """
    获取项目执行完毕的最新时间
    """
    execute_result = execute(sql)
    return execute_result


def get_session_id():
    """
    获取AZkaban登录session_id
    """
    try:
        params = {
            'action': 'login',
            'username': USERNAME,
            'password': PASSWORD
        }
        r = requests.post(AZKABANURL, data=params, headers={
            'Content-Type': 'application/x-www-form-urlencoded; charset=utf-8',
            'X-Requested-With': 'XMLHttpRequest'
        })

        if r.status_code == 200 and 'session.id' in r.json():
            session_id = r.json()['session.id']
            # print('[INFO] session_id:', session_id)
            return session_id
    except Exception as e:
        print('[FAIL] getsession_id: %s' % str(e))
        return None


def exec_flows(session_id, project_name, flow_name):
    """
    执行Flow
    """
    try:
        params_list = {}
        params_list['session.id'] = session_id
        params_list['ajax'] = 'executeFlow'
        params_list['project'] = project_name
        params_list['flow'] = flow_name

        r = requests.post(AZKABANURL + "/executor", data=params_list)

        if r.status_code == 200:
            execid = r.json()['execid']
            print('%s项目%s流开始执行,execId:%s' % (project_name, flow_name, execid))
            return execid
    except Exception as e:
        print('Flow执行失败:%s' % str(e))
        return None


def get_exec_id(session_id, project, flow):
    """
    获取Azkaban任务的运行id
    """
    try:
        r = requests.get('%s/manager?session.id=%s&ajax=fetchFlowExecutions&project=%s&flow=%s&start=0&length=3' % (
            AZKABANURL, session_id, project, flow), verify=False)

        if r.status_code == 200 and 'executions' in r.json():
            execIds = []
            for execution in r.json()['executions']:
                execIds.append(execution['execId'])
            print('[INFO] execIds:', execIds)
            return execIds
        else:
            raise Exception('[FAIL] getExecId: status_code(%d), %s' % (r.status_code, r.json()))
    except Exception as e:
        print('[FAIL] getExecId: %s' % str(e))
        sys.exit(-1)


def get_running_status(session_id, exec_id):
    """
    获取AZkaban任务的运行状态
    """
    try:
        r = requests.post('%s/executor' % AZKABANURL, verify=False, data={'session.id': session_id,
                                                                          'ajax': 'fetchexecflow',
                                                                          'execid': exec_id})
        if r.status_code == 200 and 'nodes' in r.json():
            return r.json()
        else:
            raise Exception('[FAIL] getRunningStatus: status_code(%d), %s' % (r.status_code, r.json()))
    except Exception as e:
        print('[FAIL] getRunningStatus: %s' % str(e))
        sys.exit(-1)


def check_status(session_id, project, flow):
    """
    检查AZkaban project任务的所有运行状态
    """
    exec_ids = get_exec_id(session_id, project, flow)

    for exec_id in exec_ids:
        runningStatus = get_running_status(project, exec_id)
        for rs in runningStatus:
            id, attempt, status = rs['id'], rs['attempt'], rs['status']
            print('execId: %s, id: %s, attempt: %d, status: %s' % (exec_id, id, attempt, status))


if __name__ == '__main__':
    init_door()
  1. 准备依赖执行得项目文件,list.conf
{"depend_projects":["P1","P2"],"target_flows":[("P3","p3_flow"),("P4","p4_flow")]}
{"depend_projects":["P3","P4"],"target_flows":[("P5","p5_flow")]}
{"depend_projects":["P5"],"target_flows":[("P6","p6_flow")]}

一行代表一层依赖(注意:该文件内容中间不可有空行),上述文件所表示得依赖图例为:
azkaban多项目之间依赖检查执行_第1张图片
3. 执行:python3 azkaban_project_auto.py
4. 将3中执行改脚本的命令写成一个job文件,上传azkaban部署为定时任务即可

附件:python2版本的检测脚本如下

# coding=UTF-8
#!/usr/bin/python
# @Author: mark.zheng
# @Time: 2021-05-13
# @Desc: 处理Azkaban project执行流水线

import sys
import time
import MySQLdb
import requests
import json

reload(sys)
sys.setdefaultencoding('utf8')
AZKABANURL = 'http://ip:port'
USERNAME = 'username'
PASSWORD = 'password'

mysql_host = 'host'
mysql_port = 3306
mysql_user = 'user'
mysql_pass = 'password'
mysql_db = 'azkaban'

def init_door():
    with open("list.conf", "r") as f:
        exec_flow_lists = f.readlines()
    for flow in exec_flow_lists:
        flow = flow.replace("\n", "")
        flow_dict = eval(flow)
        print "\033[0;36;40m"+"*" * 10 + "开始执行FLOW" + "*" * 10+"\033[0m"
        exec_flow(depend_projects=flow_dict["depend_projects"], target_flows=flow_dict["target_flows"])


def exec_flow(depend_projects, target_flows):
    # 首先判断传入依赖project是否都存在
    for depend_project in depend_projects:
        if not judge_online(depend_project):
            print "\033[0;31;40m依赖的项目:",depend_project,"不存在,请核对!!!\033[0m"
            sys.exit(1)
    # 判断要执行的project是否存在
    for target_flow in target_flows:
        if not judge_online(target_flow[0]):
            print "\033[0;31;40m要执行的项目:",target_flow[0],"不存在,请核对!!!\033[0m"
            sys.exit(1)
    # 判断所有依赖项目今天是否执行成功
    if check_project_exec_result(depend_projects):
        print "\033[0;32;40m所有依赖项目已执行,开始执行目标flow\033[0m"
        session_id = get_session_id()
        # 执行目标flow
        for target_flow in target_flows:
            if check_target_exec_result(target_flow[0]):
                print "开始执行project:",target_flow[0],",flow:",target_flow[1]
                exec_id = exec_flows(session_id, target_flow[0], target_flow[1])
                if not exec_id:
                    print "执行project:",target_flow[0],",flow:",target_flow[1],"失败!!!"
                    sys.exit(2)
                else:
                    print "执行project:",target_flow[0],",flow:",target_flow[1],"成功!!!"


def execute(sql):
    # 执行sql
    config = {'host': mysql_host, 'port': mysql_port, 'user': mysql_user, 'passwd': mysql_pass,
              'db': mysql_db,
              'charset': 'utf8'}
    connection = MySQLdb.connect(**config)
    cursor1 = connection.cursor()
    cursor1.execute(sql)
    result = cursor1.fetchall()
    connection.commit()	
#    with connection.cursor() as cursor1:
#        cursor1.execute(sql)
#        result = cursor1.fetchall()
#        connection.commit()
    cursor1.close()
    connection.close()
    return result


def judge_online(project_name):
    """
    判断任务是否在数据库中存在
    """
    sql = "select * from projects where name='{}'".format(project_name)
    execute_result = execute(sql)
    return False if len(execute_result) == 0 else True


def check_project_exec_result(depend_projects):
    """
    检查关联项目是否执行完毕
    """
    for depend_project in depend_projects:
        print "检查",depend_project,"是否已经执行............"
        sql = """SELECT *
                         FROM (
                            SELECT t2.name AS project_name, t1.*
                            FROM (
                                SELECT project_id, flow_id, status
                                    , substr(FROM_UNIXTIME(start_time / 1000), 1, 19) AS start_time
                                    , substr(FROM_UNIXTIME(end_time / 1000), 1, 19) AS end_time
                                    , enc_type
                                FROM azkaban.execution_flows
                                WHERE status = 50 AND substr(FROM_UNIXTIME(end_time/1000), 1, 19)>=DATE_FORMAT(CURDATE(),'%Y-%m-%d %H:%i:%s')
                            ) t1
                                INNER JOIN (
                                    SELECT *
                                    FROM projects
                                    WHERE name = '{}'
                                ) t2
                                ON t1.project_id = t2.id
                            ORDER BY end_time DESC
                         ) t
                         LIMIT 1""".format(depend_project)
        while True:
            exec_result = get_latest_record(sql)
            if len(exec_result) == 0:
                print depend_project,"项目还没有执行,请等待..."
                time.sleep(5)
            else:
                print depend_project,"项目已执行"
                break
    # 全部执行完毕
    return True


def check_target_exec_result(project_name):
    """
    检查目标project今天是否执行过
    """
    print "检查",project_name,"是否已经执行............"
    sql = """SELECT *
                 FROM (
                    SELECT t2.name AS project_name, t1.*
                    FROM (
                        SELECT project_id, flow_id, status
                            , substr(FROM_UNIXTIME(start_time / 1000), 1, 19) AS start_time
                            , substr(FROM_UNIXTIME(end_time / 1000), 1, 19) AS end_time
                            , enc_type
                        FROM azkaban.execution_flows
                        WHERE status = 50 AND substr(FROM_UNIXTIME(end_time/1000), 1, 19)>=DATE_FORMAT(CURDATE(),'%Y-%m-%d %H:%i:%s')
                    ) t1
                        INNER JOIN (
                            SELECT *
                            FROM projects
                            WHERE name = '{}'
                        ) t2
                        ON t1.project_id = t2.id
                    ORDER BY end_time DESC
                 ) t
                 LIMIT 1""".format(project_name)
    exec_result = get_latest_record(sql)
    if len(exec_result) != 0:
        print "\033[0;33;40m",project_name,"今天已经执行,跳过.........\033[0m"
        return False
    return True


def get_latest_record(sql):
    """
    获取项目执行完毕的最新时间
    """
    execute_result = execute(sql)
    return execute_result


def get_session_id():
    """
    获取AZkaban登录session_id
    """
    try:
        params = {
            'action': 'login',
            'username': USERNAME,
            'password': PASSWORD
        }
        r = requests.post(AZKABANURL, data=params, headers={
            'Content-Type': 'application/x-www-form-urlencoded; charset=utf-8',
            'X-Requested-With': 'XMLHttpRequest'
        })

        if r.status_code == 200 and 'session.id' in r.json():
            session_id = r.json()['session.id']
            # print('[INFO] session_id:', session_id)
            return session_id
    except Exception as e:
        print '[FAIL] getsession_id: %s' % str(e)
        return None


def exec_flows(session_id, project_name, flow_name):
    """
    执行Flow
    """
    try:
        params_list = {}
        params_list['session.id'] = session_id
        params_list['ajax'] = 'executeFlow'
        params_list['project'] = project_name
        params_list['flow'] = flow_name

        r = requests.post(AZKABANURL + "/executor", data=params_list)

        if r.status_code == 200:
            execid = r.json()['execid']
            print '%s项目%s流开始执行,execId:%s' % (project_name, flow_name, execid)
            return execid
    except Exception as e:
        print 'Flow执行失败:%s' % str(e)
        return None


def get_exec_id(session_id, project, flow):
    """
    获取Azkaban任务的运行id
    """
    try:
        r = requests.get('%s/manager?session.id=%s&ajax=fetchFlowExecutions&project=%s&flow=%s&start=0&length=3' % (
            AZKABANURL, session_id, project, flow), verify=False)

        if r.status_code == 200 and 'executions' in r.json():
            execIds = []
            for execution in r.json()['executions']:
                execIds.append(execution['execId'])
            print '[INFO] execIds:', execIds
            return execIds
        else:
            raise Exception('[FAIL] getExecId: status_code(%d), %s' % (r.status_code, r.json()))
    except Exception as e:
        print '[FAIL] getExecId: %s' % str(e)
        sys.exit(-1)


def get_running_status(session_id, exec_id):
    """
    获取AZkaban任务的运行状态
    """
    try:
        r = requests.post('%s/executor' % AZKABANURL, verify=False, data={'session.id': session_id,
                                                                          'ajax': 'fetchexecflow',
                                                                          'execid': exec_id})
        if r.status_code == 200 and 'nodes' in r.json():
            return r.json()
        else:
            raise Exception('[FAIL] getRunningStatus: status_code(%d), %s' % (r.status_code, r.json()))
    except Exception as e:
        print '[FAIL] getRunningStatus: %s' % str(e)
        sys.exit(-1)


def check_status(session_id, project, flow):
    """
    检查AZkaban project任务的所有运行状态
    """
    exec_ids = get_exec_id(session_id, project, flow)

    for exec_id in exec_ids:
        runningStatus = get_running_status(project, exec_id)
        for rs in runningStatus:
            id, attempt, status = rs['id'], rs['attempt'], rs['status']
            print 'execId: %s, id: %s, attempt: %d, status: %s' % (exec_id, id, attempt, status)


if __name__ == '__main__':
    init_door()


你可能感兴趣的:(随记)