Python - 基于nameko任务管理

Python - 基于nameko任务管理

Max.Bai

2019-05

 

功能

1. 任务管理:开始任务,停止任务(排队中,在执行),查看队列(排队中,执行中,slave)

2. 执行端:分不同类型,心跳

需要的库:

amqp==2.2.2
amqplib==1.0.2
Jinja2==2.10
nameko==2.12.0
peewee==3.9.4
six==1.11.0
threadpool==1.3.2
click

Master端代码

# _*_ coding:utf-8 _*_

import queue
import threading
import time
import traceback

from nameko.dependency_providers import Config
from nameko.rpc import rpc
from nameko.standalone.rpc import ServiceRpcProxy
from nameko.timer import timer


'''
Task manager

1. start task 
2. stop task when running (send signal to slave)
3. start task with env

author: Max.Bai
date: 2019-04
'''


# Slave status
SLAVE_STATUS_IDLE = "slave_idle"
SLAVE_STATUS_RUNNING = "slave_running"

# Task signal
TASK_SIGNAL_STOP = "stop_signal"

# Task execute enviroment
ENV_ONLINE = "ONLINE"
ENV_TEST = "TEST"
ENV_PRE = "PRE-RELEASE"

# CONFIG = {'AMQP_URI': "amqp://guest:guest@localhost"}
QUEUE_LOCK = threading.Lock()


class TaskManager:
    name = "task_master"
    task_runing_queue = []
    task_ready_queue = []
    runner_list = []
    _priority = 0

    CONFIG = Config()


    @property
    def priority(self):
        TaskManager._priority += 1
        if TaskManager._priority > 1000:
            TaskManager._priority = 0
        return TaskManager._priority

    @rpc
    def get_runner_list(self):
        return [{"name":r.name, "status":r.status} for r in self.runner_list]

    @rpc
    def get_queue_list(self):
        return [{"id":j.id, "name":j.name} for j in TaskManager.task_ready_queue]
    
    @rpc
    def get_running_list(self):
        return [{"id":j.id, "name":j.name} for j in TaskManager.task_runing_queue]

    @rpc
    def start_task(self, task):
        result = {
            "code": 0,
            "msg": ""
        }
        try:
            self.check_task_data(task)
            job = Job(task["id"], task["name"], task["env"], TaskManager.priority)
            # self.task_ready_queue.put(job)
            TaskManager.task_ready_queue.append(job)
        except Exception as e:
            result["code"] = 1
            result["msg"] = "Start task failed. ERROR:{}".format(str(e))
        return result

    @rpc
    def stop_task(self, task):
        stoped = False
        result = {
            "code": 0,
            "msg": ""
        }
        try:
            global QUEUE_LOCK
            if QUEUE_LOCK.acquire():
                try:
                    for j in TaskManager.task_ready_queue:
                        if j.id == task["id"]:
                            TaskManager.task_ready_queue.remove(j)
                            stoped = True
                            print("Stop task [{}] in queue success.".format(j.name))
                            break
                except Exception as e:
                    print("ERROR:Stop task [{}] in queue Failed!!.".format(task["id"]))
                QUEUE_LOCK.release()
            

            # task in running
            if not stoped:
                payload = {
                    "signal": TASK_SIGNAL_STOP,
                    "task": task
                }
                with ServiceRpcProxy("runner_listener", self.CONFIG) as slave:
                    slave.send_signal.call_async(payload)
                print("Stop task [{}] in running status success.".format(task["name"]))
        except Exception as e:
            result["code"] = 1
            result["msg"] = "Stop task failed. ERROR:{}".format(str(e))
        return result

    @rpc
    def register_runner(self, name, env, status, heart_beat):
        if time.time() - heart_beat >= 10:
            # print(name, status, heart_beat, "drop")
            return
        # print(name, status, env, heart_beat)
        new_runner = True
        for r in self.runner_list:
            if r.name == name:
                r.refresh(status, heart_beat)
                new_runner = False
                break
        if new_runner:
            self.runner_list.append(Runner(name, env, status))
            print("New slave [{}] <{}> registed, total {} runners registed.".format(name, env, len(self.runner_list)))


    @timer(interval=3)
    def check_runner(self):
        '''
        1. check the runner and remove the died runner
        2. start new task when there is a runner idle
        '''
        # refresh runner list
        for i in range(len(self.runner_list)-1, -1, -1):
            if time.time() - self.runner_list[i].heart_beat >= 10:
                name = self.runner_list[i].name
                self.runner_list.pop(i)
                print("Remove slave [{}], total {} runners registed.".format(name, len(self.runner_list)))
        
        for r in self.runner_list:
            if r.status == SLAVE_STATUS_IDLE:
                for j in TaskManager.task_ready_queue:
                    if j.env == r.env:
                        # start run
                        result = self.send_task_to_salve(j)
                        time.sleep(0.5) # wait slave update status
                        if result["status"]:
                            TaskManager.task_ready_queue.remove(j)
                            TaskManager.task_runing_queue.append(j)
                            break

    @classmethod
    def load_queue_task(cls):
        print("Starting loading queue tasks from database ...")
            queue_tasks = getfromdb()  # todo: get task from db
            for task in queue_tasks:
                print("Load Queue Task:{} - {}".format(task.id, task.name))
                TaskManager.task_ready_queue.append(Job(task.id, task.name, 'TEST', TaskManager.priority))
            print("Completed load queue tasks from database.")


    @timer(interval=5)
    def check_running_task(self):
        # clear done or cancled task
            for i in range(len(TaskManager.task_runing_queue)-1, -1, -1):
                db_task = get_task_info_byid(TaskManager.task_runing_queue[i].id) #todo get task info
                if db_task.status == 'CANCLE' or db_task.status == 'DONE':
                    TaskManager.task_runing_queue.pop(i)
                    print("Remove complete task from running list task:{} with {}.".format(db_task.name, db_task.status))
                
    def send_task_to_salve(self, job):
        result = {
            "status": True,
            "msg": "Send task to slave success."
        }
        try:
            print("Send task [{}] to [{}] slave".format(job.name, job.env))
            with ServiceRpcProxy("runner_slave_{}".format(job.env), self.CONFIG) as slave:
                slave.run_task.call_async(job.to_dict())
        except Exception as e:
            result["status"] = False
            result["msg"] = "Send task to slave failed. ERROR:{}".format(str(e))
            traceback.print_exc()
        return result
                
    def check_task_data(self, task):
        if not task.get('env', None) in [ENV_ONLINE, ENV_PRE, ENV_TEST]:
            raise('Task data  value invalid!')


class Runner:
    def __init__(self, name, env, status):
        self.name = name
        self._status = status
        self._env = env
        self.heart_beat = time.time()
    
    @property
    def status(self):
        return self._status

    @status.setter
    def status(self, value):
        self._status = value

    @property
    def env(self):
        return self._env

    def refresh(self, status, heart_beat):
        self.heart_beat = heart_beat
        self._status = status

class Job:
    def __init__(self, id, name, env, priority):
        self._id = id
        self._name = name
        self._env = env
        self._priority = priority
    
    def __lt__(self, other):
        return self.priority < other.priority
    
    @property
    def priority(self):
        return self._priority
    
    # @priority.setter
    # def priority(self, value):
    #     self._priority = value

    @property
    def name(self):
        return self._name
    
    @property
    def id(self):
        return self._id
    
    @property
    def env(self):
        return self._env

    def to_dict(self):
        return {
            "id": self.id,
            "name": self.name,
            "env": self.env
        }

Slave 代码:

# _*_ coding:utf-8 _*_

import datetime
import json
import logging
import os
import socket
import time
import traceback
import uuid

from nameko.dependency_providers import Config
from nameko.events import BROADCAST, EventDispatcher, event_handler
from nameko.rpc import rpc
from nameko.standalone.rpc import ServiceRpcProxy
from nameko.timer import timer

import pts.ptsconfig as PTSConfig
from pts.pts_master import (ENV_ONLINE, ENV_PRE, ENV_TEST, SLAVE_STATUS_IDLE,
                            SLAVE_STATUS_RUNNING, TASK_SIGNAL_STOP)



'''
Task slave

author: Max.Bai
date: 2019-04

1. start slave with diffrent enviroment
2. handle task signal
3. heartbeat every 5s
'''


logging.getLogger('LocustExecutor').setLevel(logging.INFO)

SLAVE_ENV = PTSConfig.get_config().get('slave_env', ENV_TEST).upper()
if SLAVE_ENV not in (ENV_ONLINE, ENV_PRE, ENV_TEST):
    print('Env value error!')
    exit(-1)
SLAVE_NAME = "runner_slave_{}_{}_{}".format(SLAVE_ENV, socket.gethostname(), str(uuid.uuid1())[:8])

class TestTask():
    id = 0
    name = ""
    signal = ""
    status = SLAVE_STATUS_IDLE

slave_task = TestTask()

class TaskRunnerSlave(object):
    name = "runner_slave_{}".format(SLAVE_ENV)
    status = SLAVE_STATUS_IDLE
    CONFIG = Config()
    

    @rpc
    def run_task(self, task:dict):
        print("run task...", task)
        self.status = SLAVE_STATUS_RUNNING
        slave_task.id = task["id"]
        slave_task.signal = ""
        slave_task.status = self.status
        self.task_duration_timeout = False
        self.cancel_task = False
        self.heart_beat()
        try:
            self.do_task(task)
        except Exception as e:
            traceback.print_exc()
            print("Running task failed with ERROR:{}".format(str(e)))
        finally:
            self.reset_slave_status()

    def reset_slave_status(self):
        # complete task initial slave status
        self.status = SLAVE_STATUS_IDLE
        self.task_duration_timeout = False
        self.cancel_task = False
        slave_task.status = self.status


    def do_task(self, task):
        print("start task...", task.name)
        duration = 20
        start_time = time.time()
        while not self.check_running(start_time, duration):
            time.sleep(1)
        if self.cancel_task:
            # save cancle task
            print("cancel task:", task.name)
        else:
            # save complete task 
            print("task done:", task.name)
        print("task running complete.", task.name, "cost:", time.time()-start_time)
    

    def check_running(self, start_time, duration):
        if (time.time() - start_time) > duration:
            self.task_duration_timeout = True
        if slave_task.signal == TASK_SIGNAL_STOP:
            self.cancel_task = True
        return self.task_duration_timeout or self.cancel_task


    def heart_beat(self):
        with ServiceRpcProxy("task_master", self.CONFIG) as master:
            master.register_runner.call_async(SLAVE_NAME, SLAVE_ENV, self.status, time.time())



class TaskRunnerListener(object):
    """Slave listener to handler the signal like stop cancel...
    """
    name = "runner_listener"
    dispatch = EventDispatcher()
    CONFIG = Config()

    print(" SLAVE ID:  {}".format(SLAVE_NAME))
    print(" SLAVE ENV:{: >10}".format(SLAVE_ENV))

    @rpc
    def send_signal(self, payload):
        if payload["signal"] == TASK_SIGNAL_STOP:
            self.dispatch("stop_task", payload)

    @event_handler("runner_listener", "stop_task", handler_type=BROADCAST, reliable_delivery=False)
    def handle_stop_task(self, payload):
        if slave_task.id == payload["task"]["id"]:
            slave_task.signal = TASK_SIGNAL_STOP
            print("stopping task ...:", payload)
    
    @timer(interval=5)
    def heart_beat(self):
        with ServiceRpcProxy("task_master", self.CONFIG) as master:
            master.register_runner.call_async(SLAVE_NAME, SLAVE_ENV, slave_task.status, time.time())

main入口

#!/usr/bin/env python3
# _*_ coding:utf-8 _*_

import json

import click
import eventlet
from nameko.runners import ServiceRunner

import pts.ptsconfig as PTSConfig
eventlet.monkey_patch()


@click.command()
@click.option('--master', 'master', flag_value='master', help="start master")
@click.option('--slave', 'master', flag_value='slave', help="start slave")
@click.option("-c", "--config_file", "config_file", type=click.Path(), required=True, help="config file")
def main(master, config_file):
    """Start pts master or slave
    """

    print(" "*70)
    print("           ___    _____    ___   ")
    print("          |  _ \ /__   \  / __\   _____                         ")
    print("          | (_) |  / /\/ | (__   /__   \  ___   __ _  _ __ ___  ")
    print("          |  __/  / /     \__ \    / /\/ / _ \ / _` || '_ ` _ \ ")
    print("          | |    / /       __) |  / /   |  __/| (_| || | | | | |")
    print("          |_|    |/       \___/   \/     \___| \__,_||_| |_| |_| @v1.0")
    print(" "*70)
    if config_file:
        config = read_config(config_file)
    else:
        click.echo("config file need!")
        return -1
    valied_list = ['master', 'slave']
    if master and master.lower() in valied_list:
        if master.lower() == 'master':
            start_master(config)
        else:
            start_slave(config)
    else:
        click.echo('Only {} argument accepted!\r\nTry "Python HetPTS_runner.py --help" for help.'.format(','.join(valied_list)))


def start_master(config_items):
    print("-"*28)
    print("|        Task Master       |")
    print("-"*28)
    print_config(config_items)
    from pts.pts_master import TaskManager
    config = {
        "AMQP_URI": config_items.get('AMQP_URI'),
        "max_workers": 10,
        "parent_calls_tracked": 10,
    }
    runner = ServiceRunner(config=config)
    runner.add_service(TaskManager)

    TaskManager.load_queue_task()
    runner.start()
    runner.wait()

def start_slave(config_items):
    print("-"*28)
    print("|        Task Slave        |")
    print("-"*28)
    print_config(config_items)
    from pts.pts_slave import TaskRunnerListener, TaskRunnerSlave
    config = {
        "AMQP_URI": config_items.get('AMQP_URI'),
        "max_workers": 1,
        "parent_calls_tracked": 10,
    }
    runner = ServiceRunner(config=config)
    runner.add_service(TaskRunnerSlave)
    runner.add_service(TaskRunnerListener)

    runner.start()
    runner.wait()

def print_config(config_items):
    for key in config_items.keys():
        print("[{: <10}]:{}".format(key, config_items[key]))

def read_config(config_file):
    try:
        with open(config_file, 'r', encoding='utf-8') as f:
            config = json.load(f)
            if ("AMQP_URI" in config.keys() and
                "db_host" in config.keys() and 
                "db_port" in config.keys() and
                "db_name" in config.keys() and
                "db_user" in config.keys() and
                "db_pwd" in config.keys()
                ):
                PTSConfig.set_config(config)
                return config
            else:
                raise Exception('Please check the config file with config_demo.json!')
    except Exception as e:
        raise Exception('Read config file ERROR! {}'.format(str(e)))

if __name__ == "__main__":
    main()

配置类

#!/usr/bin/env python3
# _*_ coding:utf-8 _*_

class PTSConfig():
    settings = None

def set_config(config):
    PTSConfig.settings = config


def get_config():
    return PTSConfig.settings

如何启动


# start master
python3 runner --master -c ./config.json

# start slave
python3 runner --slave -c ./config.json

 

你可能感兴趣的:(Python)