Max.Bai
2019-05
1. 任务管理:开始任务,停止任务(排队中,在执行),查看队列(排队中,执行中,slave)
2. 执行端:分不同类型,心跳
amqp==2.2.2
amqplib==1.0.2
Jinja2==2.10
nameko==2.12.0
peewee==3.9.4
six==1.11.0
threadpool==1.3.2
click
# _*_ coding:utf-8 _*_
import queue
import threading
import time
import traceback
from nameko.dependency_providers import Config
from nameko.rpc import rpc
from nameko.standalone.rpc import ServiceRpcProxy
from nameko.timer import timer
'''
Task manager
1. start task
2. stop task when running (send signal to slave)
3. start task with env
author: Max.Bai
date: 2019-04
'''
# Slave status
SLAVE_STATUS_IDLE = "slave_idle"
SLAVE_STATUS_RUNNING = "slave_running"
# Task signal
TASK_SIGNAL_STOP = "stop_signal"
# Task execute enviroment
ENV_ONLINE = "ONLINE"
ENV_TEST = "TEST"
ENV_PRE = "PRE-RELEASE"
# CONFIG = {'AMQP_URI': "amqp://guest:guest@localhost"}
QUEUE_LOCK = threading.Lock()
class TaskManager:
name = "task_master"
task_runing_queue = []
task_ready_queue = []
runner_list = []
_priority = 0
CONFIG = Config()
@property
def priority(self):
TaskManager._priority += 1
if TaskManager._priority > 1000:
TaskManager._priority = 0
return TaskManager._priority
@rpc
def get_runner_list(self):
return [{"name":r.name, "status":r.status} for r in self.runner_list]
@rpc
def get_queue_list(self):
return [{"id":j.id, "name":j.name} for j in TaskManager.task_ready_queue]
@rpc
def get_running_list(self):
return [{"id":j.id, "name":j.name} for j in TaskManager.task_runing_queue]
@rpc
def start_task(self, task):
result = {
"code": 0,
"msg": ""
}
try:
self.check_task_data(task)
job = Job(task["id"], task["name"], task["env"], TaskManager.priority)
# self.task_ready_queue.put(job)
TaskManager.task_ready_queue.append(job)
except Exception as e:
result["code"] = 1
result["msg"] = "Start task failed. ERROR:{}".format(str(e))
return result
@rpc
def stop_task(self, task):
stoped = False
result = {
"code": 0,
"msg": ""
}
try:
global QUEUE_LOCK
if QUEUE_LOCK.acquire():
try:
for j in TaskManager.task_ready_queue:
if j.id == task["id"]:
TaskManager.task_ready_queue.remove(j)
stoped = True
print("Stop task [{}] in queue success.".format(j.name))
break
except Exception as e:
print("ERROR:Stop task [{}] in queue Failed!!.".format(task["id"]))
QUEUE_LOCK.release()
# task in running
if not stoped:
payload = {
"signal": TASK_SIGNAL_STOP,
"task": task
}
with ServiceRpcProxy("runner_listener", self.CONFIG) as slave:
slave.send_signal.call_async(payload)
print("Stop task [{}] in running status success.".format(task["name"]))
except Exception as e:
result["code"] = 1
result["msg"] = "Stop task failed. ERROR:{}".format(str(e))
return result
@rpc
def register_runner(self, name, env, status, heart_beat):
if time.time() - heart_beat >= 10:
# print(name, status, heart_beat, "drop")
return
# print(name, status, env, heart_beat)
new_runner = True
for r in self.runner_list:
if r.name == name:
r.refresh(status, heart_beat)
new_runner = False
break
if new_runner:
self.runner_list.append(Runner(name, env, status))
print("New slave [{}] <{}> registed, total {} runners registed.".format(name, env, len(self.runner_list)))
@timer(interval=3)
def check_runner(self):
'''
1. check the runner and remove the died runner
2. start new task when there is a runner idle
'''
# refresh runner list
for i in range(len(self.runner_list)-1, -1, -1):
if time.time() - self.runner_list[i].heart_beat >= 10:
name = self.runner_list[i].name
self.runner_list.pop(i)
print("Remove slave [{}], total {} runners registed.".format(name, len(self.runner_list)))
for r in self.runner_list:
if r.status == SLAVE_STATUS_IDLE:
for j in TaskManager.task_ready_queue:
if j.env == r.env:
# start run
result = self.send_task_to_salve(j)
time.sleep(0.5) # wait slave update status
if result["status"]:
TaskManager.task_ready_queue.remove(j)
TaskManager.task_runing_queue.append(j)
break
@classmethod
def load_queue_task(cls):
print("Starting loading queue tasks from database ...")
queue_tasks = getfromdb() # todo: get task from db
for task in queue_tasks:
print("Load Queue Task:{} - {}".format(task.id, task.name))
TaskManager.task_ready_queue.append(Job(task.id, task.name, 'TEST', TaskManager.priority))
print("Completed load queue tasks from database.")
@timer(interval=5)
def check_running_task(self):
# clear done or cancled task
for i in range(len(TaskManager.task_runing_queue)-1, -1, -1):
db_task = get_task_info_byid(TaskManager.task_runing_queue[i].id) #todo get task info
if db_task.status == 'CANCLE' or db_task.status == 'DONE':
TaskManager.task_runing_queue.pop(i)
print("Remove complete task from running list task:{} with {}.".format(db_task.name, db_task.status))
def send_task_to_salve(self, job):
result = {
"status": True,
"msg": "Send task to slave success."
}
try:
print("Send task [{}] to [{}] slave".format(job.name, job.env))
with ServiceRpcProxy("runner_slave_{}".format(job.env), self.CONFIG) as slave:
slave.run_task.call_async(job.to_dict())
except Exception as e:
result["status"] = False
result["msg"] = "Send task to slave failed. ERROR:{}".format(str(e))
traceback.print_exc()
return result
def check_task_data(self, task):
if not task.get('env', None) in [ENV_ONLINE, ENV_PRE, ENV_TEST]:
raise('Task data value invalid!')
class Runner:
def __init__(self, name, env, status):
self.name = name
self._status = status
self._env = env
self.heart_beat = time.time()
@property
def status(self):
return self._status
@status.setter
def status(self, value):
self._status = value
@property
def env(self):
return self._env
def refresh(self, status, heart_beat):
self.heart_beat = heart_beat
self._status = status
class Job:
def __init__(self, id, name, env, priority):
self._id = id
self._name = name
self._env = env
self._priority = priority
def __lt__(self, other):
return self.priority < other.priority
@property
def priority(self):
return self._priority
# @priority.setter
# def priority(self, value):
# self._priority = value
@property
def name(self):
return self._name
@property
def id(self):
return self._id
@property
def env(self):
return self._env
def to_dict(self):
return {
"id": self.id,
"name": self.name,
"env": self.env
}
# _*_ coding:utf-8 _*_
import datetime
import json
import logging
import os
import socket
import time
import traceback
import uuid
from nameko.dependency_providers import Config
from nameko.events import BROADCAST, EventDispatcher, event_handler
from nameko.rpc import rpc
from nameko.standalone.rpc import ServiceRpcProxy
from nameko.timer import timer
import pts.ptsconfig as PTSConfig
from pts.pts_master import (ENV_ONLINE, ENV_PRE, ENV_TEST, SLAVE_STATUS_IDLE,
SLAVE_STATUS_RUNNING, TASK_SIGNAL_STOP)
'''
Task slave
author: Max.Bai
date: 2019-04
1. start slave with diffrent enviroment
2. handle task signal
3. heartbeat every 5s
'''
logging.getLogger('LocustExecutor').setLevel(logging.INFO)
SLAVE_ENV = PTSConfig.get_config().get('slave_env', ENV_TEST).upper()
if SLAVE_ENV not in (ENV_ONLINE, ENV_PRE, ENV_TEST):
print('Env value error!')
exit(-1)
SLAVE_NAME = "runner_slave_{}_{}_{}".format(SLAVE_ENV, socket.gethostname(), str(uuid.uuid1())[:8])
class TestTask():
id = 0
name = ""
signal = ""
status = SLAVE_STATUS_IDLE
slave_task = TestTask()
class TaskRunnerSlave(object):
name = "runner_slave_{}".format(SLAVE_ENV)
status = SLAVE_STATUS_IDLE
CONFIG = Config()
@rpc
def run_task(self, task:dict):
print("run task...", task)
self.status = SLAVE_STATUS_RUNNING
slave_task.id = task["id"]
slave_task.signal = ""
slave_task.status = self.status
self.task_duration_timeout = False
self.cancel_task = False
self.heart_beat()
try:
self.do_task(task)
except Exception as e:
traceback.print_exc()
print("Running task failed with ERROR:{}".format(str(e)))
finally:
self.reset_slave_status()
def reset_slave_status(self):
# complete task initial slave status
self.status = SLAVE_STATUS_IDLE
self.task_duration_timeout = False
self.cancel_task = False
slave_task.status = self.status
def do_task(self, task):
print("start task...", task.name)
duration = 20
start_time = time.time()
while not self.check_running(start_time, duration):
time.sleep(1)
if self.cancel_task:
# save cancle task
print("cancel task:", task.name)
else:
# save complete task
print("task done:", task.name)
print("task running complete.", task.name, "cost:", time.time()-start_time)
def check_running(self, start_time, duration):
if (time.time() - start_time) > duration:
self.task_duration_timeout = True
if slave_task.signal == TASK_SIGNAL_STOP:
self.cancel_task = True
return self.task_duration_timeout or self.cancel_task
def heart_beat(self):
with ServiceRpcProxy("task_master", self.CONFIG) as master:
master.register_runner.call_async(SLAVE_NAME, SLAVE_ENV, self.status, time.time())
class TaskRunnerListener(object):
"""Slave listener to handler the signal like stop cancel...
"""
name = "runner_listener"
dispatch = EventDispatcher()
CONFIG = Config()
print(" SLAVE ID: {}".format(SLAVE_NAME))
print(" SLAVE ENV:{: >10}".format(SLAVE_ENV))
@rpc
def send_signal(self, payload):
if payload["signal"] == TASK_SIGNAL_STOP:
self.dispatch("stop_task", payload)
@event_handler("runner_listener", "stop_task", handler_type=BROADCAST, reliable_delivery=False)
def handle_stop_task(self, payload):
if slave_task.id == payload["task"]["id"]:
slave_task.signal = TASK_SIGNAL_STOP
print("stopping task ...:", payload)
@timer(interval=5)
def heart_beat(self):
with ServiceRpcProxy("task_master", self.CONFIG) as master:
master.register_runner.call_async(SLAVE_NAME, SLAVE_ENV, slave_task.status, time.time())
#!/usr/bin/env python3
# _*_ coding:utf-8 _*_
import json
import click
import eventlet
from nameko.runners import ServiceRunner
import pts.ptsconfig as PTSConfig
eventlet.monkey_patch()
@click.command()
@click.option('--master', 'master', flag_value='master', help="start master")
@click.option('--slave', 'master', flag_value='slave', help="start slave")
@click.option("-c", "--config_file", "config_file", type=click.Path(), required=True, help="config file")
def main(master, config_file):
"""Start pts master or slave
"""
print(" "*70)
print(" ___ _____ ___ ")
print(" | _ \ /__ \ / __\ _____ ")
print(" | (_) | / /\/ | (__ /__ \ ___ __ _ _ __ ___ ")
print(" | __/ / / \__ \ / /\/ / _ \ / _` || '_ ` _ \ ")
print(" | | / / __) | / / | __/| (_| || | | | | |")
print(" |_| |/ \___/ \/ \___| \__,_||_| |_| |_| @v1.0")
print(" "*70)
if config_file:
config = read_config(config_file)
else:
click.echo("config file need!")
return -1
valied_list = ['master', 'slave']
if master and master.lower() in valied_list:
if master.lower() == 'master':
start_master(config)
else:
start_slave(config)
else:
click.echo('Only {} argument accepted!\r\nTry "Python HetPTS_runner.py --help" for help.'.format(','.join(valied_list)))
def start_master(config_items):
print("-"*28)
print("| Task Master |")
print("-"*28)
print_config(config_items)
from pts.pts_master import TaskManager
config = {
"AMQP_URI": config_items.get('AMQP_URI'),
"max_workers": 10,
"parent_calls_tracked": 10,
}
runner = ServiceRunner(config=config)
runner.add_service(TaskManager)
TaskManager.load_queue_task()
runner.start()
runner.wait()
def start_slave(config_items):
print("-"*28)
print("| Task Slave |")
print("-"*28)
print_config(config_items)
from pts.pts_slave import TaskRunnerListener, TaskRunnerSlave
config = {
"AMQP_URI": config_items.get('AMQP_URI'),
"max_workers": 1,
"parent_calls_tracked": 10,
}
runner = ServiceRunner(config=config)
runner.add_service(TaskRunnerSlave)
runner.add_service(TaskRunnerListener)
runner.start()
runner.wait()
def print_config(config_items):
for key in config_items.keys():
print("[{: <10}]:{}".format(key, config_items[key]))
def read_config(config_file):
try:
with open(config_file, 'r', encoding='utf-8') as f:
config = json.load(f)
if ("AMQP_URI" in config.keys() and
"db_host" in config.keys() and
"db_port" in config.keys() and
"db_name" in config.keys() and
"db_user" in config.keys() and
"db_pwd" in config.keys()
):
PTSConfig.set_config(config)
return config
else:
raise Exception('Please check the config file with config_demo.json!')
except Exception as e:
raise Exception('Read config file ERROR! {}'.format(str(e)))
if __name__ == "__main__":
main()
#!/usr/bin/env python3
# _*_ coding:utf-8 _*_
class PTSConfig():
settings = None
def set_config(config):
PTSConfig.settings = config
def get_config():
return PTSConfig.settings
# start master
python3 runner --master -c ./config.json
# start slave
python3 runner --slave -c ./config.json