对于批量处理服务器的任务,通常的解决方案是在服务器内网部署一台中转机, 中转机通过一些途径获得所有单服的信息. 再使用ssh,rsync等指令和具体服务器交互完成工作。
简略代码如下:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import time
import sys
def SwitchOption(ipInfo, option, server, ext_info):
print 'begin %s %s server(%s) '%(option, server, ipInfo)
taskFinish = True
#.......
time.sleep(1)
assert taskFinish
class UpdateHandler():
def Option(self, ipInfo, option, name, ext_info):
SwitchOption(ipInfo, option, name, ext_info)
def GetGroupList():
return ('10.0.128.1', '10.0.128.2', '10.0.128.3', '10.0.128.4' , '10.0.128.5')
def option_group_list(op, ext_info):
groups = GetGroupList()
updatehandler = UpdateHandler()
for ipTup in groups:
updatehandler.Option(ipTup, op, 'group', ext_info)
option_group_list('update', '')
option_group_list('restart', '')
上述代码模拟更新并重启了5个服务器, 假设每步操作需要1分钟, 那么总耗时就是10分钟..
而且每增加一台服务器, 更新的时间将会按此倍数增长.
考虑到更新/重启多个服务器不是计算密集型任务,
且每次操作都属于独立的子任务,并不存在相互影响相互依赖关系, 可以使用不需要同步的多线程完成具体的子任务。
当然有几个细节值得注意:
1.子线程返回错误/中断不会打断主线程, 需要用try catch记录各个子线程任务的完成情况
2. print函数打印log并不是多线程安全的,可以用sys.stdout.write代替
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import threading
import time
import sys
def SwitchOption(ipInfo, option, server, ext_info):
sys.stdout.write('begin %s %s server(%s) \n'%(option, server, ipInfo))
taskFinish = True
#.......
time.sleep(1)
assert taskFinish
class threadUpdateHandler(threading.Thread):
def __init__(self):
super(threadUpdateHandler, self).__init__()
self.result = 0
def parepare_args(self, name, ipInfo, option, ext_info):
self.name,self.ipInfo,self.option,self.ext_info=name,ipInfo,option,ext_info
def run(self):
try:
SwitchOption(self.ipInfo, self.option, self.name, self.ext_info)
#捕获所有异常
except Exception as e:
sys.stdout.write("!!! error exception !!!!: %s server[%s] option %s \n"%(e, (self.ipInfo), self.option) )
self.result = 1
def get_result(self):
return self.result
def GetGroupList():
return ('10.0.128.1', '10.0.128.2', '10.0.128.3', '10.0.128.4' , '10.0.128.5')*2
def option_group_list(op, ext_info):
groups = GetGroupList()
#多线程处理小服列表的启停
if op in ['update', 'start', 'stop', 'restart']:
threadList = []
for ipTup in groups:
t = threadUpdateHandler()
t.parepare_args('group', ipTup, op, ext_info)
t.start()
threadList.append(t)
#主线程阻塞等待子线程返回
for thread in threadList:
thread.join()
#检查子线程任务完成情况
ret_error = 0
for index,thread in enumerate(threadList):
if thread.get_result():
print "%s server[%s] failed"%(op, (groups[index]))
ret_error = 1
assert ret_error == 0
else:
updatehandler = updateHandler()
for ipTup in groups:
updatehandler.Option('group', ipTup, op, ext_info)
option_group_list('update', '')
option_group_list('restart', '')
上述多线程代码模拟更新并重启了10个服务器. 如果每步操作1分钟,则总共用时两分钟。由此可见多线程大大减少了批量处理服务器所花费的时间。
但上段代码还有一个很严重的问题, 即是没有对并发执行的子线程进行数量上的限制, 设想如下情况: 有100个服务器需要更新,每个子线程需要启动一个相关进程(如rsync),那么中转机整体的压力就会变的很大,网络带宽也会占满.
因此我们需要一个线程队列来控制并发执行的线程数量, 保证机器数量如何扩展都不会对机器性能造成影响。
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import Queue
import threading
import time
import sys
def SwitchOption(ipInfo, option, server, ext_info):
sys.stdout.write('begin %s %s server(%s) \n'%(option, server, ipInfo))
taskFinish = True
#.......
time.sleep(1)
assert taskFinish
class ThreadPool(object):
def __init__(self, max_num=15):
#用线程安全的队列模拟线程池
self.queue = Queue.Queue(max_num)
for i in range(max_num):
self.queue.put(threadUpdateHandler())
def get_thread(self):
return self.queue.get()
def add_thread(self):
self.queue.put(threadUpdateHandler())
class threadUpdateHandler(threading.Thread):
def __init__(self):
super(threadUpdateHandler, self).__init__()
self.result = 0
def parepare_args(self, name, ipInfo, option, ext_info, pool = None):
self.name,self.ipInfo,self.option,self.ext_info=name,ipInfo,option,ext_info
self.pool = pool
def run(self):
try:
SwitchOption(self.ipInfo, self.option, self.name, self.ext_info)
except Exception as e:
sys.stdout.write("!!! error exception !!!!: %s server[%s] option %s \n"%(e, (self.ipInfo), self.option) )
self.result = 1
#线程执行完毕, 通知线程池
if self.pool != None:
self.pool.add_thread()
def get_result(self):
return self.result
def GetGroupList():
return ('10.0.128.1', '10.0.128.2', '10.0.128.3', '10.0.128.4' , '10.0.128.5')*20
def option_group_list(op, ext_info):
pool = ThreadPool(10) if op == 'update' else None
groups = GetGroupList()
if op in ['update', 'start', 'stop', 'restart']:
threadList = []
for ipTup in groups:
#针对更新服务器的rsync,单独进行优化
t = pool.get_thread() if op == 'update' else threadUpdateHandler()
t.parepare_args('group', ipTup, op, ext_info, pool)
t.start()
threadList.append(t)
for thread in threadList:
thread.join()
ret_error = 0
for index,thread in enumerate(threadList):
if thread.get_result():
print "%s server[%s] failed"%(op, (groups[index]))
ret_error = 1
assert ret_error == 0
else:
updatehandler = updateHandler()
for ipTup in groups:
updatehandler.CarryNameAddrOption('group', ipTup, op, ext_info)
option_group_list('update', '')
option_group_list('restart', '')