HA测试【服务/网络/电源/磁盘】4个维度的命令操作
服务异常
# 获取该机器上运行的所有pod
kubectl get pod --all-namespaces -o wide | grep `hostname`
# 根据ns和pod_name去找到pid,然后杀掉服务
kubectl describe pods --namespace= | grep "Container ID" | cut -d \/ -f 3 | xargs docker inspect -f "{{.State.Pid}}" | xargs kill -9
网络异常
# 网卡的下线和上线
ifdown eth0
ifup eth0
机器异常
# 重启,关机
sudo reboot
/usr/sbin/shutdown -H now
# ipmitool去进行控制卡操作,yum install ipmitool
ipmitool -I lanplus -H -U -P chassis power off
ipmitool -I lanplus -H -U -P chassis power on
ipmitool -I lanplus -H -U -P chassis power reset
ipmitool -I lanplus -H -U -P chassis power status
磁盘异常
# 查看磁盘盘符,编号,状态
/opt/MegaRAID/MegaCli/MegaCli64 -LdPdInfo -aALL | grep -E "Virtual Drive|Enclosure Device ID|Slot Number|Firmware state"
# 磁盘下线
/opt/MegaRAID/MegaCli/MegaCli64 -PDOffline -PhysDrv[:] -aALL
# 磁盘上线
/opt/MegaRAID/MegaCli/MegaCli64 -PDOnline -PhysDrv[:] -aALL
# 磁盘raid重建的进度查询
/opt/MegaRAID/MegaCli/MegaCli64 -pdrbld -showprog -physdrv[:] -aALL
# 磁盘上线后修复
umount /dev/sdx /xx
xfs_repair -L /dev/sdx
mount /dev/sdx /xx
# MegaCli64的安装
rpm -ivh /opt/storcli-1.16.06-1.noarch.rpm
rpm -ivh /opt/Lib_Utils-1.00-09.noarch.rpm
rpm -ivh /opt/MegaCli-8.02.21-1.noarch.rpm
python代码实现
import functools
import os
import time
import paramiko
class SSH:
def __init__(self, ip, user, password):
self.ip = ip
self.user = user
self.password = password
self.ssh = None
self.stdin = None
self.stderr = None
self.stdout = None
def conn(self):
self.ssh = paramiko.SSHClient()
self.ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
try:
self.ssh.connect(self.ip, username=self.user, password=self.password)
except:
self.ssh = None
def run(self, cmd):
if self.ssh:
stdin, stdout, stderr = self.ssh.exec_command(cmd, timeout=60*10)
self.stdin = stdin
self.stderr = stderr
self.stdout = stdout
return stdout
else:
return '%s@%s:%s ssh connect failed!' % (self.ip, self.user, self.password)
def run_cmd(self, cmd):
if self.ssh:
stdin, stdout, stderr = self.ssh.exec_command(cmd, timeout=60*10)
self.stdin = stdin
self.stderr = stderr
self.stdout = stdout
return [elem[:-1] for elem in stdout.readlines()]
else:
return []
def close(self):
if self.ssh:
self.ssh.close()
def single_ssh_operation(f):
"""
【单次远程ssh命令操作装饰器】
:param f: 被装饰函数
:return:
"""
@functools.wraps(f)
def operation(ssh, *args, **kwargs):
ssh.conn()
if ssh.ssh is None:
return None
result = f(ssh, *args, **kwargs)
ssh.close()
return result
return operation
def power_off(bmc_ip, user='ADMIN', password='ADMIN',
springboard=None):
"""
【bmc关闭机器】
:param bmc_ip:
:param user:
:param password:
:param springboard:
:return:
"""
if springboard is None:
r = os.system('ipmitool -I lanplus -H %s -U %s -P %s chassis power off' %
(bmc_ip, user, password))
if r == ['Chassis Power Control: Down/Off']:
return True
else:
return False
else:
ssh = SSH(*springboard.split(','))
ssh.conn()
if ssh.ssh is not None:
r = ssh.run_cmd('ipmitool -I lanplus -H %s -U %s -P %s chassis power off' %
(bmc_ip, user, password))
if r == ['Chassis Power Control: Down/Off']:
return True
else:
return False
else:
return False
def power_on(bmc_ip, user='ADMIN', password='ADMIN',
springboard=None):
"""
【bmc启动机器】
:param bmc_ip:
:param user:
:param password:
:param springboard:
:return:
"""
if springboard is None:
r = os.system('ipmitool -I lanplus -H %s -U %s -P %s chassis power on' %
(bmc_ip, user, password))
if r == ['Chassis Power Control: Up/On']:
return True
else:
return False
else:
ssh = SSH(*springboard.split(','))
ssh.conn()
if ssh.ssh is not None:
r = ssh.run_cmd('ipmitool -I lanplus -H %s -U %s -P %s chassis power on' %
(bmc_ip, user, password))
if r == ['Chassis Power Control: Up/On']:
return True
else:
return False
else:
return False
def power_reset(bmc_ip, user='ADMIN', password='ADMIN',
springboard=None, log=None):
"""
【bmc启动机器】
:param bmc_ip:
:param user:
:param password:
:param springboard:
:return:
"""
if springboard is None:
r = os.system('ipmitool -I lanplus -H %s -U %s -P %s chassis power reset' %
(bmc_ip, user, password))
if log:
log.critical('ipmitool -I lanplus -H %s -U %s -P %s chassis power reset' %
(bmc_ip, user, password))
if r == ['Chassis Power Control: Reset']:
return True
else:
return False
else:
ssh = SSH(*springboard.split(','))
ssh.conn()
if ssh.ssh is not None:
r = ssh.run_cmd('ipmitool -I lanplus -H %s -U %s -P %s chassis power reset' %
(bmc_ip, user, password))
if log:
log.critical('ipmitool -I lanplus -H %s -U %s -P %s chassis power reset' %
(bmc_ip, user, password))
if r == ['Chassis Power Control: Reset']:
return True
else:
return False
else:
return False
@single_ssh_operation
def get_disk_info(ssh):
"""
获取磁盘信息
:param ssh: ssh连接对象
:return: ['sda': [['Enclosure Device ID': xx , 'Slot Number': xx] ...] ...]
"""
if 'MegaRAID' not in ssh.run_cmd('ls /opt'):
ssh.run_cmd('yum install -y sshpass')
ssh.run_cmd('sshpass -p Changeme_123 scp -o StrictHostKeyChecking=no '
'[email protected]:/platform/storage/material/Viper/Business/ha/* /opt'
' && rpm -ivh /opt/storcli-1.16.06-1.noarch.rpm'
' && rpm -ivh /opt/Lib_Utils-1.00-09.noarch.rpm'
' && rpm -ivh /opt/MegaCli-8.02.21-1.noarch.rpm'
' && rm -rf /opt/*.rpm')
disk_info = ssh.run_cmd('/opt/MegaRAID/MegaCli/MegaCli64 -LdPdInfo -aALL | '
'grep -E "Virtual Drive|Enclosure Device ID|Slot Number|Firmware state"')
disk_dict = {}
disk_key = None
for elem in disk_info:
if 'Virtual Drive' in elem:
disk_key = 'sd'+chr(97+int(elem.split()[2]))
disk_dict[disk_key] = []
else:
if len(disk_dict[disk_key]) == 0:
disk_dict[disk_key].append([elem.split()[-1]])
elif len(disk_dict[disk_key][-1]) == 1:
disk_dict[disk_key][-1].append(elem.split()[-1])
elif len(disk_dict[disk_key][-1]) == 2:
disk_dict[disk_key][-1].append(elem.split()[-3][:-1])
else:
disk_dict[disk_key].append([elem.split()[-1]])
return disk_dict
@single_ssh_operation
def disk_off(ssh, enclosure_id, slot_id):
"""
【磁盘下线】
:param ssh: ssh连接对象
:param enclosure_id:
:param slot_id:
:return:
"""
ssh.run_cmd('/opt/MegaRAID/MegaCli/MegaCli64 -PDOffline -PhysDrv[%s:%s] -aALL' % (enclosure_id, slot_id))
@single_ssh_operation
def disk_on(ssh, enclosure_id, slot_id, raid=True, disk_num=None, log=None):
"""
【磁盘恢复】
:param ssh: ssh连接对象
:param enclosure_id:
:param slot_id:
:param disk_num: 盘符,单盘未作raid时,需传入
:param raid: 是否有作raid
:return:
"""
disk_info = ssh.run_cmd('df -h | grep sd')
if raid:
ssh.run_cmd('/opt/MegaRAID/MegaCli/MegaCli64 -PDOnline -PhysDrv[%s:%s] -aALL' % (enclosure_id, slot_id))
if log:
log.critical('/opt/MegaRAID/MegaCli/MegaCli64 -PDOnline -PhysDrv[%s:%s] -aALL' % (enclosure_id, slot_id))
else:
disk_mount = []
disk_umount = []
disk_num_list = []
for line in disk_info:
tmp = filter(None, line.split())
if disk_num in tmp[0]:
disk_umount.append('umount %s %s' % (tmp[0], tmp[-1]))
if not str.startswith(str(tmp[-1]), '/var/lib/'):
disk_num_list.append(tmp[0])
disk_mount.append('mount %s %s' % (tmp[0], tmp[-1]))
ssh.run('/opt/MegaRAID/MegaCli/MegaCli64 -PDOnline -PhysDrv[%s:%s] -aALL' % (enclosure_id, slot_id))
if log:
log.critical('/opt/MegaRAID/MegaCli/MegaCli64 -PDOnline -PhysDrv[%s:%s] -aALL' % (enclosure_id, slot_id))
time.sleep(10)
for elem in disk_umount:
ssh.run(elem)
if log:
log.critical(elem)
time.sleep(5)
for elem in disk_num_list:
ssh.run('xfs_repair -L %s' % elem)
if log:
log.critical('xfs_repair -L %s' % elem)
time.sleep(10)
for elem in disk_mount:
ssh.run(elem)
if log:
log.critical(elem)
time.sleep(5)
time.sleep(60)
@single_ssh_operation
def network_disconnect_long(ssh, eth='eth100', lang=60*30):
"""
【网络长时间中断】
:param ssh: ssh连接对象
:param eth: 网卡,默认eth100
:param lang: 中断时长,默认0.5小时
:return:
"""
ssh.run('echo "ifdown %s;sleep %s;ifup %s" >> tmp.sh'
' && chmod +x tmp.sh'
' && ./tmp.sh'
' && rm -f tmp.sh'
% (eth, lang, eth))
return True
@single_ssh_operation
def network_disconnect_interval(ssh, eth='eth100', i_num=5, c_time=3, i_time=5):
"""
【网络间歇性闪断】默认闪断持续3秒间隔5秒模拟5次
:param ssh: ssh连接对象
:param eth: 网卡,默认eth100
:param i_num: 闪断次数
:param c_time: 中断持续时间
:param i_time: 间隔时间
:return:
"""
cmd = 'ifdown %s;sleep %s;ifup %s;sleep %s;' % (eth, c_time, eth, i_time)
ssh.run('echo "%s" >> tmp.sh'
' && chmod +x tmp.sh'
' && ./tmp.sh'
' && rm -f tmp.sh'
% (cmd*i_num))
return True
@single_ssh_operation
def get_host_name(ssh):
try:
return ssh.run_cmd('hostname')[0]
except:
return None
@single_ssh_operation
def get_all_pod_info(ssh, hostname=None):
"""
【获取被测机器的所有pod的信息】
:param ssh: ssh连接对象
:param hostname: 主机名
:return: < [pod所在的名称空间, pod名称, 运行的个数/pod里容器个数, 状态, 重启次数, 运行时长, 集群ip, 所在主机名] : list>
"""
if hostname is None:
pod_info = ssh.run_cmd('kubectl get pod --all-namespaces -o wide')
else:
pod_info = ssh.run_cmd('kubectl get pod --all-namespaces -o wide | grep %s' % hostname)
return [filter(None, elem.split(' '))for elem in pod_info]
@single_ssh_operation
def get_pod_not_running(ssh, _filter=None):
"""
【获取被测机器的未正常运行pod的信息】
:param ssh: ssh连接对象
:return: < [pod所在的名称空间, pod名称, 运行的个数/pod里容器个数, 状态, 重启次数, 运行时长, 集群ip, 所在主机名] : list>
"""
pod_info = ssh.run_cmd('kubectl get pod --all-namespaces -o wide | grep -v Running| grep -v STATUS')
_filter = [] if _filter is None else _filter
pod_info = [filter(None, elem.split(' '))for elem in pod_info]
tmp = pod_info[:]
for elem in pod_info:
for item in _filter:
if item in elem[1]:
tmp.remove(elem)
break
return tmp
@single_ssh_operation
def get_single_pod_status(ssh, pod_name, namespace):
"""
【获取被测机器的指定pod的状态】
:param ssh: ssh连接对象
:return: 状态
"""
pod_info = ssh.run_cmd('kubectl get pod %s -n %s' % (pod_name, namespace))
try:
return filter(None, pod_info[1].split(' '))[2]
except:
return 'ERROR'
@single_ssh_operation
def kill_pod(ssh, pid):
"""
【杀掉pod对应的进程】
:param ssh: ssh连接对象
:param pid: 进程id
:return:
"""
for p in pid:
ssh.run_cmd('docker inspect -f "{{.State.Pid}}" %s | xargs kill -9' % p)
@single_ssh_operation
def get_pod_pid(ssh, pod_name, namespace='default'):
"""
【获取pod对应的进程id】
:param ssh: ssh连接对象
:param pod_name: pod的名称
:param namespace: pod所在的名称空间
:return:
"""
return ssh.run_cmd('kubectl describe pods --namespace=%s %s | grep "Container ID" | cut -d \/ -f 3 '
% (namespace, pod_name))
@single_ssh_operation
def shutdown(ssh):
"""
【立刻关闭被测机器】
:param ssh: ssh连接对象
:return:
"""
ssh.run_cmd('/usr/sbin/shutdown -H now')
return True
@single_ssh_operation
def reboot(ssh):
"""
【重启被测机器】
:param ssh:
:return:
"""
ssh.run_cmd('sudo reboot')
return True
if __name__ == '__main__':
ssh = SSH('172.20.2.161', 'root', 'V1p3r1@#$%')
disk_on(ssh, 252, 6)