k8s集群的高可靠性测试的自动化实现

HA测试【服务/网络/电源/磁盘】4个维度的命令操作

服务异常

# 获取该机器上运行的所有pod
kubectl get pod --all-namespaces -o wide | grep `hostname`
# 根据ns和pod_name去找到pid,然后杀掉服务
kubectl describe pods --namespace=  | grep "Container ID" | cut -d \/ -f 3 | xargs docker inspect -f "{{.State.Pid}}" | xargs kill -9

网络异常

# 网卡的下线和上线
ifdown eth0
ifup eth0

机器异常

# 重启,关机
sudo reboot
/usr/sbin/shutdown -H now

# ipmitool去进行控制卡操作,yum install ipmitool
ipmitool -I lanplus -H  -U  -P  chassis power off
ipmitool -I lanplus -H  -U  -P  chassis power on
ipmitool -I lanplus -H  -U  -P  chassis power reset
ipmitool -I lanplus -H  -U  -P  chassis power status

磁盘异常

# 查看磁盘盘符,编号,状态
/opt/MegaRAID/MegaCli/MegaCli64 -LdPdInfo -aALL | grep -E "Virtual Drive|Enclosure Device ID|Slot Number|Firmware state"

# 磁盘下线
/opt/MegaRAID/MegaCli/MegaCli64 -PDOffline -PhysDrv[:] -aALL

# 磁盘上线
/opt/MegaRAID/MegaCli/MegaCli64 -PDOnline -PhysDrv[:] -aALL

# 磁盘raid重建的进度查询
/opt/MegaRAID/MegaCli/MegaCli64 -pdrbld -showprog -physdrv[:] -aALL

# 磁盘上线后修复
umount /dev/sdx /xx
xfs_repair -L /dev/sdx
mount /dev/sdx /xx

# MegaCli64的安装
rpm -ivh /opt/storcli-1.16.06-1.noarch.rpm
rpm -ivh /opt/Lib_Utils-1.00-09.noarch.rpm
rpm -ivh /opt/MegaCli-8.02.21-1.noarch.rpm

python代码实现

# -*- coding: utf-8 -*-


import functools
import os
import time
import paramiko


class SSH:
    def __init__(self, ip, user, password):
        self.ip = ip
        self.user = user
        self.password = password
        self.ssh = None
        self.stdin = None
        self.stderr = None
        self.stdout = None

    def conn(self):
        self.ssh = paramiko.SSHClient()
        self.ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
        try:
            self.ssh.connect(self.ip, username=self.user, password=self.password)
        except:
            self.ssh = None

    def run(self, cmd):
        if self.ssh:
            stdin, stdout, stderr = self.ssh.exec_command(cmd, timeout=60*10)
            self.stdin = stdin
            self.stderr = stderr
            self.stdout = stdout
            return stdout
        else:
            return '%s@%s:%s ssh connect failed!' % (self.ip, self.user, self.password)

    def run_cmd(self, cmd):
        # 新增的run方法,返回值直接是列表,每行对应linux的输出;替代上面的run方法
        if self.ssh:
            stdin, stdout, stderr = self.ssh.exec_command(cmd, timeout=60*10)
            self.stdin = stdin
            self.stderr = stderr
            self.stdout = stdout
            return [elem[:-1] for elem in stdout.readlines()]
        else:
            return []

    def close(self):
        if self.ssh:
            self.ssh.close()


def single_ssh_operation(f):
    """
    【单次远程ssh命令操作装饰器】
    :param f: 被装饰函数
    :return:
    """
    @functools.wraps(f)
    def operation(ssh, *args, **kwargs):
        ssh.conn()
        if ssh.ssh is None:
            return None
        result = f(ssh, *args, **kwargs)
        ssh.close()
        return result
    return operation


def power_off(bmc_ip, user='ADMIN', password='ADMIN',
              springboard=None):
    """
    【bmc关闭机器】
    :param bmc_ip:
    :param user:
    :param password:
    :param springboard:
    :return:
    """
    if springboard is None:
        r = os.system('ipmitool -I lanplus -H %s -U %s -P %s chassis power off' %
                      (bmc_ip, user, password))
        if r == ['Chassis Power Control: Down/Off']:
            return True
        else:
            return False
    else:
        ssh = SSH(*springboard.split(','))
        ssh.conn()
        if ssh.ssh is not None:
            r = ssh.run_cmd('ipmitool -I lanplus -H %s -U %s -P %s chassis power off' %
                            (bmc_ip, user, password))
            if r == ['Chassis Power Control: Down/Off']:
                return True
            else:
                return False
        else:
            return False


def power_on(bmc_ip, user='ADMIN', password='ADMIN',
             springboard=None):
    """
    【bmc启动机器】
    :param bmc_ip:
    :param user:
    :param password:
    :param springboard:
    :return:
    """
    if springboard is None:
        r = os.system('ipmitool -I lanplus -H %s -U %s -P %s chassis power on' %
                      (bmc_ip, user, password))
        if r == ['Chassis Power Control: Up/On']:
            return True
        else:
            return False
    else:
        ssh = SSH(*springboard.split(','))
        ssh.conn()
        if ssh.ssh is not None:
            r = ssh.run_cmd('ipmitool -I lanplus -H %s -U %s -P %s chassis power on' %
                            (bmc_ip, user, password))
            if r == ['Chassis Power Control: Up/On']:
                return True
            else:
                return False
        else:
            return False


def power_reset(bmc_ip, user='ADMIN', password='ADMIN',
                springboard=None, log=None):
    """
    【bmc启动机器】
    :param bmc_ip:
    :param user:
    :param password:
    :param springboard:
    :return:
    """
    if springboard is None:
        r = os.system('ipmitool -I lanplus -H %s -U %s -P %s chassis power reset' %
                      (bmc_ip, user, password))
        if log:
            log.critical('ipmitool -I lanplus -H %s -U %s -P %s chassis power reset' %
                         (bmc_ip, user, password))
        if r == ['Chassis Power Control: Reset']:
            return True
        else:
            return False
    else:
        ssh = SSH(*springboard.split(','))
        ssh.conn()
        if ssh.ssh is not None:
            r = ssh.run_cmd('ipmitool -I lanplus -H %s -U %s -P %s chassis power reset' %
                            (bmc_ip, user, password))
            if log:
                log.critical('ipmitool -I lanplus -H %s -U %s -P %s chassis power reset' %
                             (bmc_ip, user, password))
            if r == ['Chassis Power Control: Reset']:
                return True
            else:
                return False
        else:
            return False


@single_ssh_operation
def get_disk_info(ssh):
    """
    获取磁盘信息
    :param ssh: ssh连接对象
    :return: ['sda': [['Enclosure Device ID': xx , 'Slot Number': xx] ...] ...]
    """
    if 'MegaRAID' not in ssh.run_cmd('ls /opt'):
        ssh.run_cmd('yum install -y sshpass')
        ssh.run_cmd('sshpass -p Changeme_123 scp -o StrictHostKeyChecking=no '
                    '[email protected]:/platform/storage/material/Viper/Business/ha/* /opt'
                    ' && rpm -ivh /opt/storcli-1.16.06-1.noarch.rpm'
                    ' && rpm -ivh /opt/Lib_Utils-1.00-09.noarch.rpm'
                    ' && rpm -ivh /opt/MegaCli-8.02.21-1.noarch.rpm'
                    ' && rm -rf /opt/*.rpm')
    disk_info = ssh.run_cmd('/opt/MegaRAID/MegaCli/MegaCli64 -LdPdInfo -aALL | '
                            'grep -E "Virtual Drive|Enclosure Device ID|Slot Number|Firmware state"')
    disk_dict = {}
    disk_key = None
    for elem in disk_info:
        if 'Virtual Drive' in elem:
            disk_key = 'sd'+chr(97+int(elem.split()[2]))
            disk_dict[disk_key] = []
        else:
            if len(disk_dict[disk_key]) == 0:
                disk_dict[disk_key].append([elem.split()[-1]])
            elif len(disk_dict[disk_key][-1]) == 1:
                disk_dict[disk_key][-1].append(elem.split()[-1])
            elif len(disk_dict[disk_key][-1]) == 2:
                disk_dict[disk_key][-1].append(elem.split()[-3][:-1])
            else:
                disk_dict[disk_key].append([elem.split()[-1]])
    return disk_dict


@single_ssh_operation
def disk_off(ssh, enclosure_id, slot_id):
    """
    【磁盘下线】
    :param ssh: ssh连接对象
    :param enclosure_id:
    :param slot_id:
    :return:
    """
    ssh.run_cmd('/opt/MegaRAID/MegaCli/MegaCli64 -PDOffline -PhysDrv[%s:%s] -aALL' % (enclosure_id, slot_id))


@single_ssh_operation
def disk_on(ssh, enclosure_id, slot_id, raid=True, disk_num=None, log=None):
    """
    【磁盘恢复】
    :param ssh: ssh连接对象
    :param enclosure_id:
    :param slot_id:
    :param disk_num: 盘符,单盘未作raid时,需传入
    :param raid: 是否有作raid
    :return:
    """
    disk_info = ssh.run_cmd('df -h | grep sd')
    if raid:
        ssh.run_cmd('/opt/MegaRAID/MegaCli/MegaCli64 -PDOnline -PhysDrv[%s:%s] -aALL' % (enclosure_id, slot_id))
        if log:
            log.critical('/opt/MegaRAID/MegaCli/MegaCli64 -PDOnline -PhysDrv[%s:%s] -aALL' % (enclosure_id, slot_id))
    else:
        # 获取挂载信息
        disk_mount = []
        disk_umount = []
        disk_num_list = []
        for line in disk_info:
            tmp = filter(None, line.split())
            if disk_num in tmp[0]:
                disk_umount.append('umount %s %s' % (tmp[0], tmp[-1]))
                if not str.startswith(str(tmp[-1]), '/var/lib/'):
                    disk_num_list.append(tmp[0])
                    disk_mount.append('mount %s %s' % (tmp[0], tmp[-1]))

        # 执行磁盘上线
        ssh.run('/opt/MegaRAID/MegaCli/MegaCli64 -PDOnline -PhysDrv[%s:%s] -aALL' % (enclosure_id, slot_id))
        if log:
            log.critical('/opt/MegaRAID/MegaCli/MegaCli64 -PDOnline -PhysDrv[%s:%s] -aALL' % (enclosure_id, slot_id))
        time.sleep(10)

        # 去挂载磁盘
        for elem in disk_umount:
            ssh.run(elem)
            if log:
                log.critical(elem)
            time.sleep(5)

        # 修复磁盘
        for elem in disk_num_list:
            ssh.run('xfs_repair -L %s' % elem)
            if log:
                log.critical('xfs_repair -L %s' % elem)
            time.sleep(10)

        # 重新挂载磁盘
        for elem in disk_mount:
            ssh.run(elem)
            if log:
                log.critical(elem)
            time.sleep(5)
        time.sleep(60)


@single_ssh_operation
def network_disconnect_long(ssh, eth='eth100', lang=60*30):
    """
    【网络长时间中断】
    :param ssh: ssh连接对象
    :param eth: 网卡,默认eth100
    :param lang: 中断时长,默认0.5小时
    :return:
    """
    ssh.run('echo "ifdown %s;sleep %s;ifup %s" >> tmp.sh'
            ' && chmod +x tmp.sh'
            ' && ./tmp.sh'
            ' && rm -f tmp.sh'
            % (eth, lang, eth))
    return True


@single_ssh_operation
def network_disconnect_interval(ssh, eth='eth100', i_num=5, c_time=3, i_time=5):
    """
    【网络间歇性闪断】默认闪断持续3秒间隔5秒模拟5次
    :param ssh: ssh连接对象
    :param eth: 网卡,默认eth100
    :param i_num: 闪断次数
    :param c_time: 中断持续时间
    :param i_time: 间隔时间
    :return:
    """
    cmd = 'ifdown %s;sleep %s;ifup %s;sleep %s;' % (eth, c_time, eth, i_time)
    ssh.run('echo "%s" >> tmp.sh'
            ' && chmod +x tmp.sh'
            ' && ./tmp.sh'
            ' && rm -f tmp.sh'
            % (cmd*i_num))
    return True


@single_ssh_operation
def get_host_name(ssh):
    try:
        return ssh.run_cmd('hostname')[0]
    except:
        return None


@single_ssh_operation
def get_all_pod_info(ssh, hostname=None):
    """
    【获取被测机器的所有pod的信息】
    :param ssh: ssh连接对象
    :param hostname: 主机名
    :return: < [pod所在的名称空间, pod名称, 运行的个数/pod里容器个数, 状态, 重启次数, 运行时长, 集群ip, 所在主机名] : list>
    """
    if hostname is None:
        pod_info = ssh.run_cmd('kubectl get pod --all-namespaces -o wide')
    else:
        pod_info = ssh.run_cmd('kubectl get pod --all-namespaces -o wide | grep %s' % hostname)
    return [filter(None, elem.split(' '))for elem in pod_info]


@single_ssh_operation
def get_pod_not_running(ssh, _filter=None):
    """
    【获取被测机器的未正常运行pod的信息】
    :param ssh: ssh连接对象
    :return: < [pod所在的名称空间, pod名称, 运行的个数/pod里容器个数, 状态, 重启次数, 运行时长, 集群ip, 所在主机名] : list>
    """
    pod_info = ssh.run_cmd('kubectl get pod --all-namespaces -o wide | grep -v Running| grep -v STATUS')
    _filter = [] if _filter is None else _filter
    pod_info = [filter(None, elem.split(' '))for elem in pod_info]
    tmp = pod_info[:]
    for elem in pod_info:
        for item in _filter:
            if item in elem[1]:
                tmp.remove(elem)
                break
    return tmp


@single_ssh_operation
def get_single_pod_status(ssh, pod_name, namespace):
    """
    【获取被测机器的指定pod的状态】
    :param ssh: ssh连接对象
    :return: 状态
    """
    pod_info = ssh.run_cmd('kubectl get pod %s -n %s' % (pod_name, namespace))
    try:
        return filter(None, pod_info[1].split(' '))[2]
    except:
        return 'ERROR'


@single_ssh_operation
def kill_pod(ssh, pid):
    """
    【杀掉pod对应的进程】
    :param ssh: ssh连接对象
    :param pid: 进程id
    :return:
    """
    for p in pid:
        ssh.run_cmd('docker inspect -f "{{.State.Pid}}" %s | xargs kill -9' % p)


@single_ssh_operation
def get_pod_pid(ssh, pod_name, namespace='default'):
    """
    【获取pod对应的进程id】
    :param ssh: ssh连接对象
    :param pod_name: pod的名称
    :param namespace: pod所在的名称空间
    :return:
    """
    return ssh.run_cmd('kubectl describe pods --namespace=%s %s | grep "Container ID" | cut -d \/ -f 3 '
                       % (namespace, pod_name))

@single_ssh_operation
def shutdown(ssh):
    """
    【立刻关闭被测机器】
    :param ssh: ssh连接对象
    :return:
    """
    ssh.run_cmd('/usr/sbin/shutdown -H now')
    return True

@single_ssh_operation
def reboot(ssh):
    """
    【重启被测机器】
    :param ssh:
    :return:
    """
    ssh.run_cmd('sudo reboot')
    return True


if __name__ == '__main__':
    ssh = SSH('172.20.2.161', 'root', 'V1p3r1@#$%')
    disk_on(ssh, 252, 6)

你可能感兴趣的:(自动化测试)