hadoop滚动重启脚本

背景
再使用开源版本hadoop集群时,有配置参数修改时,需要人工修改并重启节点让配置生效,因此这里这里写了个脚本实现自动滚动重启。

脚本:
注意:不同版本的hadoop

#!/usr/bin/env python
# -*-coding:utf-8 -*-
import logging
import subprocess
import sys
import time

"""
 滚动重启DataNode服务
 通过如下命令关闭现有服务,然后通过远程命令启动当前机器服务
 hdfs dfsadmin -shutdownDatanode  upgrade
 hdfs dfsadmin -getDatanodeInfo 

"""

"""
初始化日志
"""
logger = logging.getLogger("Rolling Restart Hadoop Datanode")
logger.setLevel(logging.INFO)
stdout = logging.StreamHandler()
stdout.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
stdout.setFormatter(formatter)
logger.addHandler(stdout)


class RollingStartException(Exception):
    """
    异常信息
    """
    pass


class CheckDataNodeDeadStateException(Exception):
    """
    检查datanode服务是否停止异常
    """
    pass


def runShellCommand(cmd):
    """
    执行shell命令
    :param cmd:
    :return:
    """
    process = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    # 等待当前任务运行完成
    while process.poll() is None:
        logger.info("wait for shell command done...")
        time.sleep(1)
    code = process.poll()
    out, err = process.communicate()
    if code != 0:
        # logger.error("exec shell command {cmd} faild".format(cmd=cmd))
        raise RollingStartException("run shell command faild,msg:{msg}".format(msg=err))
    else:
        return code, out


def shutDownDatanode(host):
    """
    关闭当前datanode服务
    :param host:
    :return:
    """
    execCommand = "su hdfs -c 'hdfs dfsadmin -shutdownDatanode {host}:8010'".format(host=host)
    logger.info("start shutdown datanode,current info: {host}".format(host=host))
    code, out = runShellCommand(execCommand)
    logger.info("send shutDownDatanode msg successful,msg:{msg}".format(msg=out))


def chekDatanodeState(host, targetState="active"):
    """
    检查当前Datanode状态待dn服务完全停止
    :param host:
    :return:
    """
    try:
        execCommand = "su hdfs -c 'hdfs dfsadmin -getDatanodeInfo {host}:8010'".format(host=host)
        return runShellCommand(execCommand)
        if targetState == "dead":
            logger.warning("datanode already running ,wait shutdown...")
            raise CheckDataNodeDeadStateException("datanode already running...")
    except RollingStartException ,e:
        # 判断当前服务是否已经停止成功时
        if targetState == "dead" and str(e).__contains__("getDatanodeInfo: Datanode unreachable"):
            logger.info("current datanode state became dead,dninfo: {host}".format(host=host))
            return 0,"datanode is dead"
        else:
            raise RollingStartException(e.args)
    # todo: 使用端口方式进行检测,是否会快一些 ????


def checkDataNodeIsDead(host):
    logger.info("wait datanode state to dead...")
    retrycount = 5
    while retrycount > 0:
        try:
            chekDatanodeState(host, targetState="dead")
            retrycount = 0
        except CheckDataNodeDeadStateException ,e:
            logger.warning("check datanode dead state with retry 5 times,current count {count}".format(count=retrycount))
            retrycount = retrycount - 1
            time.sleep(1)
            # 如果重试五次还未检测到相应的状态
            if retrycount <= 0:
                raise RollingStartException(e.args)


def startDatanode(host):
    """
    启动datanode服务
    :param host:
    :return:
    """
    # 启动服务
    logger.info("start datanode.......")
    # todo: 升级时需要切换到对应版本路径上
    baseCommand = "/opt/app/hadoop/sbin/hadoop-daemon.sh"
    execCommand = "ssh {host} -C 'su hdfs -c \"{command} start datanode\"'".format(host=host, command=baseCommand)
    logger.info(execCommand)
    runShellCommand(execCommand)
    # 检查当前服务状态
    logger.info("wait datanode state to active...")
    code, out = chekDatanodeState(host)
    logger.info("restart datanode {host} successful,msg:{msg}".format(host=host, msg=out))


def rollingRestart():
    """
    读取DataNode列表文件对依次对每台机器执行关闭 状态检查 服务启动
    """

    dnlist = open("./dnhost", "r")
    for line in dnlist:
        try:
            line = line.replace("\n", "")
            shutDownDatanode(line)
            checkDataNodeIsDead(line)
            # 等待dn完全停止 pid文件被清理后在进行启动
            time.sleep(5)
            startDatanode(line)
            # 等待dn上报完毕 
            time.sleep(30)
            # todo: 这里需要修改语法 在python2
        except RollingStartException ,e:
            logger.error(e.args)
            # 存在异常直接退出重启程序
            sys.exit(255)


if __name__ == '__main__':
    rollingRestart()

你可能感兴趣的:(hadoop,大数据,分布式)