进程服务监测与自愈

#!/usr/bin/python3
# encoding: utf-8
#filename: service-detection-repair.py
#author: gaohaixiang
#writetime:202403041043

"""
# 定时任务监测示例
*/5 * * * * python3 /data/processlog/service-detection-repair.py systemctlCheck nginx
*/5 * * * * python3 /data/processlog/service-detection-repair.py processCheck /data/nginx/sbin/nginx /data/nginx/sbin/nginx

# 脚本使用示例:
# systemctl is-active nginx ,检测不是 active 以后就 systemctl restart nginx
python3 service-detection-repair.py systemctlCheck nginx

# pgrep -f /data/nginx/sbin/nginx,该路径为nginx启动的绝对路径,用于检测这个nginx是否存活,
# 若是不存在该nginx,则使用 /data/nginx/sbin/nginx 启动 nginx
# 第一个路径为检测nginx是否存活,第二个路径是绝对路径启动nginx
python3 service-detection-repair.py processCheck /data/nginx/sbin/nginx /data/nginx/sbin/nginx
"""

import time
import subprocess
import sys
import os

# 日期时间获取
def timestamp_time():
    timestamp = int(time.time())
    # 转换成localtime(格式和时间戳一样)
    timelocal = time.localtime(timestamp)
    # 转换成新的时间格式(3016-05-05 20:28:54)
    datetime = time.strftime("%Y%m%d%H%M%S", timelocal)
    return datetime

# 日志文件写入换行间隔
def fileWriteLine(getdatetime,filewrite):
    filewrite.writelines("\n-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_\n")
    filewrite.writelines(getdatetime+"\n")

# systemctl 进行服务检测
def systemctlCheckService(getdatetime, filewrite,checkCommand):
    fileWriteLine(getdatetime, filewrite)
    filewrite.writelines("systemctlCheckService\n")
    filewrite.writelines(checkCommand+"\n")
    try:
        # 执行检查命令
        status = subprocess.check_output(checkCommand, shell=True).decode('utf-8').strip()
        filewrite.writelines("------服务正常------\n")
        return status == 'active'
    except subprocess.CalledProcessError as e:
        print(f"错误,服务检测的状态是: {e}")
        filewrite.writelines("------服务关闭------\n")
        return False

# systemctl 进行服务重启
def systemctlRestartService(getdatetime, filewrite,restartCommand,serviceName):
    fileWriteLine(getdatetime, filewrite)
    filewrite.writelines("systemctlRestartService\n")
    filewrite.writelines(restartCommand+"\n")
    try:
        # 执行重启命令
        subprocess.check_call(restartCommand, shell=True)
        filewrite.writelines("------服务已经被重启------\n")
        print(f"服务 {serviceName} 已经被重启.")
    except subprocess.CalledProcessError as e:
        filewrite.writelines("------服务重启错误------\n")
        print(f"服务重启错误: {e}")

# systemct 服务进行监测及重启
def systemctlCheck(getdatetime, filewrite,serviceName):
    fileWriteLine(getdatetime, filewrite)
    filewrite.writelines("systemctlCheck\n")
    filewrite.writelines(serviceName+"\n")
    # 检查服务状态的命令
    checkCommand = f'systemctl is-active {serviceName}'
    # 重启服务的命令
    restartCommand = f'sudo systemctl restart {serviceName}'

    # 检查服务是否运行
    if not systemctlCheckService(getdatetime, filewrite,checkCommand):
        filewrite.writelines("------服务停止. 准备进行重启.------\n")
        print(f"服务 {serviceName} 停止. 准备进行重启.")
        systemctlRestartService(getdatetime, filewrite,restartCommand,serviceName)
    else:
        filewrite.writelines("------服务正在运行中.------\n")
        print(f"服务 {serviceName} 正在运行中.")

def processIsRunning(getdatetime, filewrite,servicePath):
    fileWriteLine(getdatetime, filewrite)
    filewrite.writelines("processIsRunning\n")
    filewrite.writelines(servicePath+"\n")
    try:
        # 获取当前脚本的进程ID
        current_pid = str(os.getpid())

        # 使用pgrep检查进程是否存在,并获取所有匹配的PID
        pids = subprocess.check_output(['pgrep', '-f', servicePath, '-d', '\n']).decode('utf-8').strip().split('\n')

        # 移除当前脚本的PID
        pids = [pid for pid in pids if pid != current_pid]

        # 如果移除后仍有其他PID存在,则服务正在运行
        if pids:
            filewrite.writelines("------服务正在运行.------\n")
            print("serviceName 正在运行")
            return True
        else:
            filewrite.writelines("------服务已经停止.------\n")
            print("serviceName 已经停止")
            return False
    except subprocess.CalledProcessError:
        # pgrep在没有找到进程时返回非零退出状态
        print("serviceName 已经停止")
        filewrite.writelines("------服务已经停止.------\n")
        return False

def processStartService(getdatetime,filewrite,startCommand):
    fileWriteLine(getdatetime, filewrite)
    filewrite.writelines("processStartService\n")
    filewrite.writelines(startCommand+"\n")
    try:
        # 执行启动命令
        subprocess.check_output(startCommand, shell=True)
        filewrite.writelines("------已经开始使用这个命令进行启动.------\n")
        print(f"已经开始使用这个命令进行启动: {startCommand}")
    except subprocess.CalledProcessError as e:
        filewrite.writelines("------使用命令进行启动发生错误.------\n")
        print(f"错误,使用这个命令进行启动发生错误 {startCommand}: {e}")

def processCheck(getdatetime, filewrite,serviceMaster):
    fileWriteLine(getdatetime, filewrite)
    filewrite.writelines("processCheck\n")
    #filewrite.writelines(startCommand+"\n")
    for serviceName, serviceInfo in serviceMaster.items():
        servicePath = serviceInfo['servicePath']
        startCommand = serviceInfo['startCommand']

        # 检查服务是否运行
        if not processIsRunning(getdatetime, filewrite,servicePath):
            filewrite.writelines("------服务已关闭。正在尝试启动.------\n")
            print(f"{serviceName} 已关闭。正在尝试启动。")
            processStartService(getdatetime, filewrite,startCommand)
        else:
            filewrite.writelines("------服务正在运行.------\n")
            print(f"{serviceName} 正在运行。")

def main(getdatetime,filewrite):
    fileWriteLine(getdatetime, filewrite)
    filewrite.writelines("main\n")
    # filewrite.writelines(startCommand+"\n")
    if len(sys.argv) > 1:
        if sys.argv[1] == "systemctlCheck" and len(sys.argv) == 3:
            systemctlCheck(getdatetime,filewrite,sys.argv[2])
        elif sys.argv[1] == "processCheck" and len(sys.argv) == 4:
            serviceMaster = {
                'serviceMaster': {
                    'servicePath': sys.argv[2],
                    'startCommand': sys.argv[3]
                },
            }
            processCheck(getdatetime,filewrite,serviceMaster)
        else:
            filewrite.writelines("------脚本携带参数有误.------\n")
            print("脚本携带参数有误")
    else:
        filewrite.writelines("------脚本没有携带参数,请携带正确的参数再运行脚本.------\n")
        print("脚本没有携带参数,请携带正确的参数再运行脚本")


if __name__ == '__main__':
    # 脚本及日志存放路径
    logdir = "/data/processlog/"
    # 日志文件
    processChecklog = "processChecklog.log"
    processfile = logdir + processChecklog

    # 时间获取
    getdatetime = timestamp_time()

    filewrite = open(processfile, "a+", encoding="UTF8")
    # 主函数入口
    main(getdatetime, filewrite)
    filewrite.close()


你可能感兴趣的:(python,运维,python,运维)