自动监控GPU温度并报警

实验室电脑原来的1080训练coco数据集时烧坏了,搞了2周终于放弃治疗,返厂维修。同时换了一台ti继续跑,为了随时监控1080ti的温度,我用Python写了一个实时监控GPU温度的脚本,分享如下。

# !/usr/bin/python
# -*- coding: utf-8 -*-
import time
import os, shutil
import smtplib
from email.mime.text import MIMEText
import datetime

pause = 100
mailto_list=['[email protected]']
mail_host="smtp.163.com"
mail_user="GPU_Monitor" #发送警报的邮箱
mail_pass="自己注册一个吧老哥们,简单的一比吊糟,记得开启STMP服务,设置STMP密码" #不是登录密码,是STMP密码
mail_postfix="163.com"

def send_email(to_list,sub,content):
    me="GPU Auto Monitor"+"<"+mail_user+"@"+mail_postfix+">"
    msg = MIMEText(content,_subtype='plain')
    msg['Subject'] = sub
    msg['From'] = me
    msg['To'] = ";".join(to_list)                #将收件人列表以‘;’分隔
    try:
        server = smtplib.SMTP()
        server.connect(mail_host)                            #连接服务器
        server.login(mail_user,mail_pass)               #登录操作
        server.sendmail(me, to_list, msg.as_string())
        server.close()
        return True
    except Exception:
        print("send error!!!")
        return False

def get_gpu_tem():
    shell_str = "tem_line=`nvidia-smi | grep %` && tem1=`echo $tem_line | cut -d C -f 1` " \
                "&& tem2=`echo $tem1 | cut -d % -f 2` && echo $tem2"
    result = os.popen(shell_str)
    result_str = result.read()
    tem_str = result_str.split("\n")[0]
    result.close()
    return float(tem_str)

while(True):
    try:
        tem_num = get_gpu_tem()
        if tem_num>20:
            nowTime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            warning_str = nowTime+"  Current temperature is " + str(tem_num) + "!!!"
            print(warning_str)
            send_email(mailto_list, "GPU Warning!!!", warning_str)
            print("send over")

    finally:
        time.sleep(pause)

自动监控GPU温度并报警_第1张图片

你可能感兴趣的:(Linux&Windows系统,奇技淫巧)