深夜,学长学姐已然入睡,深度学习进程已然结束
可你不愿意爬起来,因为梦乡的香甜
于是你白天通过ps -xal
查找到了学长学姐训练的pid进程号
悄悄地输入了下面代码pid=的后面
让下面的python代码持续运行 记为wait.py
等你醒来,你的训练已然完成
import os
import psutil
import time
pid_list = psutil.pids()
wait_pid = 59900 # 等待的进程号
while wait_pid in pid_list:
print('still working!' + str(wait_pid))
time.sleep(10)
pid_list = psutil.pids()
os.system('python train.py')
注意这个py文件要放在train.py同一目录下面,不然自己写路径去
原理就是psutil监视pid,一旦进程消失,马上命令开始自己的训练
当然,要是学长学姐知道你这么抢资源被说了,可怪不了我哟 调皮
补充:借鉴自scramble4gpu:抢占显卡-*--源码_-互联网文档类资源-CSDN下载
修复了里面的bug
然后你可以把以下代码改成train.py 设置好默认的参数,放到和wait.py同一目录下
那么你可以一口气抢多张卡,还可以确定空闲比(达到多少就抢)
# -*- coding: utf-8 -*-
import os
import time
import argparse
import numpy as np
try:
import torch
except ImportError:
try:
import tensorflow as tf
except ImportError:
print("No pytorch and tensorflow module")
def set_parser():
parser = argparse.ArgumentParser(description='..')
parser.add_argument('-p', '--proportion', type=float, default=0.7,
help='The ratio of gpu free memory to total memory')
parser.add_argument('-n', '--gpu_nums', type=int, default=2,
help='The numbers of GPU to scramble')
parser.add_argument('-t', '--times', type=int, default=180000,
help='Sleep time if scramble gpu')
args = parser.parse_args()
return args
def parse(qargs, results):
result_np = []
for line in results[1:]:
result_np.append([''.join(filter(str.isdigit, word)) for word in line.split(',')])
result_np = np.array(result_np)
return result_np
def query_gpu():
qargs = ['index', 'memory.free', 'memory.total']
cmd = 'nvidia-smi --query-gpu={} --format=csv,noheader'.format(','.join(qargs))
results = os.popen(cmd).readlines()
return parse(qargs, results), results[0].strip()
class GPUManager(object):
def __init__(self, args):
self._args = args
def choose_free_gpu(self, num=1):
qresult, qindex = query_gpu()
qresult = qresult.astype('int')
if qresult.shape[0] < num:
print('The number GPU {} < num {}'.format(len(qresult), num))
else:
qresult_sort_index = np.argsort(-qresult[:, 1])
idex = [i for i in qresult_sort_index[:num] if qresult[i][1]/qresult[i][2] > self._args.proportion]
gpus_index = qresult[:, 0][idex]
gpus_memory = qresult[:, 1][idex]
return gpus_index, gpus_memory
def compute_storage_size(memory):
return pow(memory * 1024 * 1024 / 8, 1/3) * 0.9
# if __name__ == '__main__':
def main():
args = set_parser()
gpu_manager = GPUManager(args)
gpus_free, gpus_memory = gpu_manager.choose_free_gpu(num=args.gpu_nums)
sizes = [int(compute_storage_size(i)) for i in gpus_memory]
if len(gpus_free) > 0:
ids = []
for gpus_id, size in zip(gpus_free, sizes):
print("Scramble GPU {}".format(gpus_id))
try:
torch.zeros([size, size, size], dtype=torch.double, device=gpus_id)
except:
# with tf.device('/gpu:{}'.format(gpus_id)):
os.environ["CUDA_VISIBLE_DEVICES"] = str(gpus_id)
tf.zeros([size, size, size], dtype=tf.dtypes.float64)
ids.append(gpus_id)
if len(ids) == args.gpu_nums:
time.sleep(args.times)
return ids
else:
return []
if __name__ == '__main__':
args = set_parser()
while True:
ids = main()
print("start one try!")
print(f"now finshed epochs:{len(ids)}")
if ids and len(ids) == args.gpu_nums:
break
其中的ids的bug已经被我修复了,可以正常使用。
可怜的抢卡人,晚安!