import torch
from torchvision import models
model = models.resnet50(pretrained=True)
save_dir = './resnet50.pth'
# 保存整个 模型结构+权重
torch.save(model, save_dir)
# 保存 模型权重
torch.save(model.state_dict, save_dir)
# pt, pth和pkl三种数据格式均支持模型权重和整个模型的存储
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-puyISgkD-1692613764220)(attachment:image.png)]
注:因此一般情况下第一张显卡的内存使用占比会更多
import os
import torch
from torchvision import models
#单卡
os.environ['CUDA_VISIBLE_DEVICES'] = '0' # 如果是多卡改成类似0,1,2
model = model.cuda() # 单卡
#print(model)
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_7460/77570021.py in
1 import os
2 os.environ['CUDA_VISIBLE_DEVICES'] = '0' # 如果是多卡改成类似0,1,2
----> 3 model = model.cuda() # 单卡
D:\Users\xulele\Anaconda3\lib\site-packages\torch\nn\modules\module.py in cuda(self, device)
903 Module: self
904 """
--> 905 return self._apply(lambda t: t.cuda(device))
906
907 def ipu(self: T, device: Optional[Union[int, device]] = None) -> T:
D:\Users\xulele\Anaconda3\lib\site-packages\torch\nn\modules\module.py in _apply(self, fn)
795 def _apply(self, fn):
796 for module in self.children():
--> 797 module._apply(fn)
798
799 def compute_should_use_set_data(tensor, tensor_applied):
D:\Users\xulele\Anaconda3\lib\site-packages\torch\nn\modules\module.py in _apply(self, fn)
818 # `with torch.no_grad():`
819 with torch.no_grad():
--> 820 param_applied = fn(param)
821 should_use_set_data = compute_should_use_set_data(param, param_applied)
822 if should_use_set_data:
D:\Users\xulele\Anaconda3\lib\site-packages\torch\nn\modules\module.py in (t)
903 Module: self
904 """
--> 905 return self._apply(lambda t: t.cuda(device))
906
907 def ipu(self: T, device: Optional[Union[int, device]] = None) -> T:
D:\Users\xulele\Anaconda3\lib\site-packages\torch\cuda\__init__.py in _lazy_init()
245 if 'CUDA_MODULE_LOADING' not in os.environ:
246 os.environ['CUDA_MODULE_LOADING'] = 'LAZY'
--> 247 torch._C._cuda_init()
248 # Some of the queued calls may reentrantly call _lazy_init();
249 # we need to just return without initializing in that case.
RuntimeError: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-0G4NTv1z-1692613764220)(attachment:ed8eb711294e4c6e3e43690ddb2bf66.png)]
#多卡
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'
model = torch.nn.DataParallel(model).cuda() # 多卡
#print(model)
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-eHt1Dn8t-1692613764221)(attachment:image.png)]
os.environ['CUDA_VISIBLE_DEVICES'] = '0' #这里替换成希望使用的GPU编号
model = models.resnet50(pretrained=True)
model.cuda()
save_dir = 'resnet50.pt' #保存路径
# 保存+读取整个模型
torch.save(model, save_dir)
loaded_model = torch.load(save_dir)
loaded_model.cuda()
# 保存+读取模型权重
torch.save(model.state_dict(), save_dir)
# 先加载模型结构
loaded_model = models.resnet50()
# 在加载模型权重
loaded_model.load_state_dict(torch.load(save_dir))
loaded_model.cuda()
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_7460/585340704.py in
5 os.environ['CUDA_VISIBLE_DEVICES'] = '0' #这里替换成希望使用的GPU编号
6 model = models.resnet50(pretrained=True)
----> 7 model.cuda()
8
9 save_dir = 'resnet50.pt' #保存路径
D:\Users\xulele\Anaconda3\lib\site-packages\torch\nn\modules\module.py in cuda(self, device)
903 Module: self
904 """
--> 905 return self._apply(lambda t: t.cuda(device))
906
907 def ipu(self: T, device: Optional[Union[int, device]] = None) -> T:
D:\Users\xulele\Anaconda3\lib\site-packages\torch\nn\modules\module.py in _apply(self, fn)
795 def _apply(self, fn):
796 for module in self.children():
--> 797 module._apply(fn)
798
799 def compute_should_use_set_data(tensor, tensor_applied):
D:\Users\xulele\Anaconda3\lib\site-packages\torch\nn\modules\module.py in _apply(self, fn)
818 # `with torch.no_grad():`
819 with torch.no_grad():
--> 820 param_applied = fn(param)
821 should_use_set_data = compute_should_use_set_data(param, param_applied)
822 if should_use_set_data:
D:\Users\xulele\Anaconda3\lib\site-packages\torch\nn\modules\module.py in (t)
903 Module: self
904 """
--> 905 return self._apply(lambda t: t.cuda(device))
906
907 def ipu(self: T, device: Optional[Union[int, device]] = None) -> T:
D:\Users\xulele\Anaconda3\lib\site-packages\torch\cuda\__init__.py in _lazy_init()
245 if 'CUDA_MODULE_LOADING' not in os.environ:
246 os.environ['CUDA_MODULE_LOADING'] = 'LAZY'
--> 247 torch._C._cuda_init()
248 # Some of the queued calls may reentrantly call _lazy_init();
249 # we need to just return without initializing in that case.
RuntimeError: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx
os.environ['CUDA_VISIBLE_DEVICES'] = '0' #这里替换成希望使用的GPU编号
model = models.resnet50(pretrained=True)
model.cuda()
# 保存+读取整个模型
torch.save(model, save_dir)
os.environ['CUDA_VISIBLE_DEVICES'] = '1,2' #这里替换成希望使用的GPU编号
loaded_model = torch.load(save_dir)
loaded_model = nn.DataParallel(loaded_model).cuda()
# 保存+读取模型权重
torch.save(model.state_dict(), save_dir)
os.environ['CUDA_VISIBLE_DEVICES'] = '1,2' #这里替换成希望使用的GPU编号
loaded_model = models.resnet50() #注意这里需要对模型结构有定义
loaded_model.load_state_dict(torch.load(save_dir))
loaded_model = nn.DataParallel(loaded_model).cuda()
核心问题:如何去掉权重字典键名中的"module",以保证模型的统一性
os.environ['CUDA_VISIBLE_DEVICES'] = '1,2' #这里替换成希望使用的GPU编号
model = models.resnet50(pretrained=True)
model = nn.DataParallel(model).cuda()
# 保存+读取整个模型
torch.save(model, save_dir)
os.environ['CUDA_VISIBLE_DEVICES'] = '0' #这里替换成希望使用的GPU编号
loaded_model = torch.load(save_dir).module
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1,2' #这里替换成希望使用的GPU编号
model = models.resnet50(pretrained=True)
model = nn.DataParallel(model).cuda()
# 保存权重
torch.save(model.module.state_dict(), save_dir)
#加载模型权重
os.environ['CUDA_VISIBLE_DEVICES'] = '0' #这里替换成希望使用的GPU编号
loaded_model = models.resnet50() #注意这里需要对模型结构有定义
loaded_model.load_state_dict(torch.load(save_dir))
loaded_model.cuda()
保存整个模型时会同时保存所使用的GPU id等信息,读取时若这些信息和当前使用的GPU信息不符则可能会报错或者程序不按预定状态运行。可能出现以下2个问题:
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1,2' #这里替换成希望使用的GPU编号
model = models.resnet50(pretrained=True)
model = nn.DataParallel(model).cuda()
# 保存+读取模型权重,强烈建议!!
torch.save(model.state_dict(), save_dir)
#加载模型 权重
loaded_model = models.resnet50() #注意这里需要对模型结构有定义
loaded_model.load_state_dict(torch.load(save_dir)))
loaded_model = nn.DataParallel(loaded_model).cuda()
# 使用案例(截取片段代码)
My_model.eval()
test_total_loss = 0
test_total_correct = 0
test_total_num = 0
past_test_loss = 0 #上一轮的loss
save_model_step = 10 # 每10步保存一次model
for iter,(images,labels) in enumerate(test_loader):
images = images.to(device)
labels = labels.to(device)
outputs = My_model(images)
loss = criterion(outputs,labels)
test_total_correct += (outputs.argmax(1) == labels).sum().item()
test_total_loss += loss.item()
test_total_num += labels.shape[0]
test_loss = test_total_loss / test_total_num
print("Epoch [{}/{}], train_loss:{:.4f}, train_acc:{:.4f}%, test_loss:{:.4f}, test_acc:{:.4f}%".format(
i+1, epoch, train_total_loss / train_total_num, train_total_correct / train_total_num * 100, test_total_loss / test_total_num, test_total_correct / test_total_num * 100
))
# model save
if test_loss<past_test_loss:
#保存模型权重
torch.save(model.state_dict(), save_dir)
#保存 模型权重+模型结构
#torch.save(model, save_dir)
if iter % save_model_step == 0:
#保存模型权重
torch.save(model.state_dict(), save_dir)
#保存 模型权重+模型结构
#torch.save(model, save_dir)
past_test_loss = test_loss
Google Colab:https://colab.research.google.com/drive/1hEOeqXYm4BfulY6d30QCI4HrFmCmmTQu?usp=sharing