报错如下
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:1 and cuda:2!
原因分析
模型的层数过多时,可能有些层被分到了不同的GPU上
解决方案
首先打印看看自己的模型有哪些模块,比如WizardCoder-15B有这些主要模块
no_split_module_classes = [
"Dropout","LayerNorm","Linear",
"GPTBigCodeAttention","GPTBigCodeMLP","GELUActivation","GPTBigCodeBlock",
"GELUActivation"
]
我通过下面的代码打印出的信息得到以上列表
print(f"✨记录模型包含的模块,及所在设备")
with open("model_no_split.txt", "w") as f:
for (name, module), (name, param) in zip(model.named_modules(),model.named_parameters()):
f.write(name+" | ")
f.write(str(type(module))+" | ")
f.write(str(param.device))
f.write("\n")
单机多卡推理代码
from transformers import AutoTokenizer, AutoModelForCausalLM
from accelerate import dispatch_model, infer_auto_device_map
from accelerate.utils import get_balanced_memory
from torch.cuda.amp import autocast
import torch
tokenizer = AutoTokenizer.from_pretrained("WizardCoder-15B-V1.0")
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained("WizardCoder-15B-V1.0", trust_remote_code=True, device_map='auto', torch_dtype=torch.bfloat16)
no_split_module_classes = [
"Dropout","LayerNorm","Linear",
"GPTBigCodeAttention","GPTBigCodeMLP","GELUActivation","GPTBigCodeBlock",
"GELUActivation"
]
max_memory = get_balanced_memory(
model,
max_memory=None,
no_split_module_classes=no_split_module_classes,
dtype='float16',
low_zero=False,
)
device_map = infer_auto_device_map(
model,
max_memory=max_memory,
no_split_module_classes=no_split_module_classes,
dtype='float16'
)
model = dispatch_model(model, device_map=device_map)
generation_kwargs = {
"min_length": -1,
"top_k": 0,
"top_p": 0.85,
"do_sample": True,
"pad_token_id": tokenizer.eos_token_id,
"min_new_tokens": 10,
"max_new_tokens": 50,
"eos_token_id": tokenizer.eos_token_id,
}
with autocast():
print(tokenizer.decode(model.generate(tokenizer.encode("Hello World!", return_tensors="pt").to("cuda:0"), **generation_kwargs)[0]))