如果不一致需要卸载高版本的kernel-devel包或者升级kernel,这里建议降低kernel-devel的版本。
如下所示通过yum安装的kernel-devel是3.10.0-1160版本的,然而kernel是3.10.0-1127版本的
###安装 kernel-devel
[jenkins_user@chemical_122 gpu_related]$ sudo yum install epel-release -y
###这里gcc和gcc-c++的版本比较低,对后面编译有影响,如果要做流水线编译环境,得编译安装高版本gcc,一般部署环境不需要
[jenkins_user@chemical_122 gpu_related]$ sudo yum install -y net-tools telnet autoconf libtool make gcc gcc-c++ kernel-devel
[root@chemical_122 ~]# rpm -qa | grep kernel-
kernel-tools-libs-3.10.0-1127.el7.x86_64
kernel-devel-3.10.0-1160.11.1.el7.x86_64
kernel-headers-3.10.0-1160.11.1.el7.x86_64
kernel-tools-3.10.0-1127.el7.x86_64
kernel-3.10.0-1127.el7.x86_64
[root@chemical_122 ~]# uname -a
Linux chemical_122 3.10.0-1127.el7.x86_64 #1 SMP Tue Mar 31 23:36:51 UTC 2020 x86_64 x86_64 x86_64 GNU/Linux
###下载对应版本的kernel-devel的rpm包,并安装
[root@chemical_122 ~]# yum remove -y kernel-devel
[root@chemical_122 software]# wget https://mirrors.tuna.tsinghua.edu.cn/centos-vault/7.8.2003/os/x86_64/Packages/kernel-devel-3.10.0-1127.el7.x86_64.rpm
[root@chemical_122 software]# wget https://mirrors.tuna.tsinghua.edu.cn/centos-vault/7.8.2003/os/x86_64/Packages/kernel-headers-3.10.0-1127.el7.x86_64.rpm
[root@chemical_122 software]# rpm -ivh kernel-devel-3.10.0-1127.el7.x86_64.rpm
[root@chemical_122 software]# rpm -ivh kernel-headers-3.10.0-1127.el7.x86_64.rpm
[jenkins_user@chemical_122 gpu_related]$ wget https://cn.download.nvidia.cn/XFree86/Linux-x86_64/430.14/NVIDIA-Linux-x86_64-430.14.run
[jenkins_user@chemical_122 gpu_related]$ chmod ug+x ./NVIDIA-Linux-x86_64-430.14.run
[jenkins_user@chemical_122 gpu_related]$ sudo ./NVIDIA-Linux-x86_64-430.14.run
Verifying archive integrity... OK
Uncompressing NVIDIA Accelerated Graphics Driver for Linux-x86_64 430.14..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................
###在图形化操作中禁用默认的驱动
[jenkins_user@chemical_122 gpu_related]$ cd /etc/modprobe.d/
[jenkins_user@chemical_122 modprobe.d]$ cat nvidia-installer-disable-nouveau.conf
# generated by nvidia-installer
blacklist nouveau
options nouveau modeset=0
[root@chemical_122 ~]# ls -altr /boot/initramfs-$(uname -r).img
-rw-------. 1 root root 52200101 Dec 21 13:54 /boot/initramfs-3.10.0-1127.el7.x86_64.img
[root@chemical_122 ~]# mv /boot/initramfs-$(uname -r).img /boot/initramfs-$(uname -r).img.bak
[root@chemical_122 ~]# dracut -v /boot/initramfs-$(uname -r).img $(uname -r)
[root@chemical_122 ~]# reboot
[jenkins_user@chemical_122 gpu_related]$ sudo ./NVIDIA-Linux-x86_64-430.14.run
Verifying archive integrity... OK
Uncompressing NVIDIA Accelerated Graphics Driver for Linux-x86_64 430.14..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................
[root@chemical_122 gpu_related]# wget https://developer.download.nvidia.com/compute/cuda/11.1.0/local_installers/cuda_11.1.0_455.23.05_linux.run
[root@chemical_122 gpu_related]# chmod u+x ./cuda_11.0.3_450.51.06_linux.run
[root@chemical_122 gpu_related]# ./cuda_11.0.3_450.51.06_linux.run
===========
= Summary =
===========
Driver: Installed
Toolkit: Installed in /usr/local/cuda-11.0/
Samples: Installed in /root/, but missing recommended libraries
Please make sure that
- PATH includes /usr/local/cuda-11.0/bin
- LD_LIBRARY_PATH includes /usr/local/cuda-11.0/lib64, or, add /usr/local/cuda-11.0/lib64 to /etc/ld.so.conf and run ldconfig as root
To uninstall the CUDA Toolkit, run cuda-uninstaller in /usr/local/cuda-11.0/bin
To uninstall the NVIDIA Driver, run nvidia-uninstall
Logfile is /var/log/cuda-installer.log
vim /etc/profile
export PATH=/usr/local/cuda-11.0/bin:$PATH
export CUDA_HOME=/usr/local/cuda-11.0
export CUDA_TOOLKIT_ROOT_DIR=$CUDA_HOME
export LD_LIBRARY_PATH=/usr/local/cuda-11.0/lib64:/usr/local/lib:$LD_LIBRARY_PATH
[root@chemical_122 harbor]# nvidia-smi
Tue Jan 12 16:01:46 2021
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.51.06 Driver Version: 450.51.06 CUDA Version: 11.0 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|===============================+======================+======================|
| 0 Quadro P4000 Off | 00000000:3B:00.0 Off | N/A |
| 42% 31C P0 31W / 105W | 0MiB / 8119MiB | 4% Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=============================================================================|
| No running processes found |
+-----------------------------------------------------------------------------+
distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
curl -s -L https://nvidia.github.io/nvidia-container-runtime/$distribution/nvidia-container-runtime.repo | sudo tee /etc/yum.repos.d/nvidia-container-runtime.repo
yum install nvidia-container-runtime -y
由于DNS污染,导致某些URL被墙
解决办法 : 修改DNS服务器
vim /etc/resolv.conf
# nameserver 192.168.0.112
nameserver 114.114.114.114
vim /etc/docker/daemon.json
{
"log-level": "warn",
"selinux-enabled": false,
"runtimes": {
"nvidia": {
"path": "nvidia-container-runtime",
"runtimeArgs": []
}
},
"registry-mirrors": [
"https://pqbap4ya.mirror.aliyuncs.com",
"https://1nj0zren.mirror.aliyuncs.com",
"https://docker.mirrors.ustc.edu.cn",
"http://f1361db2.m.daocloud.io",
"https://registry.docker-cn.com"
],
"default-shm-size": "128M",
"max-concurrent-downloads": 10,
"max-concurrent-uploads": 5,
"oom-score-adjust": -1000,
"debug": false,
"live-restore": true,
"exec-opts": [
"native.cgroupdriver=systemd"
],
"log-driver": "json-file",
"log-opts": {
"max-size": "100m",
"max-file": "10"
},
"oom-score-adjust": -1000,
"storage-driver": "overlay2",
"storage-opts": [
"overlay2.override_kernel_check=true"
]
}
systemctl restart docker
docker info
看到关键信息
Runtimes: io.containerd.runc.v2 io.containerd.runtime.v1.linux nvidia runc
docker pull nvidia/cuda:11.0-base-ubuntu18.04
docker run -it --gpus all nvidia/cuda:11.0-base-ubuntu18.04 bash