yum update -y
yum install -y centos-release-scl
yum install -y devtoolset-9
source /opt/rh/devtoolset-9/enable
gcc -v
chmod +x NVIDIA-Linux-x86_64-525.105.17.run
sh NVIDIA-Linux-x86_64-525.105.17.run -no-x-check
nvidia-smi
GPUDirect 通信矩阵:
> nvidia-smi topo --matrix
GPU0 GPU1 GPU2 GPU3 GPU4 GPU5 GPU6 GPU7 NIC0 NIC1 NIC2 NIC3 CPU Affinity NUMA Affinity
GPU0 X NV8 NV8 NV8 NV8 NV8 NV8 NV8 NODE NODE SYS SYS 0-31,64-95 0
GPU1 NV8 X NV8 NV8 NV8 NV8 NV8 NV8 PIX NODE SYS SYS 0-31,64-95 0
GPU2 NV8 NV8 X NV8 NV8 NV8 NV8 NV8 NODE NODE SYS SYS 0-31,64-95 0
GPU3 NV8 NV8 NV8 X NV8 NV8 NV8 NV8 NODE PIX SYS SYS 0-31,64-95 0
GPU4 NV8 NV8 NV8 NV8 X NV8 NV8 NV8 SYS SYS NODE NODE 32-63,96-127 1
GPU5 NV8 NV8 NV8 NV8 NV8 X NV8 NV8 SYS SYS NODE NODE 32-63,96-127 1
GPU6 NV8 NV8 NV8 NV8 NV8 NV8 X NV8 SYS SYS NODE NODE 32-63,96-127 1
GPU7 NV8 NV8 NV8 NV8 NV8 NV8 NV8 X SYS SYS PIX PIX 32-63,96-127 1
NIC0 NODE PIX NODE NODE SYS SYS SYS SYS X NODE SYS SYS
NIC1 NODE NODE NODE PIX SYS SYS SYS SYS NODE X SYS SYS
NIC2 SYS SYS SYS SYS NODE NODE NODE PIX SYS SYS X PIX
NIC3 SYS SYS SYS SYS NODE NODE NODE PIX SYS SYS PIX X
Legend:
X = Self
SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)
NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node
PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)
PIX = Connection traversing at most a single PCIe bridge
NV# = Connection traversing a bonded set of # NVLinks
NIC Legend:
NIC0: mlx5_0
NIC1: mlx5_1
NIC2: mlx5_2
NIC3: mlx5_3
开启持久模式:
如果未开启持久模式(Persistence Mode),每次用nvidia-smi查询显卡资源的时候,会等到较长时间才有结果。
nvidia-smi -pm ENABLED
# 查询 Persistence Mode 是否开启
nvidia-smi -a
nvidia-smi
技术要学会分享、交流,不建议闭门造车。一个人走的很快、一堆人可以走的更远。
完整代码、数据、技术交流提升, 均可加入知识星球交流群获取,群友已超过2000人,添加时切记的备注方式为:来源+兴趣方向,方便找到志同道合的朋友。
方式①、添加微信号:pythoner666,备注:来自 CSDN 技术交流
方式②、微信搜索公众号:Python学习与数据挖掘,后台回复:资料
# 安装dkms修护驱动
> sudo yum install dkms
# 查看显卡驱动版本
> ls /usr/src
debug
kernels
nvidia-525.105.17
# 重新安装对应nvidia的驱动模块
> dkms install -m nvidia -v 525.105.17
NVIDIA-Fabric Manager服务可以使多A100显卡间通过NVSwitch互联。
要通过NVSwitch互联必须安装与GPU驱动版本对应的NVIDIA-Fabric Manager软件包,否则将无法正常使用实例。
参考:https://www.volcengine.com/docs/6419/73634
wget -c https://developer.download.nvidia.cn/compute/cuda/repos/rhel7/x86_64/nvidia-fabric-manager-525.105.17-1.x86_64.rpm
rpm -ivh nvidia-fabric-manager-525.105.17-1.x86_64.rpm
wget -c https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/nvidia-fabric-manager-devel-525.105.17-1.x86_64.rpm
rpm -ivh nvidia-fabric-manager-devel-525.105.17-1.x86_64.rpm
启动NVIDIA-Fabric Manager:
# 启动Fabric Manager服务,实现NVSwitch互联
sudo systemctl restart nvidia-fabricmanager
# 查看Fabric Manager服务是否正常启动,回显active(running)表示启动成功。
sudo systemctl status nvidia-fabricmanager
# 配置Fabric Manager服务随实例开机自启动。
sudo systemctl enable nvidia-fabricmanager
mkdir -p /home/local/cuda-11.7
sudo ln -s /home/local/cuda-11.7 /usr/local/cuda-11.7
wget https://developer.download.nvidia.com/compute/cuda/11.7.0/local_installers/cuda_11.7.0_515.43.04_linux.run
sudo sh cuda_11.7.0_515.43.04_linux.run
# 配置环境变量
export PATH="/usr/local/cuda-11.7/bin:$PATH"
export LD_LIBRARY_PATH="/usr/local/cuda-11.7/lib64:$LD_LIBRARY_PATH"
nvcc -V
yum install -y zlib-devel.x86_64
mkdir -p /home/local/openssl
ln -s /home/local/openssl /usr/local/openssl
tar -zxvf openssl-1.1.1t.tar.gz
cd openssl-1.1.1t/
./config --prefix=/usr/local/openssl shared zlib
make depend
sudo make
sudo make install
sudo ln -s /usr/local/openssl/bin/openssl /usr/bin/openssl
echo "/usr/local/openssl/lib" >> /etc/ld.so.conf
ldconfig -v
ln -s /usr/local/openssl/lib/libssl.so.1.1 /usr/lib/libssl.so.1.1
ln -s /usr/local/openssl/lib/libcrypto.so.1.1 /usr/lib/libcrypto.so.1.1
yum install -y libffi-devel bzip2-devel
yum install sqlite-devel -y
mkdir -p /home/local/py310
ln -s /home/local/py310 /usr/local/py310
tar -xvf Python-3.10.10.tgz
./configure --prefix=/usr/local/py310 --with-openssl=/usr/local/openssl --with-openssl-rpath=no
make
sudo make install
sudo ln -s /usr/local/py310/bin/python3.10 /usr/bin/python3.10
sudo ln -s /usr/local/py310/bin/pip3.10 /usr/bin/pip3.10
# sudo ln -s /usr/local/py310/bin/pip3.10 /usr/bin/pip
pip3.10 install virtualenv -i https://pypi.tuna.tsinghua.edu.cn/simple --trusted-host pypi.tuna.tsinghua.edu.cn
sudo ln -s /usr/local/py310/bin/virtualenv /usr/bin/virtualenv
mkdir -p /home/guodong.li/virtual-venv
cd /home/guodong.li/virtual-venv
virtualenv -p /usr/bin/python3.10 llama-venv-py310-cu117
source /home/guodong.li/virtual-venv/llama-venv-py310-cu117/bin/activate
配置pip源:
mkdir ~/.pip
cd ~/.pip
vim ~/.pip/pip.conf
[global]
index-url = https://pypi.tuna.tsinghua.edu.cn/simple
[install]
trusted-host = https://pypi.tuna.tsinghua.edu.cn
文档:https://docs.conda.io/en/latest/miniconda.html#linux-installers
安装:
wget -c https://repo.anaconda.com/miniconda/Miniconda3-py310_23.3.1-0-Linux-x86_64.sh
bash Miniconda3-py310_23.3.1-0-Linux-x86_64.sh
配置清华源:
conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free
conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/main
安装包:
# 最常见的源是conda-forge,一个开源的代码库,许多软件通过这个源来安装。
conda install nodejs=16.6.1 -c conda-forge
tar xvf nccl_2.14.3-1+cuda11.7_x86_64.txz
cd nccl_2.14.3-1+cuda11.7_x86_64/
sudo cp -r include/* /usr/local/cuda-11.7/include/
sudo cp -r lib/* /usr/local/cuda-11.7/lib64/
export LD_LIBRARY_PATH="/usr/local/cuda-11.7/lib64:$LD_LIBRARY_PATH"
tar -xvf cudnn-linux-x86_64-8.8.1.3_cuda11-archive.tar.xz
cd cudnn-linux-x86_64-8.8.1.3_cuda11-archive
sudo cp include/cudnn*.h /usr/local/cuda-11.7/include
sudo cp -P lib/libcudnn* /usr/local/cuda-11.7/lib64/
sudo chmod a+r /usr/local/cuda-11.7/include/cudnn*.h /usr/local/cuda-11.7/lib64/libcudnn*
pip install torch-1.13.1+cu117-cp310-cp310-linux_x86_64.whl
pip install torchvision-0.14.1+cu117-cp310-cp310-linux_x86_64.whl
# pip install torch-scatter torch-sparse torch-cluster torch-geometric
python -c "import torch; print(torch.cuda.is_available())"
cd transformers-20230327
git checkout 0041be5
pip install .
pip install deepspeed==0.8.0
pip install accelerate
pip install tensorboardX
git clone https://github.com/NVIDIA/apex.git
cd apex
git checkout 22.04-dev
pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./
yum -y install openmpi-devel
export CC=/usr/lib64/openmpi/bin/mpicc
pip install mpi4py
tar xf pdsh-2.31.tar.gz
cd /data/nfs/llm/pkg/pdsh-pdsh-2.31
./configure \
--prefix=/home/local/pdsh \
--with-ssh \
--with-machines=/home/local/pdsh/machines \
--with-dshgroups=/home/local/pdsh/group \
--with-rcmd-rank-list=ssh \
--with-exec && \
make && \
make install
ln -s /home/local/pdsh /usr/local/pdsh
ll /usr/local/pdsh/bin/
# 将pdsh的所有命令追加到环境变量中
echo "export PATH=/home/local/pdsh/bin:$PATH" >> /etc/profile
source /etc/profile
pdsh -V
wget https://download.docker.com/linux/centos/docker-ce.repo -O /etc/yum.repos.d/docker-ce.repo
wget https://nvidia.github.io/nvidia-docker/centos7/x86_64/nvidia-docker.repo -O /etc/yum.repos.d/nvidia-docker.repo
yum install -y epel-release
# 安装docker
yum install -y docker-ce nvidia-docker2
systemctl enable docker
# 重启docker
sudo systemctl restart docker
docker info
修改Docker配置(/etc/docker/daemon.json
):
{
"data-root": "/home/docker",
"runtimes": {
"nvidia": {
"path": "nvidia-container-runtime",
"runtimeArgs": []
}
}
}
重启docker服务:
systemctl daemon-reload
systemctl restart docker
关闭 selinux 安全系统:
1. 临时关闭(setenforce 0),系统重启后,恢复启动。
setenforce 0
查看:
getenforce
2. 永久关闭,修改文件 /etc/selinux/config
SELINUX=disabled
保存后,重启 reboot
docker save -o tritonserver.tar runoob/ubuntu:v3
docker load --input tritonserver.tar