DCGM
datacenter-gpu-manager_1.7.2_amd64.deb
# dcgmi --version
dcgmi version: 1.7.2
# git clone https://github.com/NVIDIA/gpu-monitoring-tools.git
# cd gpu-monitoring-tools/
# make binary
go build -o dcgm-exporter github.com/NVIDIA/gpu-monitoring-tools/pkg
# make install
go build -o dcgm-exporter github.com/NVIDIA/gpu-monitoring-tools/pkg
install -m 557 dcgm-exporter /usr/bin/dcgm-exporter
install -m 557 -D ./etc/dcgm-exporter/default-counters.csv /etc/dcgm-exporter/default-counters.csv
install -m 557 -D ./etc/dcgm-exporter/dcp-metrics-included.csv /etc/dcgm-exporter/dcp-metrics-included.csv
dcgm-exporter
# which dcgm-exporter
/usr/bin/dcgm-exporter
# dcgm-exporter
INFO[0000] Starting dcgm-exporter
INFO[0000] DCGM successfully initialized!
INFO[0000] Pipeline starting
INFO[0000] Starting webserver
# curl 192.168.1.2:9400/metrics
dcgm-exporter
开机启动vim /lib/systemd/system/dcgm-exporter.service
新建服务[Unit]
Description=dcgm-exporter service
[Service]
User=root
ExecStart=/usr/bin/dcgm-exporter
TimeoutStopSec=10
Restart=on-failure
RestartSec=5
[Install]
WantedBy=multi-user.target
# systemctl daemon-reload
# systemctl enable dcgm-exporter.service
# systemctl start dcgm-exporter.service
# systemctl status dcgm-exporter.service
dcgm-exporter
# dcgm-exporter
- job_name: 'gpu'
static_configs:
- targets: ['192.168.1.2:9400']
# cat prometheus.yml
- job_name: 'prometheus'
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
static_configs:
- targets: ['localhost:9090']
# node_exporter
- job_name: 'node'
static_configs:
- targets: ['127.0.0.1:9100','192.168.1.2:9100']
# dcgm-exporter
- job_name: 'gpu'
static_configs:
- targets: ['192.168.1.2:9400']
prometheus
systemctl restart prometheus.service
9957
可以切换节点instance
为ip地址DCGM_FI_DEV_POWER_USAGE{instance="192.168.1.101:9400"}
DCGM_FI_DEV_GPU_UTIL{instance="192.168.1.101:9400"}
12027
# dcgm-exporter
- job_name: 'gpu-metrics'
static_configs:
- targets: ['127.0.0.1:9400','192.168.1.101:9400','192.168.1.102:9400']
curl http://127.0.0.1:9400/metrics
DCGM_FI_DEV_POWER_USAGE{instance="127.0.0.1:9400"}
DCGM_FI_DEV_FB_USED{instance="127.0.0.1:9400"}
DCGM_FI_DEV_FB_USED{instance="127.0.0.1:9400"}+DCGM_FI_DEV_FB_FREE{instance="127.0.0.1:9400"}
DCGM_FI_DEV_GPU_UTIL{instance="127.0.0.1:9400"}
DCGM_FI_DEV_MEM_COPY_UTIL{instance="192.168.0.114:9400"}
参考: