Prometheus监控Docker Swarm集群

cAdvisor+Prometheus+Grafana监控Docker Swarm集群

参考:https://docs.docker.com/config/daemon/prometheus/

           https://github.com/cyancow/swarmprom

按照github开源项目来进行配置:https://github.com/cyancow/swarmprom

github项目clone到本地

$ git clone https://github.com/stefanprodan/swarmprom.git
$ cd swarmprom

修改docker配置文件

修改docker默认配置文件增加以下两项内容,以便于promethus共享守护进程的数据

Linux修改: /etc/docker/daemon.json
Windows Server修改: C:\ProgramData\docker\config\daemon.json

增加以下两行内容,如果已经有其他配置则往原有大括号内追加以下两项内容,如果文件不存在则创建文件,内容如下:


{
  "metrics-addr" : "127.0.0.1:9323",
  "experimental" : true
}

 

修改docker-compose.yml配置文件

修改网络及相关端口映射,将原配置文件中的网络net修改为,自有集群中的网络,这样现有集群中在当前网络中的服务即可以本检测到;

version: "3.3"

#原网络配置
#networks:
#  net:
#    driver: overlay
#    attachable: true

#切换现有集群网络
networks:
  mon_net:
    driver: overlay
    attachable: true

volumes:
    prometheus: {}
    grafana: {}
    alertmanager: {}

configs:
  caddy_config:
    file: ./caddy/Caddyfile
  dockerd_config:
    file: ./dockerd-exporter/Caddyfile
  node_rules:
    file: ./prometheus/rules/swarm_node.rules.yml
  task_rules:
    file: ./prometheus/rules/swarm_task.rules.yml

services:
  dockerd-exporter:
    image: stefanprodan/caddy
    networks:
      - mon_net
    environment:
      - DOCKER_GWBRIDGE_IP=172.18.0.1
    configs:
      - source: dockerd_config
        target: /etc/caddy/Caddyfile
    deploy:
      mode: global
      resources:
        limits:
          memory: 128M
        reservations:
          memory: 64M

  cadvisor:
    image: google/cadvisor
    networks:
      - mon_net
    command: -logtostderr -docker_only
    volumes:
      - /var/run/docker.sock:/var/run/docker.sock:ro
      - /:/rootfs:ro
      - /var/run:/var/run
      - /sys:/sys:ro
      - /var/lib/docker/:/var/lib/docker:ro
    deploy:
      mode: global
      resources:
        limits:
          memory: 128M
        reservations:
          memory: 64M

  grafana:
    image: stefanprodan/swarmprom-grafana:5.3.4
    networks:
      - mon_net
    environment:
      - GF_SECURITY_ADMIN_USER=${ADMIN_USER:-admin}
      - GF_SECURITY_ADMIN_PASSWORD=${ADMIN_PASSWORD:-admin}
      - GF_USERS_ALLOW_SIGN_UP=false
      #- GF_SERVER_ROOT_URL=${GF_SERVER_ROOT_URL:-localhost}
      #- GF_SMTP_ENABLED=${GF_SMTP_ENABLED:-false}
      #- GF_SMTP_FROM_ADDRESS=${GF_SMTP_FROM_ADDRESS:[email protected]}
      #- GF_SMTP_FROM_NAME=${GF_SMTP_FROM_NAME:-Grafana}
      #- GF_SMTP_HOST=${GF_SMTP_HOST:-smtp:25}
      #- GF_SMTP_USER=${GF_SMTP_USER}
      #- GF_SMTP_PASSWORD=${GF_SMTP_PASSWORD}
    volumes:
      - grafana:/var/lib/grafana
    deploy:
      mode: replicated
      replicas: 1
      placement:
        constraints:
          - node.role == manager
      resources:
        limits:
          memory: 128M
        reservations:
          memory: 64M

  alertmanager:
    image: stefanprodan/swarmprom-alertmanager:v0.14.0
    networks:
      - mon_net
    environment:
      - SLACK_URL=${SLACK_URL:-https://hooks.slack.com/services/TOKEN}
      - SLACK_CHANNEL=${SLACK_CHANNEL:-general}
      - SLACK_USER=${SLACK_USER:-alertmanager}
    command:
      - '--config.file=/etc/alertmanager/alertmanager.yml'
      - '--storage.path=/alertmanager'
    volumes:
      - alertmanager:/alertmanager
    deploy:
      mode: replicated
      replicas: 1
      placement:
        constraints:
          - node.role == manager
      resources:
        limits:
          memory: 128M
        reservations:
          memory: 64M

  unsee:
    image: cloudflare/unsee:v0.8.0
    networks:
      - mon_net
    environment:
      - "ALERTMANAGER_URIS=default:http://alertmanager:9093"
    deploy:
      mode: replicated
      replicas: 1

  node-exporter:
    image: stefanprodan/swarmprom-node-exporter:v0.16.0
    networks:
      - mon_net
    environment:
      - NODE_ID={{.Node.ID}}
    volumes:
      - /proc:/host/proc:ro
      - /sys:/host/sys:ro
      - /:/rootfs:ro
      - /etc/hostname:/etc/nodename
    command:
      - '--path.sysfs=/host/sys'
      - '--path.procfs=/host/proc'
      - '--collector.textfile.directory=/etc/node-exporter/'
      - '--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($$|/)'
      - '--no-collector.ipvs'
    deploy:
      mode: global
      resources:
        limits:
          memory: 128M
        reservations:
          memory: 64M

  prometheus:
    image: stefanprodan/swarmprom-prometheus:v2.5.0
    networks:
      - mon_net
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
      - '--storage.tsdb.path=/prometheus'
      - '--storage.tsdb.retention=${PROMETHEUS_RETENTION:-24h}'
    volumes:
      - prometheus:/prometheus
    configs:
      - source: node_rules
        target: /etc/prometheus/swarm_node.rules.yml
      - source: task_rules
        target: /etc/prometheus/swarm_task.rules.yml
    deploy:
      mode: replicated
      replicas: 1
      placement:
        constraints:
          - node.role == manager
      resources:
        limits:
          memory: 2048M
        reservations:
          memory: 128M

  caddy:
    image: stefanprodan/caddy
    ports:
      - "3000:3000"
      - "9090:9090"
      - "9093:9093"
      - "9094:9094"
    networks:
      - mon_net
    environment:
      - ADMIN_USER=${ADMIN_USER:-admin}
      - ADMIN_PASSWORD=${ADMIN_PASSWORD:-admin}
    configs:
      - source: caddy_config
        target: /etc/caddy/Caddyfile
    deploy:
      mode: replicated
      replicas: 1
      placement:
        constraints:
          - node.role == manager
      resources:
        limits:
          memory: 128M
        reservations:
          memory: 64M
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:3000"]
      interval: 5s
      timeout: 1s
      retries: 5

启动promethus服务

启动命令这里比github改了一下,暂时不需要添加slack报警通知。可以后续再加。且报警方式有很多种,不一定选择slack;这里启动命令修改为:

ADMIN_USER=admin ADMIN_PASSWORD=admin docker stack deploy -c docker-compose.yml mon

查看服务

[root@elcndc2zhda01 swarmprom]# docker service ls | grep mon
cympto586gtr        mon_alertmanager       replicated          1/1                 stefanprodan/swarmprom-alertmanager:v0.14.0                 
ksg0ysst4zm6        mon_caddy              replicated          1/1                 stefanprodan/caddy:latest                                   *:9090->9090/tcp, *:9093-9094->9093-9094/tcp, *:10009->3000/tcp
ym4kz7ske9fy        mon_cadvisor           global              2/3                 google/cadvisor:latest                                      
ylfwmwttjek7        mon_dockerd-exporter   global              3/3                 stefanprodan/caddy:latest                                   
mqpoup34kjn9        mon_grafana            replicated          1/1                 stefanprodan/swarmprom-grafana:5.3.4                        
inu71tv3lp0o        mon_node-exporter      global              3/3                 stefanprodan/swarmprom-node-exporter:v0.16.0                
sisf76m0vnr7        mon_prometheus         replicated          1/1                 stefanprodan/swarmprom-prometheus:v2.5.0                    
pkww94r8rtm1        mon_unsee              replicated          1/1                 cloudflare/unsee:v0.8.0                                     
[root@elcndc2zhda01 swarmprom]# 

检查一下,promethus、caddy、grafana的日志保证都没有报错,则启动成功了;

服务启动正常后通过caddy服务的容器所在node节点的ip加端口号来服务;(这里没有专门配置域名)

相关服务对应访问端口对应关系:

Services:

prometheus (metrics database) http://:9090
grafana (visualize metrics) http://:3000
node-exporter (host metrics collector)
cadvisor (containers metrics collector)
dockerd-exporter (Docker daemon metrics collector, requires Docker experimental metrics-addr to be enabled)
alertmanager (alerts dispatcher) http://:9093
unsee (alert manager dashboard) http://:9094
caddy (reverse proxy and basic auth provider for prometheus, alertmanager and unsee)

主要用到的是grafana:用来查看图标

http://:3000

promethus的服务不常用,如果熟悉使用也可以登录:

http://:9090

上述密码是在启动集群是命中带的admin:admin

登录grafana:

首页:

Prometheus监控Docker Swarm集群_第1张图片

 

自带报表

以上就已经完成了,promethus监控组合的基础搭建;

创建账号:

打开设置->用户->邀请用户

Prometheus监控Docker Swarm集群_第2张图片

 填写用户信息,点invite邀请

Prometheus监控Docker Swarm集群_第3张图片

查看邀请用户,并复制邀请链接

Prometheus监控Docker Swarm集群_第4张图片

邀请链接会复制到剪切板,直接黏贴出来如下,是一个邀请地址,这里需要把localhost和端口替换为我们grafana的ip和端口后进行访问

http://localhost:3000/invite/aZRgdHbJudW3RnOiZzcFQLi0kwOWKv

修改为

http://:3000/invite/aZRgdHbJudW3RnOiZzcFQLi0kwOWKv

打开链接后输入邮箱/密码点击注册,账号就可以用了

Prometheus监控Docker Swarm集群_第5张图片

 

 

你可能感兴趣的:(docker,运维,promethus,运维,swarm,docker)