1.docker基础环境安装
yum -y install docker
apt-get -y update ;apt-get -y install docker-compose
systemctl enable docker
systemctl start docker
timedatectl
timedatectl set-timezone Asia/Shanghai 宿主机时间设定
2.Prometheus安装
编辑配置文件prometheus.yml
cat prometheus.yml
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.

scrape_configs:

  • job_name: 'prometheus'
    static_configs:
    • targets: ['localhost:9090']
  • job_name: 'kafka'
    static_configs:
    • targets: ['localhost:9308']
      labels:
      instance: kafka
  • job_name: elasticsearch
    scrape_interval: 5s
    metrics_path: "/_prometheus/metrics"

    file_sd_configs:

    • files:
      • es.yml

sudo docker stop prometheus
sudo docker rm prometheus
sudo docker run -d --restart=always \
-v /etc/localtime:/etc/localtime \
-v /data/monitor/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml \
--name prometheus \
--net=host docker.io/wang049718/prometheus --web.enable-lifecycle --config.file=/etc/prometheus/prometheus.yml

3.grafana部署
docker run -d --name grafana \
--restart=always \
-v /etc/localtime:/etc/localtime \
-p 3000:3000 \
docker.io/wang049718/grafana
admin/admin
4.监控node_exporter部署
参数说明
--web.listen-address=":9200"
#node_exporter监听的端口,默认是9100,若需要修改则通过此参数。
--web.telemetry-path="/metrics"
#获取metric信息的url,默认是/metrics,若需要修改则通过此参数
--log.level="info"
#设置日志级别
--log.format="logger:stderr"

docker run -d --restart=always \
-v /etc/localtime:/etc/localtime \
--name node-exporter \
--net=host docker.io/wang049718/node-exporter:0.18

Grafana导入模版id 10262

5.告警规则
Prometheus添加规则

alerting: #指定alertmanager报警组件地址
alertmanagers:

  • static_configs:
    • targets: [ '1.1.1.5:9093']

rule_files: #指定报警规则文件

  • "rules.yml"

增加规则文件rules.yml
groups:

  • name: example #定义规则组
    rules:
    • alert: InstanceDown #定义报警名称
      expr: up == 0 #Promql语句,触发规则
      for: 1m # 一分钟
      labels: #标签定义报警的级别和主机
      name: instance
      severity: Critical
      annotations: #注解
      summary: " {{ $labels.instance }}" #报警摘要,取报警信息的appname名称
      description: " 服务停止运行 " #报警信息
      value: "{{ $value }}%" # 当前报警状态值
  • name: Host
    rules:
    • alert: HostMemory Usage
      expr: 1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) > 0.8
      for: 1m
      labels:
      name: Memory
      severity: Warning
      annotations:
      summary: " {{ $labels.appname }} "
      description: "宿主机内存使用率超过80%."
      value: "{{ $value }}"
    • alert: HostCPU Usage
      expr: sum(avg without (cpu)(irate(node_cpu_seconds_total{mode!='idle'}[5m]))) by (instance,appname) > 0.65
      for: 1m
      labels:
      name: CPU
      severity: Warning
      annotations:
      summary: " {{ $labels.appname }} "
      description: "宿主机CPU使用率超过65%."
      value: "{{ $value }}"
    • alert: HostLoad
      expr: node_load5 > 4
      for: 1m
      labels:
      name: Load
      severity: Warning
      annotations:
      summary: "{{ $labels.appname }} "
      description: " 主机负载5分钟超过4."
      value: "{{ $value }}"
    • alert: HostLoad
      expr: node_load1 > 10
      for: 1m
      labels:
      name: Load
      severity: Warning
      annotations:
      summary: "{{ $labels.appname }} "
      description: " 主机负载1分钟超过10."
      value: "{{ $value }}"
    • alert: HostFilesystem Usage
      expr: 1-(node_filesystem_free_bytes / node_filesystem_size_bytes) > 0.8
      for: 1m
      labels:
      name: Disk
      severity: Warning
      annotations:
      summary: " {{ $labels.appname }} "
      description: " 宿主机 [ {{ $labels.mountpoint }} ]分区使用超过80%."
      value: "{{ $value }}%"
    • alert: HostDiskio
      expr: irate(node_disk_writes_completed_total{job=~"Host"}[1m]) > 10
      for: 1m
      labels:
      name: Diskio
      severity: Warning
      annotations:
      summary: " {{ $labels.appname }} "
      description: " 宿主机 [{{ $labels.device }}]磁盘1分钟平均写入IO负载较高."
      value: "{{ $value }}iops"
    • alert: Network_receive
      expr: irate(node_network_receive_bytes_total{device!~"lo|bond[0-9]|eth[0-9]|cbr[0-9]|veth.|virbr.|ovs-system"}[5m]) / 1048576 > 3
      for: 1m
      labels:
      name: Network_receive
      severity: Warning
      annotations:
      summary: " {{ $labels.appname }} "
      description: " 宿主机 [{{ $labels.device }}] 网卡5分钟平均接收流量超过3Mbps."
      value: "{{ $value }}3Mbps"
    • alert: Network_transmit
      expr: irate(node_network_transmit_bytes_total{device!~"lo|bond[0-9]|eth[0-9]|cbr[0-9]|veth.|virbr.|ovs-system"}[5m]) / 1048576 > 3
      for: 1m
      labels:
      name: Network_transmit
      severity: Warning
      annotations:
      summary: " {{ $labels.appname }} "
      description: " 宿主机 [{{ $labels.device }}] 网卡5分钟内平均发送流量超过3Mbps."
      value: "{{ $value }}3Mbps"
  • name: Container
    rules:
    • alert: ContainerCPU Usage
      expr: (sum by(name,instance) (rate(container_cpu_usage_seconds_total{image!=""}[5m]))*100) > 60
      for: 1m
      labels:
      name: CPU
      severity: Warning
      annotations:
      summary: "{{ $labels.name }} "
      description: " 容器CPU使用超过60%."
      value: "{{ $value }}%"
    • alert: ContainerMem Usage
      expr: container_memory_usage_bytes{name=~".+"} / 1048576 > 1024
      for: 1m
      labels:
      name: Memory
      severity: Warning
      annotations:
      summary: "{{ $labels.name }} "
      description: " 容器内存使用超过1GB."
      value: "{{ $value }}G"
  • name: Kafka
    rules:
    • alert: kafka_lag
      expr: kafka_consumergroup_lag > 180
      for: 1m
      labels:
      severity: Warning
      annotations:
      description: "剩余队列长度大于180."
      value: "{{ $value }}"
  • name: Redis
    rules:
    • alert: rejected_connections
      expr: redis_rejected_connections_total > 0
      for: 1m
      labels:
      severity: Warning
      annotations:
      description: "redis达到链接上限,拒绝的个数"
      value: "{{ $value }}"
    • alert: blocked_clients
      expr: irate(redis_blocked_clients[5m]) > 10
      for: 1m
      labels:
      severity: Warning
      annotations:
      description: "redis是单线程,5分钟阻塞大于10"
      value: "{{ $value }}"
    • alert: slave
      expr: redis_connected_slaves == 1
      for: 1m
      labels:
      severity: Warning
      annotations:
      description: "slave donw"
      value: "{{ $value }}"
  • name: ES
    rules:
    • alert: es_cluster_node
      expr: es_cluster_nodes_number < 3
      for: 1m
      labels:
      severity: Warning
      annotations:
      description: "ES集群异常."
      value: "{{ $value }}"
    • alert: es_cluster_datanodes_number
      expr: es_cluster_datanodes_number < 3
      for: 1m
      labels:
      severity: Warning
      annotations:
      description: "ES集群数据节点异常."
      value: "{{ $value }}"
    • alert: es内存使用率
      expr: es_os_mem_used_bytes / es_os_mem_total_bytes * 100 > 80
      for: 1m
      labels:
      severity: Warning
      annotations:
      description: "内粗使用率大于80%"
      value: "{{ $value }}"
    • alert: es cpu使用率
      expr: es_os_cpu_percent > 0.6
      for: 1m
      labels:
      severity: Warning
      annotations:
      description: "cpu使用率大于60%"
      value: "{{ $value }}"
  • name: web
    rules:
    • alert: basevisitor
      expr: basevisitor != 200
      for: 1m
      labels:
      severity: Warning
      annotations:
      description: "basevisitor 异常"
      value: "{{ $value }}"
    • alert: km
      expr: km != 200
      for: 1m
      labels:
      severity: Warning
      annotations:
      description: "km 异常"
      value: "{{ $value }}"
    • alert: gtower
      expr: gtower != 200
      for: 1m
      labels:
      severity: Warning
      annotations:
      description: "gtower 异常"
      value: "{{ $value }}"
    • alert: im03
      expr: im03 != 200
      for: 1m
      labels:
      severity: Warning
      annotations:
      description: "im03 异常"
      value: "{{ $value }}"
    • alert: immonitor
      expr: immonitor != 200
      for: 1m
      labels:
      severity: Warning
      annotations:
      description: "immonitor 异常"
      value: "{{ $value }}"
    • alert: volcano
      expr: volcano != 200
      for: 1m
      labels:
      severity: Warning
      annotations:
      description: "volcano 异常"
      value: "{{ $value }}"
    • alert: kfonline
      expr: kfonline != 302
      for: 1m
      labels:
      severity: Warning
      annotations:
      description: "kfonline 异常"
      value: "{{ $value }}"
    • alert: ocs
      expr: ocs != 403
      for: 1m
      labels:
      severity: Warning
      annotations:
      description: "ocs 异常"
      value: "{{ $value }}"
    • alert: fliter
      expr: fliter != 200
      for: 1m
      labels:
      severity: Warning
      annotations:
      description: "fliter 异常"
      value: "{{ $value }}"
  • name: mongo
    rules:
    • alert: cluster
      expr: mongodb_mongod_replset_my_state != 2
      for: 1m
      labels:
      severity: Warning
      annotations:
      description: "集群异常"
      value: "{{ $value }}"Prometheus加载告警规则

docker run -d --restart=always \
-v /etc/localtime:/etc/localtime \
-v /data/monitor/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml \
-v /data/monitor/prometheus/rules.yml:/etc/prometheus/rules.yml \
--name prometheus \
--net=host docker.io/wang049718/prometheus --web.enable-lifecycle --config.file=/etc/prometheus/prometheus.yml

启动告警服务插件
cat alertmanager.yml
global:
resolve_timeout: 2m
smtp_smarthost: smtp.163.com:25
smtp_from: [email protected]
smtp_auth_username: [email protected]
smtp_auth_password: wang049718

templates: ##消息模板

  • '/etc/alertmanager/template/wechat.tmpl'
    route:
    group_by: ['alertname_wechat']
    group_wait: 30s
    group_interval: 60s
    receiver: 'email' # 优先使用wechat发送
    repeat_interval: 1h
    routes: #子路由,使用email发送
  • receiver: email
    match_re:
    serverity: email
    receivers:
    • name: 'email'
      email_configs:
  • to: '[email protected]'
    send_resolved: true # 发送已解决通知
    wechat.tmpl
    [root@localhost wang]# cat wechat.tmpl
    {{ define "wechat.default.message" }}
    {{ range $i, $alert :=.Alerts }}
    ========监控报警==========
    告警状态:{{ .Status }}
    告警级别:{{ $alert.Labels.severity }}
    告警类型:{{ $alert.Labels.alertname }}
    告警应用:{{ $alert.Annotations.summary }}
    告警主机:{{ $alert.Labels.instance }}
    告警详情:{{ $alert.Annotations.description }}
    触发阀值:{{ $alert.Annotations.value }}
    告警时间:{{ $alert.StartsAt.Format "2006-01-02 15:04:05" }}
    ========end=============
    {{ end }}
    {{ end }}

docker run -d -p 9093:9093 --name alertmanager \
--restart always \
-v /etc/localtime:/etc/localtime \
-v /home/wang/alertmanager.yml:/etc/alertmanager/alertmanager.yml \
-v /home/wang:/etc/alertmanager/template \
docker.io/wang049718/alertmanager

6.监控mongo
docker run -itd --net=host -v /etc/localtime:/etc/localtime --name mongo mongo --auth
docker exec -it mongo mongo
use admin
db.createUser({
user: 'admin',
pwd: '123456',
roles: [{ "role": "userAdminAnyDatabase", "db": "admin" },
{ "role": "dbAdminAnyDatabase", "db": "admin" },
{ role: "root", db: "admin" }
]
})

docker exec -it mongo mongo -uadmin -p123456

use col
db.col.insert({title: 'MongoDB 教程',
description: 'MongoDB 是一个 Nosql 数据库',
by: '菜鸟教程',
url: 'http://www.runoob.com',
tags: ['mongodb', 'database', 'NoSQL'],
likes: 100
})
sudo docker run -d --name mongo-explorer \
--restart=always \
-p 9105:9104 \
--cpuset-cpus=2,1 \
-m 300m \
-v /etc/localtime:/etc/localtime \
docker.io/wang049718/mongo --mongodb.uri "mongodb://admin:[email protected]:27017"

模版为json串
7监控mysql
sudo docker run -d --restart=always \
--net=host \
--cpuset-cpus=0,1 \
-m 1200m \
-e DATA_SOURCE_NAME="monitoring:monitoring@(172.21.10.22:3306)"/ \
-v /etc/localtime:/etc/localtime \
--name mysql_exporter \
docker.io/wang049718/mysqld-exporter:latest
8.监控redis
docker stop redis-server
docker rm redis-server
docker run -d --name redis-server -p 6379:6379 \
-v /etc/localtime:/etc/localtime \
-v /home/redis:/data \
--restart always redis \
--requirepass "123456" --appendonly yes
docker run -d --name redis_exporter \
-p 9121:9121 \
-v /etc/localtime:/etc/localtime \
--restart always docker.io/wang049718/redis_exporter \
--redis.addr redis://1.1.1.4:6379 -redis.password 123456

sudo docker run -d --name redis_exporter \
--net=host \
-v /etc/localtime:/etc/localtime \
--cpuset-cpus=0,1 \
-m 1200m \
--restart always \
docker.io/wang049718/redis_exporter \
--redis.addr redis://172.21.10.11:6379

9.监控kafka
sudo docker stop kafka
sudo docker rm kafka
sudo docker run -d --restart=always \
--cpuset-cpus=3,2 \
-m 1200m \
-p 9308:9308 \
-v /etc/localtime:/etc/localtime \
--name kafka \
docker.io/wang049718/kafka_exporter \
/kafka_exporter-1.2.0.linux-amd64/kafka_exporter --kafka.server=172.21.10.4:9092

Json串

10.pushgateway

sudo docker stop pushgateway
sudo docker rm pushgateway
sudo docker run -d --restart=always \
-v /etc/localtime:/etc/localtime \
--name pushgateway -p 9091:9091 \
docker.io/wang049718/pushgateway:latest

访问9091端口(http://pushgatewayIP:9091)

打开prometheus的配置文件

  • job_name: 'pushgateway'
    static_configs:
    • targets: ['pushgatewayIP:9091']
      honor_labels: true #作用:如果没有设置instance标签,Prometheus服务器也会附加标签,否则instance标签值会为空  

写入测试
单条
echo "test 123" | curl --data-binary @- http://localhost:9091/metrics/job/test
上述测试的目的是,在被监控的机器上,想pushgateway发送了一条数据,内容是“test 123”,指标名称是“test”,指标值是“123”;
  http://pushgatewayIP:9091/metrics/job/test,此次也声名了,在pushgateway处建立一个job为test的指标。

可以在prometheus图形查看test值

API格式:
  http://pustgatewayIP/metrices/job/job名/标签名/标签值(一般 标签名 采用 instance)
例子:
  http://pustgatewayIP/metrics/job/
    /sb/instance/si
    /testjob/abc/pushgateway1
    /testjob/yyy/pushgateway1
  分别触发上述三个API,打开pushgateway的web UI

cat <

TYPE docker_runtime counter

docker_runtime{name="cadvisor"} 33
docker_runtime{name="nginx"} 331

docker_runtime{name="abc"} 332

echo "basevisitor curl -I -m 10 -o /dev/null -s -w %{http_code} http://kf-basevisitor.ziroom.com" | curl --data-binary @- http://localhost:9091/metrics/job/web/instance/basevisitor
echo "km curl -I -m 10 -o /dev/null -s -w %{http_code} https://kf-km.ziroom.com/backend/health" | curl --data-binary @- http://localhost:9091/metrics/job/web/instance/km
echo "gtower curl -I -m 10 -o /dev/null -s -w %{http_code} http://kf-gtower.ziroom.com:7002" | curl --data-binary @- http://localhost:9091/metrics/job/web/instance/gtower
echo "im03 curl -I -m 10 -o /dev/null -s -w %{http_code} http://kf-im03.ziroom.com/health-check" | curl --data-binary @- http://localhost:9091/metrics/job/web/instance/im03
echo "fliter curl -I -m 10 -o /dev/null -s -w %{http_code} http://kf-fliter.ziroom.com" | curl --data-binary @- http://localhost:9091/metrics/job/web/instance/fliter
echo "immonitor curl -I -m 10 -o /dev/null -s -w %{http_code} http://kf-immonitor.ziroom.com/health-check" | curl --data-binary @- http://localhost:9091/metrics/job/web/instance/immonitor
echo "volcano curl -I -m 10 -o /dev/null -s -w %{http_code} http://kf-volcano.ziroom.com/monitor/group_all_agent" | curl --data-binary @- http://localhost:9091/metrics/job/web/instance/volcano
echo "kfonline curl -I -m 10 -o /dev/null -s -w %{http_code} http://kfonline.ziroom.com" | curl --data-binary @- http://localhost:9091/metrics/job/web/instance/kfonline
echo "push curl -I -m 10 -o /dev/null -s -w %{http_code} http://kf-push.ziroom.com:7002" | curl --data-binary @- http://localhost:9091/metrics/job/web/instance/push
echo "ocs curl -I -m 10 -o /dev/null -s -w %{http_code} http://kf-ocs.ziroom.com/minio/login" | curl --data-binary @- http://localhost:9091/metrics/job/web/instance/ocs
11.监控es

es监控
/usr/local/services/elasticsearch/bin/elasticsearch-plugin install file:///home/webuser/package/elasticsearch-prometheus-exporter-5.6.4.0.zip

重启服务器
服务器端

  • job_name: elasticsearch
    scrape_interval: 5s
    metrics_path: "/_prometheus/metrics"

    file_sd_configs:

    • files:
      • es.yml

es.yml文件,位置和prometheus.yml在一个文件夹

  • targets:
    • 172.21.10.10:9200
    • 172.21.8.49:9200
    • 172.21.10.12:9200
      labels:
      server: c2-jenkins
      grafana导入266

sudo docker stop prometheus
sudo docker rm prometheus
sudo docker run -d --restart=always \
-v /etc/localtime:/etc/localtime \
-v /data/monitor/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml \
-v /data/monitor/prometheus/es.yml:/etc/prometheus/es.yml \
-v /data/monitor/prometheus/rules.yml:/etc/prometheus/rules.yml \
--name prometheus \
--net=host prom/prometheus --web.enable-lifecycle --config.file=/etc/prometheus/prometheus.yml

12监控nginx

1.10.3
apt-get -y update
apt-get -y install libpcre3 libpcre3-dev gcc
apt-get -y install openssl libssl-dev libxslt-dev libgd-dev libgeoip-dev

git clone git://github.com/vozlt/nginx-module-vts.git

wget http://nginx.org/download/nginx-1.10.3.tar.gz
tar xvf nginx-1.10.3.tar.gz
nginx/1.10.3
./configure --with-cc-opt='-g -O2 -fPIE -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2' --with-ld-opt='-Wl,-Bsymbolic-functions -fPIE -pie -Wl,-z,relro -Wl,-z,now' --prefix=/usr/share/nginx --conf-path=/etc/nginx/nginx.conf --http-log-path=/var/log/nginx/access.log --error-log-path=/var/log/nginx/error.log --lock-path=/var/lock/nginx.lock --pid-path=/run/nginx.pid --http-client-body-temp-path=/var/lib/nginx/body --http-fastcgi-temp-path=/var/lib/nginx/fastcgi --http-proxy-temp-path=/var/lib/nginx/proxy --http-scgi-temp-path=/var/lib/nginx/scgi --http-uwsgi-temp-path=/var/lib/nginx/uwsgi --with-debug --with-pcre-jit --with-ipv6 --with-http_ssl_module --with-http_stub_status_module --with-http_realip_module --with-http_auth_request_module --with-http_addition_module --with-http_dav_module --with-http_geoip_module --with-http_gunzip_module --with-http_gzip_static_module --with-http_image_filter_module --with-http_v2_module --with-http_sub_module --with-http_xslt_module --with-stream --with-stream_ssl_module --with-mail --with-mail_ssl_module --with-threads --add-module=/home/webuser/nginx-module-vts

make && make install
make upgrade

rm -rf /usr/sbin/nginx
ln -s /usr/share/nginx/sbin/nginx /usr/sbin/
nginx -V

http
vhost_traffic_status_zone;
vhost_traffic_status_filter_by_host on;

server
location /status {
vhost_traffic_status_display;
vhost_traffic_status_display_format html;
}
不能是localhost不然不生效

wget -c https://github.com/hnlq715/nginx-vts-exporter/releases/download/v0.9.1/nginx-vts-exporter-0.9.1.linux-amd64.tar.gz
tar -xvf nginx-vts-exporter-0.9.1.linux-amd64.tar.gz -C /usr/local/
cd /usr/local/nginx-vts-exporter-0.9.1.linux-amd64/
./nginx-vts-exporter -nginx.scrape_uri http://172.21.10.3:7002/status/format/json &

9913

sudo docker stop nginx-vts-exporter
sudo docker rm nginx-vts-exporter
sudo docker run -d --name nginx-vts-exporter \
--restart=always \
-p 9913:9913 \
--cpuset-cpus=2,1 \
-e nginx.scrape_uri='http://172.21.10.3:7002/status/format/json' \
-e NGINX_HOST=http://172.21.10.3:7002 \
-m 300m \
-v /etc/localtime:/etc/localtime \
docker.io/sophos/nginx-vts-exporter

9913端口
http://1.1.1.4:9913/metrics 查看数据

grafana配置
2949