#升级yum
sudo yum update
#卸载旧版本
docker sudo yum remove docker docker-common docker-selinux docker-engine
#安装依赖
sudo yum install -y yum-utils device-mapper-persistent-data lvm2
#设置源
sudo yum-config-manager --add-repo http://mirrors.aliyun.com/docker-ce/linux/centos/docker-ce.repo sudo yum makecache fast
#安装docker
sudo yum install docker-ce
#启动服务
sudo systemctl start docker
安装成功后查看版本:
查看版本命令:docker -v
结果:Docker version 20.10.12, build e91ed57
在下载镜像前,需要设置一下国内源,用来提高下载速度,
创建文件: sudo vim /etc/docker/daemon.json
添加如下代码:
{
"registry-mirrors": ["https://d7grpode.mirror.aliyuncs.com"]
}
重启docker:
sudo systemctl restart docker
随后拉取Prometheus的Docker镜像:
docker pull prom/prometheus:latest
这里我们以监控Redis数据库为例子,所以还需要拉取redis和redis状态收集器两个镜像:
docker pull redis
docker pull oliver006/redis_exporter:latest
分别启动redis和redis状态收集器:
启动redis:
docker run -d --name redis -p 6380:6379 redis
启动redis状态收集器:
# redis_exporter监听服务器上的redis服务,而redis_exporter运行在9121端口上,注意redis的地址写服务器的公网ip(180.76.231.32)
docker run -d --name redis_exporter -p 9121:9121 oliver006/redis_exporter:latest --redis.addr redis://180.76.231.32:6379
运行docker ps查看服务:
[root@instance-53r3vagg tmp]# docker ps
CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES
51349113892a redis "docker-entrypoint.s…" 52 minutes ago Up 52 minutes 0.0.0.0:6379->6379/tcp redis
0ffcf81ea7ff oliver006/redis_exporter:latest "/redis_exporter --r…" About an hour ago Up About an hour 0.0.0.0:9121->9121/tcp redis_exporter
随后创建prometheus的配置文件
vim /tmp/prometheus.yml
加入下面代码:
#注意 targets:["服务器公网ip:9121"]
scrape_configs:
# The job name is added as a label `job=` to any timeseries scraped from this config.
- job_name: 'redis'
# Override the global default and scrape targets from this job every 5 seconds.
scrape_interval: 5s
static_configs:
- targets: ['180.76.231.32:9121']
创建文件:
mkdir /etc/prometheus
cd etc/prometheus
touch prometheus.yml
将下面代码加入该文件中:
scrape_configs:
# The job name is added as a label `job=` to any timeseries scraped from this config.
- job_name: 'redis'
# Override the global default and scrape targets from this job every 5 seconds.
scrape_interval: 5s
static_configs:
- targets: ['180.76.231.32:9121']
这里每隔5秒就获取一下服务运行信息,注意服务器地址要写公网ip,随后启动prometheus服务:
docker run -d -p 9090:9090 -v /tmp/prometheus.yml:/etc/prometheus/prometheus.yml prom/prometheus:latest
此时,prometheus就运行在9090端口上,访问一下:180.76.231.32:9090/targets
监控ok,但是如何redis服务突然停止,那我们如何解决这个问题,这时候可以引入钉钉机器人做一个报警,实时监控redis服务的状态,一旦停止,机器人发送消息,那我们开始创建机器人:
登录钉钉->群聊->设置->智能群助手->添加机器人->自定义->添加->
ip地址输入服务器公网ip:180.76.231.32
点击完成: (#这里有一个access_token一定要记住,后面开启时需要)
接下来利用docker拉取两个镜像:
docker pull prom/alertmanager:latest
docker pull timonwong/prometheus-webhook-dingtalk:v1.4.0
分别是prometheus的警高模块以及钉钉机器人插件,流程是如果prometheus检测到服务器异常,就会通过请求钉钉机器人的webhook地址来发送告警通知。
编写告警配置文件:
vim /tmp/alertmanager.yml
添加代码:
global:
resolve_timeout: 5m
route:
receiver: webhook
group_wait: 30s
group_interval: 5m
repeat_interval: 5m
group_by: [alertname]
routes:
- receiver: webhook
group_wait: 10s
receivers:
- name: webhook
webhook_configs:
- url: http://180.76.231.32:8060/dingtalk/webhook1/send
send_resolved: true
同时编写告警规则:
vim /tmp/redis.rules
添加代码:
groups:
- name: redis
rules:
- alert: redis
expr: up{job="redis"} == 0
for: 15s
labels:
severity: 1
team: node
annotations:
summary: "恭喜您,您的redis服务已经挂掉啦"
最后,修改一下prometheus的配置文件,将告警设置配置好:
vim /tmp/prometheus.yml
修改代码:
scrape_configs:
# The job name is added as a label `job=` to any timeseries scraped from this config.
- job_name: 'redis'
# Override the global default and scrape targets from this job every 5 seconds.
scrape_interval: 5s
static_configs:
- targets: ['180.76.231.32:9121']
alerting:
alertmanagers:
- static_configs:
- targets:
- 180.76.231.32:9093
rule_files:
- "/etc/prometheus/redis.rules"
重启prometheus服务:
docker run -d -p 9090:9090 -v /tmp/prometheus.yml:/etc/prometheus/prometheus.yml -v /tmp/redis.rules:/etc/prometheus/redis.rules prom/prometheus:latest
启动告警模块:
docker run -d --name alertmanager -p 9093:9093 -v /tmp/alertmanager.yml:/etc/alertmanager/alertmanager.yml prom/alertmanager:latest
启动钉钉插件:#注意将token替换成自己的机器人token。
docker run -d -p 8060:8060 --name webhook timonwong/prometheus-webhook-dingtalk:v1.4.0 --ding.profile="webhook1=https://oapi.dingtalk.com/robot/send?access_token=钉钉token"
重新访问180.76.231.32.9090/rules
看state是否为OK,
OK代表警告配置已经生效了,模拟下redis挂掉
docker stop redis_exporter
再次刷新180.76.231.32.9090/rules,查看prometheus监控立刻发现问题:
状态state是否发出红色警告:DOWN
宕机持续10秒,则会立刻触发firiing警告:
此时钉钉机器人立刻发送信息: