promethues_cilent 基本用法
安装 prometheus_client
# pip install prometheus_client
编写 prom_demo.py 如下
# coding: utf-8
# 详见 https://github.com/prometheus/client_python#gauge
from prometheus_client import Gauge, start_http_server
value = 404
# Gauge 的监控项,比如这里的 http_code,只能初始化一次,不然会报 “ValueError:Duplicated timeseries in CollectorRegistry”
http_code = Gauge('http_code', 'HTTP CODE')
http_code.set(value)
# Gauge 用法:Gauge('监控项', '监控项说明', ['标签1', '标签2'])
# 一定要先在 Gauge 中初始化标签(比如,['标签1', '标签2']),才能在 labels 中使用(比如,labels(IP='10.0.0.1', HOSTNAME='foobar'))
cpu_usage = Gauge('cpu_usage', 'CPU USAGE', ['IP', 'HOSTNAME'])
start_http_server(5000)
while True:
for value in range(10):
cpu_usage.labels(IP='10.0.0.1', HOSTNAME='foobar').set(value) # value 类型要跟 golang 中的 numeric 数值类型匹配
运行 python prom_demo.py,打开浏览器地址 http://127.0.0.1:5000/metrics,可以看到结果
实现站点监控 exporter
安装 prometheus_client pycurl flask pyyaml
pip install prometheus_client pycurl flask pyyaml
编写 site-monitor-exporter.py
# coding: utf-8
import yaml
import os
import pycurl
import time
from StringIO import StringIO
from prometheus_client.core import CollectorRegistry
from prometheus_client import Gauge, generate_latest
from flask import Flask, Response
def get_config(filename):
with open(filename, "r") as ymlfile:
cfg = yaml.safe_load(ymlfile)
return cfg
def get_site_status(url):
data = {'namelookup_time': 0, 'connect_time': 0, 'pretransfer_time': 0,
'starttransfer_time': 0, 'total_time': 0, 'http_code': 444,
'size_download': 0, 'header_size': 0, 'speed_download': 0}
html = StringIO()
c = pycurl.Curl()
c.setopt(pycurl.URL, url)
# 请求连接的等待时间
c.setopt(pycurl.CONNECTTIMEOUT, 5)
# 请求超时时间
c.setopt(pycurl.TIMEOUT, 5)
# 屏蔽下载进度条
c.setopt(pycurl.NOPROGRESS, 1)
# 完成交互后强制断开连接,不重用
c.setopt(pycurl.FORBID_REUSE, 1)
# 指定 HTTP 重定向的最大数为 1
c.setopt(pycurl.MAXREDIRS, 1)
# 设置保存 DNS 信息的时间为 10 秒
c.setopt(pycurl.DNS_CACHE_TIMEOUT, 10)
# 设置是否返回请求头
# c.setopt(pycurl.HEADER, True)
# 设置是否返回请求体
# c.setopt(pycurl.NOBODY, True)
# 设置是否验证HTTP证书
c.setopt(pycurl.SSL_VERIFYPEER, 0)
# 把 response body 存在 html 变量里,不输出到终端
c.setopt(pycurl.WRITEFUNCTION, html.write)
try:
c.perform()
# 变量含义,参考文档:https://curl.haxx.se/libcurl/c/curl_easy_getinfo.html
# 获取 DNS 解析时间,单位 秒(s)
namelookup_time = c.getinfo(c.NAMELOOKUP_TIME)
# 获取建立连接时间,单位 秒(s)
connect_time = c.getinfo(c.CONNECT_TIME)
# 获取从建立连接到准备传输所消耗的时间,单位 秒(s)
pretransfer_time = c.getinfo(c.PRETRANSFER_TIME)
# 获取从建立连接到传输开始消耗的时间,单位 秒(s)
starttransfer_time = c.getinfo(c.STARTTRANSFER_TIME)
# 获取传输的总时间,单位 秒(s)
total_time = c.getinfo(c.TOTAL_TIME)
# 获取 HTTP 状态码
http_code = c.getinfo(c.HTTP_CODE)
# 获取下载数据包大小,单位 bytes
size_download = c.getinfo(c.SIZE_DOWNLOAD)
# 获取 HTTP 头部大小,单位 byte
header_size = c.getinfo(c.HEADER_SIZE)
# 获取平均下载速度,单位 bytes/s
speed_download = c.getinfo(c.SPEED_DOWNLOAD)
c.close()
data = dict(namelookup_time=namelookup_time * 1000, connect_time=connect_time * 1000,
pretransfer_time=pretransfer_time * 1000, starttransfer_time=starttransfer_time * 1000,
total_time=total_time * 1000, http_code=http_code,
size_download=size_download, header_size=header_size,
speed_download=speed_download)
# 如果站点无法访问,捕获异常,并使用前面初始化的字典 data 的值
except Exception, e:
print "{} connection error: {}".format(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), str(e))
c.close()
return data
# 设置 metrics
registry = CollectorRegistry(auto_describe=False)
namelookup_time = Gauge('namelookup_time', 'namelookup time', ['url'], registry=registry)
connect_time = Gauge('connect_time', 'connect time', ['url'], registry=registry)
pretransfer_time = Gauge('pretransfer_time', 'pretransfer time time', ['url'], registry=registry)
starttransfer_time = Gauge('starttransfer_time', 'starttransfertime time', ['url'], registry=registry)
total_time = Gauge('total_time', 'total time', ['url'], registry=registry)
size_download = Gauge('size_download', 'size download', ['url'], registry=registry)
header_size = Gauge('header_size', 'header size', ['url'], registry=registry)
speed_download = Gauge('speed_download', 'speed download', ['url'], registry=registry)
http_code = Gauge('http_code', 'http code', ['url'], registry=registry)
app = Flask(__name__)
@app.route("/metrics")
def main():
filename = os.path.join(os.path.dirname(os.path.abspath(__file__)), "config.yml")
res = get_config(filename)
for url in res['urls']:
data = get_site_status(url)
print data
for key, value in data.iteritems():
if key == 'namelookup_time':
namelookup_time.labels(url=url).set(float(value))
elif key == 'connect_time':
connect_time.labels(url=url).set(float(value))
elif key == 'pretransfer_time':
pretransfer_time.labels(url=url).set(float(value))
elif key == 'starttransfer_time':
starttransfer_time.labels(url=url).set(float(value))
elif key == 'total_time':
total_time.labels(url=url).set(float(value))
elif key == 'size_download':
size_download.labels(url=url).set(float(value))
elif key == 'header_size':
header_size.labels(url=url).set(float(value))
elif key == 'speed_download':
speed_download.labels(url=url).set(float(value))
elif key == 'http_code':
http_code.labels(url=url).set(float(value))
return Response(generate_latest(registry), mimetype="text/plain")
if __name__ == '__main__':
app.run(host='0.0.0.0', port=5000)
要监控的地址加到 config.yml 即可
urls:
- https://www.qq.com
- http://api.map.baidu.com/
最终采集到的的数据
部署到服务器上(CentOS 7)
# ls /data/site-monitor-exporter/
config.yml site-monitor-exporter.py
# vim /etc/systemd/system/site-monitor-exporter.service
[Unit]
Description=site-monitor-exporter
After=network.target
[Service]
Type=simple
ExecStart=/usr/bin/python /data/site-monitor-exporter/site-monitor-exporter.py
Restart=on-failure
[Install]
WantedBy=multi-user.target
# systemctl start site-monitor-exporter
# systemctl status site-monitor-exporter
prometheus.yml 中添加配置(这里 site-monitor-exporter 跟 prometheus 在同一台服务器上)
- job_name: 'site'
static_configs:
- targets: ['localhost:5000']
热加载 prometheus 配置
curl -X POST http://127.0.0.1:9090/-/reload
grafana 展示
新建 dashboard
设置 url 为变量,用于匹配不同的 url
配置每个 metric 的查询语句
配置每个 metric 的相应的单位
最终的效果图
参考:
Python自动化运维 : 技术与最佳实践 2.4 探测Web服务质量方法
jenkins_exporter:https://github.com/lovoo/jenkins_exporter