1、需求说明:
采用cadvisor+influxdb+grafana进行容器监控并数据展示,同时收集tcpstats相关数据。
启动时关闭默认的—disable-metric
,启动参数如下:
[program:cadvisor]
command=/root/go/src/cadvisor/cadvisor -port=18080 -logtostderr=true -v=5 -enable_load_reader=true -storage_duration=15s -disable_metrics="" -docker_only=true -storage_driver=influxdb -storage_driver_db=influxdb -storage_driver_user=influxdb -storage_driver_password=influxdb -storage_driver_host="127.0.0.1:18086"
numprocs=1
autostart=true
autorestart=true
startsecs=3
startretries=5
stopasgroup=true
killasgroup=true
stdout_logfile=/var/log/supervisor/cadvisor_out.log
stderr_logfile=/var/log/supervisor/cadvisor_err.log
2、问题定位
启动后发现向influxdb中写入的数据中并没有tcpstat相关值,so 查看源码如下:
cat storage/influxdb/influxdb.go
...
50 // Series names
51 const (
52 // Cumulative CPU usage
53 serCpuUsageTotal string = "cpu_usage_total"
54 serCpuUsageSystem string = "cpu_usage_system"
55 serCpuUsageUser string = "cpu_usage_user"
56 serCpuUsagePerCpu string = "cpu_usage_per_cpu"
57 // Smoothed average of number of runnable threads x 1000.
58 serLoadAverage string = "load_average"
59 // Memory Usage
60 serMemoryUsage string = "memory_usage"
61 // Working set size
62 serMemoryWorkingSet string = "memory_working_set"
63 // Cumulative count of bytes received.
64 serRxBytes string = "rx_bytes"
65 // Cumulative count of receive errors encountered.
66 serRxErrors string = "rx_errors"
67 // Cumulative count of bytes transmitted.
68 serTxBytes string = "tx_bytes"
69 // Cumulative count of transmit errors encountered.
70 serTxErrors string = "tx_errors"
71 // Filesystem device.
72 serFsDevice string = "fs_device"
73 // Filesystem limit.
74 serFsLimit string = "fs_limit"
75 // Filesystem usage.
76 serFsUsage string = "fs_usage"
77 )
...
...
180 func (self *influxdbStorage) containerStatsToPoints(
181 cInfo *info.ContainerInfo,
182 stats *info.ContainerStats,
183 ) (points []*influxdb.Point) {
184 // CPU usage: Total usage in nanoseconds
185 points = append(points, makePoint(serCpuUsageTotal, stats.Cpu.Usage.Total))
186
187 // CPU usage: Time spend in system space (in nanoseconds)
188 points = append(points, makePoint(serCpuUsageSystem, stats.Cpu.Usage.System))
189
190 // CPU usage: Time spent in user space (in nanoseconds)
191 points = append(points, makePoint(serCpuUsageUser, stats.Cpu.Usage.User))
192
193 // CPU usage per CPU
194 for i := 0; i < len(stats.Cpu.Usage.PerCpu); i++ {
195 point := makePoint(serCpuUsagePerCpu, stats.Cpu.Usage.PerCpu[i])
196 tags := map[string]string{"instance": fmt.Sprintf("%v", i)}
197 addTagsToPoint(point, tags)
198
199 points = append(points, point)
200 }
201
202 // Load Average
203 points = append(points, makePoint(serLoadAverage, stats.Cpu.LoadAverage))
204
205 // Memory Usage
206 points = append(points, makePoint(serMemoryUsage, stats.Memory.Usage))
207
208 // Working Set Size
209 points = append(points, makePoint(serMemoryWorkingSet, stats.Memory.WorkingSet))
210
211 // Network Stats
212 points = append(points, makePoint(serRxBytes, stats.Network.RxBytes))
213 points = append(points, makePoint(serRxErrors, stats.Network.RxErrors))
214 points = append(points, makePoint(serTxBytes, stats.Network.TxBytes))
215 points = append(points, makePoint(serTxErrors, stats.Network.TxErrors))
216 self.tagPoints(cInfo, stats, points)
217
218 return points
219 }
...
源码中并未收集tcpstats的metric进入influxdb,所以需要手动添加。
3、问题解决
修改原代码,将需要的数据添加到points中。
vim storage/influxdb/influxdb.go
...
50 // Series names
51 const (
52 // Cumulative CPU usage
53 serCpuUsageTotal string = "cpu_usage_total"
54 serCpuUsageSystem string = "cpu_usage_system"
55 serCpuUsageUser string = "cpu_usage_user"
56 serCpuUsagePerCpu string = "cpu_usage_per_cpu"
57 // Smoothed average of number of runnable threads x 1000.
58 serLoadAverage string = "load_average"
59 // Memory Usage
60 serMemoryUsage string = "memory_usage"
61 // Working set size
62 serMemoryWorkingSet string = "memory_working_set"
63 // Cumulative count of bytes received.
64 serRxBytes string = "rx_bytes"
65 // Cumulative count of receive errors encountered.
66 serRxErrors string = "rx_errors"
67 // Cumulative count of bytes transmitted.
68 serTxBytes string = "tx_bytes"
69 // Cumulative count of transmit errors encountered.
70 serTxErrors string = "tx_errors"
71 // Filesystem device.
72 serFsDevice string = "fs_device"
73 // Filesystem limit.
74 serFsLimit string = "fs_limit"
75 // Filesystem usage.
76 serFsUsage string = "fs_usage"
77 // Tcp Establisd count.
78 serEsTabs string = "tcp_estab"
79 // Tcp TimeWait count.
80 serTimeWait string = "tcp_timewait"
81 // Tcp CloseWait count.
82 serCloseWait string = "tcp_closewait"
83 )
...
...
180 func (self *influxdbStorage) containerStatsToPoints(
181 cInfo *info.ContainerInfo,
182 stats *info.ContainerStats,
183 ) (points []*influxdb.Point) {
184 // CPU usage: Total usage in nanoseconds
185 points = append(points, makePoint(serCpuUsageTotal, stats.Cpu.Usage.Total))
186
187 // CPU usage: Time spend in system space (in nanoseconds)
188 points = append(points, makePoint(serCpuUsageSystem, stats.Cpu.Usage.System))
189
190 // CPU usage: Time spent in user space (in nanoseconds)
191 points = append(points, makePoint(serCpuUsageUser, stats.Cpu.Usage.User))
192
193 // CPU usage per CPU
194 for i := 0; i < len(stats.Cpu.Usage.PerCpu); i++ {
195 point := makePoint(serCpuUsagePerCpu, stats.Cpu.Usage.PerCpu[i])
196 tags := map[string]string{"instance": fmt.Sprintf("%v", i)}
197 addTagsToPoint(point, tags)
198
199 points = append(points, point)
200 }
201
202 // Load Average
203 points = append(points, makePoint(serLoadAverage, stats.Cpu.LoadAverage))
204
205 // Memory Usage
206 points = append(points, makePoint(serMemoryUsage, stats.Memory.Usage))
207
208 // Working Set Size
209 points = append(points, makePoint(serMemoryWorkingSet, stats.Memory.WorkingSet))
210
211 // Network Stats
212 points = append(points, makePoint(serRxBytes, stats.Network.RxBytes))
213 points = append(points, makePoint(serRxErrors, stats.Network.RxErrors))
214 points = append(points, makePoint(serTxBytes, stats.Network.TxBytes))
215 points = append(points, makePoint(serTxErrors, stats.Network.TxErrors))
216 points = append(points, makePoint(serEsTabs, stats.Network.Tcp.Established))
217 points = append(points, makePoint(serTimeWait, stats.Network.Tcp.TimeWait))
218 points = append(points, makePoint(serCloseWait, stats.Network.Tcp.CloseWait))
219
220 self.tagPoints(cInfo, stats, points)
221
222 return points
223 }
...
4、数据验证:
相关代码已经修改完成并提交到我的个人仓库中,需要的可以直接拉取进行make build,也可以直接下载cadvisor_with_tcp这个二进制文件执行。
代码仓库:https://github.com/mmjl/cadvisor.git
至此,cadvisor+influxdb,无法收集tcpEstab数量问题已经解决。