python实现系统指标采集

前言

这周公司新上的项目需要压测,根据各个压测场景,需要拿到linux服务器不同的系统消耗指标。

思来想去觉得还是使用python更轻量,也更容易被后续的第三方agent来执行,就写了这样的一个指标采集工具。

指标采集

指标包括cpu、内存、io、网卡等一系列常见的性能指标,具体的指标以及计算也可以参考github上的淘宝开源项目tsar

整体的采集思路非常简单,分为两种:

  • 读取特定的文件,解析文件,格式化数据;
  • 执行指定命令,获取输出,格式化数据

所有的指标都乘以了一个系数,我贪快,所以全都直接写的10000 :(

具体的数据解析可以自行cat输出对应的文件,结合命令输出来对比

1.负载

从/proc/loadavg文件中读取

def collector_load():
    # 读取负载文件
    load_file = open("/proc/loadavg")
    content = load_file.read().split()
    load_file.close()
    load_avg = {
        "load1": int(string.atof(content[0]) * 10000),
        "load5": int(string.atof(content[1]) * 10000),
        "load15": int(string.atof(content[2]) * 10000)
    }
    return load_avg

2. 内存

从/proc/meminfo中读取

# 采集内存信息
def collect_memory_info():
    # 读取内存信息文件
    memory_buffer = {}
    with open("/proc/meminfo") as mem_file:
        for line in mem_file:
            memory_buffer[line.split(':')[0]] = string.atoi(line.split(':')[1].split()[0])
    # 过滤只取关注的指标
    mem_total = memory_buffer["MemTotal"]
    mem_free = memory_buffer["MemFree"] + memory_buffer["Buffers"] + memory_buffer["Cached"]
    mem_util = int((float(mem_total - mem_free)/float(mem_total)) * 10000)
    mem_buff = int(float(memory_buffer["Buffers"])/float(mem_total) * 10000)
    mem_cache = int(float(memory_buffer["Cached"])/float(mem_total) * 10000)
    mem_info = {
        "mem_buff": mem_buff,
        "mem_util": mem_util,
        "mem_cache": mem_cache,
    }
    return mem_info

3. cpu信息

从/proc/stat中获取

# 采集cpu信息
def collect_cpu_info():
    cpu_buffer = {}
    with open("/proc/stat") as cpu_file:
        for line in cpu_file:
            line_fields = line.split()
            if line_fields[0] == "cpu":
                total = 0
                for field in line_fields:
                    if field == "cpu":
                        continue
                    total += string.atoi(field)

                cpu_buffer = {
                    "User": string.atoi(line_fields[1]),
                    "Sys": string.atoi(line_fields[3]),
                    "Idle": string.atoi(line_fields[4]),
                    "Steal": string.atoi(line_fields[8]),
                    "Wait": string.atoi(line_fields[5]),
                    "Total": total
                }
                break
    return cpu_buffer

这个指标在系统中是累加的,因此需要再次进行计算,即本次结果与上次结果的差值才是本段时间内的指标值:

# 计算cpu数据
def calculate_cpu_info():
    global last_cpu_info
    cpu_info = collect_cpu_info()
    if last_cpu_info is None:
        last_cpu_info = cpu_info
        return {}
    else:
        delta_total = cpu_info["Total"] - last_cpu_info["Total"]
        delta_user = cpu_info["User"] - last_cpu_info["User"]
        delta_sys = cpu_info["Sys"] - last_cpu_info["Sys"]
        delta_idle = cpu_info["Idle"] - last_cpu_info["Idle"]
        delta_wait = cpu_info["Wait"] - last_cpu_info["Wait"]
        delta_steal = cpu_info["Steal"] - last_cpu_info["Steal"]
        last_cpu_info = cpu_info
        return {
            "cpu_user": int(float(delta_user)/float(delta_total) * 10000),
            "cpu_sys": int(float(delta_sys)/float(delta_total) * 10000),
            "cpu_wait": int(float(delta_wait)/float(delta_total) * 10000),
            "cpu_steal": int(float(delta_steal)/float(delta_total) * 10000),
            "cpu_idle": int(float(delta_idle)/float(delta_total) * 10000),
            "cpu_util": int(float(delta_total - delta_idle - delta_wait - delta_steal)/float(delta_total) * 10000)
        }

4. IO相关

从文件/proc/diskstats中读取

# 采集io
def collect_io_info():
    io_buffer = {}
    with open("/proc/diskstats") as io_file:
        for line in io_file:
            line_fields = line.split()
            device_name = line_fields[2]
            if line_fields[3] == "0":
                continue
            if should_handle_device(device_name):
                io_buffer[device_name] = {
                    "ReadRequest": string.atoi(line_fields[3]),
                    "WriteRequest": string.atoi(line_fields[7]),
                    "MsecRead": string.atoi(line_fields[6]),
                    "MsecWrite": string.atoi(line_fields[10]),
                    "MsecTotal": string.atoi(line_fields[12]),
                    "Timestamp": int(time.time())
                }
    return io_buffer

# 当前的硬盘设备是否需要使用
def should_handle_device(device):
    normal = len(device) == 3 and device.startswith("sd") or device.startswith("vd")
    aws = len(device) >= 4 and device.startswith("xvd") or device.startswith("sda")
    return normal or aws

这个指标也是累加的,需要进行求差:

# 计算io信息
def calculate_io_info():
    global last_io_info
    io_info = collect_io_info()
    result = []
    if last_io_info is not None:
        for key in io_info.keys():
            total_duration = io_info[key]["Timestamp"] - last_io_info[key]["Timestamp"]
            read_use_io = io_info[key]["MsecRead"] - last_io_info[key]["MsecRead"]
            write_use_io = io_info[key]["MsecWrite"] - last_io_info[key]["MsecWrite"]
            read_io = io_info[key]["ReadRequest"] - last_io_info[key]["ReadRequest"]
            write_io = io_info[key]["WriteRequest"] - last_io_info[key]["WriteRequest"]
            read_write_io = io_info[key]["MsecTotal"] - last_io_info[key]["MsecTotal"]
            readwrite_io = read_io + write_io
            io_awit = 0
            if readwrite_io > 0:
                io_awit = int(float(read_use_io + write_use_io) / float(readwrite_io) * 10000)
            result.append({
                "io_rs": int((read_io/total_duration) * 10000),
                "io_ws": int((write_io/total_duration) * 10000),
                "io_await": io_awit,
                "io_util": int(float(read_write_io) / (total_duration * 1000) * 10000),
            })

    last_io_info = io_info
    return result

5. 采集网卡

网卡数据从/proc/net/dev中读取

# 采集网卡流量数据
def collect_net_info():
    net_buffer = {}
    with open("/proc/net/dev") as net_file:
        for line in net_file:
            if line.find(":") < 0:
                continue
            card_name = line.split(":")[0].strip()
            if should_collect_card(card_name):
                line_fields = line.split(":")[1].lstrip().split()
                net_buffer[card_name] = {
                    "InBytes": string.atoi(line_fields[0]),
                    "InPackets": string.atoi(line_fields[1]),
                    "InErrors": string.atoi(line_fields[2]),
                    "InDrops": string.atoi(line_fields[3]),
                    "OutBytes": string.atoi(line_fields[8]),
                    "OutPackets": string.atoi(line_fields[9]),
                    "OutErrors": string.atoi(line_fields[10]),
                    "OutDrops": string.atoi(line_fields[11])
                }
    return net_buffer

# 是否需要采集相应的网卡
def should_collect_card(line):
    return line.startswith("eth") or line.startswith("em")

网卡指标也是一个累加值,需要求差:

# 计算网卡的指标
def calculate_net_info():
    global last_net_info
    net_info = collect_net_info()
    result = []
    if last_net_info is not None:
        for key in net_info.keys():
            result.append({
                "in_bytes": (net_info[key]["InBytes"] - last_net_info[key]["InBytes"]) * 10000,
                "in_packets": (net_info[key]["InPackets"] - last_net_info[key]["InPackets"]) * 10000,
                "in_errors": (net_info[key]["InErrors"] - last_net_info[key]["InErrors"]) * 10000,
                "in_drops": (net_info[key]["InDrops"] - last_net_info[key]["InDrops"]) * 10000,
                "out_bytes": (net_info[key]["OutBytes"] - last_net_info[key]["OutBytes"]) * 10000,
                "out_packets": (net_info[key]["OutPackets"] - last_net_info[key]["OutPackets"]) * 10000,
                "out_errors": (net_info[key]["OutErrors"] - last_net_info[key]["OutErrors"]) * 10000,
                "out_drops": (net_info[key]["OutDrops"] - last_net_info[key]["OutDrops"]) * 10000
            })
    last_net_info = net_info
    return result

6. 采集tcp指标

tcp与udp的指标信息都可以从/proc/net/snmp中读取

# 采集tcp相关数据
def collect_tcp_info():
    tcp_buffer = {}
    is_title = True
    with open("/proc/net/snmp") as tcp_file:
        for line in tcp_file:
            protocol_name = line.split(":")[0].strip()
            if protocol_name == "Tcp":
                if is_title:
                    is_title = False
                    continue
                else:
                    line_fields = line.split(":")[1].lstrip().split()
                    tcp_buffer = {
                        "ActiveOpens": string.atoi(line_fields[4]),
                        "PassiveOpens": string.atoi(line_fields[5]),
                        "InSegs": string.atoi(line_fields[9]),
                        "OutSegs": string.atoi(line_fields[10]),
                        "RetransSegs": string.atoi(line_fields[11]),
                        "CurrEstab": string.atoi(line_fields[8]),
                    }
                    break
    return tcp_buffer

里面有累加值也有实时值,当前的连接数为实时值:

# 计算tcp数据
def calculate_tcp_info():
    global last_tcp_info
    tcp_info = collect_tcp_info()
    result = {}
    if last_tcp_info is not None:
        outSegsTcp = tcp_info["OutSegs"] - last_tcp_info["OutSegs"]
        retransRate = float(tcp_info["RetransSegs"] - last_tcp_info["RetransSegs"])/float(outSegsTcp)
        result = {
            "tcp_active": (tcp_info["ActiveOpens"] - last_tcp_info["ActiveOpens"]) * 10000,
            "tcp_passive": (tcp_info["PassiveOpens"] - last_tcp_info["PassiveOpens"]) * 10000,
            "tcp_inseg": (tcp_info["InSegs"] - last_tcp_info["InSegs"]) * 10000,
            "tcp_outseg": outSegsTcp * 10000,
            "tcp_established": tcp_info["CurrEstab"] * 10000,
            "tcp_retran": int(retransRate * 10000)
        }
    last_tcp_info = tcp_info
    return result

7. 采集指定进程的cpu与内存

有两种方式,其一是执行ps命令,取到的是当前进程启动之后的平均cpu与内存占用;其二是在proc/pid下面读取,在这里用的是第一种。

指定的进程的名称通过ps auxc | grep "进程名1|进程名2|...."来获取进程id

# 采集指定进程数据
def collect_process_info():
    global processes
    process_info = {}
    if processes == "":
        return process_info
    process_filter = processes.replace(",", "\|")
    process_filter = "'" + process_filter + "'"
    commandline = "ps auxc | grep " + process_filter
    status_code, result = commands.getstatusoutput(commandline)
    if status_code == 0:
        # 分割结果
        result_array = result.split("\n")
        for item in result_array:
            item_fields = item.split()
            process_info[item_fields[10]] = {
                "process_cpu_util": int(string.atof(item_fields[2]) * 10000),
                "process_mem_util": int(string.atof(item_fields[3]) * 10000)
            }
    return process_info

如果需实时的数据,应该从proce/pid中的文件夹去读取数据,拿pid的方式和上述的方式是一样的

你可能感兴趣的:(python实现系统指标采集)