关于ebpf 的co-re

前言

由于linux内核更新很快,linux内核底层的结构体千变万化,字段时有变更,如何让我们的ebpf程序一次编译到处执行到不同的linux系统上是我们需要关注的点。由此我们需要关注co-re,不做特别深的研究,因为研发层面我们去关注co-re数据结构和算法我认为是没有必要的,只要保证可以用到项目上就可以,保证我们的ebpf一次编译以后在各种linux机器上能够跑起来就行,跟同事经过很长时间的学习,总结了一下笔记

一、co-re介绍

底层库依靠的是libbpf

https://github.com/libbpf/libbpf

co-re的运行主要依赖btf格式,这个东西还是比较复杂,在这里不需要关注,btf相关介绍:

https://www.kernel.org/doc/html/latest/bpf/btf.html

kernel代码include/uapi/linux/bpf.h解释了`CO-RE`的原理。

clang有内置标记`__attribute__((preserve_access_index))`(等效于`__builtin_preserve_access_index`)。ebpf.c代码这样标记所有它需要访问的结构体。clang在对象ELF文件ebpf.o中为每个这样的访问生成一个`bpf_core_relo`。libbpf将在加载ebpf.o时按照ELF中的bpf_core_relo修改指令段。

/*
 * "struct bpf_core_relo" is used to pass relocation data form LLVM to libbpf
 * and from libbpf to the kernel.
 *
 * CO-RE relocation captures the following data:
 * - insn_off - instruction offset (in bytes) within a BPF program that needs
 *   its insn->imm field to be relocated with actual field info;
 * - type_id - BTF type ID of the "root" (containing) entity of a relocatable
 *   type or field;
 * - access_str_off - offset into corresponding .BTF string section. String
 *   interpretation depends on specific relocation kind:
 *     - for field-based relocations, string encodes an accessed field using
 *       a sequence of field and array indices, separated by colon (:). It's
 *       conceptually very close to LLVM's getelementptr ([0]) instruction's
 *       arguments for identifying offset to a field.
 *     - for type-based relocations, strings is expected to be just "0";
 *     - for enum value-based relocations, string contains an index of enum
 *       value within its enum type;
 * - kind - one of enum bpf_core_relo_kind;
 *
 * Example:
 *   struct sample {
 *       int a;
 *       struct {
 *           int b[10];
 *       };
 *   };
 *
 *   struct sample *s = ...;
 *   int *x = &s->a;     // encoded as "0:0" (a is field #0)
 *   int *y = &s->b[5];  // encoded as "0:1:0:5" (anon struct is field #1,
 *                       // b is field #0 inside anon struct, accessing elem #5)
 *   int *z = &s[10]->b; // encoded as "10:1" (ptr is used as an array)
 *
 * type_id for all relocs in this example will capture BTF type id of
 * `struct sample`.
 *
 * Such relocation is emitted when using __builtin_preserve_access_index()
 * Clang built-in, passing expression that captures field address, e.g.:
 *
 * bpf_probe_read(&dst, sizeof(dst),
 *		  __builtin_preserve_access_index(&src->a.b.c));
 *
 * In this case Clang will emit field relocation recording necessary data to
 * be able to find offset of embedded `a.b.c` field within `src` struct.
 *
 * [0] https://llvm.org/docs/LangRef.html#getelementptr-instruction
 */
struct bpf_core_relo {
	__u32 insn_off;
	__u32 type_id;
	__u32 access_str_off;
	enum bpf_core_relo_kind kind;
};

vmlinux.h文件开头的如下片段,为内核所有结构体加上了标记

#ifndef BPF_NO_PRESERVE_ACCESS_INDEX
#pragma clang attribute push (__attribute__((preserve_access_index)), apply_to = record)
#endif

BTF 类型信息:用于获取内核、BPF 程序类型及 BPF 代码的关键信息

__builtin_preserve_access_index

__builtin_preserve_access_index specifies a code section where array subscript access and structure/union member access are relocatable under bpf compile-once run-everywhere framework. Debuginfo (typically with -g) is needed, otherwise, the compiler will exit with an error. The return type for the intrinsic is the same as the type of the argument.

使用案例:

struct t {
  int i;
  int j;
  union {
    int a;
    int b;
  } c[4];
};
struct t *v = ...;
int *pb =__builtin_preserve_access_index(&v->c[3].b);
__builtin_preserve_access_index(v->j);

二、c程序需要做的兼容

1.读取内核结构体

c程序需要使用bpf_core_read 使用clang支持的__builtin_preserve_access_index

bpf_core_read:

应用读取结构体:

#define bpf_core_read(dst, sz, src)					    \
	bpf_probe_read_kernel(dst, sz, (const void *)__builtin_preserve_access_index(src))

这样elf文件里就有了 access_index, bpf程序启动的时候会读取当前机器的

/sys/kernel/btf/vmlinux

这里的btf和之前.o文件的btf,计算结构体的偏移量,然后进行修订

简单来说就是使用bpf_core_read代替bpf_probe_read

实战案例kernel部分:

static __always_inline u32 ct_status(const struct nf_conn *ct) {
    u32 status = 0;
    u32 ct_conn_status =  BPF_CORE_READ(ct, status);
    bpf_probe_read_kernel_with_telemetry(&status, sizeof(status), (void *)(&ct_conn_status));
    return status;
}

当我们读取内核netlink中的nfconn中结构体程序的时候不再使用bpf_probe_read而是使用bpf_core_read,如果多个成员可以使用BPF_CORE_READ宏定义

#define BPF_CORE_READ(src, a, ...) ({					    \
	___type((src), a, ##__VA_ARGS__) __r;				    \
	BPF_CORE_READ_INTO(&__r, (src), a, ##__VA_ARGS__);		    \
	__r;								    \
})

读取结构体嵌套:

static __always_inline int nf_conntrack_tuple_to_conntrack_tuple(conntrack_tuple_t *t, const struct nf_conntrack_tuple *ct) {
    memset(t, 0, sizeof(conntrack_tuple_t));

    //ct->dst.protonum
    switch (BPF_CORE_READ(ct, dst.protonum)) {
    case IPPROTO_TCP:
        t->metadata = CONN_TYPE_TCP;
        t->sport = BPF_CORE_READ(ct, src.u.tcp.port);
        t->dport = BPF_CORE_READ(ct, dst.u.tcp.port);
        break;
    case IPPROTO_UDP:
        t->metadata = CONN_TYPE_UDP;
        t->sport = BPF_CORE_READ(ct, src.u.udp.port);
        t->dport = BPF_CORE_READ(ct, dst.u.udp.port);
        break;
    default:
        log_debug("ERR(to_conn_tuple): unknown protocol number: %u\n", ct->dst.protonum);
        return 0;
    }

    t->sport = bpf_ntohs(t->sport);
    t->dport = bpf_ntohs(t->dport);
    if (t->sport == 0 || t->dport == 0) {
        log_debug("ERR(to_conn_tuple): src/dst port not set: src: %u, dst: %u\n", t->sport, t->dport);
        return 0;
    }

    if (BPF_CORE_READ(ct, src.l3num) == AF_INET) {
        t->metadata |= CONN_V4;
        t->saddr_l = BPF_CORE_READ(ct, src.u3.ip);
        t->daddr_l = BPF_CORE_READ(ct, dst.u3.ip);

        if (!t->saddr_l || !t->daddr_l) {
            log_debug("ERR(to_conn_tuple.v4): src/dst addr not set src:%u, dst:%u\n", t->saddr_l, t->daddr_l);
            return 0;
        }
    }
#ifdef FEATURE_IPV6_ENABLED
    else if (BPF_CORE_READ(ct, src.l3num) == AF_INET6) {
        t->metadata |= CONN_V6;
        read_in6_addr(&t->saddr_h, &t->saddr_l, &BPF_CORE_READ(ct, src.u3.in6));
        read_in6_addr(&t->daddr_h, &t->daddr_l, &BPF_CORE_READ(ct, dst.u3.in6));

        if (!(t->saddr_h || t->saddr_l)) {
            log_debug("ERR(to_conn_tuple.v6): src addr not set: src_l: %llu, src_h: %llu\n",
                t->saddr_l, t->saddr_h);
            return 0;
        }
        if (!(t->daddr_h || t->daddr_l)) {
            log_debug("ERR(to_conn_tuple.v6): dst addr not set: dst_l: %llu, dst_h: %llu\n",
                t->daddr_l, t->daddr_h);
            return 0;
        }
    }
#endif

    return 1;
}

有一篇文章说了,我们不一定总是需要去显示调用bpf_core_read一类的函数

https://nakryiko.com/posts/bpf-core-reference-guide/#btf-enabled-bpf-program-types-with-direct-memory-reads

这篇文章里有介绍

2.关于检查字段是否存在

非co-re的读取情况,我们严重依赖头文件

// depending on the kernel version p_net may be a struct net** or possible_net_t*
static __always_inline u32 get_netns(void *p_net) {
    u32 net_ns_inum = 0;
#ifdef CONFIG_NET_NS
    struct net *ct_net = NULL;
    bpf_probe_read_kernel_with_telemetry(&ct_net, sizeof(ct_net), p_net);
    #ifdef _LINUX_NS_COMMON_H
        bpf_probe_read_kernel_with_telemetry(&net_ns_inum, sizeof(net_ns_inum), &ct_net->ns.inum);
    #else
        bpf_probe_read_kernel_with_telemetry(&net_ns_inum, sizeof(net_ns_inum), &ct_net->proc_inum);
    #endif
#endif
    return net_ns_inum;
}

当我们使用libbpf之后,应有了强大的co-re能力

我们重铸指针:

struct ct_net___old {
    unsigned int proc_inum;
} __attribute__((preserve_access_index));

co-re后的代码,我们使用bpf_core_field_exists判断字段是否存在,而不去依赖宏

// depending on the kernel version p_net may be a struct net** or possible_net_t*
static __always_inline u32 get_netns(void *p_net) {
    u32 net_ns_inum = 0;
    struct net *ct_net = NULL;
    bpf_probe_read_kernel_with_telemetry(&ct_net, sizeof(ct_net), p_net);
    if (bpf_core_field_exists(ct_net->ns.inum)) {
        unsigned int inum = BPF_CORE_READ(ct_net, ns.inum);
        bpf_core_read(&net_ns_inum, sizeof(net_ns_inum), &inum);
    } else {
        struct ct_net___old *ct_net_old = (void *)ct_net;
        unsigned int proc_inum = BPF_CORE_READ(ct_net_old, proc_inum);
        bpf_probe_read(&net_ns_inum, sizeof(net_ns_inum), &proc_inum);
    }

    return net_ns_inum;
}

三、golang加载部分:

读取btf:

var btfData *btf.Spec
	btfData, telemetry = ddebpf.GetBTF(cfg.BTFPath, cfg.BPFDir)

GetBTF实现:

// LoadKernelSpec returns the current kernel's BTF information.
//
// Defaults to /sys/kernel/btf/vmlinux and falls back to scanning the file system
// for vmlinux ELFs. Returns an error wrapping ErrNotSupported if BTF is not enabled.
func LoadKernelSpec() (*Spec, error) {
	fh, err := os.Open("/sys/kernel/btf/vmlinux")
	if err == nil {
		defer fh.Close()

		return loadRawSpec(fh, internal.NativeEndian, nil, nil)
	}

	file, err := findVMLinux()
	if err != nil {
		return nil, err
	}
	defer file.Close()

	return loadSpecFromELF(file)
}

加载进入kernel

       if btfData != nil {
		opts.VerifierOptions = ebpf.CollectionOptions{
			Programs: ebpf.ProgramOptions{
				KernelTypes: btfData,
			},
		}
	}

	err = mgr.InitWithOptions(buf, opts)
	if err != nil {
		return nil, err
	}
	return mgr, nil

简单来说就是读取当前主机的vmlinux的etf信息

然后通过libbpf对结构体偏移量进行修正,然后加载入内核

至此co-re介绍结束.

你可能感兴趣的:(c,linux,c,linux,运维,服务器)