由于linux内核更新很快,linux内核底层的结构体千变万化,字段时有变更,如何让我们的ebpf程序一次编译到处执行到不同的linux系统上是我们需要关注的点。由此我们需要关注co-re,不做特别深的研究,因为研发层面我们去关注co-re数据结构和算法我认为是没有必要的,只要保证可以用到项目上就可以,保证我们的ebpf一次编译以后在各种linux机器上能够跑起来就行,跟同事经过很长时间的学习,总结了一下笔记
底层库依靠的是libbpf
https://github.com/libbpf/libbpf
co-re的运行主要依赖btf格式,这个东西还是比较复杂,在这里不需要关注,btf相关介绍:
https://www.kernel.org/doc/html/latest/bpf/btf.html
kernel代码include/uapi/linux/bpf.h解释了`CO-RE`的原理。
clang有内置标记`__attribute__((preserve_access_index))`(等效于`__builtin_preserve_access_index`)。ebpf.c代码这样标记所有它需要访问的结构体。clang在对象ELF文件ebpf.o中为每个这样的访问生成一个`bpf_core_relo`。libbpf将在加载ebpf.o时按照ELF中的bpf_core_relo修改指令段。
/*
* "struct bpf_core_relo" is used to pass relocation data form LLVM to libbpf
* and from libbpf to the kernel.
*
* CO-RE relocation captures the following data:
* - insn_off - instruction offset (in bytes) within a BPF program that needs
* its insn->imm field to be relocated with actual field info;
* - type_id - BTF type ID of the "root" (containing) entity of a relocatable
* type or field;
* - access_str_off - offset into corresponding .BTF string section. String
* interpretation depends on specific relocation kind:
* - for field-based relocations, string encodes an accessed field using
* a sequence of field and array indices, separated by colon (:). It's
* conceptually very close to LLVM's getelementptr ([0]) instruction's
* arguments for identifying offset to a field.
* - for type-based relocations, strings is expected to be just "0";
* - for enum value-based relocations, string contains an index of enum
* value within its enum type;
* - kind - one of enum bpf_core_relo_kind;
*
* Example:
* struct sample {
* int a;
* struct {
* int b[10];
* };
* };
*
* struct sample *s = ...;
* int *x = &s->a; // encoded as "0:0" (a is field #0)
* int *y = &s->b[5]; // encoded as "0:1:0:5" (anon struct is field #1,
* // b is field #0 inside anon struct, accessing elem #5)
* int *z = &s[10]->b; // encoded as "10:1" (ptr is used as an array)
*
* type_id for all relocs in this example will capture BTF type id of
* `struct sample`.
*
* Such relocation is emitted when using __builtin_preserve_access_index()
* Clang built-in, passing expression that captures field address, e.g.:
*
* bpf_probe_read(&dst, sizeof(dst),
* __builtin_preserve_access_index(&src->a.b.c));
*
* In this case Clang will emit field relocation recording necessary data to
* be able to find offset of embedded `a.b.c` field within `src` struct.
*
* [0] https://llvm.org/docs/LangRef.html#getelementptr-instruction
*/
struct bpf_core_relo {
__u32 insn_off;
__u32 type_id;
__u32 access_str_off;
enum bpf_core_relo_kind kind;
};
vmlinux.h文件开头的如下片段,为内核所有结构体加上了标记
#ifndef BPF_NO_PRESERVE_ACCESS_INDEX
#pragma clang attribute push (__attribute__((preserve_access_index)), apply_to = record)
#endif
BTF 类型信息:用于获取内核、BPF 程序类型及 BPF 代码的关键信息
__builtin_preserve_access_index
__builtin_preserve_access_index specifies a code section where array subscript access and structure/union member access are relocatable under bpf compile-once run-everywhere framework. Debuginfo (typically with -g) is needed, otherwise, the compiler will exit with an error. The return type for the intrinsic is the same as the type of the argument.
使用案例:
struct t {
int i;
int j;
union {
int a;
int b;
} c[4];
};
struct t *v = ...;
int *pb =__builtin_preserve_access_index(&v->c[3].b);
__builtin_preserve_access_index(v->j);
c程序需要使用bpf_core_read 使用clang支持的__builtin_preserve_access_index
bpf_core_read:
应用读取结构体:
#define bpf_core_read(dst, sz, src) \
bpf_probe_read_kernel(dst, sz, (const void *)__builtin_preserve_access_index(src))
这样elf文件里就有了 access_index, bpf程序启动的时候会读取当前机器的
/sys/kernel/btf/vmlinux
这里的btf和之前.o文件的btf,计算结构体的偏移量,然后进行修订
简单来说就是使用bpf_core_read代替bpf_probe_read
实战案例kernel部分:
static __always_inline u32 ct_status(const struct nf_conn *ct) {
u32 status = 0;
u32 ct_conn_status = BPF_CORE_READ(ct, status);
bpf_probe_read_kernel_with_telemetry(&status, sizeof(status), (void *)(&ct_conn_status));
return status;
}
当我们读取内核netlink中的nfconn中结构体程序的时候不再使用bpf_probe_read而是使用bpf_core_read,如果多个成员可以使用BPF_CORE_READ宏定义
#define BPF_CORE_READ(src, a, ...) ({ \
___type((src), a, ##__VA_ARGS__) __r; \
BPF_CORE_READ_INTO(&__r, (src), a, ##__VA_ARGS__); \
__r; \
})
读取结构体嵌套:
static __always_inline int nf_conntrack_tuple_to_conntrack_tuple(conntrack_tuple_t *t, const struct nf_conntrack_tuple *ct) {
memset(t, 0, sizeof(conntrack_tuple_t));
//ct->dst.protonum
switch (BPF_CORE_READ(ct, dst.protonum)) {
case IPPROTO_TCP:
t->metadata = CONN_TYPE_TCP;
t->sport = BPF_CORE_READ(ct, src.u.tcp.port);
t->dport = BPF_CORE_READ(ct, dst.u.tcp.port);
break;
case IPPROTO_UDP:
t->metadata = CONN_TYPE_UDP;
t->sport = BPF_CORE_READ(ct, src.u.udp.port);
t->dport = BPF_CORE_READ(ct, dst.u.udp.port);
break;
default:
log_debug("ERR(to_conn_tuple): unknown protocol number: %u\n", ct->dst.protonum);
return 0;
}
t->sport = bpf_ntohs(t->sport);
t->dport = bpf_ntohs(t->dport);
if (t->sport == 0 || t->dport == 0) {
log_debug("ERR(to_conn_tuple): src/dst port not set: src: %u, dst: %u\n", t->sport, t->dport);
return 0;
}
if (BPF_CORE_READ(ct, src.l3num) == AF_INET) {
t->metadata |= CONN_V4;
t->saddr_l = BPF_CORE_READ(ct, src.u3.ip);
t->daddr_l = BPF_CORE_READ(ct, dst.u3.ip);
if (!t->saddr_l || !t->daddr_l) {
log_debug("ERR(to_conn_tuple.v4): src/dst addr not set src:%u, dst:%u\n", t->saddr_l, t->daddr_l);
return 0;
}
}
#ifdef FEATURE_IPV6_ENABLED
else if (BPF_CORE_READ(ct, src.l3num) == AF_INET6) {
t->metadata |= CONN_V6;
read_in6_addr(&t->saddr_h, &t->saddr_l, &BPF_CORE_READ(ct, src.u3.in6));
read_in6_addr(&t->daddr_h, &t->daddr_l, &BPF_CORE_READ(ct, dst.u3.in6));
if (!(t->saddr_h || t->saddr_l)) {
log_debug("ERR(to_conn_tuple.v6): src addr not set: src_l: %llu, src_h: %llu\n",
t->saddr_l, t->saddr_h);
return 0;
}
if (!(t->daddr_h || t->daddr_l)) {
log_debug("ERR(to_conn_tuple.v6): dst addr not set: dst_l: %llu, dst_h: %llu\n",
t->daddr_l, t->daddr_h);
return 0;
}
}
#endif
return 1;
}
有一篇文章说了,我们不一定总是需要去显示调用bpf_core_read一类的函数
https://nakryiko.com/posts/bpf-core-reference-guide/#btf-enabled-bpf-program-types-with-direct-memory-reads
这篇文章里有介绍
非co-re的读取情况,我们严重依赖头文件
// depending on the kernel version p_net may be a struct net** or possible_net_t*
static __always_inline u32 get_netns(void *p_net) {
u32 net_ns_inum = 0;
#ifdef CONFIG_NET_NS
struct net *ct_net = NULL;
bpf_probe_read_kernel_with_telemetry(&ct_net, sizeof(ct_net), p_net);
#ifdef _LINUX_NS_COMMON_H
bpf_probe_read_kernel_with_telemetry(&net_ns_inum, sizeof(net_ns_inum), &ct_net->ns.inum);
#else
bpf_probe_read_kernel_with_telemetry(&net_ns_inum, sizeof(net_ns_inum), &ct_net->proc_inum);
#endif
#endif
return net_ns_inum;
}
当我们使用libbpf之后,应有了强大的co-re能力
我们重铸指针:
struct ct_net___old {
unsigned int proc_inum;
} __attribute__((preserve_access_index));
co-re后的代码,我们使用bpf_core_field_exists判断字段是否存在,而不去依赖宏
// depending on the kernel version p_net may be a struct net** or possible_net_t*
static __always_inline u32 get_netns(void *p_net) {
u32 net_ns_inum = 0;
struct net *ct_net = NULL;
bpf_probe_read_kernel_with_telemetry(&ct_net, sizeof(ct_net), p_net);
if (bpf_core_field_exists(ct_net->ns.inum)) {
unsigned int inum = BPF_CORE_READ(ct_net, ns.inum);
bpf_core_read(&net_ns_inum, sizeof(net_ns_inum), &inum);
} else {
struct ct_net___old *ct_net_old = (void *)ct_net;
unsigned int proc_inum = BPF_CORE_READ(ct_net_old, proc_inum);
bpf_probe_read(&net_ns_inum, sizeof(net_ns_inum), &proc_inum);
}
return net_ns_inum;
}
读取btf:
var btfData *btf.Spec
btfData, telemetry = ddebpf.GetBTF(cfg.BTFPath, cfg.BPFDir)
GetBTF实现:
// LoadKernelSpec returns the current kernel's BTF information.
//
// Defaults to /sys/kernel/btf/vmlinux and falls back to scanning the file system
// for vmlinux ELFs. Returns an error wrapping ErrNotSupported if BTF is not enabled.
func LoadKernelSpec() (*Spec, error) {
fh, err := os.Open("/sys/kernel/btf/vmlinux")
if err == nil {
defer fh.Close()
return loadRawSpec(fh, internal.NativeEndian, nil, nil)
}
file, err := findVMLinux()
if err != nil {
return nil, err
}
defer file.Close()
return loadSpecFromELF(file)
}
加载进入kernel
if btfData != nil {
opts.VerifierOptions = ebpf.CollectionOptions{
Programs: ebpf.ProgramOptions{
KernelTypes: btfData,
},
}
}
err = mgr.InitWithOptions(buf, opts)
if err != nil {
return nil, err
}
return mgr, nil
简单来说就是读取当前主机的vmlinux的etf信息
然后通过libbpf对结构体偏移量进行修正,然后加载入内核
至此co-re介绍结束.