本文重点讲解samples/bpf/sockex1_kern.c
和samples/bpf/sockex1_user.c
这两个文件,并剖析了它们调用的其他外部函数。
sockex1_kern.c
包含eBPF数据结构和eBPF程序的定义,sockex1_kern.c
会被编译为sockex1_kern.o
,这是一个ELF格式的文件。sockex1_user.c
中会实现加载器和用户空间逻辑,其中加载器会解析sockex1_kern.o
文件,创建map并将其中的eBPF代码挂载到对应的hook点上。sockex1_kern.c是关于eBPF map和eBPF程序的定义,SEC
对应的宏定义是:
#define SEC(NAME) __attribute__((section(NAME), used))
被SEC(NAME)修饰的定义会被放到elf文件名为"NAME"的section中,有了"used"字段,即便这个定义未被使用过,也不会被编译器移除。
执行readelf -e sockex1_kern.o
命令,可以看到我们这个程序自定义的3个section字段,分别是"socket1"、“maps"和"license”。
[root@localhost bpf]# readelf -e sockex1_kern.o -W
ELF Header:
Magic: 7f 45 4c 46 02 01 01 00 00 00 00 00 00 00 00 00
Class: ELF64
Data: 2's complement, little endian
Version: 1 (current)
OS/ABI: UNIX - System V
ABI Version: 0
Type: REL (Relocatable file)
Machine: <unknown>: 0xf7
Version: 0x1
Entry point address: 0x0
Start of program headers: 0 (bytes into file)
Start of section headers: 528 (bytes into file)
Flags: 0x0
Size of this header: 64 (bytes)
Size of program headers: 0 (bytes)
Number of program headers: 0
Size of section headers: 64 (bytes)
Number of section headers: 10
Section header string table index: 1
Section Headers:
[Nr] Name Type Address Off Size ES Flg Lk Inf Al
[ 0] NULL 0000000000000000 000000 000000 00 0 0 0
[ 1] .strtab STRTAB 0000000000000000 0001b8 000057 00 0 0 1
[ 2] .text PROGBITS 0000000000000000 000040 000000 00 AX 0 0 4
[ 3] socket1 PROGBITS 0000000000000000 000040 000078 00 AX 0 0 8
[ 4] .relsocket1 REL 0000000000000000 000198 000010 10 9 3 8
[ 5] maps PROGBITS 0000000000000000 0000b8 00001c 00 WA 0 0 4
[ 6] license PROGBITS 0000000000000000 0000d4 000004 00 WA 0 0 1
[ 7] .eh_frame PROGBITS 0000000000000000 0000d8 000030 00 A 0 0 8
[ 8] .rel.eh_frame REL 0000000000000000 0001a8 000010 10 9 7 8
[ 9] .symtab SYMTAB 0000000000000000 000108 000090 18 1 3 8
Key to Flags:
W (write), A (alloc), X (execute), M (merge), S (strings), I (info),
L (link order), O (extra OS processing required), G (group), T (TLS),
C (compressed), x (unknown), o (OS specific), E (exclude),
p (processor specific)
There are no program headers in this file.
其中:
readelf -p .strtab sockex1_kern.o -W
命令查看#include
#include
#include
#include
#include "bpf_helpers.h"
struct bpf_map_def SEC("maps") my_map = {
.type = BPF_MAP_TYPE_ARRAY,
.key_size = sizeof(u32),
.value_size = sizeof(long),
.max_entries = 256,
};
SEC("socket1")
int bpf_prog1(struct __sk_buff *skb)
{
int index = load_byte(skb, ETH_HLEN + offsetof(struct iphdr, protocol));
long *value;
if (skb->pkt_type != PACKET_OUTGOING)
return 0;
value = bpf_map_lookup_elem(&my_map, &index);
if (value)
__sync_fetch_and_add(value, skb->len);
return 0;
}
char _license[] SEC("license") = "GPL";
可以看到,sockex1_user.c一上来就是去解析sockex1_kern.o文件,使用了load_bpf_file(filename)
函数,这个函数会去调用do_load_bpf_file()
,这个函数会解析elf文件并进行创建、加载工作。
static int do_load_bpf_file(const char *path, fixup_map_cb fixup_map)
{
int fd, i, ret, maps_shndx = -1, strtabidx = -1;
Elf *elf;
GElf_Ehdr ehdr;
GElf_Shdr shdr, shdr_prog;
Elf_Data *data, *data_prog, *data_maps = NULL, *symbols = NULL;
char *shname, *shname_prog;
int nr_maps = 0;
//... ...
done:
close(fd);
return ret;
}
这其中涉及了几个结构体定义:
struct Elf
:对于ELF文件的整体描述struct GElf_Ehdr
:ELF header,记录了ELF的元数据信息,在每个ELF文件最前面定义,包含magic number(标识文件类型)、section的数量等struct GElf_Shdr
:ELF section的header,记录section的元数据信息,包含section名称(为整型,对应.strtab中的索引)、类型等struct Elf_Data
:具体section中的数据{
//... ...
/* reset global variables */
kern_version = 0;
memset(license, 0, sizeof(license));
memset(processed_sec, 0, sizeof(processed_sec));
if (elf_version(EV_CURRENT) == EV_NONE)
return 1;
fd = open(path, O_RDONLY, 0);
if (fd < 0)
return 1;
elf = elf_begin(fd, ELF_C_READ, NULL);
if (!elf)
return 1;
if (gelf_getehdr(elf, &ehdr) != &ehdr)
return 1;
/* clear all kprobes */
i = system("echo \"\" > /sys/kernel/debug/tracing/kprobe_events");
//... ...
}
接下来是一些初始化的工作,读取ELF文件,并从中提取了ELF header信息,然后清除掉了"/sys/kernel/debug/tracing/kprobe_events"下的kprobes探测点
{
//... ...
/* scan over all elf sections to get license and map info */
for (i = 1; i < ehdr.e_shnum; i++) { //ehdr.e_shnum就是section的数量
if (get_sec(elf, i, &ehdr, &shname, &shdr, &data))
continue;
if (1) /* helpful for llvm debugging */
printf("section %d:%s data %p size %zd link %d flags %d\n",
i, shname, data->d_buf, data->d_size,
shdr.sh_link, (int) shdr.sh_flags);
if (strcmp(shname, "license") == 0) {
processed_sec[i] = true;
memcpy(license, data->d_buf, data->d_size);
} else if (strcmp(shname, "version") == 0) {
processed_sec[i] = true;
if (data->d_size != sizeof(int)) {
printf("invalid size of version section %zd\n",
data->d_size);
return 1;
}
memcpy(&kern_version, data->d_buf, sizeof(int));
} else if (strcmp(shname, "maps") == 0) {
int j;
maps_shndx = i;
data_maps = data;
for (j = 0; j < MAX_MAPS; j++)
map_data[j].fd = -1;
} else if (shdr.sh_type == SHT_SYMTAB) {
strtabidx = shdr.sh_link;
symbols = data;
}
}
//... ...
}
接下来的工作就是扫描这个ELF文件,获取每个section的具体信息。get_sec(elf, i, &ehdr, &shname, &shdr, &data)
获取具体section的信息,包括section名字、header和数据,具体细节见注释:
static int get_sec(Elf *elf, int i, GElf_Ehdr *ehdr, char **shname,
GElf_Shdr *shdr, Elf_Data **data)
{
Elf_Scn *scn;
scn = elf_getscn(elf, i); //获取第i个section
if (!scn)
return 1;
if (gelf_getshdr(scn, shdr) != shdr) //获取这个section的header
return 2;
//extern char *elf_strptr (Elf *__elf, size_t __index, size_t __offset);
//这个函数表示要在__elf中索引为index的section里,获取offset索引的string的指针
//shname是这个section的名字,ehdr->e_shstrndx是.strtab这个section在elf所有section中的索引值
//shdr->sh_name是整型,表示.strtab中的索引
//也就是说,最终会去.strtab中shdr->sh_name索引下读取这个section的名字
*shname = elf_strptr(elf, ehdr->e_shstrndx, shdr->sh_name);
if (!*shname || !shdr->sh_size)
return 3;
*data = elf_getdata(scn, 0); //获取section数据
if (!*data || elf_getdata(scn, *data) != NULL)
return 4;
return 0;
}
获取到以上每一个section的数据,对license
、version
、maps
以及.symtab
这几个section做了处理。其中,我们可以看到,在处理maps
这个section时,初始化了该section的index值以及section的data,并将map数组的fd都初始化为-1。
接下来,开始具体处理maps的内容,这里的核心函数是load_elf_maps_section(map_data, maps_shndx, elf, symbols, strtabidx)
,首先,明确一下传入的函数参数的含义:
{
//... ...
if (data_maps) {
nr_maps = load_elf_maps_section(map_data, maps_shndx,
elf, symbols, strtabidx);
if (nr_maps < 0) {
printf("Error: Failed loading ELF maps (errno:%d):%s\n",
nr_maps, strerror(-nr_maps));
goto done;
}
if (load_maps(map_data, nr_maps, fixup_map))
goto done;
map_data_count = nr_maps;
processed_sec[maps_shndx] = true;
}
//... ...
}
在load_elf_maps_section
中,我们遍历了这个symbols(符号表),用readelf
命令可以查看这个符号表,符号表的Ndx列表示这个符号所在的section的索引值,Name是该符号的名字:
[root@localhost bpf]# readelf -s sockex1_kern.o -W
Symbol table '.symtab' contains 6 entries:
Num: Value Size Type Bind Vis Ndx Name
0: 0000000000000000 0 NOTYPE LOCAL DEFAULT UND
1: 0000000000000068 0 NOTYPE LOCAL DEFAULT 3 LBB0_3
2: 0000000000000000 0 SECTION LOCAL DEFAULT 3
3: 0000000000000000 0 NOTYPE GLOBAL DEFAULT 6 _license
4: 0000000000000000 0 NOTYPE GLOBAL DEFAULT 3 bpf_prog1
5: 0000000000000000 0 NOTYPE GLOBAL DEFAULT 5 my_map
结合我们得到的符号表,不难理解这段代码的含义:即遍历符号表,将所有创建的map找到(这个例子中只有一个my_map)。每一次将找到的这个map的symbol entry加入到sym数组中,并对nr_map计数值加1。
接着,我们调用qsort
对这个sym数组按照sym entry的st_value值进行升序排序,这里应该就是为了保证在你创建多个map的时候,那个fd的大小顺序就是和你创建map的顺序是一致的(从小到大)。在这个例子里面只有一个map,如果有多个map,你就会发现,你代码里面定义的顺序和这个符号表出现的先后顺序可能不一致,但是和st_value的大小顺序是一致的。举个例子,我们查看sysdig
的map定义,并查看其编译产物的elf:
[root@localhost bpf]# readelf -s probe.o -W | grep map
11718: 0000000000000000 0 NOTYPE LOCAL DEFAULT 31 bpf_sys_brk_munmap_mmap_x
11739: 0000000000000000 0 NOTYPE LOCAL DEFAULT 33 bpf_sys_mmap_e
11937: 00000000000000a8 0 NOTYPE GLOBAL DEFAULT 227 frame_scratch_map
11939: 00000000000000fc 0 NOTYPE GLOBAL DEFAULT 227 local_state_map
11940: 0000000000000000 0 NOTYPE GLOBAL DEFAULT 227 perf_map
11941: 0000000000000150 0 NOTYPE GLOBAL DEFAULT 227 pgft_major_map
11943: 0000000000000118 0 NOTYPE GLOBAL DEFAULT 227 rtt_static_map
11944: 00000000000000e0 0 NOTYPE GLOBAL DEFAULT 227 settings_map
11945: 000000000000016c 0 NOTYPE GLOBAL DEFAULT 227 stash_map
11946: 0000000000000134 0 NOTYPE GLOBAL DEFAULT 227 stash_tuple_map
11949: 000000000000001c 0 NOTYPE GLOBAL DEFAULT 227 tail_map
11950: 00000000000000c4 0 NOTYPE GLOBAL DEFAULT 227 tmp_scratch_map
可以看出来,这个st_value(第二列的值)的大小顺序是乱的,因此必须要排序,才能保证和我们代码定义的顺序一致。
接下来,我们要从maps section中读取map数据,这里首先有一个兼容问题。这里我们假设每一个map定义所占用的空间是相同的,直接用data_maps->d_size / nr_maps
作为每一个map定义所占用的空间。这里有两种情况:
struct bpf_load_map_def
所占空间小,这时候就读取elf中那部分长度就可以struct bpf_load_map_def
所占空间大,这时候读取struct bpf_load_map_def
所需字节,然后判断剩下没读的elf文件中有没有有效值,没有即忽略,否则返回提示信息,说明有比struct bpf_load_map_def
更多的有效信息。static int load_elf_maps_section(struct bpf_map_data *maps, int maps_shndx,
Elf *elf, Elf_Data *symbols, int strtabidx)
{
int map_sz_elf, map_sz_copy;
bool validate_zero = false;
Elf_Data *data_maps;
int i, nr_maps;
GElf_Sym *sym;
Elf_Scn *scn;
int copy_sz;
if (maps_shndx < 0)
return -EINVAL;
if (!symbols)
return -EINVAL;
/* Get data for maps section via elf index */
scn = elf_getscn(elf, maps_shndx);
if (scn)
data_maps = elf_getdata(scn, NULL); //获取map section data
if (!scn || !data_maps) {
printf("Failed to get Elf_Data from maps section %d\n",
maps_shndx);
return -EINVAL;
}
/* For each map get corrosponding symbol table entry */
sym = calloc(MAX_MAPS+1, sizeof(GElf_Sym));
/*symbols是符号表,对应.symtab这个section,sockex1_kern.o的symtab有144字节,GElf_Sym的大小是24字节*/
//遍历符号表,把所有创建的map的symbol entry加入到sym数组中
for (i = 0, nr_maps = 0; i < symbols->d_size / sizeof(GElf_Sym); i++) {
assert(nr_maps < MAX_MAPS+1);
if (!gelf_getsym(symbols, i, &sym[nr_maps]))
continue;
if (sym[nr_maps].st_shndx != maps_shndx)
continue;
/* Only increment iif maps section */
nr_maps++;
}
/* Align to map_fd[] order, via sort on offset in sym.st_value */
//将所有的map symbol entry,按照它们的sym.st_value值进行升序排序,保证和map_fd[]顺序对齐
//这里应该就是为了保证在你创建多个map的时候,那个fd的大小顺序就是和你创建map的顺序是一致的(从小到大)
qsort(sym, nr_maps, sizeof(GElf_Sym), cmp_symbols);
/* Keeping compatible with ELF maps section changes
* ------------------------------------------------
* The program size of struct bpf_load_map_def is known by loader
* code, but struct stored in ELF file can be different.
*
* Unfortunately sym[i].st_size is zero. To calculate the
* struct size stored in the ELF file, assume all struct have
* the same size, and simply divide with number of map
* symbols.
*/
map_sz_elf = data_maps->d_size / nr_maps;
map_sz_copy = sizeof(struct bpf_load_map_def);
if (map_sz_elf < map_sz_copy) {
/*
* Backward compat, loading older ELF file with
* smaller struct, keeping remaining bytes zero.
*/
map_sz_copy = map_sz_elf;
} else if (map_sz_elf > map_sz_copy) {
/*
* Forward compat, loading newer ELF file with larger
* struct with unknown features. Assume zero means
* feature not used. Thus, validate rest of struct
* data is zero.
*/
validate_zero = true;
}
/* Memcpy relevant part of ELF maps data to loader maps */
for (i = 0; i < nr_maps; i++) {
struct bpf_load_map_def *def; //map定义的结构体
unsigned char *addr, *end;
const char *map_name;
size_t offset;
//从strtab中获取map名,下标由sym[i].st_name给出
map_name = elf_strptr(elf, strtabidx, sym[i].st_name);
maps[i].name = strdup(map_name);
if (!maps[i].name) {
printf("strdup(%s): %s(%d)\n", map_name,
strerror(errno), errno);
free(sym);
return -errno;
}
/* Symbol value is offset into ELF maps section data area */
offset = sym[i].st_value; //获取maps section内部偏移
def = (struct bpf_load_map_def *)(data_maps->d_buf + offset);
maps[i].elf_offset = offset;
memset(&maps[i].def, 0, sizeof(struct bpf_load_map_def));
memcpy(&maps[i].def, def, map_sz_copy); //将定义拷贝给maps[i].def
/* Verify no newer features were requested */
//如果elf文件中的定义比bpf_load_map_def更长,则判断没有拷贝过来的那部分空间中有没有有效值,
//有的话就返回-EFBIG,否则忽略
if (validate_zero) {
addr = (unsigned char*) def + map_sz_copy;
end = (unsigned char*) def + map_sz_elf;
for (; addr < end; addr++) {
if (*addr != 0) {
free(sym);
return -EFBIG;
}
}
}
}
free(sym);
return nr_maps;
}
load_elf_maps_section
函数从maps section中获取了所有map的定义信息,并存储到了maps数组中,然后返回map的数量。接下来我们关注load_maps(map_data, nr_maps, fixup_map)
函数,这个函数负责利用刚才获得的maps数组信息,真正创建map:这个函数逻辑很简单,fixup_map字段允许用户自定义map的fd,这里不过多阐述。我们往下可以看到,BPF Map分为两大类:
bpf_create_map_in_map_node
进行创建bpf_create_map_node
进行创建这两个创建函数最终都会调用bpf()系统调用去创建map,这里没有什么特别的地方,就不贴源码了。创建的map,会返回fd,这个fd会存入map_fd[]这个全局数组变量中,以供用户空间对Map进行操作。如果是BPF_MAP_TYPE_PROG_ARRAY,还会单独赋值给prog_array_fd这个变量。这里有一个细节,用户空间可以通过fd获取这个map,那么内核空间呢?答案是内核空间可以直接用该map定义时的名字去访问,因为这个map的名字在创建时也是该map的一部分,也就是说该map既可以通过name标识也可以通过fd来标识(不过这个map名字会在后面的阶段被替换为真正的map指针,见下文)。
static int load_maps(struct bpf_map_data *maps, int nr_maps,
fixup_map_cb fixup_map)
{
int i, numa_node;
for (i = 0; i < nr_maps; i++) {
if (fixup_map) {
fixup_map(&maps[i], i);
/* Allow userspace to assign map FD prior to creation */
if (maps[i].fd != -1) {
map_fd[i] = maps[i].fd;
continue;
}
}
numa_node = maps[i].def.map_flags & BPF_F_NUMA_NODE ?
maps[i].def.numa_node : -1;
if (maps[i].def.type == BPF_MAP_TYPE_ARRAY_OF_MAPS ||
maps[i].def.type == BPF_MAP_TYPE_HASH_OF_MAPS) {
int inner_map_fd = map_fd[maps[i].def.inner_map_idx];
map_fd[i] = bpf_create_map_in_map_node(maps[i].def.type,
maps[i].name,
maps[i].def.key_size,
inner_map_fd,
maps[i].def.max_entries,
maps[i].def.map_flags,
numa_node);
} else {
map_fd[i] = bpf_create_map_node(maps[i].def.type,
maps[i].name,
maps[i].def.key_size,
maps[i].def.value_size,
maps[i].def.max_entries,
maps[i].def.map_flags,
numa_node);
}
if (map_fd[i] < 0) {
printf("failed to create a map: %d %s\n",
errno, strerror(errno));
return 1;
}
maps[i].fd = map_fd[i];
if (maps[i].def.type == BPF_MAP_TYPE_PROG_ARRAY)
prog_array_fd = map_fd[i];
}
return 0;
}
行文至此,我们把创建map这一段讲解完了。
接下来do_load_bpf_file
将开始解析可重定位section部分:在此之前,我们先看一下socket1这个section的内容:一条bpf字节码指令占用8个字节,可以看出这个Section共有15条bpf指令。
[root@localhost bpf]# readelf -x socket1 sockex1_kern.o -W
Hex dump of section 'socket1':
NOTE: This section has relocations against it, but these have NOT been applied to this dump.
0x00000000 bf160000 00000000 30000000 17000000 ........0.......
0x00000010 630afcff 00000000 61610400 00000000 c.......aa......
0x00000020 55010800 04000000 bfa20000 00000000 U...............
0x00000030 07020000 fcffffff 18010000 00000000 ................
0x00000040 00000000 00000000 85000000 01000000 ................
0x00000050 15000200 00000000 61610000 00000000 ........aa......
0x00000060 db100000 00000000 b7000000 00000000 ................
0x00000070 95000000 00000000 ........
利用llvm-objdump工具先看一下"socket1"这个section中的BPF指令:
[root@localhost bpf]# llvm-objdump -disassemble-all sockex1_kern.o
sockex1_kern.o: file format ELF64-BPF
#... ...省略一些
Disassembly of section socket1:
bpf_prog1:
0: bf 16 00 00 00 00 00 00 r6 = r1
1: 30 00 00 00 17 00 00 00 r0 = *(u8 *)skb[23]
2: 63 0a fc ff 00 00 00 00 *(u32 *)(r10 - 4) = r0
3: 61 61 04 00 00 00 00 00 r1 = *(u32 *)(r6 + 4)
4: 55 01 08 00 04 00 00 00 if r1 != 4 goto +8 <LBB0_3>
5: bf a2 00 00 00 00 00 00 r2 = r10
6: 07 02 00 00 fc ff ff ff r2 += -4
7: 18 01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 r1 = 0 ll
9: 85 00 00 00 01 00 00 00 call 1
10: 15 00 02 00 00 00 00 00 if r0 == 0 goto +2 <LBB0_3>
11: 61 61 00 00 00 00 00 00 r1 = *(u32 *)(r6 + 0)
12: db 10 00 00 00 00 00 00 lock *(u64 *)(r0 + 0) += r1
LBB0_3:
13: b7 00 00 00 00 00 00 00 r0 = 0
14: 95 00 00 00 00 00 00 00 exit
回到我们处理重定位section的代码部分,可以看到,这里解析.rel的section实际上就是为了解析bpf字节码指令(过滤掉了其他无关.rel类型的section,比如.eh_frame)
static int do_load_bpf_file(const char *path, fixup_map_cb fixup_map)
{
//... ...
/* process all relo sections, and rewrite bpf insns for maps */
for (i = 1; i < ehdr.e_shnum; i++) {
if (processed_sec[i])
continue;
if (get_sec(elf, i, &ehdr, &shname, &shdr, &data))
continue;
if (shdr.sh_type == SHT_REL) { //.rel类型的section
struct bpf_insn *insns;
/* locate prog sec that need map fixup (relocations) */
//从相应的非重定位Section中读取eBPF程序指令 shdr.sh_info指向相应的非重定位的section索引
if (get_sec(elf, shdr.sh_info, &ehdr, &shname_prog,
&shdr_prog, &data_prog))
continue;
if (shdr_prog.sh_type != SHT_PROGBITS ||
!(shdr_prog.sh_flags & SHF_EXECINSTR)) //过滤无关section,只关注eBPF代码的section
continue;
insns = (struct bpf_insn *) data_prog->d_buf; //获取bpf字节码指令指针
processed_sec[i] = true; /* relo section */
if (parse_relo_and_apply(data, symbols, &shdr, insns,
map_data, nr_maps))
continue;
}
}
//... ...
done:
close(fd);
return ret;
}
在获取bpf字节码程序后,调用了parse_relo_and_apply
函数,该函数定义如下:这个函数的主要功能就是从可重定位section提供的信息中(包括在bpf字节码指令的哪个偏移位置重定位以及重定位的符号)去完成map_fd的替换。由于map时在用户态创建的,bpf程序被编译时并不知道fd的存在,因此会在操作map的地方留下一个可重定位的信息,在map被创建后,然后利用这个可重定位信息,先定位到对应指令(利用rel.r_offset),再获取对应的符号表表项(利用rel.r_info前32位)。然后我们判断这一条指令的操作码是否是BPF_LD | BPF_IMM | BPF_DW
,这个操作码表示要将立即数加载到目标寄存器中,此时立即数为0,因为还没有为它填充fd。之后再去判断这个符号表表项是否和已经创建的map可以匹配关联起来,若可以,则将该map的fd赋值给这条指令的立即数字段。这里还有一个细节,在匹配map之前,src_reg被置为1(BPF_PSEUDO_MAP_FD),我们知道当src_reg为0时,表示立即数就在指令内,那么为什么还要多此一举把它置为1呢,这样看起来好像也不对?原因是,这个fd也不是最终 BPF指令执行时用来标志map的方式,之后,BPF验证器会使用replace_map_fd_with_map_ptr()
函数将fd更改为最终的map指针,然后convert_pseudo_ld_imm64()
函数会将BPF_PSEUDO_MAP_FD
重置为0,这样表示完成了从map_fd到最终map指针的替换。网上有一个图比较清晰的展示了这个过程:
static int parse_relo_and_apply(Elf_Data *data, Elf_Data *symbols,
GElf_Shdr *shdr, struct bpf_insn *insn,
struct bpf_map_data *maps, int nr_maps)
{
int i, nrels;
nrels = shdr->sh_size / shdr->sh_entsize;
for (i = 0; i < nrels; i++) {
GElf_Sym sym;
GElf_Rel rel;
unsigned int insn_idx;
bool match = false;
int j, map_idx;
gelf_getrel(data, i, &rel);
//可重定位地址对应的insn数组的索引值
insn_idx = rel.r_offset / sizeof(struct bpf_insn);
//获取symtab中的my_map,GELF_R_SYM用于获取前32位,rel.r_info前32位表示可重定位目标在symtab中的索引下标
gelf_getsym(symbols, GELF_R_SYM(rel.r_info), &sym);
//可以反汇编看到对应的指令是0x18,即BPF_LD | BPF_IMM | BPF_DW, 该opcode 表示要将一个 64 位的立即数加载到目标寄存器。
//此时src_reg为0,表示立即数在指令内
if (insn[insn_idx].code != (BPF_LD | BPF_IMM | BPF_DW)) {
printf("invalid relo for insn[%d].code 0x%x\n",
insn_idx, insn[insn_idx].code);
return 1;
}
insn[insn_idx].src_reg = BPF_PSEUDO_MAP_FD; //src_reg置为1,等到最终替换为map指针时会清零
/* Match FD relocation against recorded map_data[] offset */
for (map_idx = 0; map_idx < nr_maps; map_idx++) {
if (maps[map_idx].elf_offset == sym.st_value) {
match = true;
break;
}
}
if (match) {
insn[insn_idx].imm = maps[map_idx].fd; //将map的fd赋值给立即数
} else {
printf("invalid relo for insn[%d] no map_data match\n",
insn_idx);
return 1;
}
/*之后,BPF验证器会使用replace_map_fd_with_map_ptr()函数将fd更改为最终的map指针
然后convert_pseudo_ld_imm64()函数会将BPF_PSEUDO_MAP_FD重置为0,表示完成了map_fd的替换*/
}
return 0;
}
接下来,就是具体调用load_and_attach()
,load_and_attach()
会去调用bpf_load_program()
(内核实现的函数),将eBPF程序挂载到hook点上(包括验证等),最终就是调用bpf()系统调用,这个系统调用包括了验证bpf指令字节码的实现,大致如下所示:
sys_bpf()
--> bpf_prog_load()
--> bpf_check()
--> replace_map_fd_with_map_ptr()
--> do_check()
--> check_ld_imm()
==> check_func_arg()
--> convert_pseudo_ld_imm64()
加载eBPF程序部分的源码实现:
{
//... ...
/* load programs */
for (i = 1; i < ehdr.e_shnum; i++) {
if (processed_sec[i])
continue;
if (get_sec(elf, i, &ehdr, &shname, &shdr, &data))
continue;
if (memcmp(shname, "kprobe/", 7) == 0 ||
memcmp(shname, "kretprobe/", 10) == 0 ||
memcmp(shname, "tracepoint/", 11) == 0 ||
memcmp(shname, "raw_tracepoint/", 15) == 0 ||
memcmp(shname, "xdp", 3) == 0 ||
memcmp(shname, "perf_event", 10) == 0 ||
memcmp(shname, "socket", 6) == 0 ||
memcmp(shname, "cgroup/", 7) == 0 ||
memcmp(shname, "sockops", 7) == 0 ||
memcmp(shname, "sk_skb", 6) == 0 ||
memcmp(shname, "sk_msg", 6) == 0) {
ret = load_and_attach(shname, data->d_buf,
data->d_size);
if (ret != 0)
goto done;
}
}
//... ...
}
load_and_attach()
除了实现eBPF程序的加载验证,还有一些其他工作,这里简要分析一下:这里sys_perf_event_open
在PMU(Performance Monitoring Unit)上初始化一个硬件性能计数器(PMC: Performance Monitoring Counter)。ioctl
开启PMC计数。PMC随着指定硬件事件的发生而自动累加。在PMC 溢出时,PMU 触发一个PMI(Performance Monitoring Interrupt)中断。内核在PMI 中断的处理函数中保存PMC 的计数值,触发中断时的指令地址,当前时间戳以及当前进程的PID,TID,comm 等信息。我们把这些信息统称为一个采样(sample)。
注意区分本函数中的PMU初始化和perm map通信的那一套是不一样的。本处只初始化PMU即可保证eBPF正常运行,我们并没有从其中的ring buffer读取数据。而eBPF中的perf map通信则是如下过程:
(1)内核空间
BPF_MAP_TYPE_PERF_EVENT
类型的map,key和value的类型均为整型,分别表示CPU核的编号以及在该核上PMU初始化的PMC的pmu_fd值bpf_perf_event_output(ctx, &perf_map, BPF_F_CURRENT_CPU, data, size)
将数据发送至指定CPU开启的PMU的ring buffer中(在内核5.8之前更准确的说法是叫perf buffer,实际上是环形缓冲区)(2)用户空间
通过pmu_fd = sys_perf_event_open(&attr, -1/*pid*/, 0/*cpu*/, -1/*group_fd*/, 0);
为指定CPU创建PMC,并返回pmu_fd值,同时,在perf_map中记录该CPU对应的pmu_fd,调用ioctl
开启PMC计数。此步骤完成后,内核eBPF即可正常向ring buffer中写入数据。
用户空间通过perf_event_mmap(int pmu_fd)
函数(实际上是封装了mmap
函数)将该pmu_fd对应的ring buffer的内存映射到用户空间中。这段内存的首页是元数据,包含这段内存的data->head,data->tail,data->size等信息,后面跟着的是数据。如下方内核官方文档所述。data->head会随着内核空间写入的数据一直增加,data->tail往往指向用户空间上一次读入的最后位置,这两个变量往往都需要%data->size
后再访问,在逻辑上构成一个环。
The mmap size should be 1+2^n pages, where the first page is a metadata page (struct perf_event_mmap_page) that contains various bits of information such as where the ring-buffer head is.
可参考内核官方文档:https://man7.org/linux/man-pages/man2/perf_event_open.2.html
static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
{
bool is_socket = strncmp(event, "socket", 6) == 0;
bool is_kprobe = strncmp(event, "kprobe/", 7) == 0;
bool is_kretprobe = strncmp(event, "kretprobe/", 10) == 0;
bool is_tracepoint = strncmp(event, "tracepoint/", 11) == 0;
bool is_raw_tracepoint = strncmp(event, "raw_tracepoint/", 15) == 0;
bool is_xdp = strncmp(event, "xdp", 3) == 0;
bool is_perf_event = strncmp(event, "perf_event", 10) == 0;
bool is_cgroup_skb = strncmp(event, "cgroup/skb", 10) == 0;
bool is_cgroup_sk = strncmp(event, "cgroup/sock", 11) == 0;
bool is_sockops = strncmp(event, "sockops", 7) == 0;
bool is_sk_skb = strncmp(event, "sk_skb", 6) == 0;
bool is_sk_msg = strncmp(event, "sk_msg", 6) == 0;
size_t insns_cnt = size / sizeof(struct bpf_insn);
enum bpf_prog_type prog_type;
char buf[256];
int fd, efd, err, id;
struct perf_event_attr attr = {};
attr.type = PERF_TYPE_TRACEPOINT;
attr.sample_type = PERF_SAMPLE_RAW;
attr.sample_period = 1;
attr.wakeup_events = 1;
if (is_socket) {
prog_type = BPF_PROG_TYPE_SOCKET_FILTER;
} else if (is_kprobe || is_kretprobe) {
prog_type = BPF_PROG_TYPE_KPROBE;
} else if (is_tracepoint) {
prog_type = BPF_PROG_TYPE_TRACEPOINT;
} else if (is_raw_tracepoint) {
prog_type = BPF_PROG_TYPE_RAW_TRACEPOINT;
} else if (is_xdp) {
prog_type = BPF_PROG_TYPE_XDP;
} else if (is_perf_event) {
prog_type = BPF_PROG_TYPE_PERF_EVENT;
} else if (is_cgroup_skb) {
prog_type = BPF_PROG_TYPE_CGROUP_SKB;
} else if (is_cgroup_sk) {
prog_type = BPF_PROG_TYPE_CGROUP_SOCK;
} else if (is_sockops) {
prog_type = BPF_PROG_TYPE_SOCK_OPS;
} else if (is_sk_skb) {
prog_type = BPF_PROG_TYPE_SK_SKB;
} else if (is_sk_msg) {
prog_type = BPF_PROG_TYPE_SK_MSG;
} else {
printf("Unknown event '%s'\n", event);
return -1;
}
if (prog_cnt == MAX_PROGS)
return -1;
fd = bpf_load_program(prog_type, prog, insns_cnt, license, kern_version,
bpf_log_buf, BPF_LOG_BUF_SIZE);
if (fd < 0) {
printf("bpf_load_program() err=%d\n%s", errno, bpf_log_buf);
return -1;
}
prog_fd[prog_cnt++] = fd;
if (is_xdp || is_perf_event || is_cgroup_skb || is_cgroup_sk)
return 0;
if (is_socket || is_sockops || is_sk_skb || is_sk_msg) {
if (is_socket)
event += 6;
else
event += 7;
if (*event != '/')
return 0;
event++;
if (!isdigit(*event)) {
printf("invalid prog number\n");
return -1;
}
return populate_prog_array(event, fd);
}
if (is_raw_tracepoint) {
efd = bpf_raw_tracepoint_open(event + 15, fd);
if (efd < 0) {
printf("tracepoint %s %s\n", event + 15, strerror(errno));
return -1;
}
event_fd[prog_cnt - 1] = efd;
return 0;
}
if (is_kprobe || is_kretprobe) {
bool need_normal_check = true;
const char *event_prefix = "";
if (is_kprobe)
event += 7;
else
event += 10;
if (*event == 0) {
printf("event name cannot be empty\n");
return -1;
}
if (isdigit(*event))
return populate_prog_array(event, fd);
#ifdef __x86_64__
if (strncmp(event, "sys_", 4) == 0) {
snprintf(buf, sizeof(buf),
"echo '%c:__x64_%s __x64_%s' >> /sys/kernel/debug/tracing/kprobe_events",
is_kprobe ? 'p' : 'r', event, event);
err = system(buf);
if (err >= 0) {
need_normal_check = false;
event_prefix = "__x64_";
}
}
#endif
if (need_normal_check) {
snprintf(buf, sizeof(buf),
"echo '%c:%s %s' >> /sys/kernel/debug/tracing/kprobe_events",
is_kprobe ? 'p' : 'r', event, event);
err = system(buf);
if (err < 0) {
printf("failed to create kprobe '%s' error '%s'\n",
event, strerror(errno));
return -1;
}
}
strcpy(buf, DEBUGFS);
strcat(buf, "events/kprobes/");
strcat(buf, event_prefix);
strcat(buf, event);
strcat(buf, "/id");
} else if (is_tracepoint) {
event += 11;
if (*event == 0) {
printf("event name cannot be empty\n");
return -1;
}
strcpy(buf, DEBUGFS);
strcat(buf, "events/");
strcat(buf, event);
strcat(buf, "/id");
}
efd = open(buf, O_RDONLY, 0);
if (efd < 0) {
printf("failed to open event %s\n", event);
return -1;
}
err = read(efd, buf, sizeof(buf));
if (err < 0 || err >= sizeof(buf)) {
printf("read from '%s' failed '%s'\n", event, strerror(errno));
return -1;
}
close(efd);
buf[err] = 0;
id = atoi(buf);
attr.config = id;
efd = sys_perf_event_open(&attr, -1/*pid*/, 0/*cpu*/, -1/*group_fd*/, 0);
if (efd < 0) {
printf("event %d fd %d err %s\n", id, efd, strerror(errno));
return -1;
}
event_fd[prog_cnt - 1] = efd;
err = ioctl(efd, PERF_EVENT_IOC_ENABLE, 0);
if (err < 0) {
printf("ioctl PERF_EVENT_IOC_ENABLE failed err %s\n",
strerror(errno));
return -1;
}
err = ioctl(efd, PERF_EVENT_IOC_SET_BPF, fd);
if (err < 0) {
printf("ioctl PERF_EVENT_IOC_SET_BPF failed err %s\n",
strerror(errno));
return -1;
}
return 0;
}
https://blog.csdn.net/hubingsong/article/details/126776029
https://zhuanlan.zhihu.com/p/141694060
// SPDX-License-Identifier: GPL-2.0
#include
#include
#include
#include
#include "bpf_load.h"
#include "sock_example.h"
#include
#include
int main(int ac, char **argv)
{
char filename[256];
FILE *f;
int i, sock;
snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
if (load_bpf_file(filename)) {
printf("%s", bpf_log_buf);
return 1;
}
sock = open_raw_sock("lo");
assert(setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, prog_fd,
sizeof(prog_fd[0])) == 0);
f = popen("ping -c5 localhost", "r");
(void) f;
for (i = 0; i < 5; i++) {
long long tcp_cnt, udp_cnt, icmp_cnt;
int key;
key = IPPROTO_TCP;
assert(bpf_map_lookup_elem(map_fd[0], &key, &tcp_cnt) == 0);
key = IPPROTO_UDP;
assert(bpf_map_lookup_elem(map_fd[0], &key, &udp_cnt) == 0);
key = IPPROTO_ICMP;
assert(bpf_map_lookup_elem(map_fd[0], &key, &icmp_cnt) == 0);
printf("TCP %lld UDP %lld ICMP %lld bytes\n",
tcp_cnt, udp_cnt, icmp_cnt);
sleep(1);
}
return 0;
}