xdp部分的代码分为Kernel层和User层两部分,Kernel层代码由User层代码调用linux bpf接口加载到网卡内核驱动,对进入的数据包进行拦截,对于Knot关注的DNS数据包(包括UDP/TCP,甚至QUIC)以Zero Copy的方式穿透到User层的代码进行处理,而Knot不关注的网络数据包则直接PASS到内核网络协议栈进行常规处理。
Kernel层代码存放在src/libknot/xdp目录下面,包括以下几个文件:
以上kernel层的代码需要用clang 编译成ebpf字节码并加载到ebpf内核虚拟机中运行。使用以下命令生成:
clang -target bpf -Wall -O2 -g -DNDEBUG -c -o bpf-kernel.o -I/usr/include/x86_64-linux-gnu -include ../../config.h bpf-kernel.c
llvm-strip -S bpf-kernel.o
clang 设置了“-target bpf ”选项,用以生成ebpf字节码。
llvm-strip 命令用来删除符号表和相关调试信息。
为了方便User层代码将生成的ebpf字节码加载到ebpf内核虚拟机中,将以上命令生成的bpf-kernel.o的原始字节码,生成为一个c语言数组,以便直接链接编译到User层代码中,这样无需在启动User层代码的时候再额外提供bpf-kernel.o文件,便于安装部署。命令如下:
xxd -i bpf-kernel.o > bpf-kernel-obj.c
输出为bpf-kernel-obj.c文件,另外需要编写一个bpf-kenrle-obj.h的文件,便于User层代码进行include。如下:
extern unsigned char bpf_kernel_o[];
extern unsigned int bpf_kernel_o_len;
2.1 首先定义了两个ebpf的map,如下:
/* A map of configuration options. */
struct {
__uint(type, BPF_MAP_TYPE_ARRAY);
__uint(max_entries, QUEUE_MAX);
__uint(key_size, sizeof(__u32)); /* Must be 4 bytes. */
__uint(value_size, sizeof(knot_xdp_opts_t));
} opts_map SEC(".maps");
/* A map of AF_XDP sockets. */
struct {
__uint(type, BPF_MAP_TYPE_XSKMAP);
__uint(max_entries, QUEUE_MAX);
__uint(key_size, sizeof(__u32)); /* Must be 4 bytes. */
__uint(value_size, sizeof(int));
} xsks_map SEC(".maps");
map是ebpf的一个非常重要的kernel和user层代码的通讯机制,user层代码可以将数据写入map,然后kernel层代码从map中读取,反之亦然。
第一个map为opts_map,其作用是user层借此将配置信息传递到kernel层代码,其key为网卡接收队列id,value为knot_xdp_opts_t,定义如下:
/*! \brief XDP map item for the filter configuration. */
typedef struct knot_xdp_opts knot_xdp_opts_t;
struct knot_xdp_opts {
__u16 flags; /*!< XDP filter flags \a knot_xdp_filter_flag_t. */
__u16 udp_port; /*!< UDP/TCP port to listen on. */
__u16 quic_port; /*!< QUIC/UDP port to listen on. */
} __attribute__((packed));
第二个map为xsks_map,其作用是user层将其创建的AF_XDP的socket传递到kernel层代码,其key为网卡接收队列id, value为AF_XDP的socket句柄,kernel层代码在需要的时候将接收到的数据传递到该socket句柄对应的rx ring环形队列中,如:
/* Forward the packet to user space. */
bpf_redirect_map(&xsks_map, ctx->rx_queue_index, 0);
2.2 接着是主函数xdp_redirect_dns_func,下面进行逐行分析,以下是它的原型定义:
SEC("xdp")
int xdp_redirect_dns_func(struct xdp_md *ctx)
SEC(“xdp”)将xdp_redirect_dns_func的二进制字节码存放到elf的xdp节中,这个是 xdp程序的规定。
ctx是内核ebpf调用xdp_redirect_dns_func的时候传入的上下文参数,定义如下:
/* user accessible metadata for XDP packet hook
* new fields must be added to the end of this structure
*/
struct xdp_md {
__u32 data; /* pkt data starting position */
__u32 data_end; /* end of pkt data */
__u32 data_meta; /* meta data for the skb */
/* Below access go through struct xdp_rxq_info */
__u32 ingress_ifindex; /* rxq->dev->ifindex */
__u32 rx_queue_index; /* rxq->queue_index */
__u32 egress_ifindex; /* txq->dev->ifindex */
};
2.2.1 获取配置选项并检查是否启用xdp
/* Get the queue options. */
__u32 index = ctx->rx_queue_index;
struct knot_xdp_opts *opts_ptr = bpf_map_lookup_elem(&opts_map, &index);
if (!opts_ptr) {
return XDP_ABORTED;
}
/* save the opts_ptr value into opts */
knot_xdp_opts_t opts = *opts_ptr;
/* Check if the filter is disabled. */
if (!(opts.flags & KNOT_XDP_FILTER_ON)) {
return XDP_PASS;
}
2.2.2 在skb同步预留vlan附加数据并初始化设置data和meta对应的指针地址
/* Try to reserve space in front of the packet for additional (VLAN) data. */
(void)bpf_xdp_adjust_meta(ctx, - (int)sizeof(struct knot_xdp_info) - KNOT_XDP_PKT_ALIGNMENT);
void *data = (void *)(long)ctx->data;
const void *data_end = (void *)(long)ctx->data_end;
struct knot_xdp_info *meta = (void *)(long)ctx->data_meta;
/* Check if the meta data pointer is usable (e.g. not `tap` interface). */
if ((void *)meta + sizeof(*meta) > data) {
meta = 0;
}
9-11行代码的含义是:如果保留vlan附加数据空间操作失败,那么设置meta = 0,表示不能支持vlan报文的处理。
2.2.3 设置ethernet协议头的指针地址,并调整data指向三层头
struct ethhdr *eth_hdr = data;
const void *ip_hdr;
const struct iphdr *ip4;
const struct ipv6hdr *ip6;
const void *l4_hdr;
__u8 ipv4;
__u8 ip_proto;
__u8 fragmented = 0;
__u16 eth_type; /* In big endian. */
/* Parse Ethernet header. */
if ((void *)eth_hdr + sizeof(*eth_hdr) > data_end) {
return XDP_DROP;
}
data += sizeof(*eth_hdr);
2.2.4 解析获取三层头
/* Parse possible VLAN (802.1Q) header. */
if (eth_hdr->h_proto == __constant_htons(ETH_P_8021Q)) {
if (data + sizeof(__u16) + sizeof(eth_type) > data_end) {
return XDP_DROP;
} else if (meta == 0) { /* VLAN not supported. */
return XDP_PASS;
}
__builtin_memcpy(ð_type, data + sizeof(__u16), sizeof(eth_type));
data += sizeof(__u16) + sizeof(eth_type);
} else {
eth_type = eth_hdr->h_proto;
}
ip_hdr = data;
如果是VLAN报文,需要剥离VLAN报文头, 最后将ip_hdr指向三层头的开始位置。
2.2.5 解析三层报文,并得到四层头
/* Parse IPv4 or IPv6 header. */
switch (eth_type) {
case __constant_htons(ETH_P_IP):
ip4 = ip_hdr;
if ((void *)ip4 + sizeof(*ip4) > data_end) {
return XDP_DROP;
}
if (ip4->version != 4) {
return XDP_DROP;
}
/* Check the IP length. Cannot use strict
* equality due to Ethernet padding applied to
* frames shorter than 64 octects. */
if (data_end - data < __bpf_ntohs(ip4->tot_len))
{
return XDP_DROP;
}
/* check if the pkt is fragmented */
if (ip4->frag_off != 0 &&
ip4->frag_off != __constant_htons(IP_DF)) {
fragmented = 1;
}
ip_proto = ip4->protocol;
l4_hdr = data + ip4->ihl * 4;
ipv4 = 1;
break;
case __constant_htons(ETH_P_IPV6):
ip6 = ip_hdr;
if ((void *)ip6 + sizeof(*ip6) > data_end) {
return XDP_DROP;
}
if (ip6->version != 6) {
return XDP_DROP;
}
/* Check the IP length. Cannot use strict
* equality due to Ethernet padding applied
* to frames shorter than 64 octects. */
if (data_end - data < __bpf_ntohs(ip6->payload_len) + sizeof(*ip6)) {
return XDP_DROP;
}
ip_proto = ip6->nexthdr;
data += sizeof(*ip6);
if (ip_proto == IPPROTO_FRAGMENT) {
fragmented = 1;
const struct ipv6_frag_hdr *frag = data;
if ((void *)frag + sizeof(*frag) > data_end)
{
return XDP_DROP;
}
ip_proto = frag->nexthdr;
data += sizeof(*frag);
}
l4_hdr = data;
ipv4 = 0;
break;
default:
/* Pass packets of possible other protocols. */
return XDP_PASS;
}
分别对ipv4和ipv6进行报文解析,如果报文有分片则设置分片标记(fragmented),最后将l4_hdr指向四层头地址。如果不是ipv4或者ipv6的报文,直接返回XDP_PASS交给内核进行处理。
2.2.6 解析四层头
const struct tcphdr *tcp;
const struct udphdr *udp;
__u16 port_dest;
__u8 match = 0;
/* Check the transport protocol. */
switch (ip_proto) {
case IPPROTO_TCP:
/* Parse TCP header. */
tcp = l4_hdr;
if (l4_hdr + sizeof(*tcp) > data_end) {
return XDP_DROP;
}
port_dest = __bpf_ntohs(tcp->dest);
if ((opts.flags & KNOT_XDP_FILTER_TCP) &&
(port_dest == opts.udp_port ||
((opts.flags & (KNOT_XDP_FILTER_PASS | KNOT_XDP_FILTER_DROP)) &&
port_dest >= opts.udp_port))) {
match = 1;
}
break;
case IPPROTO_UDP:
/* Parse UDP header. */
udp = l4_hdr;
if (l4_hdr + sizeof(*udp) > data_end) {
return XDP_DROP;
}
/* Check the UDP length. */
if (data_end - (void *)udp < __bpf_ntohs(udp->len)) {
return XDP_DROP;
}
port_dest = __bpf_ntohs(udp->dest);
if ((opts.flags & KNOT_XDP_FILTER_UDP) &&
(port_dest == opts.udp_port ||
((opts.flags & (KNOT_XDP_FILTER_PASS | KNOT_XDP_FILTER_DROP)) &&
port_dest >= opts.udp_port))) {
match = 1;
} else if ((opts.flags & KNOT_XDP_FILTER_QUIC) &&
(port_dest == opts.quic_port ||
((opts.flags & (KNOT_XDP_FILTER_PASS | KNOT_XDP_FILTER_DROP)) &&
port_dest >= opts.quic_port))) {
match = 1;
}
break;
default:
/* Pass packets of possible other protocols. */
return XDP_PASS;
}
对TCP/UDP协议分别进行解析,检查请求的目标端口是否是knot server监听的端口,如果是则设置match = 1标记。如果不是TCP/UDP协议,则直接返回XDP_PASS交给内核协议栈进行处理。
2.2.7 根据四层的端口匹配结果执行相应的处理
if (!match) {
/* Pass non-matching packet. */
return XDP_PASS;
} else if (opts.flags & KNOT_XDP_FILTER_DROP) {
/* Drop matching packet if requested. */
return XDP_DROP;
} else if (fragmented) {
/* Drop fragmented packet. */
return XDP_DROP;
}
2.2.8 查找路由表进行路由处理
/* Take into account routing information. */
if (opts.flags & KNOT_XDP_FILTER_ROUTE) {
struct bpf_fib_lookup fib = {
.ifindex = 1 /* Loopback. */
};
if (ipv4) {
fib.family = AF_INET;
fib.ipv4_src = ip4->daddr;
fib.ipv4_dst = ip4->saddr;
} else {
struct in6_addr *ipv6_src = (struct in6_addr *)fib.ipv6_src;
struct in6_addr *ipv6_dst = (struct in6_addr *)fib.ipv6_dst;
fib.family = AF_INET6;
*ipv6_src = ip6->daddr;
*ipv6_dst = ip6->saddr;
}
const __u16 *mac_in = (const __u16 *)eth_hdr->h_dest;
const __u16 *mac_out = (const __u16 *)fib.smac;
int ret = bpf_fib_lookup(ctx, &fib, sizeof(fib), BPF_FIB_LOOKUP_DIRECT);
switch (ret) {
case BPF_FIB_LKUP_RET_SUCCESS:
/* Cross-interface answers are handled
* through normal stack. */
if (mac_in[0] != mac_out[0] ||
mac_in[1] != mac_out[1] ||
mac_in[2] != mac_out[2]) {
return XDP_PASS;
}
/* Store output interface index for later use
* with VLAN in user space. */
if (meta != 0) {
meta->out_if_index = fib.ifindex;
}
/* Update destination MAC for responding. */
__builtin_memcpy(eth_hdr->h_source, fib.dmac, ETH_ALEN);
break;
case BPF_FIB_LKUP_RET_FWD_DISABLED:
/* Disabled forwarding on loopback. */
return XDP_ABORTED;
case BPF_FIB_LKUP_RET_NO_NEIGH:
/* Use normal stack to obtain MAC. */
return XDP_PASS;
default:
return XDP_DROP;
}
}
如果经过路由选择后的DNS相应包的发送接口与接收接口不同,则直接返回XDP_PASS交由内核协议栈处理。
如果经过路由选择后发现目的地址被配置为黑洞,不可达,或者禁止状态,DNS请求报文则直接被丢弃。
响应报文的目的mac地址和可能的vlan标记都从路由系统中获取。
2.2.9 将报文传递到user层代码进行处理
/* Forward the packet to user space. */
return bpf_redirect_map(&xsks_map, ctx->rx_queue_index, 0);
将接收到的报文传递到对应的AF_XDP socket的rx ring队列。
2.3 文件最后声明了版权信息
char _license[] SEC("license") = "GPL";
由于linux内核ebpf虚拟机只能调用声明为GPL的代码,所以这里必须为GPL。
【待后续完善】