既然是对协议栈的优化,那么肯定是要对比和传统网络协议栈的区别。
首先看看传统网络数据包从客户端到服务器的走向吧
再稍微详细看看数据包是怎么一步步从物理网卡到达上层应用的吧
通过上图,可以清除看到一个网络数据包到来之后的流向,可以看到,网卡收到数据包之后,通过驱动将数据包送内核中的skb_buf,之后skb_buf一路经过内核netfilter框架流程再通过socket送到上层应用中。如果要仔细分析这个流程怕是没有个三天三根本讲不清楚,这个专栏主要记录DPDK相关内容,所以关于内核协议栈就放到以后有时间再详述了。
可以看到传统的网络协议栈层级分明,每层处理的逻辑也很清晰,但是之前我们写的DPDK程序,不管是收包还发包,或者是对不同数据包的处理,全都在一个main函数中,当我们功能单一不复杂时,是可以达到我们需求的,但是随着后面的不断开发与完善我们的程序,无疑会非常繁杂。所以DPDK模仿传统网络协议栈衍生出一套自己的体系。
传统网络架构与 DPDK(Data Plane Development Kit)网络架构之间存在许多区别,而 DPDK 的优势主要体现在以下几个方面:
数据包处理性能:传统网络架构中,网络数据包的处理通常由操作系统的网络协议栈负责,涉及多次内核态和用户态的切换,以及复杂的协议处理。这种方式对于高速数据包处理来说会产生较大的性能开销。而 DPDK 提供了一个用户空间的数据平面库,绕过了操作系统的网络协议栈,直接操作硬件和内存,实现了零拷贝和零中断的高效数据包处理,从而显著提升了数据包处理性能。
硬件抽象:传统网络架构中,网络设备的驱动程序是与特定硬件和操作系统紧密耦合的,不同的硬件需要编写不同的驱动程序。而 DPDK 提供了通用的抽象层,使得网络设备驱动可以更容易地在不同的硬件和操作系统上移植和使用,降低了硬件的依赖性。
高性能队列:传统网络架构中,操作系统提供的网络队列通常具有较高的延迟和较低的吞吐量,限制了数据包处理的性能。而 DPDK 采用基于Linux 内核的无锁环形缓冲 kfifo优化的无锁环形队列,针对单个或多个数据包生产者、单个数据包消费者的出入队列提供无锁机制,有效减少系统开销。
多核支持:传统网络架构中,由于操作系统的网络协议栈通常在单个核心上运行,无法充分利用多核处理器的性能。而 DPDK 支持多核并行处理,能够将数据包处理任务分配到多个核心上并行执行,从而充分利用多核处理器的性能优势。
采用HugePage,减少TLB Miss,降低访存开销;
采用精巧的内存池技术,创建Mbuf直接映射到实际报文,内核空间和用户空间的内存交互
不进行拷贝,只做控制权转移,避免拷贝开销;
利用CPU 亲和性,将线程绑定到指定CPU上,一方面减少了CPU线程间切换的开销,另一方
面避免了 CPU 缓存的局部失效性,增加了 CPU 缓存的命中率;
总体而言,DPDK 的优势在于其高性能、低延迟、硬件抽象和多核支持等特点,使得它成为高性能网络应用的理想选择。它被广泛应用于网络功能虚拟化、数据中心网络、云计算等场景,能够实现高速数据包的处理和转发,满足现代网络对性能和效率的要求
因为暂时的功能不多,先实现上图中的框架,之后有新功能再不断完善。
这次代码太多先上伪代码
int pkt_process{
udp_process(); //处理UDP包
rte_ring_mp_enqueue(); //将ring->in中的数据内容 送到新的环形队列host->rcvbuf中
pthread_cond_signal(&host->cond);//条件变量 通知udp server 有数据需要处理
udp_out();//从host->sndbuf中取出数据来封装响应的udp包,并将包放入环形队列ring->out中
}
int udp_server_entry(){
nsocket();
nbind();
nrecvfrom();//从host->rcvbuf取数据
nsendto();//将要发送的数据写入host->sndbuf中
}
int main{
//创建环形队列,准备收包
ring->in = rte_ring_create("in ring",RING_SIZE,rte_socket_id(),RING_F_SP_ENQ | RING_F_SC_DEQ);
ring->out = rte_ring_create("out ring",RING_SIZE,rte_socket_id(),RING_F_SP_ENQ | RING_F_SC_DEQ);
//启动用户态协议栈中处理数据包的线程
rte_eal_remote_launch(pkt_process,mbuf_pool,lcore_id);
//启动udp server 线程
rte_eal_remote_launch(udp_server_entry,mbuf_pool,lcore_id);
while(1){
//将收到的包直接送入环形队列ring->in中
rte_eth_rx_burst();
rte_ring_sp_enqueue_burst();
//从环形队列ring->out中取出数据包发送
rte_ring_sc_dequeue_burst();
rte_eth_tx_burst();
}
}
然后是完整代码
#include
#include
#include
#include
#include
#include
#include
#define DEBUG_LEVEL 0
#include "arp.h"
#define ENABLE_SEND 1
#define ENABLE_ARP 1
#define ENABLE_ICMP 1
#define ENABLE_ARP_REPLY 1
#define ENABLE_DEBUG 1
#define ENABLE_TIMER 1
#define ENABLE_RINGBUFFER 1
#define ENABLE_MULTHREAD 1
#define ENABLE_UDP_APP 1
#define NUM_MBUFS (4096-1)
#define BURST_SIZE 32
#define RING_SIZE 1024
#define TIMER_RESOLUTION_CYCLES 120000000000ULL // 10ms * 1000 = 10s * 6
#if ENABLE_SEND
#define MAKE_IPV4_ADDR(a, b, c, d) (a + (b<<8) + (c<<16) + (d<<24))
static uint32_t gLocalIp = MAKE_IPV4_ADDR(192, 168, 101, 83);
static uint32_t gSrcIp;
static uint32_t gDstIp;
static uint8_t gSrcMac[RTE_ETHER_ADDR_LEN];
static uint8_t gDstMac[RTE_ETHER_ADDR_LEN];
static uint16_t gSrcPort;
static uint16_t gDstPort;
#endif
#if ENABLE_ARP_REPLY
static uint8_t gDefaultArpMac[RTE_ETHER_ADDR_LEN] = {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF};
#endif
#if ENABLE_RINGBUFFER
struct inout_ring{
struct rte_ring *in;
struct rte_ring *out;
};
static struct inout_ring *rInst = NULL;
static struct inout_ring *ringInstance(void){
if(rInst == NULL){
rInst = rte_malloc("in/out ring",sizeof(struct inout_ring),0);
memset(rInst,0,sizeof(struct inout_ring));
}
return rInst;
}
#endif
#if ENABLE_UDP_APP
static int udp_process(struct rte_mbuf *udpmbuf);
static int udp_out(struct rte_mempool *mbuf_pool);
#endif
int gDpdkPortId = 0;//eth0
static const struct rte_eth_conf port_conf_default = {
.rxmode = {.max_rx_pkt_len = RTE_ETHER_MAX_LEN} //RTE_ETHER_MAX_LEN 以太网数据中长度,一般为1518
};
static void ng_init_port(struct rte_mempool *mbuf_pool){
//查询系统中可用的以太网设备数量,比如eth0,eth1等
uint16_t nb_sys_ports = rte_eth_dev_count_avail();
if(nb_sys_ports == 0){
rte_exit(EXIT_FAILURE, "No Supported eth found\n");
}
struct rte_eth_dev_info dev_info;
//查询以太网接口属性,此处的id = 0,代表查询eth0
rte_eth_dev_info_get(gDpdkPortId,&dev_info);
const int num_rx_queues = 1;//设置接受队列大小,通常每个队列与一个独立CPU关联
const int num_tx_queues = 1;
struct rte_eth_conf port_conf = port_conf_default;
//配置eth0相关属性,用于后面接收发送数据包
rte_eth_dev_configure(gDpdkPortId,num_rx_queues,num_tx_queues,&port_conf);
//用于配置以太网设备的接收队列
if(rte_eth_rx_queue_setup(gDpdkPortId,0,1024,
rte_eth_dev_socket_id(gDpdkPortId),NULL,mbuf_pool) < 0){
rte_exit(EXIT_FAILURE, "Could not setup RX queue\n");
}
#if ENABLE_SEND
struct rte_eth_txconf txq_conf = dev_info.default_txconf;
txq_conf.offloads = port_conf.rxmode.offloads;
//用于配置以太网设备的发送队列
if(rte_eth_tx_queue_setup(gDpdkPortId,0,1024,
rte_eth_dev_socket_id(gDpdkPortId),&txq_conf) < 0){
rte_exit(EXIT_FAILURE, "Could not setup TX queue\n");
}
#endif
//启动指定的网卡,使其能够接收和发送数据包
//初始化指定的以太网设备,配置接收队列和设备属性,并启动该网卡,以便进行数据包的收发和处理操作
if(rte_eth_dev_start(gDpdkPortId) < 0){
rte_exit(EXIT_FAILURE, "Could not start\n");
}
}
static int ng_encode_udp_pkt(uint8_t *msg,uint8_t *data,uint16_t total_len){
//构造以太网头部(Ethernet Header),并将源MAC地址、目的MAC地址以及以太网类型(Ethernet Type)进行填充
struct rte_ether_hdr *eth = (struct rte_ether_hdr *)msg;
rte_memcpy(eth->s_addr.addr_bytes,gSrcMac,RTE_ETHER_ADDR_LEN);
rte_memcpy(eth->d_addr.addr_bytes, gDstMac, RTE_ETHER_ADDR_LEN);
eth->ether_type = htons(RTE_ETHER_TYPE_IPV4);
//构造IPv4头部(IPv4 Header)
struct rte_ipv4_hdr *ip = (struct rte_ipv4_hdr *)(msg +sizeof(struct rte_ether_hdr));
ip->version_ihl = 0x45;
ip->type_of_service = 0;
ip->total_length = htons(total_len- sizeof(struct rte_ether_hdr));
ip->packet_id = 0;
ip->fragment_offset = 0;//fragment_offset 被设置为0,表示数据包不进行分片。
ip->time_to_live = 64; //ttl = 64
ip->next_proto_id = IPPROTO_UDP;
ip->src_addr = gSrcIp;
ip->dst_addr = gDstIp;
ip->hdr_checksum = 0;
ip->hdr_checksum = rte_ipv4_cksum(ip);
//构造UDP头部(UDP Header)
struct rte_udp_hdr *udp = (struct rte_udp_hdr *)(msg +sizeof(struct rte_ether_hdr) + sizeof(struct rte_ipv4_hdr));
udp->src_port = gSrcPort;
udp->dst_port = gDstPort;
uint16_t udplen = total_len - sizeof(struct rte_ether_hdr) - sizeof(struct rte_ipv4_hdr);
udp->dgram_len = htons(udplen);
const char *source_str = "send day 2 by zxk";
strcpy((char *)data, source_str);
rte_memcpy((uint8_t *)(udp+1),data,udplen);
udp->dgram_cksum = 0;
udp->dgram_cksum = rte_ipv4_udptcp_cksum(ip,udp);
struct in_addr addr;
addr.s_addr = gSrcIp;
printf(" zxk_send--> src: %s:%d, ", inet_ntoa(addr), ntohs(gSrcPort));
addr.s_addr = gDstIp;
printf("zxk_send dst: %s:%d\n", inet_ntoa(addr), ntohs(gDstPort));
return 0;
}
static struct rte_mbuf *ng_send_udp(struct rte_mempool *mbuf_pool,uint8_t *data,uint16_t length){
// 42是以太网头部(14字节)+ IPv4头部(20字节)+ UDP头部(8字节)
const unsigned total_len = length + 42;
// 使用rte_pktmbuf_alloc函数从指定的内存池中分配一个rte_mbuf结构
struct rte_mbuf *mbuf = rte_pktmbuf_alloc(mbuf_pool);
if(!mbuf){
rte_exit(EXIT_FAILURE,"rte_pktmbuf_alloc fail\n");
}
// 设置rte_mbuf的数据包长度和实际数据长度
mbuf->pkt_len = total_len;
mbuf->data_len = total_len;
// 获取rte_mbuf的数据指针
uint8_t * pktdata = rte_pktmbuf_mtod(mbuf,uint8_t *);
// 使用ng_encode_udp_pkt函数对rte_mbuf进行填充
ng_encode_udp_pkt(pktdata,data,total_len);
return mbuf;
}
#if ENABLE_ARP
/*
这段代码是一个函数 ng_encode_arp_pkt,用于构造 ARP 数据包的头部和数据部分。
msg: 指向数据包缓冲区的指针,用于存储构造的 ARP 数据包。
dst_mac: 目标主机的 MAC 地址,用于填充 ARP 数据包的目标 MAC 地址字段。
sip: 源 IP 地址,用于填充 ARP 数据包的源 IP 地址字段。
dip: 目标 IP 地址,用于填充 ARP 数据包的目标 IP 地址字段。
*/
static int ng_encode_arp_pkt(uint8_t *msg, uint16_t opcode, uint8_t *dst_mac, uint32_t sip, uint32_t dip) {
//构造以太网头部(Ethernet Header),并将源MAC地址、目的MAC地址以及以太网类型(Ethernet Type)进行填充
struct rte_ether_hdr * eth = (struct rte_ether_hdr *)msg;
rte_memcpy(eth->s_addr.addr_bytes,gSrcMac,RTE_ETHER_ADDR_LEN);
if (!strncmp((const char *)dst_mac, (const char *)gDefaultArpMac, RTE_ETHER_ADDR_LEN)){
//链表中没有mac记录
uint8_t mac[RTE_ETHER_ADDR_LEN] = {0x0};
rte_memcpy(eth->d_addr.addr_bytes, mac, RTE_ETHER_ADDR_LEN);
} else {
rte_memcpy(eth->d_addr.addr_bytes,dst_mac,RTE_ETHER_ADDR_LEN);
}
eth->ether_type = htons(RTE_ETHER_TYPE_ARP);
//构造 ARP(Address Resolution Protocol)数据包的头部
struct rte_arp_hdr *arp = (struct rte_arp_hdr *)(eth+1);
arp->arp_hardware = htons(1);//1:以太网
arp->arp_protocol = htons(RTE_ETHER_TYPE_IPV4);
arp->arp_hlen = RTE_ETHER_ADDR_LEN;//设置 ARP 数据包的硬件地址长度字段。在以太网中,MAC 地址长度为 6 字节
arp->arp_plen = sizeof(uint32_t);//设置 ARP 数据包的协议地址长度字段。在 IPv4 中,IP 地址长度为 4 字节
arp->arp_opcode = htons(opcode); //设置 ARP 数据包的操作码字段。这里的值 2 表示 ARP Reply(响应)
#if DEBUG_LEVEL
// 以下代码用于调试,模拟设置源MAC地址
const char* mac_address = "00:11:22:33:44:55";
sscanf(mac_address, "%2hhx:%2hhx:%2hhx:%2hhx:%2hhx:%2hhx",
&gSrcMac[0], &gSrcMac[1], &gSrcMac[2], &gSrcMac[3], &gSrcMac[4], &gSrcMac[5]);
rte_memcpy(arp->arp_data.arp_sha.addr_bytes,gSrcMac,RTE_ETHER_ADDR_LEN);
#else
rte_memcpy(arp->arp_data.arp_sha.addr_bytes,gSrcMac,RTE_ETHER_ADDR_LEN);
#endif
rte_memcpy(arp->arp_data.arp_tha.addr_bytes,dst_mac,RTE_ETHER_ADDR_LEN);
arp->arp_data.arp_sip = sip;
arp->arp_data.arp_tip = dip;
return 0;
}
static struct rte_mbuf *ng_send_arp(struct rte_mempool *mbuf_pool, uint16_t opcode, uint8_t *dst_mac, uint32_t sip, uint32_t dip){
const unsigned total_length = sizeof(struct rte_ether_hdr) + sizeof(struct rte_arp_hdr);
struct rte_mbuf *mbuf = rte_pktmbuf_alloc(mbuf_pool);
if(!mbuf){
rte_exit(EXIT_FAILURE,"rte_pktmbuf_alloc fail\n");
}
mbuf->pkt_len = total_length;
mbuf->data_len = total_length;
uint8_t *pkt_data = rte_pktmbuf_mtod(mbuf,uint8_t*);
ng_encode_arp_pkt(pkt_data, opcode, dst_mac, sip, dip);
return mbuf;
}
#endif
#if ENABLE_ICMP
static uint16_t ng_checksum(uint16_t *addr, int count) {
register long sum = 0;
while(count > 1){
sum += *(unsigned short *)addr++;
count -= 2;
}
if(count > 0){
sum += *(unsigned char *)addr;
}
while(sum >> 16){
sum = (sum & 0xffff) + (sum >> 16);
}
return ~sum;
}
static int ng_encode_icmp_pkt(uint8_t *msg,uint8_t *dst_mac,
uint32_t sip,uint32_t dip,uint16_t id,uint16_t seqnb){
//1 ether header
struct rte_ether_hdr *eth = (struct rte_ether_hdr *)msg;
rte_memcpy(eth->s_addr.addr_bytes, gSrcMac, RTE_ETHER_ADDR_LEN);
rte_memcpy(eth->d_addr.addr_bytes,dst_mac,RTE_ETHER_ADDR_LEN);
eth->ether_type = htons(RTE_ETHER_TYPE_IPV4);
//2 IP header
struct rte_ipv4_hdr *ip = (struct rte_ipv4_hdr *)(msg +sizeof(struct rte_ether_hdr));
ip->version_ihl = 0x45;
ip->type_of_service = 0;
ip->total_length = htons(sizeof(struct rte_ipv4_hdr) + sizeof(struct rte_icmp_hdr));
ip->packet_id = 0;
ip->fragment_offset = 0;
ip->time_to_live = 64;
ip->next_proto_id = IPPROTO_ICMP;
ip->src_addr = sip;
ip->dst_addr = dip;
ip->hdr_checksum = 0;
ip->hdr_checksum = rte_ipv4_cksum(ip);
//ICMP header
struct rte_icmp_hdr * icmp = (struct rte_icmp_hdr* )(msg + sizeof(struct rte_ether_hdr) + sizeof(struct rte_ipv4_hdr));
icmp->icmp_type = RTE_IP_ICMP_ECHO_REPLY;
icmp->icmp_code = 0;
icmp->icmp_ident = id;
icmp->icmp_seq_nb = seqnb;
icmp->icmp_cksum = 0;
icmp->icmp_cksum = ng_checksum((uint16_t *)icmp, sizeof(struct rte_icmp_hdr));
return 0;
}
static struct rte_mbuf *ng_send_icmp(struct rte_mempool *mbuf_pool,uint8_t *dst_mac,
uint32_t sip, uint32_t dip, uint16_t id, uint16_t seqnb) {
const unsigned total_length = sizeof(struct rte_ether_hdr) +sizeof(struct rte_ipv4_hdr) + sizeof(struct rte_icmp_hdr);
struct rte_mbuf *mbuf = rte_pktmbuf_alloc(mbuf_pool);
if (!mbuf) {
rte_exit(EXIT_FAILURE, "rte_pktmbuf_alloc fail\n");
}
mbuf->pkt_len = total_length;
mbuf->data_len = total_length;
uint8_t * pkt_data = rte_pktmbuf_mtod(mbuf, uint8_t *);
ng_encode_icmp_pkt(pkt_data,dst_mac,sip,dip,id,seqnb);
return mbuf;
}
#endif
static void
print_ethaddr(const char *name, const struct rte_ether_addr *eth_addr)
{
char buf[RTE_ETHER_ADDR_FMT_SIZE];
rte_ether_format_addr(buf, RTE_ETHER_ADDR_FMT_SIZE, eth_addr);
printf("%s%s", name, buf);
}
#if ENABLE_TIMER
//__attribute__((unused)):编译器属性(attribute),告诉编译器该参数未被使用。避免编译警告
static void
arp_request_timer_cb(__attribute__((unused)) struct rte_timer *tim,void *arg)
{
struct rte_mempool *mbuf_pool = (struct rte_mempool *)arg;
struct inout_ring *ring = ringInstance();
int i = 0;
for (i = 1;i <= 254;i ++) {
uint32_t dstip = (gLocalIp & 0x00FFFFFF) | (0xFF000000 & (i << 24));
#if DEBUG_LEVEL
struct in_addr addr;
addr.s_addr = dstip;
printf("arp ---> src: %s \n", inet_ntoa(addr));
#endif
struct rte_mbuf *arpbuf = NULL;
uint8_t *dstmac = ng_get_dst_macaddr(dstip);
//在链表中查找有无目的IP的mac记录,以此来封装数据包中mac字段
if (dstmac == NULL){
arpbuf = ng_send_arp(mbuf_pool, RTE_ARP_OP_REQUEST, gDefaultArpMac, gLocalIp, dstip);
}else {
arpbuf = ng_send_arp(mbuf_pool, RTE_ARP_OP_REQUEST, dstmac, gLocalIp, dstip);
}
//rte_eth_tx_burst(gDpdkPortId, 0, &arpbuf, 1);
//rte_pktmbuf_free(arpbuf);
rte_ring_mp_enqueue_burst(ring->out,(void **)&arpbuf,1,NULL);
}
}
#endif
#if ENABLE_MULTHREAD
//用户态协议栈处理数据包的线程
static int pkt_process(void *arg){
struct rte_mempool *mbuf_pool = (struct rte_mempool *)arg;
struct inout_ring *ring = ringInstance();
while(1){
struct rte_mbuf *mbufs[BURST_SIZE];
//从ring-in 环形队列中取出数据到mbufs
unsigned num_recvd = rte_ring_mc_dequeue_burst(ring->in, (void**)mbufs,BURST_SIZE,NULL);
unsigned i = 0;
for (i = 0;i < num_recvd;i ++){
//rte_ether_hdr是DPDK 中用于表示以太网数据包头部的结构体
//rte_pktmbuf_mtod用于将数据包缓冲区中的数据指针转换为特定类型的指针,以方便对数据包头部进行解析
struct rte_ether_hdr *ehdr = rte_pktmbuf_mtod(mbufs[i],struct rte_ether_hdr *);
#if ENABLE_ARP
if(ehdr->ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_ARP)){
//将数据包偏移以太网数据包头部大小后,就是arp头部信息,再转换为struct rte_arp_hdr *
struct rte_arp_hdr *ahdr = rte_pktmbuf_mtod_offset(mbufs[i],
struct rte_arp_hdr *,sizeof(struct rte_ether_hdr));
struct in_addr addr;
addr.s_addr = ahdr->arp_data.arp_tip;
printf("zxk arp ---> src: %s ", inet_ntoa(addr));
addr.s_addr = gLocalIp;
printf("zxk local: %s \n", inet_ntoa(addr));
//只处理ip地址是本机的arp数据包
if(ahdr->arp_data.arp_tip == gLocalIp){
//处理ARP request包
if (ahdr->arp_opcode == rte_cpu_to_be_16(RTE_ARP_OP_REQUEST)) {
printf("arp --> request\n");
//封装arp reply包
struct rte_mbuf *arpbuf = ng_send_arp(mbuf_pool, RTE_ARP_OP_REPLY, ahdr->arp_data.arp_sha.addr_bytes,
ahdr->arp_data.arp_tip,ahdr->arp_data.arp_sip);
//e_eth_tx_burst(gDpdkPortId,0,&arpbuf,1);
//e_pktmbuf_free(arpbuf);
rte_ring_mp_enqueue_burst(ring->out, (void**)&arpbuf, 1, NULL);
}
//处理ARP reply包
else if (ahdr->arp_opcode == rte_cpu_to_be_16(RTE_ARP_OP_REPLY)) {
printf("arp --> reply\n");
//获取指向 ARP 表结构的指针,如果 ARP 表还未被创建,则会创建并初始化一个新的 ARP 表
struct arp_table *table = arp_table_instance();
/*
尝试从 ARP 表中查找给定目标 IP 地址 ahdr->arp_data.arp_sip 对应的 MAC 地址。
如果能找到,则将该 MAC 地址保存在 hwaddr 变量中,否则 hwaddr 为 NULL。
*/
uint8_t *hwaddr = ng_get_dst_macaddr(ahdr->arp_data.arp_sip);
if (hwaddr == NULL) {
// 从 ARP 表中没有找到对应的 MAC 地址,需要添加新的条目到 ARP 表中
struct arp_entry *entry = rte_malloc("arp_entry",sizeof(struct arp_entry), 0);
if (entry) {
memset(entry, 0, sizeof(struct arp_entry));
entry->ip = ahdr->arp_data.arp_sip;
rte_memcpy(entry->hwaddr, ahdr->arp_data.arp_sha.addr_bytes, RTE_ETHER_ADDR_LEN);
entry->type = 0;
// 将新条目添加到 ARP 表中
LL_ADD(entry, table->entries);
table->count ++;
}
}
#if ENABLE_DEBUG
//遍历 ARP 表中的所有条目,并打印每个条目的 IP 地址和 MAC 地址信息。
struct arp_entry *iter;
for (iter = table->entries; iter != NULL; iter = iter->next) {
struct in_addr addr;
addr.s_addr = iter->ip;
print_ethaddr("arp table --> mac: ", (struct rte_ether_addr *)iter->hwaddr);
printf(" ip: %s \n", inet_ntoa(addr));
}
#endif
rte_pktmbuf_free(mbufs[i]);
}
continue;
}
}
#endif
//rte_cpu_to_be_16用于将 16 位的数据从主机字节序(CPU 字节序)转换为网络字节序(大端字节序)
if(ehdr->ether_type != rte_cpu_to_be_16(RTE_ETHER_TYPE_IPV4)){
continue;
}
//rte_pktmbuf_mtod_offset来获取数据包缓冲区中 IPv4 头部的指针
//将数据包偏移以太网数据包头部大小后,就是IPV4头部信息,再转换为struct rte_ipv4_hdr *
struct rte_ipv4_hdr * iphdr = rte_pktmbuf_mtod_offset(mbufs[i],struct rte_ipv4_hdr *,
sizeof(struct rte_ether_hdr));
if(iphdr->next_proto_id == IPPROTO_UDP){
udp_process(mbufs[i]);
}
#if ENABLE_ICMP
if (iphdr->next_proto_id == IPPROTO_ICMP) {
struct rte_icmp_hdr *icmphdr = (struct rte_icmp_hdr *)(iphdr + 1);
struct in_addr addr;
addr.s_addr = iphdr->src_addr;
printf("zxk_icmp ---> src: %s ", inet_ntoa(addr));
if (icmphdr->icmp_type == RTE_IP_ICMP_ECHO_REQUEST) {
addr.s_addr = iphdr->dst_addr;
printf("zxk local: %s , type : %d\n", inet_ntoa(addr), icmphdr->icmp_type);
struct rte_mbuf *txbuf = ng_send_icmp(mbuf_pool, ehdr->s_addr.addr_bytes,
iphdr->dst_addr, iphdr->src_addr, icmphdr->icmp_ident, icmphdr->icmp_seq_nb);
//rte_eth_tx_burst(gDpdkPortId, 0, &txbuf, 1);
//rte_pktmbuf_free(txbuf);
rte_ring_mp_enqueue_burst(ring->out,(void **)&txbuf,1,NULL);
rte_pktmbuf_free(mbufs[i]);
}
}
#endif
}
#if ENABLE_UDP_APP
udp_out(mbuf_pool);
#endif
}
return 0;
}
#endif
#if ENABLE_UDP_APP
struct localhost{
int fd;
uint32_t localip;
uint8_t localmac[RTE_ETHER_ADDR_LEN];
uint16_t localport;
uint8_t protocol;
struct rte_ring *sndbuf;
struct rte_ring *rcvbuf;
struct localhost *prev;
struct localhost *next;
pthread_cond_t cond;
pthread_mutex_t mutex;
};
static struct localhost *lhost = NULL;
#define DEFAULT_FD_NUM 3
static int get_fd_frombitmap(void){
int fd = DEFAULT_FD_NUM;
return fd;
}
static struct localhost *get_hostinfo_fromfd(int sockfd){
struct localhost *host;
for(host = lhost; host!=NULL;host = host->next){
if(sockfd == host->fd){
return host;
}
}
return NULL;
}
static struct localhost *get_hostinfo_fromip_port(uint32_t dip, uint16_t port, uint8_t proto){
struct localhost *host;
for (host = lhost; host != NULL;host = host->next) {
if (dip == host->localip && port == host->localport && proto == host->protocol) {
return host;
}
}
return NULL;
}
struct offload{
uint32_t sip;
uint32_t dip;
uint16_t sport;
uint16_t dport;
int protocol;
unsigned char *data;
uint16_t length;
};
/*
用于处理接收到的UDP数据包。函数的主要功能是解析UDP数据包的头部信息,并将相关信息存储在一个名为offload的结构体中,
然后将结构体放入接收缓冲区进行后续处理。
*/
static int udp_process(struct rte_mbuf *udpmbuf){
struct rte_ipv4_hdr *iphdr = rte_pktmbuf_mtod_offset(udpmbuf, struct rte_ipv4_hdr *,
sizeof(struct rte_ether_hdr));
//(iphdr + 1) +1指的是偏移rte_ipv4_hdr(iphdr类型)大小
struct rte_udp_hdr *udphdr = (struct rte_udp_hdr *)(iphdr + 1);
struct in_addr addr;
addr.s_addr = iphdr->src_addr;
printf("udp_process ---> src: %s:%d \n", inet_ntoa(addr), ntohs(udphdr->src_port));
printf("zxk udp data: %s\n",(unsigned char *)(udphdr+1));
struct localhost *host = get_hostinfo_fromip_port(iphdr->dst_addr, udphdr->dst_port, iphdr->next_proto_id);
if (host == NULL) {
rte_pktmbuf_free(udpmbuf);
return -3;
}
struct offload *ol = rte_malloc("offload", sizeof(struct offload), 0);
if (ol == NULL) {
rte_pktmbuf_free(udpmbuf);
return -1;
}
ol->dip = iphdr->dst_addr;
ol->sip = iphdr->src_addr;
ol->sport = udphdr->src_port;
ol->dport = udphdr->dst_port;
ol->protocol = IPPROTO_UDP;
ol->length = ntohs(udphdr->dgram_len);
ol->data = rte_malloc("unsigned char*", ol->length - sizeof(struct rte_udp_hdr), 0);
if (ol->data == NULL) {
rte_pktmbuf_free(udpmbuf);
rte_free(ol);
return -2;
}
//这里返回的数据只是简单拷贝收到的数据内容,后续可以根据需求完善
rte_memcpy(ol->data, (unsigned char *)(udphdr+1), ol->length - sizeof(struct rte_udp_hdr));
rte_ring_mp_enqueue(host->rcvbuf, ol); // recv buffer
//通过互斥锁和条件变量通知相关线程有新数据可处理
pthread_mutex_lock(&host->mutex);
pthread_cond_signal(&host->cond);
pthread_mutex_unlock(&host->mutex);
rte_pktmbuf_free(udpmbuf);
return 0;
}
static int ng_encode_udp_apppkt(uint8_t *msg, uint32_t sip, uint32_t dip,
uint16_t sport, uint16_t dport, uint8_t *srcmac, uint8_t *dstmac,
unsigned char *data, uint16_t total_len) {
// encode
// 1 ethhdr
struct rte_ether_hdr *eth = (struct rte_ether_hdr *)msg;
rte_memcpy(eth->s_addr.addr_bytes, srcmac, RTE_ETHER_ADDR_LEN);
rte_memcpy(eth->d_addr.addr_bytes, dstmac, RTE_ETHER_ADDR_LEN);
eth->ether_type = htons(RTE_ETHER_TYPE_IPV4);
// 2 iphdr
struct rte_ipv4_hdr *ip = (struct rte_ipv4_hdr *)(msg + sizeof(struct rte_ether_hdr));
ip->version_ihl = 0x45;
ip->type_of_service = 0;
ip->total_length = htons(total_len - sizeof(struct rte_ether_hdr));
ip->packet_id = 0;
ip->fragment_offset = 0;
ip->time_to_live = 64; // ttl = 64
ip->next_proto_id = IPPROTO_UDP;
ip->src_addr = sip;
ip->dst_addr = dip;
ip->hdr_checksum = 0;
ip->hdr_checksum = rte_ipv4_cksum(ip);
// 3 udphdr
struct rte_udp_hdr *udp = (struct rte_udp_hdr *)(msg + sizeof(struct rte_ether_hdr) + sizeof(struct rte_ipv4_hdr));
udp->src_port = sport;
udp->dst_port = dport;
uint16_t udplen = total_len - sizeof(struct rte_ether_hdr) - sizeof(struct rte_ipv4_hdr);
udp->dgram_len = htons(udplen);
rte_memcpy((uint8_t*)(udp+1), data, udplen);
udp->dgram_cksum = 0;
udp->dgram_cksum = rte_ipv4_udptcp_cksum(ip, udp);
return 0;
}
static struct rte_mbuf * ng_udp_pkt(struct rte_mempool *mbuf_pool, uint32_t sip, uint32_t dip,
uint16_t sport, uint16_t dport, uint8_t *srcmac, uint8_t *dstmac,
uint8_t *data, uint16_t length) {
// mempool --> mbuf
const unsigned total_len = length + 42;
struct rte_mbuf *mbuf = rte_pktmbuf_alloc(mbuf_pool);
if (!mbuf) {
rte_exit(EXIT_FAILURE, "rte_pktmbuf_alloc\n");
}
mbuf->pkt_len = total_len;
mbuf->data_len = total_len;
uint8_t *pktdata = rte_pktmbuf_mtod(mbuf, uint8_t*);
ng_encode_udp_apppkt(pktdata, sip, dip, sport, dport, srcmac, dstmac,
data, total_len);
return mbuf;
}
static int udp_out(struct rte_mempool *mbuf_pool) {
struct localhost *host;
for (host = lhost; host != NULL; host = host->next) {
struct offload *ol;
int nb_snd = rte_ring_mc_dequeue(host->sndbuf, (void **)&ol);
if (nb_snd < 0) continue;
struct in_addr addr;
addr.s_addr = ol->dip;
printf("udp_out ---> src: %s:%d \n", inet_ntoa(addr), ntohs(ol->dport));
uint8_t *dstmac = ng_get_dst_macaddr(ol->dip);
//不知道对方mac地址的情况,先发送arp
if (dstmac == NULL) {
struct rte_mbuf *arpbuf = ng_send_arp(mbuf_pool, RTE_ARP_OP_REQUEST, gDefaultArpMac,
ol->sip, ol->dip);
struct inout_ring *ring = ringInstance();
rte_ring_mp_enqueue_burst(ring->out, (void **)&arpbuf, 1, NULL);
rte_ring_mp_enqueue(host->sndbuf, ol);
} else {
struct rte_mbuf *udpbuf = ng_udp_pkt(mbuf_pool, ol->sip, ol->dip, ol->sport, ol->dport,
host->localmac, dstmac, ol->data, ol->length);
struct inout_ring *ring = ringInstance();
rte_ring_mp_enqueue_burst(ring->out, (void **)&udpbuf, 1, NULL);
}
}
return 0;
}
static int nsocket(__attribute__((unused)) int domain, int type, __attribute__((unused)) int protocol)
{
int fd = get_fd_frombitmap();
struct localhost *host = rte_malloc("localhost", sizeof(struct localhost), 0);
if (host == NULL) {
return -1;
}
memset(host, 0, sizeof(struct localhost));
host->fd = fd;
if (type == SOCK_DGRAM)
host->protocol = IPPROTO_UDP;
//使用 rte_ring_create 创建一个接收缓冲区环形队列,并将指针赋给 host 结构体的 rcvbuf 成员
host->rcvbuf = rte_ring_create("recv buffer", RING_SIZE, rte_socket_id(), RING_F_SP_ENQ | RING_F_SC_DEQ);
if (host->rcvbuf == NULL) {
rte_free(host);
return -1;
}
使用 rte_ring_create 创建一个接收缓冲区环形队列,并将指针赋给 host 结构体的 sndbuf 成员
host->sndbuf = rte_ring_create("send buffer", RING_SIZE, rte_socket_id(), RING_F_SP_ENQ | RING_F_SC_DEQ);
if (host->sndbuf == NULL) {
rte_ring_free(host->rcvbuf);
rte_free(host);
return -1;
}
//初始化互斥锁和条件变量,将空白的互斥锁和条件变量拷贝到 host 结构体的对应成员中
pthread_cond_t blank_cond = PTHREAD_COND_INITIALIZER;
rte_memcpy(&host->cond, &blank_cond, sizeof(pthread_cond_t));
pthread_mutex_t blank_mutex = PTHREAD_MUTEX_INITIALIZER;
rte_memcpy(&host->mutex, &blank_mutex, sizeof(pthread_mutex_t));
LL_ADD(host, lhost);
return fd;
}
/*
将fd与网卡绑定
*/
static int nbind(int sockfd, const struct sockaddr *addr,
__attribute__((unused)) socklen_t addrlen) {
struct localhost *host = get_hostinfo_fromfd(sockfd);
if (host == NULL) return -1;
const struct sockaddr_in *laddr = (const struct sockaddr_in *)addr;
host->localport = laddr->sin_port;
rte_memcpy(&host->localip, &laddr->sin_addr.s_addr, sizeof(uint32_t));
rte_memcpy(host->localmac, gSrcMac, RTE_ETHER_ADDR_LEN);
return 0;
}
/*
我们自己实现的协议栈将数据放入rcvbuf中,udp server中通过nrecvfrom函数取出数据到buf中
*/
static ssize_t nrecvfrom(int sockfd, void *buf, size_t len, __attribute__((unused)) int flags,
struct sockaddr *src_addr, __attribute__((unused)) socklen_t *addrlen) {
struct localhost *host = get_hostinfo_fromfd(sockfd);
if (host == NULL) return -1;
struct offload *ol = NULL;
unsigned char *ptr = NULL;
struct sockaddr_in *saddr = (struct sockaddr_in *)src_addr;
int nb = -1;
pthread_mutex_lock(&host->mutex);
while ((nb = rte_ring_mc_dequeue(host->rcvbuf, (void **)&ol)) < 0) {
//这里会阻塞,直到收到包后pthread_cond_signal唤醒
pthread_cond_wait(&host->cond, &host->mutex);
}
pthread_mutex_unlock(&host->mutex);
saddr->sin_port = ol->sport;
rte_memcpy(&saddr->sin_addr.s_addr, &ol->sip, sizeof(uint32_t));
if (len < ol->length) {
rte_memcpy(buf, ol->data, len);
//将未拷贝的数据复制到新分配的内存中
ptr = rte_malloc("unsigned char *", ol->length-len, 0);
rte_memcpy(ptr, ol->data+len, ol->length-len);
ol->length -= len;
rte_free(ol->data);
ol->data = ptr;
rte_ring_mp_enqueue(host->rcvbuf, ol);
return len;
} else {
rte_memcpy(buf, ol->data, ol->length);
rte_free(ol->data);
rte_free(ol);
return ol->length;
}
}
/*
将数据放入发送缓冲区环形队列中,以便后续从队列中发送数据
*/
static ssize_t nsendto(int sockfd, const void *buf, size_t len, __attribute__((unused)) int flags,
const struct sockaddr *dest_addr, __attribute__((unused)) socklen_t addrlen){
struct localhost *host = get_hostinfo_fromfd(sockfd);
if (host == NULL) return -1;
const struct sockaddr_in *daddr = (const struct sockaddr_in *)dest_addr;
struct offload *ol = rte_malloc("offload", sizeof(struct offload), 0);
if (ol == NULL) return -1;
ol->dip = daddr->sin_addr.s_addr;
ol->dport = daddr->sin_port;
ol->sip = host->localip;
ol->sport = host->localport;
ol->length = len;
struct in_addr addr;
addr.s_addr = ol->dip;
printf("nsendto ---> src: %s:%d \n", inet_ntoa(addr), ntohs(ol->dport));
ol->data = rte_malloc("unsigned char *", len, 0);
if (ol->data == NULL) {
rte_free(ol);
return -1;
}
rte_memcpy(ol->data, buf, len);
//将 需要发送的ol 数据包放入发送缓冲区环形队列 host->sndbuf 中
rte_ring_mp_enqueue(host->sndbuf, ol);
return len;
}
static int nclose(int fd) {
struct localhost *host = get_hostinfo_fromfd(fd);
if (host == NULL) return -1;
LL_REMOVE(host, lhost);
if (host->rcvbuf) {
rte_ring_free(host->rcvbuf);
}
if (host->sndbuf){
rte_ring_free(host->sndbuf);
}
rte_free(host);
}
#define UDP_APP_RECV_BUFFER_SIZE 128
/*
典型的udp server,此处绑定网卡为eth0,然后
*/
static int udp_server_entry(__attribute__((unused)) void *arg) {
int connfd = nsocket(AF_INET, SOCK_DGRAM, 0);
if (connfd == -1) {
printf("sockfd failed\n");
return -1;
}
struct sockaddr_in localaddr, clientaddr; // struct sockaddr
memset(&localaddr, 0, sizeof(struct sockaddr_in));
localaddr.sin_port = htons(8889);
localaddr.sin_family = AF_INET;
localaddr.sin_addr.s_addr = inet_addr("192.168.101.83"); // 0.0.0.0
nbind(connfd, (struct sockaddr*)&localaddr, sizeof(localaddr));
char buffer[UDP_APP_RECV_BUFFER_SIZE] = {0};
socklen_t addrlen = sizeof(clientaddr);
while (1){
if (nrecvfrom(connfd, buffer, UDP_APP_RECV_BUFFER_SIZE, 0,
(struct sockaddr*)&clientaddr, &addrlen) < 0) {
continue;
}else{
printf("recv from %s:%d, data:%s\n", inet_ntoa(clientaddr.sin_addr),
ntohs(clientaddr.sin_port), buffer);
nsendto(connfd, buffer, strlen(buffer), 0,
(struct sockaddr*)&clientaddr, sizeof(clientaddr));
}
}
nclose(connfd);
}
#endif
int main(int argc,char *argv[]){
//初始化EAL环境
if(rte_eal_init(argc,argv) < 0 ){
rte_exit(EXIT_FAILURE,"Error with EAL init\n");
}
//创建内存池
struct rte_mempool *mbuf_pool = rte_pktmbuf_pool_create("mbuf pool",NUM_MBUFS,
0,0,RTE_MBUF_DEFAULT_BUF_SIZE,rte_socket_id());
if(mbuf_pool == NULL){
rte_exit(EXIT_FAILURE,"Could not create mbuf pool\n");
}
//mbuf_pool 是一个预先创建好的内存池,它将被用于接收队列来存储数据包的缓冲区
ng_init_port(mbuf_pool);
rte_eth_macaddr_get(gDpdkPortId, (struct rte_ether_addr *)gSrcMac);
#if ENABLE_TIMER
rte_timer_subsystem_init();
struct rte_timer arp_timer;
rte_timer_init(&arp_timer);
uint64_t hz = rte_get_timer_hz();//函数用于获取 DPDK 计时器的频率(每秒的计时器滴答数)
unsigned lcore_id = rte_lcore_id();//取当前线程的 ID
//当rte_timer_manage 函数触发定时器时,这里调用回调函数arp_request_timer_cb,mbuf_pool是回调函数的参数
rte_timer_reset(&arp_timer, hz, PERIODICAL, lcore_id, arp_request_timer_cb, mbuf_pool);
#endif
#if ENABLE_RINGBUFFER
struct inout_ring *ring = ringInstance();
if(ring == NULL){
rte_exit(EXIT_FAILURE,"ring buffer init failed\n");
}
if(ring->in == NULL){
//rte_ring_create 创建环形队列 in ring是队列名字
ring->in = rte_ring_create("in ring",RING_SIZE,rte_socket_id(),RING_F_SP_ENQ | RING_F_SC_DEQ);
}
if (ring->out == NULL) {
ring->out = rte_ring_create("out ring",RING_SIZE,rte_socket_id(),RING_F_SP_ENQ | RING_F_SC_DEQ);
}
#endif
#if ENABLE_MULTHREAD
lcore_id不一样。可以分配不同cpu给线程,实现负载均衡
lcore_id = rte_get_next_lcore(lcore_id, 1, 0);
//启动用户态协议栈中处理数据包的线程
rte_eal_remote_launch(pkt_process,mbuf_pool,lcore_id);
#endif
#if ENABLE_UDP_APP
lcore_id = rte_get_next_lcore(lcore_id,1,0);
//启动udp server 线程
rte_eal_remote_launch(udp_server_entry,mbuf_pool,lcore_id);
#endif
while(1){
//对rx数据包的处理
//mbufs用于存储数据包的缓冲区结构体
//BURST_SIZE表示每次从网卡接收数据包的最大数量
struct rte_mbuf *rx[BURST_SIZE];
unsigned num_recvd = rte_eth_rx_burst(gDpdkPortId,0,rx,BURST_SIZE);
if(num_recvd > BURST_SIZE){
rte_exit(EXIT_FAILURE,"Error receving from eth\n");
}else if(num_recvd > 0){
//将收到的数据包送入ring_in buffer中
rte_ring_sp_enqueue_burst(ring->in, (void **)rx,num_recvd,NULL);
}
//对tx数据包的处理,从ring out 中取出数据保存到tx中,然后再发送出去
struct rte_mbuf *tx[BURST_SIZE];
unsigned nb_tx = rte_ring_sc_dequeue_burst(ring->out,(void **)tx,BURST_SIZE,NULL);
if(nb_tx > 0){
rte_eth_tx_burst(gDpdkPortId,0,tx,nb_tx);
unsigned i = 0;
for(i = 0;i < nb_tx;i++){
rte_pktmbuf_free(tx[i]);
}
}
#if ENABLE_TIMER
static uint64_t prev_tsc = 0, cur_tsc;
uint64_t diff_tsc;
cur_tsc = rte_rdtsc();
diff_tsc = cur_tsc - prev_tsc;
if (diff_tsc > TIMER_RESOLUTION_CYCLES) //定时60s
{
//检查所有已经注册的计时器,并对已经到期的计时器触发相应的回调函数,这里触发 arp_request_timer_cb
rte_timer_manage();
prev_tsc = cur_tsc;
}
#endif
}
return 0;
}
最后编译运行后,优化后的协议栈可以正常处理UDP数据包