l2fwd的命令行参数分两部分:EAL和程序本身的参数,中间以–分隔开。例如:
./l2fwd -c 0x3 -n 4 -- -p 3 -q 1
其中-c和-n就是EAL的参数,-p和-q就是程序配置参数。在解析参数的时候也是分两部分解析的。
ret = rte_eal_init(argc, argv);//有时间应该好好看一下,比较重要
if (ret < 0)
rte_exit(EXIT_FAILURE, "Invalid EAL arguments\n");
argc -= ret;
argv += ret;
rte_eal_init()不仅有解析命令行参数的作用,以及一系列很复杂的环境的初始化。当解析完了EAL的参数之后,argc减去EAL参数的个数同时argv后移这么多位,这样就能保证后面解析程序参数的时候跳过了前面的EAL参数。
rte_eal_init()进行一系列很复杂的初始化工作,在官方文档上写的这些初始化工作包括:
/* parse application arguments (after the EAL ones) */
ret = l2fwd_parse_args(argc, argv);
if (ret < 0)
rte_exit(EXIT_FAILURE, "Invalid L2FWD arguments\n");
创建内存池的接口是:
struct rte_mempool *
rte_pktmbuf_pool_create(const char *name, unsigned int n,
unsigned int cache_size, uint16_t priv_size, uint16_t data_room_size,
int socket_id)
/*返回所有以太网端口*/
nb_ports = rte_eth_dev_count();
if (nb_ports == 0)
rte_exit(EXIT_FAILURE, "No Ethernet ports - bye\n");
/*检查可用的端口掩码*/
/* check port mask to possible port mask */
if (l2fwd_enabled_port_mask & ~((1 << nb_ports) - 1))
rte_exit(EXIT_FAILURE, "Invalid portmask; possible (0x%x)\n",
(1 << nb_ports) - 1);
/*目的端口都先置0*/
/* reset l2fwd_dst_ports */
for (portid = 0; portid < RTE_MAX_ETHPORTS; portid++)
l2fwd_dst_ports[portid] = 0;
last_port = 0;
RTE_ETH_FOREACH_DEV(portid) {
/* skip ports that are not enabled */
if ((l2fwd_enabled_port_mask & (1 << portid)) == 0)
continue;
if (nb_ports_in_mask % 2) {
l2fwd_dst_ports[portid] = last_port;
l2fwd_dst_ports[last_port] = portid;
}
else
last_port = portid;
nb_ports_in_mask++;
}
if (nb_ports_in_mask % 2) {
printf("Notice: odd number of ports in portmask.\n");
l2fwd_dst_ports[last_port] = last_port;
}
此处是设为相邻端口互相转发。例如:0号端口的目的端口是1,1号端口的目的端口是0;2号端口的目的端口是3,3号端口的目的端口是2……这里我们也可以修改成我们想要的转发规则。
/* Initialize the port/queue configuration of each logical core */
RTE_ETH_FOREACH_DEV(portid) {
/* skip ports that are not enabled */
if ((l2fwd_enabled_port_mask & (1 << portid)) == 0)
continue;
/*得到一个收取队列未分配满且可用的逻辑内核*/
/* 为每个端口分配到相应的逻辑内核
每个端口只对应一个逻辑内核
每个逻辑内核对应l2fwd_rx_queue_per_lcore个端口*/
/* get the lcore_id for this port */
while (rte_lcore_is_enabled(rx_lcore_id) == 0 || lcore_queue_conf[rx_lcore_id].n_rx_port == l2fwd_rx_queue_per_lcore) {
rx_lcore_id++;
if (rx_lcore_id >= RTE_MAX_LCORE)
rte_exit(EXIT_FAILURE, "Not enough cores\n");
}
if (qconf != &lcore_queue_conf[rx_lcore_id]) {
/* Assigned a new logical core in the loop above. */
qconf = &lcore_queue_conf[rx_lcore_id];
nb_lcores++;
}
qconf->rx_port_list[qconf->n_rx_port] = portid;
qconf->n_rx_port++;
printf("Lcore %u: RX port %u\n", rx_lcore_id, portid);
}
//内存池中存储mbuf的数量
nb_mbufs = RTE_MAX(nb_ports * (nb_rxd + nb_txd + MAX_PKT_BURST +
nb_lcores * MEMPOOL_CACHE_SIZE), 8192U);
/* create the mbuf pool */
l2fwd_pktmbuf_pool = rte_pktmbuf_pool_create("mbuf_pool", nb_mbufs,
MEMPOOL_CACHE_SIZE, 0, RTE_MBUF_DEFAULT_BUF_SIZE,
rte_socket_id());
if (l2fwd_pktmbuf_pool == NULL)
rte_exit(EXIT_FAILURE, "Cannot init mbuf pool\n");
其中fflush函数是清除缓冲区的作用,会强迫未写入磁盘的内容立即写入。
/* Initialise each port */
RTE_ETH_FOREACH_DEV(portid) {
struct rte_eth_rxconf rxq_conf;
struct rte_eth_txconf txq_conf;
struct rte_eth_conf local_port_conf = port_conf;
struct rte_eth_dev_info dev_info;
/* skip ports that are not enabled */
if ((l2fwd_enabled_port_mask & (1 << portid)) == 0) {
printf("Skipping disabled port %u\n", portid);
continue;
}
nb_ports_available++;
/* init port */
printf("Initializing port %u... ", portid);
fflush(stdout); //清除读写缓存区
rte_eth_dev_info_get(portid, &dev_info);
if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE)
local_port_conf.txmode.offloads |=
DEV_TX_OFFLOAD_MBUF_FAST_FREE;
/*配置端口,将一些配置写进设备dev的一些字段,以及检查设备支持什么类型的中断、支持的包大小*/
ret = rte_eth_dev_configure(portid, 1, 1, &local_port_conf);
if (ret < 0)
rte_exit(EXIT_FAILURE, "Cannot configure device: err=%d, port=%u\n",
ret, portid);
ret = rte_eth_dev_adjust_nb_rx_tx_desc(portid, &nb_rxd,
&nb_txd);
if (ret < 0)
rte_exit(EXIT_FAILURE,
"Cannot adjust number of descriptors: err=%d, port=%u\n",
ret, portid);
/*获取设备的MAC地址,写在后一个参数里*/
rte_eth_macaddr_get(portid,&l2fwd_ports_eth_addr[portid]);
/* init one RX queue */
fflush(stdout);/*清除缓存区*/
/*设置接收队列*/
rxq_conf = dev_info.default_rxconf;
rxq_conf.offloads = local_port_conf.rxmode.offloads;
ret = rte_eth_rx_queue_setup(portid, 0, nb_rxd,
rte_eth_dev_socket_id(portid),
&rxq_conf,
l2fwd_pktmbuf_pool);
if (ret < 0)
rte_exit(EXIT_FAILURE, "rte_eth_rx_queue_setup:err=%d, port=%u\n",
ret, portid);
/* init one TX queue on each port */
fflush(stdout);/*清除缓存区*/
/*设置发送队列*/
txq_conf = dev_info.default_txconf;
txq_conf.txq_flags = ETH_TXQ_FLAGS_IGNORE;
txq_conf.offloads = local_port_conf.txmode.offloads;
ret = rte_eth_tx_queue_setup(portid, 0, nb_txd,
rte_eth_dev_socket_id(portid),
&txq_conf);
if (ret < 0)
rte_exit(EXIT_FAILURE, "rte_eth_tx_queue_setup:err=%d, port=%u\n",
ret, portid);
/* Initialize TX buffers */
/*每个端口分配发送缓存区,根据numa架构的socket就近分配*/
tx_buffer[portid] = rte_zmalloc_socket("tx_buffer",
RTE_ETH_TX_BUFFER_SIZE(MAX_PKT_BURST), 0,
rte_eth_dev_socket_id(portid));
if (tx_buffer[portid] == NULL)
rte_exit(EXIT_FAILURE, "Cannot allocate buffer for tx on port %u\n",
portid);
/*发送缓存区初始化*/
rte_eth_tx_buffer_init(tx_buffer[portid], MAX_PKT_BURST);
/*设置发送缓存区的err_callback*/
ret = rte_eth_tx_buffer_set_err_callback(tx_buffer[portid],
rte_eth_tx_buffer_count_callback,
&port_statistics[portid].dropped);
if (ret < 0)
rte_exit(EXIT_FAILURE,
"Cannot set error callback for tx buffer on port %u\n",
portid);
/* Start device */
/*开启端口*/
ret = rte_eth_dev_start(portid);
if (ret < 0)
rte_exit(EXIT_FAILURE, "rte_eth_dev_start:err=%d, port=%u\n",
ret, portid);
printf("done: \n");
rte_eth_promiscuous_enable(portid);
/*打印端口MAC地址*/
printf("Port %u, MAC address: %02X:%02X:%02X:%02X:%02X:%02X\n\n",
portid,
l2fwd_ports_eth_addr[portid].addr_bytes[0],
l2fwd_ports_eth_addr[portid].addr_bytes[1],
l2fwd_ports_eth_addr[portid].addr_bytes[2],
l2fwd_ports_eth_addr[portid].addr_bytes[3],
l2fwd_ports_eth_addr[portid].addr_bytes[4],
l2fwd_ports_eth_addr[portid].addr_bytes[5]);
/* initialize port stats */
/*初始化端口数据,就是后面要打印的,接收、发送、drop的包数*/
memset(&port_statistics, 0, sizeof(port_statistics));
}
check_all_ports_link_status(l2fwd_enabled_port_mask);
这里就是DPDK的任务分发函数,每个slave从线程启动后运行的函数是l2fwd_launch_one_lcore:
/* launch per-lcore init on every lcore */
rte_eal_mp_remote_launch(l2fwd_launch_one_lcore, NULL, CALL_MASTER);
RTE_LCORE_FOREACH_SLAVE(lcore_id) {
if (rte_eal_wait_lcore(lcore_id) < 0) {
ret = -1;
break;
}
}
而l2fwd_launch_one_lcore实际上运行的是l2fwd_main_loop,这就是上面说的从线程循环了
static int
l2fwd_launch_one_lcore(__attribute__((unused)) void *dummy)
{
l2fwd_main_loop();
return 0;
}
/* main processing loop */
static void
l2fwd_main_loop(void)
{
struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
struct rte_mbuf *m;
int sent;
unsigned lcore_id;
uint64_t prev_tsc, diff_tsc, cur_tsc, timer_tsc;
unsigned i, j, portid, nb_rx;
struct lcore_queue_conf *qconf;
const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S *
BURST_TX_DRAIN_US;
struct rte_eth_dev_tx_buffer *buffer;
prev_tsc = 0;
timer_tsc = 0;
//获取自己的lcore_id
lcore_id = rte_lcore_id();
qconf = &lcore_queue_conf[lcore_id];
//分配后多余的lcore
if (qconf->n_rx_port == 0) {
RTE_LOG(INFO, L2FWD, "lcore %u has nothing to do\n", lcore_id);
return;
}
//有事可做的核进入了主循环
RTE_LOG(INFO, L2FWD, "entering main loop on lcore %u\n", lcore_id);
for (i = 0; i < qconf->n_rx_port; i++) {
portid = qconf->rx_port_list[i];
RTE_LOG(INFO, L2FWD, " -- lcoreid=%u portid=%u\n", lcore_id,
portid);
}
//直到发生了强制退出,ctrl+c或者kill该进程
while (!force_quit) {
cur_tsc = rte_rdtsc();
/*
* TX burst queue drain
*/
//计算时间片,上次收包时间和这次收包时间差
diff_tsc = cur_tsc - prev_tsc;
//过了100us,把buffer里面的报文发送出去
if (unlikely(diff_tsc > drain_tsc)) {
for (i = 0; i < qconf->n_rx_port; i++) {
portid = l2fwd_dst_ports[qconf->rx_port_list[i]];
buffer = tx_buffer[portid];
sent = rte_eth_tx_buffer_flush(portid, 0, buffer);
if (sent)
port_statistics[portid].tx += sent;
}
//到了时间片,打印各端口的数据
/* if timer is enabled */
if (timer_period > 0) {
/* advance the timer */
timer_tsc += diff_tsc;
/* if timer has reached its timeout */
//如果累积时间超过我们设定的阈值,就打印出统计数据,默认是10s
if (unlikely(timer_tsc >= timer_period)) {
/* do this only on master core */
//打印让master主线程来做
if (lcore_id == rte_get_master_lcore()) {
//打印统计数据
print_stats();
/* reset the timer积累时间置0 */
timer_tsc = 0;
}
}
}
prev_tsc = cur_tsc;
}
/*
* Read packet from RX queues
*/
//没有到发送时间片的话,读接收队列里的报文
for (i = 0; i < qconf->n_rx_port; i++) {
portid = qconf->rx_port_list[i];
nb_rx = rte_eth_rx_burst(portid, 0,
pkts_burst, MAX_PKT_BURST);
//计数,收到的报文数
port_statistics[portid].rx += nb_rx;
for (j = 0; j < nb_rx; j++) {
m = pkts_burst[j];
rte_prefetch0(rte_pktmbuf_mtod(m, void *));
//updating mac地址以及目的端口发送buffer满了的话,尝试发送
l2fwd_simple_forward(m, portid);
}
}
}
}
程序中强制退出,是自己写的一个信号量,包括两种操作,即在ctrl+c或者kill了这个进程的时候,会触发:
static void
signal_handler(int signum)
{
if (signum == SIGINT || signum == SIGTERM) {
printf("\n\nSignal %d received, preparing to exit...\n",
signum);
force_quit = true;
}
}
force_quit = false;
signal(SIGINT, signal_handler);
signal(SIGTERM, signal_handler);
signal_handler函数第一个参数signum:指明了所要处理的信号类型,它可以取除了SIGKILL和SIGSTOP外的任何一种信号。
第二个参数handler:描述了与信号关联的动作,它可以取以下三种值:
这个符号表示忽略该信号。
这个符号表示恢复对信号的系统默认处理。不写此处理函数默认也是执行系统默认操作。
此函数必须在signal()被调用前申明,handler中为这个函数的名字。当接收到一个类型为sig的信号时,就执行handler 所指定的函数。(int)signum是传递给它的唯一参数。执行了signal()调用后,进程只要接收到类型为sig的信号,不管其正在执行程序的哪一部分,就立即执行func()函数。当func()函数执行结束后,控制权返回进程被中断的那一点继续执行。
在该函数中就是第三种,所以当我们退出是ctrl+c不是直接将进程杀死,而是会将force_quit置为true,让程序自然退出,这样程序就来得及完成最后退出之前的操作。
内容参考来自: