作为一个Linux内核开发的程序员,时常会被问到内核收报的处理过程,很多时候可以快速讲出一个大概,但关注的重点多在内核协议栈的报文处理过程,对于报文从到达网卡再到上送协议栈之间的处理过程总是很模糊,因此想就此过程进行学习研究,故有了本篇博客。
网络收报处理的过程分为几个步骤:
1)硬件接收,网卡通过物理层或者数据链路层接收到数据帧
2) DMA传输,网卡通过DMA引擎将报文拷贝到ring_buffer缓冲区,并触发硬件中断,通知CPU有报文到来。
3)CPU硬件中断处理,将报文从ring_buffer中拷贝到内核报文缓冲区skb_buffer中,并放入报文接收队列中。
4)触发收报软中断NET_RX_ACTION
5)软中断处理,将报文从接收队列移入处理队列,并上送内核协议栈
6)内核协议栈报文处理
这里需要说明的是Linux 内核只负责L2-L4层的内容,L1物理层硬件负责,L4以上应用层负责。
网络收报离不开网卡驱动,以Ubuntu 为例,常用的Intel网卡驱动为e1000模块,这里以e1000为例,介绍网卡收报过程,在此之前需要对驱动加载、网卡探测、请求中断等一系列过程有个了解。
在Linux 内核中,驱动以模块的形式存在,即驱动实质是一个内核模块,在Linux 内核启动,模块初始化过程中加载注册驱动,函数调用过程如下:
驱动注册过程如下:
static int __init e100_init_module(void)
{
if (((1 << debug) - 1) & NETIF_MSG_DRV) {
pr_info("%s, %s\n", DRV_DESCRIPTION, DRV_VERSION);
pr_info("%s\n", DRV_COPYRIGHT);
}
return pci_register_driver(&e100_driver);
}
由此可知e1000驱动为e100_driver,通过pci_register_driver注册到内核中,e100_driver定义如下:
static struct pci_driver e100_driver = {
.name = DRV_NAME, /* 驱动名 */
.id_table = e100_id_table,
.probe = e100_probe, /* 网卡探测函数 */
.remove = e100_remove, /* 网卡移除时调用,主要释放网卡相关资源 */
#ifdef CONFIG_PM
/* Power Management hooks */
.suspend = e100_suspend,
.resume = e100_resume,
#endif
.shutdown = e100_shutdown, /* 网卡shutdown */
.err_handler = &e100_err_handler,
};
pci_register_driver注册的实质就是将对应网卡的驱动挂载在相应的总线下,这样当网卡插入pci总线时,总线启动扫描并遍历下面挂载的所有驱动,依据设备信息(如厂商Id,设备ID信息等)去匹配相应驱动id_table内容,匹配后调用驱动的probe函数为网卡请求中断号,注册中断处理函数等一系列操作。
从e100_driver的定义可以出,驱动的几个核心函数为:e100_probe、e100_remove、e100_shutdown,这里我们主要介绍和收报有关的e100_probe 函数的处理逻辑。
probe函数处理逻辑如下:
static int e100_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
{
struct net_device *netdev;
struct nic *nic;
int err;
/* 申请并分配网络设备结构体变量netdev */
if (!(netdev = alloc_etherdev(sizeof(struct nic))))
return -ENOMEM;
/* 设置网络设备硬件特征 */
netdev->hw_features |= NETIF_F_RXFCS;
netdev->priv_flags |= IFF_SUPP_NOFCS;
netdev->hw_features |= NETIF_F_RXALL;
/* 设置网络设备操作函数: 如open ,close,tx_xmit, do_ioctl等 */
netdev->netdev_ops = &e100_netdev_ops;
SET_ETHTOOL_OPS(netdev, &e100_ethtool_ops); /*设置网络设备配置操作接口 */
netdev->watchdog_timeo = E100_WATCHDOG_PERIOD;
strncpy(netdev->name, pci_name(pdev), sizeof(netdev->name) - 1);
nic = netdev_priv(netdev);
/* 设置网卡poll函数,用于通过dma从网卡缓冲区拉取报文到ring_buffer */
netif_napi_add(netdev, &nic->napi, e100_poll, E100_NAPI_WEIGHT);
nic->netdev = netdev;
nic->pdev = pdev;
nic->msg_enable = (1 << debug) - 1;
nic->mdio_ctrl = mdio_ctrl_hw;
pci_set_drvdata(pdev, netdev);
/* 激活PCI设备 */
if ((err = pci_enable_device(pdev))) {
netif_err(nic, probe, nic->netdev, "Cannot enable PCI device, aborting\n");
goto err_out_free_dev;
}
if (!(pci_resource_flags(pdev, 0) & IORESOURCE_MEM)) {
netif_err(nic, probe, nic->netdev, "Cannot find proper PCI device base address, aborting\n");
err = -ENODEV;
goto err_out_disable_pdev;
}
/* 分配I/O内存区域 */
if ((err = pci_request_regions(pdev, DRV_NAME))) {
netif_err(nic, probe, nic->netdev, "Cannot obtain PCI resources, aborting\n");
goto err_out_disable_pdev;
}
/* 打上dma_mask 意味值网卡支持dma */
if ((err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32)))) {
netif_err(nic, probe, nic->netdev, "No usable DMA configuration, aborting\n");
goto err_out_free_res;
}
SET_NETDEV_DEV(netdev, &pdev->dev);
if (use_io)
netif_info(nic, probe, nic->netdev, "using i/o access mode\n");
/* 映射I/O 内存 */
nic->csr = pci_iomap(pdev, (use_io ? 1 : 0), sizeof(struct csr));
if (!nic->csr) {
netif_err(nic, probe, nic->netdev, "Cannot map device registers, aborting\n");
err = -ENOMEM;
goto err_out_free_res;
}
if (ent->driver_data)
nic->flags |= ich;
else
nic->flags &= ~ich;
e100_get_defaults(nic);
/* D100 MAC doesn't allow rx of vlan packets with normal MTU */
if (nic->mac < mac_82558_D101_A4)
netdev->features |= NETIF_F_VLAN_CHALLENGED;
/* locks must be initialized before calling hw_reset */
spin_lock_init(&nic->cb_lock);
spin_lock_init(&nic->cmd_lock);
spin_lock_init(&nic->mdio_lock);
/* Reset the device before pci_set_master() in case device is in some
* funky state and has an interrupt pending - hint: we don't have the
* interrupt handler registered yet. */
/* 硬件复位 */
e100_hw_reset(nic);
pci_set_master(pdev);
init_timer(&nic->watchdog);
nic->watchdog.function = e100_watchdog;
nic->watchdog.data = (unsigned long)nic;
INIT_WORK(&nic->tx_timeout_task, e100_tx_timeout_task);
if ((err = e100_alloc(nic))) {
netif_err(nic, probe, nic->netdev, "Cannot alloc driver memory, aborting\n");
goto err_out_iounmap;
}
if ((err = e100_eeprom_load(nic)))
goto err_out_free;
/* phy 寄存器复位 */
e100_phy_init(nic);
memcpy(netdev->dev_addr, nic->eeprom, ETH_ALEN);
if (!is_valid_ether_addr(netdev->dev_addr)) {
if (!eeprom_bad_csum_allow) {
netif_err(nic, probe, nic->netdev, "Invalid MAC address from EEPROM, aborting\n");
err = -EAGAIN;
goto err_out_free;
} else {
netif_err(nic, probe, nic->netdev, "Invalid MAC address from EEPROM, you MUST configure one.\n");
}
}
/* Wol magic packet can be enabled from eeprom */
if ((nic->mac >= mac_82558_D101_A4) &&
(nic->eeprom[eeprom_id] & eeprom_id_wol)) {
nic->flags |= wol_magic;
device_set_wakeup_enable(&pdev->dev, true);
}
/* ack any pending wake events, disable PME */
pci_pme_active(pdev, false);
strcpy(netdev->name, "eth%d");
/* 网络设备注册 */
if ((err = register_netdev(netdev))) {
netif_err(nic, probe, nic->netdev, "Cannot register net device, aborting\n");
goto err_out_free;
}
/* 创建dma 缓冲池 */
nic->cbs_pool = pci_pool_create(netdev->name,
nic->pdev,
nic->params.cbs.max * sizeof(struct cb),
sizeof(u32),
0);
netif_info(nic, probe, nic->netdev,
"addr 0x%llx, irq %d, MAC addr %pM\n",
(unsigned long long)pci_resource_start(pdev, use_io ? 1 : 0),
pdev->irq, netdev->dev_addr);
return 0;
err_out_free:
e100_free(nic);
err_out_iounmap:
pci_iounmap(pdev, nic->csr);
err_out_free_res:
pci_release_regions(pdev);
err_out_disable_pdev:
pci_disable_device(pdev);
err_out_free_dev:
pci_set_drvdata(pdev, NULL);
free_netdev(netdev);
return err;
}
通过上述代码可以看出里probe函数主要是分配、初始化、注册网络设备,分配并映射I/O缓冲区,为网卡收发包作准备,需要特别说明的是网卡的epoll函数也是在网卡设备注册时指定的,此后当报文到达网卡,软中断处理时可以依据poll函数拉取报文;网络设备操作函数 e100_netdev_ops,定义如下:
static const struct net_device_ops e100_netdev_ops = {
.ndo_open = e100_open,
.ndo_stop = e100_close,
.ndo_start_xmit = e100_xmit_frame,
.ndo_validate_addr = eth_validate_addr,
.ndo_set_rx_mode = e100_set_multicast_list,
.ndo_set_mac_address = e100_set_mac_address,
.ndo_change_mtu = e100_change_mtu,
.ndo_do_ioctl = e100_do_ioctl,
.ndo_tx_timeout = e100_tx_timeout,
#ifdef CONFIG_NET_POLL_CONTROLLER
.ndo_poll_controller = e100_netpoll,
#endif
.ndo_set_features = e100_set_features,
};
这里介绍下e100_open,因为我们关注的关于网卡的中断号请求,中断处理函数都在这里。
e100_open 核心逻辑是调用了e100_up,因此我们重点关注下e100_up的处理。
static int e100_up(struct nic *nic)
{
int err;
/* 分配报文接收链表 */
if ((err = e100_rx_alloc_list(nic)))
return err;
/* 分配初始化相应的dma 地址 */
if ((err = e100_alloc_cbs(nic)))
goto err_rx_clean_list;
/* 硬件初始化 */
if ((err = e100_hw_init(nic)))
goto err_clean_cbs;
e100_set_multicast_list(nic->netdev);
/* 开始接收 */
e100_start_receiver(nic, NULL);
mod_timer(&nic->watchdog, jiffies);
/* 请求中断号,并注册中断处理函数 */
if ((err = request_irq(nic->pdev->irq, e100_intr, IRQF_SHARED,
nic->netdev->name, nic->netdev)))
goto err_no_irq;
/* 唤醒传输队列 */
netif_wake_queue(nic->netdev);
/* 使能 napi调度 */
napi_enable(&nic->napi);
/* enable ints _after_ enabling poll, preventing a race between
* disable ints+schedule */
/* 开启中断 */
e100_enable_irq(nic);
return 0;
err_no_irq:
del_timer_sync(&nic->watchdog);
err_clean_cbs:
e100_clean_cbs(nic);
err_rx_clean_list:
e100_rx_clean_list(nic);
return err;
}
由此可见 e100网卡的中断处理函数为e100_intr,硬件中断处理是网卡收包到上送协议栈之间的关键,也是本博客的重点。
static irqreturn_t e100_intr(int irq, void *dev_id)
{
struct net_device *netdev = dev_id;
struct nic *nic = netdev_priv(netdev);
u8 stat_ack = ioread8(&nic->csr->scb.stat_ack);
netif_printk(nic, intr, KERN_DEBUG, nic->netdev,
"stat_ack = 0x%02X\n", stat_ack);
/* 中断确认 */
if (stat_ack == stat_ack_not_ours || /* Not our interrupt */
stat_ack == stat_ack_not_present) /* Hardware is ejected */
return IRQ_NONE;
/* Ack interrupt(s) */
iowrite8(stat_ack, &nic->csr->scb.stat_ack); /* 中断状态回写*/
/* We hit Receive No Resource (RNR); restart RU after cleaning */
if (stat_ack & stat_ack_rnr) /* 中断状态判断是否需要延迟接收 */
nic->ru_running = RU_SUSPENDED;
/* 检查napi是否可调度,若是,关硬件中断,napi调度 */
if (likely(napi_schedule_prep(&nic->napi))) {
/* 硬件关中断 */
e100_disable_irq(nic);
/* napi 调度,目的:将网卡poll函数挂载到当前cpu待执行的poll链表中,并触发软中断 */
__napi_schedule(&nic->napi);
}
return IRQ_HANDLED;
}
由此可见,e100_intr主要做了如下几件事:
1)中断状态确认,并回写。
2)napi 调度,将网卡poll函数挂载到当前cpu待执行的poll链表中,并触发软中断__raise_softirq_irqoff(NET_RX_SOFTIRQ);
收包软中断对应NET_RX_SOFTIRQ,处理函数注册如下:
open_softirq(NET_RX_SOFTIRQ, net_rx_action);
可见NET_RX_SOFTIRQ对应的处理函数为net_rx_action。
static __latent_entropy void net_rx_action(struct softirq_action *h)
{
struct softnet_data *sd = this_cpu_ptr(&softnet_data);
unsigned long time_limit = jiffies +
usecs_to_jiffies(READ_ONCE(netdev_budget_usecs));
int budget = READ_ONCE(netdev_budget);
LIST_HEAD(list);
LIST_HEAD(repoll);
start:
sd->in_net_rx_action = true; /* 设置软中断收包标志,
在触发软中断前首先判断该标志,如果正在收包则无需再触发软中断,
poll函数会轮询处理所有待处理的报文 */
local_irq_disable();
list_splice_init(&sd->poll_list, &list);
local_irq_enable();
for (;;) {
struct napi_struct *n;
skb_defer_free_flush(sd);
if (list_empty(&list)) {
if (list_empty(&repoll)) {
sd->in_net_rx_action = false;
barrier();
/* We need to check if ____napi_schedule()
* had refilled poll_list while
* sd->in_net_rx_action was true.
*/
if (!list_empty(&sd->poll_list))
goto start;
if (!sd_has_rps_ipi_waiting(sd))
goto end;
}
break;
}
/* 轮询sd->poll_list上的所有napi设备 */
n = list_first_entry(&list, struct napi_struct, poll_list);
budget -= napi_poll(n, &repoll); /* 调用poll函数从网卡驱动中读取一定数量的skb */
/* If softirq window is exhausted then punt.
* Allow this to run for 2 jiffies since which will allow
* an average latency of 1.5/HZ.
*/
if (unlikely(budget <= 0 ||
time_after_eq(jiffies, time_limit))) {
sd->time_squeeze++;
break;
}
}
local_irq_disable();
list_splice_tail_init(&sd->poll_list, &list);
list_splice_tail(&repoll, &list);
list_splice(&list, &sd->poll_list);
if (!list_empty(&sd->poll_list)) /* 如果poll list不空,表示还有skb未接收完,继续触发软中断 */
__raise_softirq_irqoff(NET_RX_SOFTIRQ);
else
sd->in_net_rx_action = false;
/* 开启本地硬中断,*/
net_rps_action_and_irq_enable(sd);
end:;
}
软中断处理的核心是调用napi_poll函数轮询拉取报文,并上送协议栈。具体逻辑如下:
static int __napi_poll(struct napi_struct *n, bool *repoll)
{
int work, weight;
weight = n->weight; /* 表示一次软中断可以处理报文的个数最大数值 */
/* This NAPI_STATE_SCHED test is for avoiding a race
* with netpoll's poll_napi(). Only the entity which
* obtains the lock and sees NAPI_STATE_SCHED set will
* actually make the ->poll() call. Therefore we avoid
* accidentally calling ->poll() when NAPI is not scheduled.
*/
work = 0;
/* 判断napi可调度状态位,若NAPI_STATE_SCHED置位则调用对应的poll函数,对于napi设备对应
网卡提供的poll函数,以e100_poll为例,对应e100_poll,非NAPI函数对应process_backlog */
if (test_bit(NAPI_STATE_SCHED, &n->state)) {
work = n->poll(n, weight);
trace_napi_poll(n, work, weight);
}
if (unlikely(work > weight))
netdev_err_once(n->dev, "NAPI poll function %pS returned %d, exceeding its budget of %d.\n",
n->poll, work, weight);
if (likely(work < weight))
return work;
/* Drivers must not modify the NAPI state if they
* consume the entire weight. In such cases this code
* still "owns" the NAPI instance and therefore can
* move the instance around on the list at-will.
*/
if (unlikely(napi_disable_pending(n))) {
napi_complete(n);
return work;
}
/* The NAPI context has more processing work, but busy-polling
* is preferred. Exit early.
*/
if (napi_prefer_busy_poll(n)) {
if (napi_complete_done(n, work)) {
/* If timeout is not set, we need to make sure
* that the NAPI is re-scheduled.
*/
napi_schedule(n);
}
return work;
}
if (n->gro_bitmask) {
/* flush too old packets
* If HZ < 1000, flush all packets.
*/
napi_gro_flush(n, HZ >= 1000);
}
/* 将接收报文上送协议栈 */
gro_normal_list(n);
/* Some drivers may have called napi_schedule
* prior to exhausting their budget.
*/
if (unlikely(!list_empty(&n->poll_list))) {
pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
n->dev ? n->dev->name : "backlog");
return work;
}
*repoll = true;
return work;
}
e100对应处理函数为e100_poll,处理逻辑如下:
static int e100_poll(struct napi_struct *napi, int budget)
{
struct nic *nic = container_of(napi, struct nic, napi);
unsigned int work_done = 0;
e100_rx_clean(nic, &work_done, budget);
e100_tx_clean(nic);
/* If budget fully consumed, continue polling */
if (work_done == budget)
return budget;
/* only re-enable interrupt if stack agrees polling is really done */
if (likely(napi_complete_done(napi, work_done)))
e100_enable_irq(nic);
return work_done;
}
其中和收包相关的处理为:e100_rx_clean,通过dma技术将报文直接从网卡内存缓冲区拉取报文到网卡的rx->list(事先分配好的内存区域,即常说的ring_buffer)。
报文上送协议栈函数调用过程如下:
至此,网卡收包的处理过程已基本结束,很多地方可能不够细节,待后续完善补充,希望能和大家一起交流学习。