这里以3c501网卡为例,每个设备对应一个device的结构体,下面代码即对3c501网卡的数据结构进行初始化,包括发送函数,注册中断回调,mac头长度等。
/* The actual probe. */
static int
el1_probe1(struct device *dev, int ioaddr)
{
#ifndef MODULE
char *mname; /* Vendor name */
unsigned char station_addr[6];
int autoirq = 0;
int i;
/* Read the station address PROM data from the special port. */
for (i = 0; i < 6; i++) {
outw(i, ioaddr + EL1_DATAPTR);
station_addr[i] = inb(ioaddr + EL1_SAPROM);
}
/* Check the first three octets of the S.A. for 3Com's prefix, or
for the Sager NP943 prefix. */
if (station_addr[0] == 0x02 && station_addr[1] == 0x60
&& station_addr[2] == 0x8c) {
mname = "3c501";
} else if (station_addr[0] == 0x00 && station_addr[1] == 0x80
&& station_addr[2] == 0xC8) {
mname = "NP943";
} else
return ENODEV;
/* Grab the region so we can find the another board if autoIRQ fails. */
request_region(ioaddr, EL1_IO_EXTENT,"3c501");
/* We auto-IRQ by shutting off the interrupt line and letting it float
high. */
if (dev->irq < 2) {
autoirq_setup(2);
inb(RX_STATUS); /* Clear pending interrupts. */
inb(TX_STATUS);
outb(AX_LOOP + 1, AX_CMD);
outb(0x00, AX_CMD);
autoirq = autoirq_report(1);
if (autoirq == 0) {
printk("%s probe at %#x failed to detect IRQ line.\n",
mname, ioaddr);
return EAGAIN;
}
}
outb(AX_RESET+AX_LOOP, AX_CMD); /* Loopback mode. */
dev->base_addr = ioaddr;
memcpy(dev->dev_addr, station_addr, ETH_ALEN);
if (dev->mem_start & 0xf)
el_debug = dev->mem_start & 0x7;
if (autoirq)
dev->irq = autoirq;
printk("%s: %s EtherLink at %#lx, using %sIRQ %d.\n",
dev->name, mname, dev->base_addr,
autoirq ? "auto":"assigned ", dev->irq);
#ifdef CONFIG_IP_MULTICAST
printk("WARNING: Use of the 3c501 in a multicast kernel is NOT recommended.\n");
#endif
if (el_debug)
printk("%s", version);
/* Initialize the device structure. */
if (dev->priv == NULL)
dev->priv = kmalloc(sizeof(struct net_local), GFP_KERNEL);
memset(dev->priv, 0, sizeof(struct net_local));
/* The EL1-specific entries in the device structure. */
dev->open = &el_open;
// 发送函数
dev->hard_start_xmit = &el_start_xmit;
dev->stop = &el1_close;
dev->get_stats = &el1_get_stats;
dev->set_multicast_list = &set_multicast_list;
/* Setup the generic properties */
ether_setup(dev);
#endif /* !MODULE */
return 0;
}
void ether_setup(struct device *dev)
{
int i;
/* Fill in the fields of the device structure with ethernet-generic values.
This should be in a common file instead of per-driver. */
for (i = 0; i < DEV_NUMBUFFS; i++)
skb_queue_head_init(&dev->buffs[i]);
/* register boot-defined "eth" devices */
if (dev->name && (strncmp(dev->name, "eth", 3) == 0)) {
i = simple_strtoul(dev->name + 3, NULL, 0);
if (ethdev_index[i] == NULL) {
ethdev_index[i] = dev;
}
else if (dev != ethdev_index[i]) {
/* Really shouldn't happen! */
printk("ether_setup: Ouch! Someone else took %s\n",
dev->name);
}
}
dev->hard_header = eth_header;
dev->rebuild_header = eth_rebuild_header;
dev->type_trans = eth_type_trans;
dev->type = ARPHRD_ETHER;
dev->hard_header_len = ETH_HLEN;
dev->mtu = 1500; /* eth_mtu */
dev->addr_len = ETH_ALEN;
for (i = 0; i < ETH_ALEN; i++) {
dev->broadcast[i]=0xff;
}
/* New-style flags. */
dev->flags = IFF_BROADCAST|IFF_MULTICAST;
dev->family = AF_INET;
dev->pa_addr = 0;
dev->pa_brdaddr = 0;
dev->pa_mask = 0;
dev->pa_alen = sizeof(unsigned long);
}
/* Open/initialize the board. */
static int el_open(struct device *dev)
{
int ioaddr = dev->base_addr;
if (el_debug > 2)
printk("%s: Doing el_open()...", dev->name);
// 设置中断的回调是el_interrupt函数,网络收到数据包后会触发系统中断,系统会执行该函数
if (request_irq(dev->irq, &el_interrupt, 0, "3c501")) {
return -EAGAIN;
}
irq2dev_map[dev->irq] = dev;
el_reset(dev);
dev->start = 1;
outb(AX_RX, AX_CMD); /* Aux control, irq and receive enabled */
MOD_INC_USE_COUNT;
return 0;
}
设置完网卡对应的数据结构后,如果有数据包到达,由驱动程序中的这两个函数处理。
/* The typical workload of the driver:
Handle the ether interface interrupts. */
static void
el_interrupt(int irq, struct pt_regs *regs)
{
struct device *dev = (struct device *)(irq2dev_map[irq]);
struct net_local *lp;
int ioaddr;
int axsr; /* Aux. status reg. */
if (dev == NULL || dev->irq != irq) {
printk ("3c501 driver: irq %d for unknown device.\n", irq);
return;
}
ioaddr = dev->base_addr;
lp = (struct net_local *)dev->priv;
axsr = inb(AX_STATUS);
if (el_debug > 3)
printk("%s: el_interrupt() aux=%#02x", dev->name, axsr);
if (dev->interrupt)
printk("%s: Reentering the interrupt driver!\n", dev->name);
dev->interrupt = 1;
if (dev->tbusy) {
/*
* Board in transmit mode.
*/
int txsr = inb(TX_STATUS);
if (el_debug > 6)
printk(" txsr=%02x gp=%04x rp=%04x", txsr, inw(GP_LOW),
inw(RX_LOW));
if ((axsr & 0x80) && (txsr & TX_READY) == 0) {
/*
* FIXME: is there a logic to whether to keep on trying or
* reset immediately ?
*/
printk("%s: Unusual interrupt during Tx, txsr=%02x axsr=%02x"
" gp=%03x rp=%03x.\n", dev->name, txsr, axsr,
inw(ioaddr + EL1_DATAPTR), inw(ioaddr + EL1_RXPTR));
dev->tbusy = 0;
mark_bh(NET_BH);
} else if (txsr & TX_16COLLISIONS) {
/*
* Timed out
*/
if (el_debug)
printk("%s: Transmit failed 16 times, ethernet jammed?\n",
dev->name);
outb(AX_SYS, AX_CMD);
lp->stats.tx_aborted_errors++;
} else if (txsr & TX_COLLISION) {
/* Retrigger xmit. */
if (el_debug > 6)
printk(" retransmitting after a collision.\n");
/*
* Poor little chip can't reset its own start pointer
*/
outb(AX_SYS, AX_CMD);
outw(lp->tx_pkt_start, GP_LOW);
outb(AX_XMIT, AX_CMD);
lp->stats.collisions++;
dev->interrupt = 0;
return;
} else {
/*
* It worked.. we will now fall through and receive
*/
lp->stats.tx_packets++;
if (el_debug > 6)
printk(" Tx succeeded %s\n",
(txsr & TX_RDY) ? "." : "but tx is busy!");
/*
* This is safe the interrupt is atomic WRT itself.
*/
dev->tbusy = 0;
mark_bh(NET_BH); /* In case more to transmit */
}
} else {
/*
* In receive mode.
*/
int rxsr = inb(RX_STATUS);
if (el_debug > 5)
printk(" rxsr=%02x txsr=%02x rp=%04x", rxsr, inb(TX_STATUS),
inw(RX_LOW));
/*
* Just reading rx_status fixes most errors.
*/
if (rxsr & RX_MISSED)
lp->stats.rx_missed_errors++;
if (rxsr & RX_RUNT) {
/* Handled to avoid board lock-up. */
lp->stats.rx_length_errors++;
if (el_debug > 5) printk(" runt.\n");
} else if (rxsr & RX_GOOD) {
/*
* Receive worked.
*/
// 成功收到数据包后执行到这
el_receive(dev);
} else {
/* Nothing? Something is broken! */
if (el_debug > 2)
printk("%s: No packet seen, rxsr=%02x **resetting 3c501***\n",
dev->name, rxsr);
el_reset(dev);
}
if (el_debug > 3)
printk(".\n");
}
/*
* Move into receive mode
*/
outb(AX_RX, AX_CMD);
outw(0x00, RX_BUF_CLR);
inb(RX_STATUS); /* Be certain that interrupts are cleared. */
inb(TX_STATUS);
dev->interrupt = 0;
return;
}
/* We have a good packet. Well, not really "good", just mostly not broken.
We must check everything to see if it is good. */
static void
el_receive(struct device *dev)
{
struct net_local *lp = (struct net_local *)dev->priv;
int ioaddr = dev->base_addr;
int pkt_len;
struct sk_buff *skb;
// 包长度
pkt_len = inw(RX_LOW);
if (el_debug > 4)
printk(" el_receive %d.\n", pkt_len);
// 包太大或太小
if ((pkt_len < 60) || (pkt_len > 1536)) {
if (el_debug)
printk("%s: bogus packet, length=%d\n", dev->name, pkt_len);
lp->stats.rx_over_errors++;
return;
}
/*
* Command mode so we can empty the buffer
*/
outb(AX_SYS, AX_CMD);
// 分配一个承载数据的skb
skb = alloc_skb(pkt_len, GFP_ATOMIC);
/*
* Start of frame
*/
outw(0x00, GP_LOW);
if (skb == NULL) {
printk("%s: Memory squeeze, dropping packet.\n", dev->name);
lp->stats.rx_dropped++;
return;
} else {
// 记录数据包长度和收到该包的设备
skb->len = pkt_len;
skb->dev = dev;
/*
* The read increments through the bytes. The interrupt
* handler will fix the pointer when it returns to
* receive mode.
*/
// 读取数据到skb中
insb(DATAPORT, skb->data, pkt_len);
// 传给mac层
netif_rx(skb);
lp->stats.rx_packets++;
}
return;
}
驱动层处理生成一个skb结构体,然后通过netif_rx函数传给链路层。netif_rx直接把skb挂载到backlog队列中,然后结束中断处理,等下半部分再进行数据包的具体处理。由sock_init函数的代码我们知道,下半部分的处理函数是net_bh。
/*
* Receive a packet from a device driver and queue it for the upper
* (protocol) levels. It always succeeds. This is the recommended
* interface to use.
*/
void netif_rx(struct sk_buff *skb)
{
static int dropping = 0;
/*
* Any received buffers are un-owned and should be discarded
* when freed. These will be updated later as the frames get
* owners.
*/
skb->sk = NULL;
skb->free = 1;
if(skb->stamp.tv_sec==0)
skb->stamp = xtime;
/*
* Check that we aren't overdoing things.
*/
// 是否过载
if (!backlog_size)
dropping = 0;
else if (backlog_size > 300)
dropping = 1;
// 过载则丢弃
if (dropping)
{
kfree_skb(skb, FREE_READ);
return;
}
/*
* Add it to the "backlog" queue.
*/
#ifdef CONFIG_SKB_CHECK
IS_SKB(skb);
#endif
// 加到backlog队列
skb_queue_tail(&backlog,skb);
backlog_size++;
/*
* If any packet arrived, mark it for processing after the
* hardware interrupt returns.
*/
// 激活下半部分,处理数据包
mark_bh(NET_BH);
return;
}
/*
* When we are called the queue is ready to grab, the interrupts are
* on and hardware can interrupt and queue to the receive queue a we
* run with no problems.
* This is run as a bottom half after an interrupt handler that does
* mark_bh(NET_BH);
*/
void net_bh(void *tmp)
{
struct sk_buff *skb;
struct packet_type *ptype;
struct packet_type *pt_prev;
unsigned short type;
/*
* Atomically check and mark our BUSY state.
*/
// 防止重入
if (set_bit(1, (void*)&in_bh))
return;
/*
* Can we send anything now? We want to clear the
* decks for any more sends that get done as we
* process the input.
*/
// 发送缓存的数据包
dev_transmit();
/*
* Any data left to process. This may occur because a
* mark_bh() is done after we empty the queue including
* that from the device which does a mark_bh() just after
*/
cli();
/*
* While the queue is not empty
*/
// backlog队列的数据包来源于网卡收到的数据包
while((skb=skb_dequeue(&backlog))!=NULL)
{
/*
* We have a packet. Therefore the queue has shrunk
*/
backlog_size--;
sti();
/*
* Bump the pointer to the next structure.
* This assumes that the basic 'skb' pointer points to
* the MAC header, if any (as indicated by its "length"
* field). Take care now!
*/
// 指向ip头
skb->h.raw = skb->data + skb->dev->hard_header_len;
// 减去mac头长度
skb->len -= skb->dev->hard_header_len;
/*
* Fetch the packet protocol ID. This is also quite ugly, as
* it depends on the protocol driver (the interface itself) to
* know what the type is, or where to get it from. The Ethernet
* interfaces fetch the ID from the two bytes in the Ethernet MAC
* header (the h_proto field in struct ethhdr), but other drivers
* may either use the ethernet ID's or extra ones that do not
* clash (eg ETH_P_AX25). We could set this before we queue the
* frame. In fact I may change this when I have time.
*/
// 判断上层协议
type = skb->dev->type_trans(skb, skb->dev);
/*
* We got a packet ID. Now loop over the "known protocols"
* table (which is actually a linked list, but this will
* change soon if I get my way- FvK), and forward the packet
* to anyone who wants it.
*
* [FvK didn't get his way but he is right this ought to be
* hashed so we typically get a single hit. The speed cost
* here is minimal but no doubt adds up at the 4,000+ pkts/second
* rate we can hit flat out]
*/
pt_prev = NULL;
for (ptype = ptype_base; ptype != NULL; ptype = ptype->next)
{
if ((ptype->type == type || ptype->type == htons(ETH_P_ALL)) && (!ptype->dev || ptype->dev==skb->dev))
{
/*
* We already have a match queued. Deliver
* to it and then remember the new match
*/
// 如果有匹配的项则要单独复制一份skb
if(pt_prev)
{
struct sk_buff *skb2;
skb2=skb_clone(skb, GFP_ATOMIC);
/*
* Kick the protocol handler. This should be fast
* and efficient code.
*/
if(skb2)
pt_prev->func(skb2, skb->dev, pt_prev);
}
/* Remember the current last to do */
// 记录最近匹配的项
pt_prev=ptype;
}
} /* End of protocol list loop */
/*
* Is there a last item to send to ?
*/
// 把数据包交给上层协议处理,大于一个匹配项,则把skb复制给最后一项,否则销毁skb
if(pt_prev)
pt_prev->func(skb, skb->dev, pt_prev);
/*
* Has an unknown packet has been received ?
*/
else
kfree_skb(skb, FREE_WRITE);
/*
* Again, see if we can transmit anything now.
* [Ought to take this out judging by tests it slows
* us down not speeds us up]
*/
dev_transmit();
cli();
} /* End of queue loop */
/*
* We have emptied the queue
*/
// 处理完毕
in_bh = 0;
sti();
/*
* One last output flush.
*/
dev_transmit();
}
这里假设上层协议是ip,ip层处理函数是ip_rcv,代码如下
/*
* This function receives all incoming IP datagrams.
*/
int ip_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt)
{
struct iphdr *iph = skb->h.iph;
struct sock *raw_sk=NULL;
unsigned char hash;
unsigned char flag = 0;
unsigned char opts_p = 0; /* Set iff the packet has options. */
struct inet_protocol *ipprot;
static struct options opt; /* since we don't use these yet, and they
take up stack space. */
int brd=IS_MYADDR;
int is_frag=0;
#ifdef CONFIG_IP_FIREWALL
int err;
#endif
ip_statistics.IpInReceives++;
/*
* Tag the ip header of this packet so we can find it
*/
skb->ip_hdr = iph;
/*
* Is the datagram acceptable?
*
* 1. Length at least the size of an ip header
* 2. Version of 4
* 3. Checksums correctly. [Speed optimisation for later, skip loopback checksums]
* (4. We ought to check for IP multicast addresses and undefined types.. does this matter ?)
*/
// 参数检查
if (skb->len<sizeof(struct iphdr) || iph->ihl<5 || iph->version != 4 ||
skb->len<ntohs(iph->tot_len) || ip_fast_csum((unsigned char *)iph, iph->ihl) !=0)
{
ip_statistics.IpInHdrErrors++;
kfree_skb(skb, FREE_WRITE);
return(0);
}
/*
* See if the firewall wants to dispose of the packet.
*/
// 配置了防火墙,则先检查是否符合防火墙的过滤规则,否则则丢掉
#ifdef CONFIG_IP_FIREWALL
if ((err=ip_fw_chk(iph,dev,ip_fw_blk_chain,ip_fw_blk_policy, 0))!=1)
{
if(err==-1)
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0, dev);
kfree_skb(skb, FREE_WRITE);
return 0;
}
#endif
/*
* Our transport medium may have padded the buffer out. Now we know it
* is IP we can trim to the true length of the frame.
*/
skb->len=ntohs(iph->tot_len);
/*
* Next analyse the packet for options. Studies show under one packet in
* a thousand have options....
*/
// ip头超过20字节,说明有选项
if (iph->ihl != 5)
{
/* Fast path for the typical optionless IP packet. */
memset((char *) &opt, 0, sizeof(opt));
if (do_options(iph, &opt) != 0)
return 0;
opts_p = 1;
}
/*
* Remember if the frame is fragmented.
*/
// 非0则说明是分片
if(iph->frag_off)
{
// 是否禁止分片,是的话is_frag等于1
if (iph->frag_off & 0x0020)
is_frag|=1;
/*
* Last fragment ?
*/
// 非0说明有偏移,即不是第一个块分片
if (ntohs(iph->frag_off) & 0x1fff)
is_frag|=2;
}
/*
* Do any IP forwarding required. chk_addr() is expensive -- avoid it someday.
*
* This is inefficient. While finding out if it is for us we could also compute
* the routing table entry. This is where the great unified cache theory comes
* in as and when someone implements it
*
* For most hosts over 99% of packets match the first conditional
* and don't go via ip_chk_addr. Note: brd is set to IS_MYADDR at
* function entry.
*/
if ( iph->daddr != skb->dev->pa_addr && (brd = ip_chk_addr(iph->daddr)) == 0)
{
/*
* Don't forward multicast or broadcast frames.
*/
if(skb->pkt_type!=PACKET_HOST || brd==IS_BROADCAST)
{
kfree_skb(skb,FREE_WRITE);
return 0;
}
/*
* The packet is for another target. Forward the frame
*/
#ifdef CONFIG_IP_FORWARD
ip_forward(skb, dev, is_frag);
#else
/* printk("Machine %lx tried to use us as a forwarder to %lx but we have forwarding disabled!\n",
iph->saddr,iph->daddr);*/
ip_statistics.IpInAddrErrors++;
#endif
/*
* The forwarder is inefficient and copies the packet. We
* free the original now.
*/
kfree_skb(skb, FREE_WRITE);
return(0);
}
#ifdef CONFIG_IP_MULTICAST
if(brd==IS_MULTICAST && iph->daddr!=IGMP_ALL_HOSTS && !(dev->flags&IFF_LOOPBACK))
{
/*
* Check it is for one of our groups
*/
struct ip_mc_list *ip_mc=dev->ip_mc_list;
do
{
if(ip_mc==NULL)
{
kfree_skb(skb, FREE_WRITE);
return 0;
}
if(ip_mc->multiaddr==iph->daddr)
break;
ip_mc=ip_mc->next;
}
while(1);
}
#endif
/*
* Account for the packet
*/
#ifdef CONFIG_IP_ACCT
ip_acct_cnt(iph,dev, ip_acct_chain);
#endif
/*
* Reassemble IP fragments.
*/
// 分片重组
if(is_frag)
{
/* Defragment. Obtain the complete packet if there is one */
skb=ip_defrag(iph,skb,dev);
if(skb==NULL)
return 0;
skb->dev = dev;
iph=skb->h.iph;
}
/*
* Point into the IP datagram, just past the header.
*/
skb->ip_hdr = iph;
// 往上层传之前先指向上层的头
skb->h.raw += iph->ihl*4;
/*
* Deliver to raw sockets. This is fun as to avoid copies we want to make no surplus copies.
*/
hash = iph->protocol & (SOCK_ARRAY_SIZE-1);
/* If there maybe a raw socket we must check - if not we don't care less */
if((raw_sk=raw_prot.sock_array[hash])!=NULL)
{
struct sock *sknext=NULL;
struct sk_buff *skb1;
// 找对应的socket
raw_sk=get_sock_raw(raw_sk, hash, iph->saddr, iph->daddr);
if(raw_sk) /* Any raw sockets */
{
do
{
/* Find the next */
// 从队列中raw_sk的下一个节点开始找满足条件的socket,因为之前的的肯定不满足条件了
sknext=get_sock_raw(raw_sk->next, hash, iph->saddr, iph->daddr);
// 复制一份skb给符合条件的socket
if(sknext)
skb1=skb_clone(skb, GFP_ATOMIC);
else
break; /* One pending raw socket left */
if(skb1)
raw_rcv(raw_sk, skb1, dev, iph->saddr,iph->daddr);
// 记录最近符合条件的socket
raw_sk=sknext;
}
while(raw_sk!=NULL);
/* Here either raw_sk is the last raw socket, or NULL if none */
/* We deliver to the last raw socket AFTER the protocol checks as it avoids a surplus copy */
}
}
/*
* skb->h.raw now points at the protocol beyond the IP header.
*/
// 传给ip层的上传协议
hash = iph->protocol & (MAX_INET_PROTOS -1);
// 获取哈希链表中的一个队列,遍历
for (ipprot = (struct inet_protocol *)inet_protos[hash];ipprot != NULL;ipprot=(struct inet_protocol *)ipprot->next)
{
struct sk_buff *skb2;
if (ipprot->protocol != iph->protocol)
continue;
/*
* See if we need to make a copy of it. This will
* only be set if more than one protocol wants it.
* and then not for the last one. If there is a pending
* raw delivery wait for that
*/
/*
是否需要复制一份skb,copy字段这个版本中都是0,有多个一样的协议才需要复制一份,
否则一份就够,因为只有一个协议需要使用,raw_sk的值是上面代码决定的
*/
if (ipprot->copy || raw_sk)
{
skb2 = skb_clone(skb, GFP_ATOMIC);
if(skb2==NULL)
continue;
}
else
{
skb2 = skb;
}
// 找到了处理该数据包的上层协议
flag = 1;
/*
* Pass on the datagram to each protocol that wants it,
* based on the datagram protocol. We should really
* check the protocol handler's return values here...
*/
ipprot->handler(skb2, dev, opts_p ? &opt : 0, iph->daddr,
(ntohs(iph->tot_len) - (iph->ihl * 4)),
iph->saddr, 0, ipprot);
}
/*
* All protocols checked.
* If this packet was a broadcast, we may *not* reply to it, since that
* causes (proven, grin) ARP storms and a leakage of memory (i.e. all
* ICMP reply messages get queued up for transmission...)
*/
if(raw_sk!=NULL) /* Shift to last raw user */
raw_rcv(raw_sk, skb, dev, iph->saddr, iph->daddr);
// 没找到处理该数据包的上层协议,报告错误
else if (!flag) /* Free and report errors */
{
// 不是广播不是多播,发送目的地不可达的icmp包
if (brd != IS_BROADCAST && brd!=IS_MULTICAST)
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0, dev);
kfree_skb(skb, FREE_WRITE);
}
return(0);
}
ip层遍历inet_protos数组,找到和ip头中指定的协议相等的协议,把数据包交给该节点处理。比如tcp协议对应的处理函数是tcp_rcv,该函数把skb挂载到socket的接收队列等待读取,获取建立一个连接等。应用层使用read函数进行读取的时候,就从接收队列摘下一个skb。至此,一个数据包从网卡到应用层的过程就结束了。