linux网络协议栈源码分析 - 邻居子系统邻居状态转移

1、邻居项状态转移图

        邻居项主要的状态转移如下(省略邻居项垃圾回收及转移原因,更权威详细的状态转移图参看《深入理解LINUX网络技术内幕》P648 "图26-13: NUD状态间的转换"):

linux网络协议栈源码分析 - 邻居子系统邻居状态转移_第1张图片

2、进入NONE状态并初始化邻居项

2.1、创建邻居表项进入NONE状态(neigh_alloc)

        邻居子系统调用neigh_alloc创建邻居项的时候,邻居项初始化为NONE状态;网络层发送报文时,需要查找目的IP地址对应的邻居项,ip_finish_output2调用__ipv4_neigh_lookup_noref查找邻居项,如果没有找到,就调用__neigh_create、neigh_alloc创建新的邻居项,如下是ping发送报文的调用栈:

         neigh_alloc实现代码如下:

static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device *dev)
{
	struct neighbour *n = NULL;
	unsigned long now = jiffies;
	int entries;

	entries = atomic_inc_return(&tbl->entries) - 1; // 邻居项数目
	if (entries >= tbl->gc_thresh3 ||
	    (entries >= tbl->gc_thresh2 &&
	     time_after(now, tbl->last_flush + 5 * HZ))) { // 垃圾回收相关操作
		if (!neigh_forced_gc(tbl) &&
		    entries >= tbl->gc_thresh3) {
			net_info_ratelimited("%s: neighbor table overflow!\n",
					     tbl->id);
			NEIGH_CACHE_STAT_INC(tbl, table_fulls);
			goto out_entries;
		}
	}

	n = kzalloc(tbl->entry_size + dev->neigh_priv_len, GFP_ATOMIC);
	if (!n)
		goto out_entries;

	__skb_queue_head_init(&n->arp_queue);
	rwlock_init(&n->lock);
	seqlock_init(&n->ha_lock);
	n->updated	  = n->used = now; // 邻居项更新、使用时间(最一次更新状态、使用的时间,状态转移的时候会用到这几个时间)
	n->nud_state	  = NUD_NONE; // 邻居项初始状态设置为NONE
	n->output	  = neigh_blackhole; // 邻居项的输出函数指针(NONE状态不能发送报文,neigh_blackhole直接释放发送的报文并返回,也就是丢弃网络层的数据,所以网络层是不可靠的传输协议,不保证传输数据到目的地址)
	seqlock_init(&n->hh.hh_lock);
	n->parms	  = neigh_parms_clone(&tbl->parms);
	setup_timer(&n->timer, neigh_timer_handler, (unsigned long)n); // 设置邻居项定时器的回掉函数/回调函数参数,定时器超时后调用neigh_timer_handler,neigh_timer_handler的函数参数为邻居项n

	NEIGH_CACHE_STAT_INC(tbl, allocs);
	n->tbl		  = tbl;
	atomic_set(&n->refcnt, 1);
	n->dead		  = 1;
out:
	return n;

out_entries:
	atomic_dec(&tbl->entries);
	goto out;
}

2.2、初始化协议相关字段(__neigh_create)

        neigh_alloc主要是创建一个邻居项,设置邻居项状态为NONE,初始化更新/使用时间,初始化邻居项定时器的回调函数及回调函数的参数;neigh_alloc只初始化了部分通用的数据,更多的设置在__neigh_create函数里面,__neigh_create设置邻居项的输出网卡设备,调用arp_constructor初始化邻居项ARP相关数据(设置邻居项输出函数指针为neigh_resolve_output,不同的链路使用不同的协议,输出函数指针也不同,并初始化其他参数),__neigh_create实现代码如下:

struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey,
				 struct net_device *dev, bool want_ref)
{
	u32 hash_val;
	int key_len = tbl->key_len;
	int error;
	struct neighbour *n1, *rc, *n = neigh_alloc(tbl, dev);
	struct neigh_hash_table *nht;

	if (!n) {
		rc = ERR_PTR(-ENOBUFS);
		goto out;
	}

	memcpy(n->primary_key, pkey, key_len);
	n->dev = dev; // 设置邻居项的输出网卡设备(网络层通过目的IP地址,查找目的IP地址的路由,路由里面就包括目的地址的下一跳以及输出网卡)
	dev_hold(dev);

	/* Protocol specific setup. */
	if (tbl->constructor &&	(error = tbl->constructor(n)) < 0) { // 调用arp_constructor初始化邻居项的其他字段(设置邻居项的ops为arp_hh_ops,output为arp_hh_ops的connected_output,也就是neigh_resolve_output)
		rc = ERR_PTR(error);
		goto out_neigh_release;
	}

	if (dev->netdev_ops->ndo_neigh_construct) {
		error = dev->netdev_ops->ndo_neigh_construct(n);
		if (error < 0) {
			rc = ERR_PTR(error);
			goto out_neigh_release;
		}
	}

	/* Device specific setup. */
	if (n->parms->neigh_setup &&
	    (error = n->parms->neigh_setup(n)) < 0) {
		rc = ERR_PTR(error);
		goto out_neigh_release;
	}

	n->confirmed = jiffies - (NEIGH_VAR(n->parms, BASE_REACHABLE_TIME) << 1); // 更新确认时间(当前时间减去BASE_REACHABLE_TIME,定时器超时回调函数neigh_timer_handler需要用到确认时间)

	write_lock_bh(&tbl->lock);
	nht = rcu_dereference_protected(tbl->nht,
					lockdep_is_held(&tbl->lock));

	if (atomic_read(&tbl->entries) > (1 << nht->hash_shift))
		nht = neigh_hash_grow(tbl, nht->hash_shift + 1);

	hash_val = tbl->hash(pkey, dev, nht->hash_rnd) >> (32 - nht->hash_shift);

	if (n->parms->dead) {
		rc = ERR_PTR(-EINVAL);
		goto out_tbl_unlock;
	}

	for (n1 = rcu_dereference_protected(nht->hash_buckets[hash_val],
					    lockdep_is_held(&tbl->lock));
	     n1 != NULL;
	     n1 = rcu_dereference_protected(n1->next,
			lockdep_is_held(&tbl->lock))) {
		if (dev == n1->dev && !memcmp(n1->primary_key, pkey, key_len)) {
			if (want_ref)
				neigh_hold(n1);
			rc = n1;
			goto out_tbl_unlock;
		}
	}

	n->dead = 0;
	if (want_ref)
		neigh_hold(n);
	rcu_assign_pointer(n->next,
			   rcu_dereference_protected(nht->hash_buckets[hash_val],
						     lockdep_is_held(&tbl->lock)));
	rcu_assign_pointer(nht->hash_buckets[hash_val], n);
	write_unlock_bh(&tbl->lock);
	neigh_dbg(2, "neigh %p is created\n", n);
	rc = n;
out:
	return rc;
out_tbl_unlock:
	write_unlock_bh(&tbl->lock);
out_neigh_release:
	neigh_release(n);
	goto out;
}

        

3、进入INCOMPLETE状态及INCOMPLETE状态超时

3.1、进入INCOMPLETE并发送ARP请求(neigh_probe)

        ip_finish_output2查找到或者创建邻居项之后,调用dst_neigh_output,发送数据到链路层,dst_neigh_output在邻居项非连接状态,调用n->output发送报文,n->output在创建邻居项时设置为neigh_resolve_output,dst_neigh_output实现代码如下:

static inline int dst_neigh_output(struct dst_entry *dst, struct neighbour *n,
				   struct sk_buff *skb)
{
	const struct hh_cache *hh;

	if (dst->pending_confirm) {
		unsigned long now = jiffies;

		dst->pending_confirm = 0;
		/* avoid dirtying neighbour */
		if (n->confirmed != now)
			n->confirmed = now;
	}

	hh = &n->hh;
	if ((n->nud_state & NUD_CONNECTED) && hh->hh_len)
		return neigh_hh_output(hh, skb);
	else // INCOMPLETE等其他非连接状态,调用n->output,对于ARP,前面创建邻居的项的时候,n->output设置为neigh_resolve_output
		return n->output(n, skb);
}

        neigh_resolve_output最终调用neigh_event_send,neigh_event_send更新邻居项的使用时间,非连接、延迟、探测状态,调用__neigh_event_send发送报文,连接、延迟、探测等状态下,没发送ARP请求的必要,已经连接的状态,邻居的状态是可达的,已经获取到了目的IP地址的MAC地址或者链路层不需要MAC地址,延迟状态情况下,定时器超时会进入探测状态,探测状态会发送ARP请求去获取目的IP地址的MAC地址,延迟状态需要等待超时再发送ARP请求,探测阶段需要等待ARP应答,因此延迟、探测状态都不需要立即发送ARP请求,避免发送过多的ARP请求到网络上,neigh_event_send实现代码如下:

static inline int neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
{
	unsigned long now = jiffies;
	
	if (neigh->used != now)
		neigh->used = now;
	if (!(neigh->nud_state&(NUD_CONNECTED|NUD_DELAY|NUD_PROBE)))
		return __neigh_event_send(neigh, skb);
	return 0;
}

        neigh_event_send检查邻居项当前状态,检查是否需要发送探测报文,根据当前的状态设置相关的定时器,NONE状态转移到INCOMPLETE状态并启动超时定时器,有数据要发送的话,缓存当前要发送的数据到ARP缓存队列(邻居可达的话就可以立即发送数据,不需要等待TCP等超时重传,缓存不够的话,替换最早缓存的数据),调用neigh_probe发送相关的探测报文,neigh_event_send实现代码如下:

int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
{
	int rc;
	bool immediate_probe = false;

	write_lock_bh(&neigh->lock);

	rc = 0;
	if (neigh->nud_state & (NUD_CONNECTED | NUD_DELAY | NUD_PROBE)) // CONNECTED、DELAY、PROBE状态不需要发送ARP请求,跳到out_unlock_bh解锁即可
		goto out_unlock_bh;
	if (neigh->dead)
		goto out_dead;

	if (!(neigh->nud_state & (NUD_STALE | NUD_INCOMPLETE))) { // 邻居项不是STALE、INCOMPLETE状态(排除前面的各种跳过的状态,这里就剩下NONE、FAILED状态)
		if (NEIGH_VAR(neigh->parms, MCAST_PROBES) +
		    NEIGH_VAR(neigh->parms, APP_PROBES)) { // 探测次数相关参数,参考《深入理解LINUX网络技术内幕》P771
			unsigned long next, now = jiffies;

			atomic_set(&neigh->probes,
				   NEIGH_VAR(neigh->parms, UCAST_PROBES)); // 设置探测次数probes(INCOMPLETE没有收到ARP应答会重传ARP请求,超过一定次数及结束,认为目的不可达)
			neigh->nud_state     = NUD_INCOMPLETE; // 邻居项状态转换为INCOMPLETE
			neigh->updated = now; // 更新时间设置为当前时间(邻居项的更新时间)
			next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME),
					 HZ/2); // 计算下一次重传时间,INCOMPLETE状态超过RETRANS_TIME时间没有得到应答,会再次发送ARP请求
			neigh_add_timer(neigh, next); // 增加定时器(编辑邻居项定时器超时时间)
			immediate_probe = true; // 设置immediate_probe为true,需要立即发送ARP探测请求报文
		} else {
			neigh->nud_state = NUD_FAILED;
			neigh->updated = jiffies;
			write_unlock_bh(&neigh->lock);

			kfree_skb(skb);
			return 1;
		}
	} else if (neigh->nud_state & NUD_STALE) {
		neigh_dbg(2, "neigh %p is delayed\n", neigh);
		neigh->nud_state = NUD_DELAY;
		neigh->updated = jiffies;
		neigh_add_timer(neigh, jiffies +
				NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME));
	}

	if (neigh->nud_state == NUD_INCOMPLETE) { // INCOMPLETE状态(NONE、FAILED状态会转换到INCOMPLETE状态,INCOMPLETE超时会停留在INCOMPLETE状态)
		if (skb) { // 有数据要发送(NONE状态发送数据,会进入INCOMPLETE状态,如果是TCP报文发送的网络层再到链路层,如果邻居不可达情况下直接丢弃报文的话,那么只有等到TCP超时重传才会再次发送报文,但是ARP请求可能很快就会得到应答,因此如果有足够缓存的话,链路层先缓存不能发送的数据)
			while (neigh->arp_queue_len_bytes + skb->truesize >
			       NEIGH_VAR(neigh->parms, QUEUE_LEN_BYTES)) { // 邻居项已经缓存的数据+本次需要发送的数据 > 邻居项允许的缓存数,需要释放之前缓存的数据
				struct sk_buff *buff;

				buff = __skb_dequeue(&neigh->arp_queue); // 邻居项缓存的数据出队列,先释放最早入队列的数据
				if (!buff) // ARP缓存队列已经没有数据了,跳出循环
					break;
				neigh->arp_queue_len_bytes -= buff->truesize; // ARP缓存的数据长度减去出队列数据的长度
				kfree_skb(buff); // 释放最早入队列的数据
				NEIGH_CACHE_STAT_INC(neigh->tbl, unres_discards);
			}
			skb_dst_force(skb);
			__skb_queue_tail(&neigh->arp_queue, skb); // 本次发送的数据添加到ARP发送缓存队列末尾
			neigh->arp_queue_len_bytes += skb->truesize; // 更新ARP发送缓存数据大小
		}
		rc = 1;
	}
out_unlock_bh:
	if (immediate_probe) // NONE需要立即发送ARP请求(其他已经有发送或者等待发送的状态,不需要立即发送,等待超时再发送)
		neigh_probe(neigh); // 调用neigh_probe发送ARP请求(邻居探测)
	else
		write_unlock(&neigh->lock);
	local_bh_enable();
	return rc;

out_dead:
	if (neigh->nud_state & NUD_STALE)
		goto out_unlock_bh;
	write_unlock_bh(&neigh->lock);
	kfree_skb(skb);
	return 1;
}

        对于ARP协议,neigh_probe函数调用arp_solicit构建ARP请求报文并发送请求到邻居项输出网卡设备,arp_solicit计算源IP地址、增加发送次数,调用arp_send_dst构建ARP报文发送到网卡设备,实现代码如下:


static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
{
	__be32 saddr = 0;
	u8 dst_ha[MAX_ADDR_LEN], *dst_hw = NULL;
	struct net_device *dev = neigh->dev; // 邻居项输出网卡设备
	__be32 target = *(__be32 *)neigh->primary_key; // 目的IP地址
	int probes = atomic_read(&neigh->probes); // 邻居项发送探测的次数加1
	struct in_device *in_dev;
	struct dst_entry *dst = NULL;

	rcu_read_lock();
	in_dev = __in_dev_get_rcu(dev);
	if (!in_dev) {
		rcu_read_unlock();
		return;
	}
	switch (IN_DEV_ARP_ANNOUNCE(in_dev)) {
	default:
	case 0:		/* By default announce any local IP */
		if (skb && inet_addr_type_dev_table(dev_net(dev), dev,
					  ip_hdr(skb)->saddr) == RTN_LOCAL) // 报文源地址路由作用域等相关检查
			saddr = ip_hdr(skb)->saddr; // 使用报文源地址作为ARP报文的源IP地址
		break;
	case 1:		/* Restrict announcements of saddr in same subnet */
		if (!skb)
			break;
		saddr = ip_hdr(skb)->saddr;
		if (inet_addr_type_dev_table(dev_net(dev), dev,
					     saddr) == RTN_LOCAL) {
			/* saddr should be known to target */
			if (inet_addr_onlink(in_dev, target, saddr))
				break;
		}
		saddr = 0;
		break;
	case 2:		/* Avoid secondary IPs, get a primary/preferred one */
		break;
	}
	rcu_read_unlock();

	if (!saddr)
		saddr = inet_select_addr(dev, target, RT_SCOPE_LINK);

	probes -= NEIGH_VAR(neigh->parms, UCAST_PROBES);
	if (probes < 0) {
		if (!(neigh->nud_state & NUD_VALID))
			pr_debug("trying to ucast probe in NUD_INVALID\n");
		neigh_ha_snapshot(dst_ha, neigh, dev);
		dst_hw = dst_ha;
	} else {
		probes -= NEIGH_VAR(neigh->parms, APP_PROBES);
		if (probes < 0) {
			neigh_app_ns(neigh);
			return;
		}
	}

	if (skb && !(dev->priv_flags & IFF_XMIT_DST_RELEASE))
		dst = skb_dst(skb);
	arp_send_dst(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr,
		     dst_hw, dev->dev_addr, NULL, dst); // 发送ARP请求,ARP报文类型: ARPOP_REQUEST,协议类型: ETH_P_ARP,目的IP地址: target,源IP地址: saddr,输出网卡设备: dev
}

3.2、ARP请求发送调用栈(arp_send_dst)

linux网络协议栈源码分析 - 邻居子系统邻居状态转移_第2张图片

 3.3、超时重传(neigh_timer_handler)

        内核定时器调用栈如下:

         硬件tick定时器超时,产生IRQ中断,定时器相关函数检查定时器的wheel,如果有定时器超时,那么触发一个软中断,中断返回时,检查到有触发软中断,那么内核就去处理相关的软中断,run_timer_softirq处理所有超时的定时器,调用对应的定时器回调函数,定时器回调调用代码如下:

  1177         trace_timer_expire_entry(timer);
  1178         fn(data);
  1179         trace_timer_expire_exit(timer);

        对于邻居项超时定时器,上面的fn就是neigh_timer_handler,data就是邻居项指针,前面邻居创建的时候已经设置好了这几个参数。

        neigh_timer_handler检查邻居项当前的状态,对于INCOMPLETE状态的邻居项,计算下一次超时重传的时间,检查重传次数是否超过了最大重传次数,如果超过了最大重传次数就进入FAILED状态并清理发送缓存等,如果没有超过最大重传次数,那么再次设置超时定时器,调用邻居探测函数接口发送探测报文并增加发送次数,neigh_timer_handler实现代码如下:

static void neigh_timer_handler(unsigned long arg)
{
	unsigned long now, next;
	struct neighbour *neigh = (struct neighbour *)arg;
	unsigned int state;
	int notify = 0;

	write_lock(&neigh->lock);

	state = neigh->nud_state;
	now = jiffies;
	next = now + HZ;

	if (!(state & NUD_IN_TIMER)) // 非定时器状态(某些事件使邻居项改变了状态,但是不一定会删除定时器,并且这些状态没有超时时间,不需要定时器,因此这些不需要定时器的状态不需要处理定时器,超时回调函数直接返回即可)
		goto out;

	if (state & NUD_REACHABLE) {
		if (time_before_eq(now,
				   neigh->confirmed + neigh->parms->reachable_time)) {
			neigh_dbg(2, "neigh %p is still alive\n", neigh);
			next = neigh->confirmed + neigh->parms->reachable_time;
		} else if (time_before_eq(now,
					  neigh->used +
					  NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME))) {
			neigh_dbg(2, "neigh %p is delayed\n", neigh);
			neigh->nud_state = NUD_DELAY;
			neigh->updated = jiffies;
			neigh_suspect(neigh);
			next = now + NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME);
		} else {
			neigh_dbg(2, "neigh %p is suspected\n", neigh);
			neigh->nud_state = NUD_STALE;
			neigh->updated = jiffies;
			neigh_suspect(neigh);
			notify = 1;
		}
	} else if (state & NUD_DELAY) {
		if (time_before_eq(now,
				   neigh->confirmed +
				   NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME))) {
			neigh_dbg(2, "neigh %p is now reachable\n", neigh);
			neigh->nud_state = NUD_REACHABLE;
			neigh->updated = jiffies;
			neigh_connect(neigh);
			notify = 1;
			next = neigh->confirmed + neigh->parms->reachable_time;
		} else {
			neigh_dbg(2, "neigh %p is probed\n", neigh);
			neigh->nud_state = NUD_PROBE;
			neigh->updated = jiffies;
			atomic_set(&neigh->probes, 0);
			notify = 1;
			next = now + NEIGH_VAR(neigh->parms, RETRANS_TIME);
		}
	} else {
		/* NUD_PROBE|NUD_INCOMPLETE */
		next = now + NEIGH_VAR(neigh->parms, RETRANS_TIME); // 计算超时重传的时间;PROBE、INCOMPLETE都需要超时重传,从这里看,ARP的超时重传时间间隔是固定的
	}

	if ((neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) &&
	    atomic_read(&neigh->probes) >= neigh_max_probes(neigh)) { // INCOMPLETE、PROBE状态下超时,如果重传次数超过最大重传次数,那么进入FAILED状态
		neigh->nud_state = NUD_FAILED; // 超出最大重传次数,转换为FAILED状态
		notify = 1;
		neigh_invalidate(neigh); // AILED相关处理(释放ARP缓存等)
		goto out;
	}

	if (neigh->nud_state & NUD_IN_TIMER) { // 编辑定时器,修改定时器超时时间
		if (time_before(next, jiffies + HZ/2))
			next = jiffies + HZ/2;
		if (!mod_timer(&neigh->timer, next))
			neigh_hold(neigh);
	}
	if (neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) { // INCOMPLETE、PROBE状态发送超时,调用neigh_probe再次发送ARP探测报文,发送次数在neigh_probe函数里面增加
		neigh_probe(neigh);
	} else { // 其他状态不需要发送ARP探测报文
out:
		write_unlock(&neigh->lock);
	}

	if (notify)
		neigh_update_notify(neigh);

	neigh_release(neigh);
}

4、INCOMPLETE转换REACHABLE状态

4.1、ARP报文处理并更新邻居状态(arp_process)

        INCOMPLETE只有在收到邻居的ARP报文的时候,才会转换为REACHABLE状态,在arp_process函数里面转换;arp_process检查ARP报文输入网卡的IP、协议类型、硬件类型等是否有效,无效就释放报文,更新相应的邻居项,ARP应答报文,更新邻居项的状态为REACHABLE,更具体的解释可以参考机械工业出版社《Linux内核源码剖析:TCP/IP实现(上册)》"18.1.2 arp_process",虽然版本比较旧,但是实现基本一样。arp_process实现代码如下:

static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
{
	struct net_device *dev = skb->dev; // 获取接收报文的网卡设备
	struct in_device *in_dev = __in_dev_get_rcu(dev);
	struct arphdr *arp;
	unsigned char *arp_ptr;
	struct rtable *rt;
	unsigned char *sha;
	__be32 sip, tip;
	u16 dev_type = dev->type;
	int addr_type;
	struct neighbour *n;
	struct dst_entry *reply_dst = NULL;
	bool is_garp = false;

	/* arp_rcv below verifies the ARP header and verifies the device
	 * is ARP'able.
	 */

	if (!in_dev)
		goto out;

	arp = arp_hdr(skb); // 获取ARP首部

	switch (dev_type) {
	default:
		if (arp->ar_pro != htons(ETH_P_IP) ||
		    htons(dev_type) != arp->ar_hrd)
			goto out;
		break;
	case ARPHRD_ETHER:
	case ARPHRD_FDDI:
	case ARPHRD_IEEE802:
		/*
		 * ETHERNET, and Fibre Channel (which are IEEE 802
		 * devices, according to RFC 2625) devices will accept ARP
		 * hardware types of either 1 (Ethernet) or 6 (IEEE 802.2).
		 * This is the case also of FDDI, where the RFC 1390 says that
		 * FDDI devices should accept ARP hardware of (1) Ethernet,
		 * however, to be more robust, we'll accept both 1 (Ethernet)
		 * or 6 (IEEE 802.2)
		 */
		if ((arp->ar_hrd != htons(ARPHRD_ETHER) &&
		     arp->ar_hrd != htons(ARPHRD_IEEE802)) ||
		    arp->ar_pro != htons(ETH_P_IP)) // 检查ARP首部硬件类型、协议类型,非以太网地址、非IP协议地址,非arp_process能处理的协议,直接跳转到out,不处理该分组
			goto out;
		break;
	case ARPHRD_AX25:
		if (arp->ar_pro != htons(AX25_P_IP) ||
		    arp->ar_hrd != htons(ARPHRD_AX25))
			goto out;
		break;
	case ARPHRD_NETROM:
		if (arp->ar_pro != htons(AX25_P_IP) ||
		    arp->ar_hrd != htons(ARPHRD_NETROM))
			goto out;
		break;
	}

	/* Understand only these message types */

	if (arp->ar_op != htons(ARPOP_REPLY) && // ARP不是ARPOP_REPLY
	    arp->ar_op != htons(ARPOP_REQUEST)) // ARP也不是ARPOP_REQUEST,直接跳转到out,不处理该分组,arp_process只处理ARPOP_REQUEST/ARPOP_REPLY
		goto out;

/*
 *	Extract fields
 */
	arp_ptr = (unsigned char *)(arp + 1); // arp是arphdr类型的指针,arp + 1也就是arp的地址+sizeof(arphdr),arp_ptr就指向ARP首部的下一个内存地址,也就是以太网发送端地址字段的内存地址
	sha	= arp_ptr; // 以太网发送端地址
	arp_ptr += dev->addr_len; // 以太网发送端地址 + 硬件地址长度 = 发送端IP地址
	memcpy(&sip, arp_ptr, 4); // 拷贝发送端IP地址到sip
	arp_ptr += 4; // 移动到分组的目的以太网地址(也就是本机的以太网地址,对于接收方,这个以太网地址不需要考虑)
	switch (dev_type) {
#if IS_ENABLED(CONFIG_FIREWIRE_NET)
	case ARPHRD_IEEE1394:
		break;
#endif
	default:
		arp_ptr += dev->addr_len; // 接收方跳过目的以太网地址(跳过本机的以太网地址字段),移动到分组的目的IP地址
	}
	memcpy(&tip, arp_ptr, 4); // 拷贝目的IP地址到tip
/*
 *	Check for bad requests for 127.x.x.x and requests for multicast
 *	addresses.  If this is one such, delete it.
 */
	if (ipv4_is_multicast(tip) || // 丢弃目的IP地址为多播地址的报文
	    (!IN_DEV_ROUTE_LOCALNET(in_dev) && ipv4_is_loopback(tip))) // 丢弃输入网卡设备配置为非ROUTE_LOCALNET并且目的IP地址为环回地址的报文
		goto out;

/*
 *     Special case: We must set Frame Relay source Q.922 address
 */
	if (dev_type == ARPHRD_DLCI)
		sha = dev->broadcast;

/*
 *  Process entry.  The idea here is we want to send a reply if it is a
 *  request for us or if it is a request for someone else that we hold
 *  a proxy for.  We want to add an entry to our cache if it is a reply
 *  to us or if it is a request for our address.
 *  (The assumption for this last is that if someone is requesting our
 *  address, they are probably intending to talk to us, so it saves time
 *  if we cache their address.  Their address is also probably not in
 *  our cache, since ours is not in their cache.)
 *
 *  Putting this another way, we only care about replies if they are to
 *  us, in which case we add them to the cache.  For requests, we care
 *  about those for us and those for our proxies.  We reply to both,
 *  and in the case of requests for us we add the requester to the arp
 *  cache.
 */

	if (arp->ar_op == htons(ARPOP_REQUEST) && skb_metadata_dst(skb))
		reply_dst = (struct dst_entry *)
			    iptunnel_metadata_reply(skb_metadata_dst(skb),
						    GFP_ATOMIC);

	/* Special case: IPv4 duplicate address detection packet (RFC2131) */
	if (sip == 0) { // 检测IPv4地址冲突(RFC2131 Dynamic Host Configuration Protocol)
		if (arp->ar_op == htons(ARPOP_REQUEST) &&
		    inet_addr_type_dev_table(net, dev, tip) == RTN_LOCAL &&
		    !arp_ignore(in_dev, sip, tip)) // 目的IP地址是本机的IP地址,检测到地址冲突
			arp_send_dst(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip,
				     sha, dev->dev_addr, sha, reply_dst); // 发送ARP应答报文
		goto out; // 跳转到out
	}

	if (arp->ar_op == htons(ARPOP_REQUEST) &&
	    ip_route_input_noref(skb, tip, sip, 0, dev) == 0) { // 查找输入路由(目的IP地址应该是本机IP的地址)

		rt = skb_rtable(skb);
		addr_type = rt->rt_type;

		if (addr_type == RTN_LOCAL) { // 输入到本地?
			int dont_send;

			dont_send = arp_ignore(in_dev, sip, tip); // 是否丢弃ARP报文
			if (!dont_send && IN_DEV_ARPFILTER(in_dev))
				dont_send = arp_filter(sip, tip, dev); // 是否过滤ARP报文
			if (!dont_send) { // 非过滤和丢弃ARP报文
				n = neigh_event_ns(&arp_tbl, sha, &sip, dev); // 更新对应的邻居项
				if (n) {
					arp_send_dst(ARPOP_REPLY, ETH_P_ARP,
						     sip, dev, tip, sha,
						     dev->dev_addr, sha,
						     reply_dst); // 发送ARP应答报文
					neigh_release(n);
				}
			}
			goto out;
		} else if (IN_DEV_FORWARD(in_dev)) { // ARP代理
			if (addr_type == RTN_UNICAST  &&
			    (arp_fwd_proxy(in_dev, dev, rt) ||
			     arp_fwd_pvlan(in_dev, dev, rt, sip, tip) ||
			     (rt->dst.dev != dev &&
			      pneigh_lookup(&arp_tbl, net, &tip, dev, 0)))) {
				n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
				if (n)
					neigh_release(n);

				if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED ||
				    skb->pkt_type == PACKET_HOST ||
				    NEIGH_VAR(in_dev->arp_parms, PROXY_DELAY) == 0) {
					arp_send_dst(ARPOP_REPLY, ETH_P_ARP,
						     sip, dev, tip, sha,
						     dev->dev_addr, sha,
						     reply_dst);
				} else {
					pneigh_enqueue(&arp_tbl,
						       in_dev->arp_parms, skb);
					goto out_free_dst;
				}
				goto out;
			}
		}
	}

	/* Update our ARP tables */

	n = __neigh_lookup(&arp_tbl, &sip, dev, 0); // 在邻居表中查找邻居项(ARP应答,没有输入路由的ARP请求,非输入到本地、非转发的ARP请求)

	if (IN_DEV_ARP_ACCEPT(in_dev)) {
		unsigned int addr_type = inet_addr_type_dev_table(net, dev, sip);

		/* Unsolicited ARP is not accepted by default.
		   It is possible, that this option should be enabled for some
		   devices (strip is candidate)
		 */
		is_garp = arp->ar_op == htons(ARPOP_REQUEST) && tip == sip &&
			  addr_type == RTN_UNICAST; // Gratuitous ARP

		if (!n && // 邻居项为空(本地没有发起ARP请求,如果有发起ARP请求,那么邻居项就不为空)
		    ((arp->ar_op == htons(ARPOP_REPLY)  && // ARP应答报文
				addr_type == RTN_UNICAST) || is_garp)) // 系统允许接受并非由ARP请求而接收到的ARP应答
			n = __neigh_lookup(&arp_tbl, &sip, dev, 1); // 创建相应的邻居项
	}

	if (n) { // 邻居项有效
		int state = NUD_REACHABLE; // 邻居项的新状态记为reachable
		int override;

		/* If several different ARP replies follows back-to-back,
		   use the FIRST one. It is possible, if several proxy
		   agents are active. Taking the first reply prevents
		   arp trashing and chooses the fastest router.
		 */
		override = time_after(jiffies,
				      n->updated +
				      NEIGH_VAR(n->parms, LOCKTIME)) || // 超过LOCKTIME时间没有更新updated(没有更新邻居表项的状态)
			   is_garp;

		/* Broadcast replies and request packets
		   do not assert neighbour reachability.
		 */
		if (arp->ar_op != htons(ARPOP_REPLY) ||
		    skb->pkt_type != PACKET_HOST) // 如果不是发给自己的应答报文,那么邻居的新状态记为STALE
			state = NUD_STALE;
		neigh_update(n, sha, state,
			     override ? NEIGH_UPDATE_F_OVERRIDE : 0); // 更新邻居表项的状态
		neigh_release(n);
	}

out:
	consume_skb(skb);
out_free_dst:
	dst_release(reply_dst);
	return 0;
}

 4.2、邻居项更新(neigh_update)

        收到ARP报文,arp_process调用neigh_update更新邻居项的状态,如果更新为连接状态,那么更新最后一次确认邻居可达的时间confirmed,更新邻居项的更新时间,如果新收到的地址不一样,需要判断是否更新邻居的缓存的地址,状态更新之后,新的状态如果要启动超时定时器,那么启动超时定时器,新状态为连接状态时,更新邻居输出函数指针(ARP协议,更新前后都是neigh_resolve_output),如果邻居项状态从不可用状态变更为可用状态,那么把ARP发送缓存队列的数据都发送出去。

neigh_update实现代码如下:

int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
		 u32 flags)
{
	u8 old;
	int err;
	int notify = 0;
	struct net_device *dev;
	int update_isrouter = 0;

	write_lock_bh(&neigh->lock);

	dev    = neigh->dev; // 邻居项的输出网卡设备
	old    = neigh->nud_state; // 旧的邻居项状态
	err    = -EPERM;

	if (!(flags & NEIGH_UPDATE_F_ADMIN) &&
	    (old & (NUD_NOARP | NUD_PERMANENT)))
		goto out;
	if (neigh->dead)
		goto out;

	if (!(new & NUD_VALID)) { // 邻居项的新的状态不是VALID状态
		neigh_del_timer(neigh);
		if (old & NUD_CONNECTED)
			neigh_suspect(neigh);
		neigh->nud_state = new;
		err = 0;
		notify = old & NUD_VALID;
		if ((old & (NUD_INCOMPLETE | NUD_PROBE)) &&
		    (new & NUD_FAILED)) {
			neigh_invalidate(neigh);
			notify = 1;
		}
		goto out;
	}

	/* Compare new lladdr with cached one */
	if (!dev->addr_len) {
		/* First case: device needs no address. */
		lladdr = neigh->ha;
	} else if (lladdr) {
		/* The second case: if something is already cached
		   and a new address is proposed:
		   - compare new & old
		   - if they are different, check override flag
		 */
		if ((old & NUD_VALID) &&
		    !memcmp(lladdr, neigh->ha, dev->addr_len))
			lladdr = neigh->ha;
	} else {
		/* No address is supplied; if we know something,
		   use it, otherwise discard the request.
		 */
		err = -EINVAL;
		if (!(old & NUD_VALID))
			goto out;
		lladdr = neigh->ha;
	}

	if (new & NUD_CONNECTED) // 邻居项的状态是连接状态(邻居可达、不需要ARP)
		neigh->confirmed = jiffies; // 更新确认时间(confirmed记录最后一次确认邻居可连接的时间,一般就是最后一次确认邻居可达的时间)
	neigh->updated = jiffies; // 更新邻居项的更新时间

	/* If entry was valid and address is not changed,
	   do not change entry state, if new one is STALE.
	 */
	err = 0;
	update_isrouter = flags & NEIGH_UPDATE_F_OVERRIDE_ISROUTER;
	if (old & NUD_VALID) { // 旧的邻居项状态是VALID(REACHABLE/PROBE/STALE/DELAY),这几个状态应该有记录之前的邻居MAC地址等信息,新的ARP报文的MAC地址等不一定与之前记录的不一样,并不能完全确定是MAC地址变了还是收到了攻击报文
		if (lladdr != neigh->ha && !(flags & NEIGH_UPDATE_F_OVERRIDE)) { // 如果与已缓存的地址不同,并且不是强制更新覆盖邻居项,那么检查是否需要更新缓存的地址
			update_isrouter = 0;
			if ((flags & NEIGH_UPDATE_F_WEAK_OVERRIDE) &&
			    (old & NUD_CONNECTED)) { // 弱覆盖并且之前邻居项的状态是CONNECTED,那么保留之前缓存的地址,状态转换为STALE,没办法确定邻居的物理地址是缓存的可靠还是新收到的可靠,暂时保留缓存的地址,STALE状态如果有使用邻居项的话,那么会进入DELAY状态,最后进入PROBE状态去探测邻居,如果长时间没有使用,就会可能被回收
				lladdr = neigh->ha; // 新的地址使用已经缓存的地址
				new = NUD_STALE; // 新状态更新为STALE
			} else
				goto out;
		} else {
			if (lladdr == neigh->ha && new == NUD_STALE && // 新地址与缓存地址相等、新的状态为STALE、设置了弱覆盖标志或者旧的状态是CONNECTED状态,那么继续保留之前的状态
			    ((flags & NEIGH_UPDATE_F_WEAK_OVERRIDE) ||
			     (old & NUD_CONNECTED))
			    )
				new = old;
		}
	}

	if (new != old) { // 需要更新邻居项的状态
		neigh_del_timer(neigh); // 删除邻居项的定时器
		if (new & NUD_PROBE) // 进入PROBE状态
			atomic_set(&neigh->probes, 0); // probes次数设置为0
		if (new & NUD_IN_TIMER) // 需要启动定时器(NUD_IN_TIMER的状态需要定时器,这些状态有超时时间,超时之后状态可能要改变或者重发报文...)
			neigh_add_timer(neigh, (jiffies +
						((new & NUD_REACHABLE) ?
						 neigh->parms->reachable_time :
						 0))); // 添加定时器
		neigh->nud_state = new; // 更新邻居项状态
		notify = 1;
	}

	if (lladdr != neigh->ha) { // 新的地址与缓存的地址不一样,更新邻居项缓存的地址(不一样且不需要更新的情况,前面会用缓存的地址替换lladdr)
		write_seqlock(&neigh->ha_lock);
		memcpy(&neigh->ha, lladdr, dev->addr_len); // 更新邻居项缓存的地址
		write_sequnlock(&neigh->ha_lock);
		neigh_update_hhs(neigh);
		if (!(new & NUD_CONNECTED)) // 新的状态不是CONNECTED状态,需要把确认时间往前移动到一个超时时间,否则定时器超时检测的confirmed比较新的话,会错误以为新的时间确认邻居可达
			neigh->confirmed = jiffies -
				      (NEIGH_VAR(neigh->parms, BASE_REACHABLE_TIME) << 1);
		notify = 1;
	}
	if (new == old) // 状态没有更新,跳转到out
		goto out;
	if (new & NUD_CONNECTED) // 邻居项处于CONNECTED状态
		neigh_connect(neigh); // 更新邻居项的输出函数指针
	else
		neigh_suspect(neigh);
	if (!(old & NUD_VALID)) { // 旧的状态不是VALID状态(可能之前有ARP缓存待发送的数据)
		struct sk_buff *skb;

		/* Again: avoid dead loop if something went wrong */

		while (neigh->nud_state & NUD_VALID && // 邻居的状态为VALID
		       (skb = __skb_dequeue(&neigh->arp_queue)) != NULL) { // ARP缓存队列有数据
			struct dst_entry *dst = skb_dst(skb);
			struct neighbour *n2, *n1 = neigh;
			write_unlock_bh(&neigh->lock);

			rcu_read_lock();

			/* Why not just use 'neigh' as-is?  The problem is that
			 * things such as shaper, eql, and sch_teql can end up
			 * using alternative, different, neigh objects to output
			 * the packet in the output path.  So what we need to do
			 * here is re-lookup the top-level neigh in the path so
			 * we can reinject the packet there.
			 */
			n2 = NULL;
			if (dst) {
				n2 = dst_neigh_lookup_skb(dst, skb);
				if (n2)
					n1 = n2;
			}
			n1->output(n1, skb); // 调用邻居项到output函数发送报文,此次还是neigh_resolve_output,底层函数根据邻居项状态决定是发送ARP请求还是数据报文
			if (n2)
				neigh_release(n2);
			rcu_read_unlock();

			write_lock_bh(&neigh->lock);
		}
		__skb_queue_purge(&neigh->arp_queue); // 释放ARP缓存队列的数据
		neigh->arp_queue_len_bytes = 0; // 更新ARP缓存队列数据长度为0
	}
out:
	if (update_isrouter) {
		neigh->flags = (flags & NEIGH_UPDATE_F_ISROUTER) ?
			(neigh->flags | NTF_ROUTER) :
			(neigh->flags & ~NTF_ROUTER);
	}
	write_unlock_bh(&neigh->lock);

	if (notify)
		neigh_update_notify(neigh);

	return err;
}

        邻居可用时,neigh_update调用neigh_resolve_output发送ARP缓存队列数据,neigh_resolve_output调用neigh_event_send,非连接等状态调用发送ARP请求的函数,连接状态则直接返回0,也就是可以直接发送数据报文,neigh_resolve_output就调用dev_queue_xmit发送数据报文到网卡里面。neigh_resolve_output实现代码如下:

int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb)
{
	int rc = 0;

	if (!neigh_event_send(neigh, skb)) { // 连接状态返回0,执行if里面的代码
		int err;
		struct net_device *dev = neigh->dev;
		unsigned int seq;

		if (dev->header_ops->cache && !neigh->hh.hh_len)
			neigh_hh_init(neigh);

		do {
			__skb_pull(skb, skb_network_offset(skb));
			seq = read_seqbegin(&neigh->ha_lock);
			err = dev_hard_header(skb, dev, ntohs(skb->protocol),
					      neigh->ha, NULL, skb->len);
		} while (read_seqretry(&neigh->ha_lock, seq));

		if (err >= 0)
			rc = dev_queue_xmit(skb); // 调用dev_queue_xmit发送报文到网卡
		else
			goto out_kfree_skb;
	}
out:
	return rc;
out_kfree_skb:
	rc = -EINVAL;
	kfree_skb(skb);
	goto out;
}

4.3、ARP报文接收调用栈

linux网络协议栈源码分析 - 邻居子系统邻居状态转移_第3张图片

        网上收到数据发生中断, 内核处理网卡中断,smsc911x_poll调用网卡驱动接收网卡数据,调用netif_receive_skb处理收到的报文,__netif_receive_skb_core解析报文的类型,一级一级调用,最后调用ARP协议的处理函数arp_process,更新状态之后,调用neigh_resolve_output、dev_queue_xmit、__dev_queue_xmit发送ARP缓存的队列数据。

4.4、REACHABLE状态更新

        REACHABLE状态会启动一个定时器,在定时器超时时间内,都认为邻居是可达的,超过一定时间没有确认邻居是否可达的情况下,邻居项缓存的数据已经不可靠了,在arp_process收到ARP报文时,更新邻居项状态时,对于REACHABLE的邻居项都会更新最后一次确认时间confirmed,那么从confirmed时间开始的一段时间内认为邻居也是可达的,REACHABLE定时器超时之后,并不能直接更新邻居项为过期状态,还的检查启动定时器之后是否有更新确认时间;REACHABLE超时实现代码如下:

static void neigh_timer_handler(unsigned long arg)
{
	unsigned long now, next;
	struct neighbour *neigh = (struct neighbour *)arg;
	unsigned int state;
	int notify = 0;

	write_lock(&neigh->lock);

	state = neigh->nud_state;
	now = jiffies;
	next = now + HZ;

	if (!(state & NUD_IN_TIMER))
		goto out;

	if (state & NUD_REACHABLE) {
		if (time_before_eq(now,
				   neigh->confirmed + neigh->parms->reachable_time)) { // 检查当前时间是否超过reachable_time时间,confirmed并不是启动定时器时的时间,如果启动定时器之后如果有确认邻居可达,那么confirmed就会更新,neigh->confirmed + neigh->parms->reachable_time之前的时间都认为邻居是可达的,如果now在这个时间之前,那么邻居仍然是可达的
			neigh_dbg(2, "neigh %p is still alive\n", neigh);
			next = neigh->confirmed + neigh->parms->reachable_time; // neigh->confirmed + neigh->parms->reachable_time最后一次确认邻居可达之后的超时时间点,例如9点、10点确认了一次,超时时间为两小时,9点启动定时器在11点超时了,因为10点确认邻居可达,那么邻居实际在12点前邻居都认为可达的,11点超时之后,只需要再起一个12点超时的定时器即可
		} else if (time_before_eq(now,
					  neigh->used +
					  NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME))) { // now小于最后一次使用时间used+DELAY_PROBE_TIME时间,虽然邻居项REACHABLE超时了,但是最近有用到邻居项,那么有可能再次使用,没必要立即发送探测邻居状态的报文,也不能直接释放掉邻居项,那么进入DELAY状态,有可能最近发送的报文在哪个时间会有应答,能够确认邻居可达,或者后面不再使用了,所以没必要立即发送探测报文
			neigh_dbg(2, "neigh %p is delayed\n", neigh);
			neigh->nud_state = NUD_DELAY; // 转换为DELAY状态(neigh_resolve_output调用neigh_event_send,检测到当前是DELAY、PROBE都会直接发送数据报文的,这些状态下还缓存了之前的地址信息,不一定可靠,但是网络没有变化的情况下,这些地址都是有效的,如果地址不对,大不了等获取新的地址之后由上层协议重复即可)
			neigh->updated = jiffies; // 更新邻居项更新时间
			neigh_suspect(neigh);
			next = now + NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME); // DELAY状态超时时间(DELAY_PROBE_TIME时间之后再进入PROBE状态,再去探测邻居状态)
		} else { // REACHABLE超时了并且很久没有用到该邻居项了,那么转换为STALE状态即可,STALE状态如果有数据发送,那么发送函数会把邻居项的状态改为INCOMPLETE
			neigh_dbg(2, "neigh %p is suspected\n", neigh);
			neigh->nud_state = NUD_STALE; // 转换为STALE状态,邻居项缓存的数据过期了,不能再使用缓存的地址去发送报文
			neigh->updated = jiffies;
			neigh_suspect(neigh);
			notify = 1;
		}
	} else if (state & NUD_DELAY) {
		if (time_before_eq(now,
				   neigh->confirmed +
				   NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME))) {
			neigh_dbg(2, "neigh %p is now reachable\n", neigh);
			neigh->nud_state = NUD_REACHABLE;
			neigh->updated = jiffies;
			neigh_connect(neigh);
			notify = 1;
			next = neigh->confirmed + neigh->parms->reachable_time;
		} else {
			neigh_dbg(2, "neigh %p is probed\n", neigh);
			neigh->nud_state = NUD_PROBE;
			neigh->updated = jiffies;
			atomic_set(&neigh->probes, 0);
			notify = 1;
			next = now + NEIGH_VAR(neigh->parms, RETRANS_TIME);
		}
	} else {
		/* NUD_PROBE|NUD_INCOMPLETE */
		next = now + NEIGH_VAR(neigh->parms, RETRANS_TIME);
	}

	if ((neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) &&
	    atomic_read(&neigh->probes) >= neigh_max_probes(neigh)) {
		neigh->nud_state = NUD_FAILED;
		notify = 1;
		neigh_invalidate(neigh);
		goto out;
	}

	if (neigh->nud_state & NUD_IN_TIMER) { // 邻居项所在状态需要启动定时器
		if (time_before(next, jiffies + HZ/2))
			next = jiffies + HZ/2;
		if (!mod_timer(&neigh->timer, next)) // 启动超时定时器
			neigh_hold(neigh);
	}
	if (neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) {
		neigh_probe(neigh);
	} else {
out:
		write_unlock(&neigh->lock);
	}

	if (notify)
		neigh_update_notify(neigh);

	neigh_release(neigh);
}

5、DELAY状态转换

        REACHABLE超时并且最近有使用邻居项的情况下,邻居项会转换为DELAY状态,DELAY时间内不会发送探测邻居状态报文,这段时间内可能有应答报文等可以确认邻居可达,更新confirmed时间;DELAY状态超时处理比较简单,neigh_timer_handler检查当前时间是否小于等于最后一次确认邻居可达时间+DELAY_PROBE_TIME,如果小于等于,也就是最近不久前就确认过邻居可达了,那么直接转换为REACHABLE状态,如果大于,那么就转换为PROBE状态,主动去探测邻居的状态,更新邻居项的相关时间;DELAY状态超时回调处理函数实现代码如下:

static void neigh_timer_handler(unsigned long arg)
{
	unsigned long now, next;
	struct neighbour *neigh = (struct neighbour *)arg;
	unsigned int state;
	int notify = 0;

	write_lock(&neigh->lock);

	state = neigh->nud_state;
	now = jiffies;
	next = now + HZ;

	if (!(state & NUD_IN_TIMER))
		goto out;

	if (state & NUD_REACHABLE) {
		if (time_before_eq(now,
				   neigh->confirmed + neigh->parms->reachable_time)) {
			neigh_dbg(2, "neigh %p is still alive\n", neigh);
			next = neigh->confirmed + neigh->parms->reachable_time;
		} else if (time_before_eq(now,
					  neigh->used +
					  NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME))) {
			neigh_dbg(2, "neigh %p is delayed\n", neigh);
			neigh->nud_state = NUD_DELAY;
			neigh->updated = jiffies;
			neigh_suspect(neigh);
			next = now + NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME);
		} else {
			neigh_dbg(2, "neigh %p is suspected\n", neigh);
			neigh->nud_state = NUD_STALE;
			neigh->updated = jiffies;
			neigh_suspect(neigh);
			notify = 1;
		}
	} else if (state & NUD_DELAY) { // DELAY状态超时
		if (time_before_eq(now,
				   neigh->confirmed +
				   NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME))) { // now小于等于最后一次确认邻居可达时间+DELAY_PROBE_TIME时间,那么认为邻居是可达的,最近一段时间有确认邻居可达
			neigh_dbg(2, "neigh %p is now reachable\n", neigh);
			neigh->nud_state = NUD_REACHABLE; // 邻居项状态转换为REACHABLE状态
			neigh->updated = jiffies;
			neigh_connect(neigh); // ARP协议没实际用处,输出函数会根据状态决定是发送ARP请求还是数据报文
			notify = 1;
			next = neigh->confirmed + neigh->parms->reachable_time; // REACHABLE状态超时时间
		} else { // 有一段时间没有确认邻居是否可达,需要主动发起探测报文
			neigh_dbg(2, "neigh %p is probed\n", neigh);
			neigh->nud_state = NUD_PROBE; // 邻居项转换为PROBE状态
			neigh->updated = jiffies;
			atomic_set(&neigh->probes, 0); // probes次数设置为0
			notify = 1;
			next = now + NEIGH_VAR(neigh->parms, RETRANS_TIME); // PROBE超时时间,下一次重发探测报文的时间
		}
	} else {
		/* NUD_PROBE|NUD_INCOMPLETE */
		next = now + NEIGH_VAR(neigh->parms, RETRANS_TIME);
	}

	if ((neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) &&
	    atomic_read(&neigh->probes) >= neigh_max_probes(neigh)) {
		neigh->nud_state = NUD_FAILED;
		notify = 1;
		neigh_invalidate(neigh);
		goto out;
	}

	if (neigh->nud_state & NUD_IN_TIMER) {
		if (time_before(next, jiffies + HZ/2))
			next = jiffies + HZ/2;
		if (!mod_timer(&neigh->timer, next))
			neigh_hold(neigh);
	}
	if (neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) {
		neigh_probe(neigh);
	} else {
out:
		write_unlock(&neigh->lock);
	}

	if (notify)
		neigh_update_notify(neigh);

	neigh_release(neigh);
}

6、PROBE状态转换

        DELAY状态超时并且最近没有确认邻居可达会转换为PROBE状态,同时会发送邻居探测报文,PROBE状态只可能转换为FAILED或者REACHABLE状态,前面ARP报文的arp_process收到ARP请求的应答时,会将邻居项状态转换为REACHABLE状态,PROBE状态的定时器超时只是为了重复ARP请求,PROBE定时器超时处理与INCOMPLETE状态走一样的代码,实现代码如下:


/* Called when a timer expires for a neighbour entry. */

static void neigh_timer_handler(unsigned long arg)
{
	unsigned long now, next;
	struct neighbour *neigh = (struct neighbour *)arg;
	unsigned int state;
	int notify = 0;

	write_lock(&neigh->lock);

	state = neigh->nud_state;
	now = jiffies;
	next = now + HZ;

	if (!(state & NUD_IN_TIMER))
		goto out;

	if (state & NUD_REACHABLE) {
		if (time_before_eq(now,
				   neigh->confirmed + neigh->parms->reachable_time)) {
			neigh_dbg(2, "neigh %p is still alive\n", neigh);
			next = neigh->confirmed + neigh->parms->reachable_time;
		} else if (time_before_eq(now,
					  neigh->used +
					  NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME))) {
			neigh_dbg(2, "neigh %p is delayed\n", neigh);
			neigh->nud_state = NUD_DELAY;
			neigh->updated = jiffies;
			neigh_suspect(neigh);
			next = now + NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME);
		} else {
			neigh_dbg(2, "neigh %p is suspected\n", neigh);
			neigh->nud_state = NUD_STALE;
			neigh->updated = jiffies;
			neigh_suspect(neigh);
			notify = 1;
		}
	} else if (state & NUD_DELAY) {
		if (time_before_eq(now,
				   neigh->confirmed +
				   NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME))) {
			neigh_dbg(2, "neigh %p is now reachable\n", neigh);
			neigh->nud_state = NUD_REACHABLE;
			neigh->updated = jiffies;
			neigh_connect(neigh);
			notify = 1;
			next = neigh->confirmed + neigh->parms->reachable_time;
		} else {
			neigh_dbg(2, "neigh %p is probed\n", neigh);
			neigh->nud_state = NUD_PROBE;
			neigh->updated = jiffies;
			atomic_set(&neigh->probes, 0);
			notify = 1;
			next = now + NEIGH_VAR(neigh->parms, RETRANS_TIME);
		}
	} else { // PROBE、INCOMPLETE状态定时器超时
		/* NUD_PROBE|NUD_INCOMPLETE */
		next = now + NEIGH_VAR(neigh->parms, RETRANS_TIME); // 计算下一次超时重传时间
	}

	if ((neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) &&
	    atomic_read(&neigh->probes) >= neigh_max_probes(neigh)) { // PROBE、INCOMPLETE状态,检查发送次数是否超过最大探测次数,是的话转换为FAILED状态
		neigh->nud_state = NUD_FAILED;
		notify = 1;
		neigh_invalidate(neigh);
		goto out;
	}

	if (neigh->nud_state & NUD_IN_TIMER) {
		if (time_before(next, jiffies + HZ/2))
			next = jiffies + HZ/2;
		if (!mod_timer(&neigh->timer, next)) // 启动下一次超时定时器
			neigh_hold(neigh);
	}
	if (neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) { // PROBE、INCOMPLETE状态
		neigh_probe(neigh); // 发送邻居探测报文
	} else {
out:
		write_unlock(&neigh->lock);
	}

	if (notify)
		neigh_update_notify(neigh);

	neigh_release(neigh);
}

7、参考文献

《Linux内核源码剖析:TCP/IP实现(上册)》机械工业出版社

《深入理解LINUX网络技术内幕》中国电力出版社

你可能感兴趣的:(linux,ARP,邻居子系统,linux)