邻居子系统实现了IP层发包不感知MAC,即由邻居子系统实现了MAC头封装。MAC头信息包括:源MAC、目的MAC、协议类型,其中协议类型由上层指定,例如IPV4等等,源MAC地址是出口设备MAC地址(在路由表中确定出口设备),目的MAC是由邻居子系统提供的,大致可以猜到,邻居子系统会主动发起arp请求获取到mac地址,实现MAC封包。IP层发包最后会调用ip_finish_output2函数,我们从该函数入手分析邻居子系统。
ip_finish_output2函数
static inline int ip_finish_output2(struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct rtable *rt = (struct rtable *)dst; struct net_device *dev = dst->dev; //出口设备 unsigned int hh_len = LL_RESERVED_SPACE(dev); struct neighbour *neigh; u32 nexthop; if (rt->rt_type == RTN_MULTICAST) { IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len); } else if (rt->rt_type == RTN_BROADCAST) IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len); /* Be paranoid, rather than too clever. */ if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { struct sk_buff *skb2; skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev)); if (!skb2) { kfree_skb(skb); return -ENOMEM; } if (skb->sk) skb_set_owner_w(skb2, skb->sk); consume_skb(skb); skb = skb2; } rcu_read_lock_bh(); nexthop = (__force u32) rt_nexthop(rt, ip_hdr(skb)->daddr);<span style="white-space:pre"> </span>//目的IP地址 neigh = __ipv4_neigh_lookup_noref(dev, nexthop); //根据目的IP查找邻居项是否存在 if (unlikely(!neigh)) neigh = __neigh_create(&arp_tbl, &nexthop, dev, false); //如果不存在,则创建neigh项 if (!IS_ERR(neigh)) { int res = dst_neigh_output(dst, neigh, skb); //调用邻居子系统封装MAC头,并且调用二层发包函数完成报文发送 rcu_read_unlock_bh(); return res; } rcu_read_unlock_bh(); net_dbg_ratelimited("%s: No header cache and no neighbour!\n", __func__); kfree_skb(skb); return -EINVAL; }首先会根据出口设备和目的IP地址,查找是否已经存在邻居项,如果没有则创建邻居项,然后通过dst_neigh_output发包,本文分析假设没有邻居项。 先邻居项的查找函数:
__ipv4_neigh_lookup_noref函数
static inline struct neighbour *__ipv4_neigh_lookup_noref(struct net_device *dev, u32 key) { return ___neigh_lookup_noref(&arp_tbl, neigh_key_eq32, arp_hashfn, &key, dev); //ipv4从arp_tbl中查找 }___neigh_lookup_noref函数
static inline struct neighbour *___neigh_lookup_noref( struct neigh_table *tbl, bool (*key_eq)(const struct neighbour *n, const void *pkey), __u32 (*hash)(const void *pkey, const struct net_device *dev, __u32 *hash_rnd), const void *pkey, struct net_device *dev) { struct neigh_hash_table *nht = rcu_dereference_bh(tbl->nht); //hash表,邻居数量大时加速 struct neighbour *n; u32 hash_val; hash_val = hash(pkey, dev, nht->hash_rnd) >> (32 - nht->hash_shift); //计算hash值 for (n = rcu_dereference_bh(nht->hash_buckets[hash_val]); n != NULL; n = rcu_dereference_bh(n->next)) { if (n->dev == dev && key_eq(n, pkey)) //dev相同并且pkey相同,这里pkey是IPV4地址 return n; } return NULL; }邻居表项查找比较简单,就是在hash表中查找匹配设备和目的IP地址的邻居表项,该函数支持IPV6, 可扩展性通过参数实现,接下来看下创建邻居表项的实现:
__neigh_create函数
struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, struct net_device *dev, bool want_ref) { u32 hash_val; int key_len = tbl->key_len; int error; struct neighbour *n1, *rc, *n = neigh_alloc(tbl, dev);<span style="white-space:pre"> </span>//创建邻居表项对象 struct neigh_hash_table *nht; if (!n) { rc = ERR_PTR(-ENOBUFS); goto out; } memcpy(n->primary_key, pkey, key_len); n->dev = dev; dev_hold(dev); /* Protocol specific setup. */ if (tbl->constructor && (error = tbl->constructor(n)) < 0) { //IPV4实际调用arp_constructor函数,设置output函数 rc = ERR_PTR(error); goto out_neigh_release; } if (dev->netdev_ops->ndo_neigh_construct) { //一般设备不设置该变量 error = dev->netdev_ops->ndo_neigh_construct(n); if (error < 0) { rc = ERR_PTR(error); goto out_neigh_release; } } /* Device specific setup. */ if (n->parms->neigh_setup && (error = n->parms->neigh_setup(n)) < 0) { //IPV4未定义该函数 rc = ERR_PTR(error); goto out_neigh_release; } n->confirmed = jiffies - (NEIGH_VAR(n->parms, BASE_REACHABLE_TIME) << 1); write_lock_bh(&tbl->lock); nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock)); if (atomic_read(&tbl->entries) > (1 << nht->hash_shift)) nht = neigh_hash_grow(tbl, nht->hash_shift + 1); hash_val = tbl->hash(pkey, dev, nht->hash_rnd) >> (32 - nht->hash_shift); //计算hash值,计算方式由邻居表定义 if (n->parms->dead) { rc = ERR_PTR(-EINVAL); goto out_tbl_unlock; } for (n1 = rcu_dereference_protected(nht->hash_buckets[hash_val], //找到有相同hash值得neighbour链表 lockdep_is_held(&tbl->lock)); n1 != NULL; n1 = rcu_dereference_protected(n1->next, lockdep_is_held(&tbl->lock))) { if (dev == n1->dev && !memcmp(n1->primary_key, pkey, key_len)) { if (want_ref) neigh_hold(n1); rc = n1; goto out_tbl_unlock; } } n->dead = 0; if (want_ref) neigh_hold(n); rcu_assign_pointer(n->next, rcu_dereference_protected(nht->hash_buckets[hash_val], lockdep_is_held(&tbl->lock))); //插入到链表中 rcu_assign_pointer(nht->hash_buckets[hash_val], n); write_unlock_bh(&tbl->lock); neigh_dbg(2, "neigh %p is created\n", n); rc = n; out: return rc; out_tbl_unlock: write_unlock_bh(&tbl->lock); out_neigh_release: neigh_release(n); goto out; }neigh_alloc函数
static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device *dev) { struct neighbour *n = NULL; unsigned long now = jiffies; int entries; entries = atomic_inc_return(&tbl->entries) - 1; if (entries >= tbl->gc_thresh3 || (entries >= tbl->gc_thresh2 && time_after(now, tbl->last_flush + 5 * HZ))) { if (!neigh_forced_gc(tbl) && entries >= tbl->gc_thresh3) goto out_entries; } n = kzalloc(tbl->entry_size + dev->neigh_priv_len, GFP_ATOMIC); if (!n) goto out_entries; __skb_queue_head_init(&n->arp_queue); //初始化arp_queue队列 rwlock_init(&n->lock); seqlock_init(&n->ha_lock); n->updated = n->used = now; n->nud_state = NUD_NONE; //状态为不可用 n->output = neigh_blackhole; //直接丢弃报文 seqlock_init(&n->hh.hh_lock); n->parms = neigh_parms_clone(&tbl->parms); //拷贝neigh_table中的parms setup_timer(&n->timer, neigh_timer_handler, (unsigned long)n); //注册定时器 NEIGH_CACHE_STAT_INC(tbl, allocs); n->tbl = tbl; atomic_set(&n->refcnt, 1); n->dead = 1; out: return n; out_entries: atomic_dec(&tbl->entries); goto out; }arp_constructor函数
static int arp_constructor(struct neighbour *neigh) { __be32 addr = *(__be32 *)neigh->primary_key; struct net_device *dev = neigh->dev; struct in_device *in_dev; struct neigh_parms *parms; rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); //通过net_device得到in_device if (!in_dev) { rcu_read_unlock(); return -EINVAL; } neigh->type = inet_addr_type(dev_net(dev), addr); //设置地址类型 parms = in_dev->arp_parms; __neigh_parms_put(neigh->parms); neigh->parms = neigh_parms_clone(parms); rcu_read_unlock(); if (!dev->header_ops) { //基本上的网卡都会设置该值 neigh->nud_state = NUD_NOARP; neigh->ops = &arp_direct_ops; neigh->output = neigh_direct_output; } else { /* Good devices (checked by reading texts, but only Ethernet is tested) ARPHRD_ETHER: (ethernet, apfddi) ARPHRD_FDDI: (fddi) ARPHRD_IEEE802: (tr) ARPHRD_METRICOM: (strip) ARPHRD_ARCNET: etc. etc. etc. ARPHRD_IPDDP will also work, if author repairs it. I did not it, because this driver does not work even in old paradigm. */ if (neigh->type == RTN_MULTICAST) { //组播地址不需要arp neigh->nud_state = NUD_NOARP; arp_mc_map(addr, neigh->ha, dev, 1); } else if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) { //设备明确不需要arp或本地回环设备,不需要arp neigh->nud_state = NUD_NOARP; memcpy(neigh->ha, dev->dev_addr, dev->addr_len); } else if (neigh->type == RTN_BROADCAST || (dev->flags & IFF_POINTOPOINT)) { //广播或点对点,也不需要arp neigh->nud_state = NUD_NOARP; memcpy(neigh->ha, dev->broadcast, dev->addr_len); } if (dev->header_ops->cache) //eth_header_ops包含cache neigh->ops = &arp_hh_ops; else neigh->ops = &arp_generic_ops; if (neigh->nud_state & NUD_VALID) neigh->output = neigh->ops->connected_output; else neigh->output = neigh->ops->output; //初始阶段为该值,即arp_hh_ops的neigh_resolve_output函数 } return 0; }邻居表项创建后,output函数为neigh_resolve_output,此时邻居子系统还不具备发送IP报文的能力,因为目的MAC地址还未获取,我们来看下dst_neigh_output函数实现:
dst_neigh_output函数
static inline int dst_neigh_output(struct dst_entry *dst, struct neighbour *n, struct sk_buff *skb) { const struct hh_cache *hh; if (dst->pending_confirm) { unsigned long now = jiffies; dst->pending_confirm = 0; /* avoid dirtying neighbour */ if (n->confirmed != now) n->confirmed = now; } hh = &n->hh; if ((n->nud_state & NUD_CONNECTED) && hh->hh_len) //如果neighbour已连接且hh已设置 return neigh_hh_output(hh, skb); else return n->output(n, skb); //初始阶段调用此函数,此时为neigh_resolve_output函数 }neigh_resolve_output函数
int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb) { int rc = 0; if (!neigh_event_send(neigh, skb)) { //发送arp请求,第一次返回true int err; struct net_device *dev = neigh->dev; unsigned int seq; if (dev->header_ops->cache && !neigh->hh.hh_len) neigh_hh_init(neigh); //初始化MAC缓存值,目的是加速 do { __skb_pull(skb, skb_network_offset(skb)); //常见情况,skb指向network header seq = read_seqbegin(&neigh->ha_lock); err = dev_hard_header(skb, dev, ntohs(skb->protocol), //封装MAC头 neigh->ha, NULL, skb->len); } while (read_seqretry(&neigh->ha_lock, seq)); if (err >= 0) rc = dev_queue_xmit(skb); //二层发送报文 else goto out_kfree_skb; } out: return rc; out_kfree_skb: rc = -EINVAL; kfree_skb(skb); goto out; }neigh_event_send函数
static inline int neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) { unsigned long now = jiffies; if (neigh->used != now) neigh->used = now; if (!(neigh->nud_state&(NUD_CONNECTED|NUD_DELAY|NUD_PROBE))) return __neigh_event_send(neigh, skb); //发送arp请求 return 0; }__neigh_event_send函数
int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) { int rc; bool immediate_probe = false; write_lock_bh(&neigh->lock); rc = 0; if (neigh->nud_state & (NUD_CONNECTED | NUD_DELAY | NUD_PROBE)) goto out_unlock_bh; if (neigh->dead) goto out_dead; if (!(neigh->nud_state & (NUD_STALE | NUD_INCOMPLETE))) { //初始阶段进入此分支 if (NEIGH_VAR(neigh->parms, MCAST_PROBES) + NEIGH_VAR(neigh->parms, APP_PROBES)) { unsigned long next, now = jiffies; atomic_set(&neigh->probes, NEIGH_VAR(neigh->parms, UCAST_PROBES)); neigh->nud_state = NUD_INCOMPLETE; //设置表项状态为incomplete neigh->updated = now; next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME), HZ/2); neigh_add_timer(neigh, next); //触发定时器,期望刷新表项状态和output函数,500毫秒后执行 immediate_probe = true; } else { neigh->nud_state = NUD_FAILED; neigh->updated = jiffies; write_unlock_bh(&neigh->lock); kfree_skb(skb); return 1; } } else if (neigh->nud_state & NUD_STALE) { neigh_dbg(2, "neigh %p is delayed\n", neigh); neigh->nud_state = NUD_DELAY; neigh->updated = jiffies; neigh_add_timer(neigh, jiffies + NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME)); } if (neigh->nud_state == NUD_INCOMPLETE) { if (skb) { while (neigh->arp_queue_len_bytes + skb->truesize > NEIGH_VAR(neigh->parms, QUEUE_LEN_BYTES)) { //如果等待发送的报文数量超过设定值,丢弃报文 struct sk_buff *buff; buff = __skb_dequeue(&neigh->arp_queue); if (!buff) break; neigh->arp_queue_len_bytes -= buff->truesize; kfree_skb(buff); NEIGH_CACHE_STAT_INC(neigh->tbl, unres_discards); } skb_dst_force(skb); __skb_queue_tail(&neigh->arp_queue, skb); //报文放入arp_queue队列中 neigh->arp_queue_len_bytes += skb->truesize; } rc = 1; } out_unlock_bh: if (immediate_probe) //初始阶段,邻居项设置状态设置为incomplete,同时设置该变量为true neigh_probe(neigh); //探测邻居表项 else write_unlock(&neigh->lock); local_bh_enable(); return rc; out_dead: if (neigh->nud_state & NUD_STALE) goto out_unlock_bh; write_unlock_bh(&neigh->lock); kfree_skb(skb); return 1; }neigh_probe函数
static void neigh_probe(struct neighbour *neigh) __releases(neigh->lock) { struct sk_buff *skb = skb_peek_tail(&neigh->arp_queue); //取出报文 /* keep skb alive even if arp_queue overflows */ if (skb) skb = skb_copy(skb, GFP_ATOMIC); //拷贝skb write_unlock(&neigh->lock); neigh->ops->solicit(neigh, skb); //实际调用arp_solicit函数,该函数会发送arp请求 atomic_inc(&neigh->probes); kfree_skb(skb); }从上述函数可以看到,报文并没有被发送出去,做了3个事情:1)发送了arp请求, 2)缓存了报文,3)启动定时器500毫秒后执行。 报文被丢弃了? 没有,其实报文是在neigh_update函数中被发送的,该函数的一个调用者是arp处理函数。 调用neigh_update函数后,neigh的output函数被改变,在这个之前,ouput函数仍然是neigh_resolve_output,如果是同一个目的IP,不会再次发送arp请求,仅仅把报文缓存起来,下面我们来看下neigh_update函数:
neigh_update函数
/* Generic update routine. -- lladdr is new lladdr or NULL, if it is not supplied. -- new is new state. -- flags NEIGH_UPDATE_F_OVERRIDE allows to override existing lladdr, if it is different. NEIGH_UPDATE_F_WEAK_OVERRIDE will suspect existing "connected" lladdr instead of overriding it if it is different. It also allows to retain current state if lladdr is unchanged. NEIGH_UPDATE_F_ADMIN means that the change is administrative. NEIGH_UPDATE_F_OVERRIDE_ISROUTER allows to override existing NTF_ROUTER flag. NEIGH_UPDATE_F_ISROUTER indicates if the neighbour is known as a router. Caller MUST hold reference count on the entry. */ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, u32 flags) { u8 old; int err; int notify = 0; struct net_device *dev; int update_isrouter = 0; write_lock_bh(&neigh->lock); dev = neigh->dev; old = neigh->nud_state; err = -EPERM; if (!(flags & NEIGH_UPDATE_F_ADMIN) && (old & (NUD_NOARP | NUD_PERMANENT))) goto out; if (neigh->dead) goto out; if (!(new & NUD_VALID)) { neigh_del_timer(neigh); if (old & NUD_CONNECTED) neigh_suspect(neigh); neigh->nud_state = new; err = 0; notify = old & NUD_VALID; if ((old & (NUD_INCOMPLETE | NUD_PROBE)) && (new & NUD_FAILED)) { neigh_invalidate(neigh); notify = 1; } goto out; } /* Compare new lladdr with cached one */ if (!dev->addr_len) { /* First case: device needs no address. */ lladdr = neigh->ha; } else if (lladdr) { /* The second case: if something is already cached and a new address is proposed: - compare new & old - if they are different, check override flag */ if ((old & NUD_VALID) && !memcmp(lladdr, neigh->ha, dev->addr_len)) lladdr = neigh->ha; } else { /* No address is supplied; if we know something, use it, otherwise discard the request. */ err = -EINVAL; if (!(old & NUD_VALID)) goto out; lladdr = neigh->ha; } if (new & NUD_CONNECTED) neigh->confirmed = jiffies; neigh->updated = jiffies; /* If entry was valid and address is not changed, do not change entry state, if new one is STALE. */ err = 0; update_isrouter = flags & NEIGH_UPDATE_F_OVERRIDE_ISROUTER; if (old & NUD_VALID) { if (lladdr != neigh->ha && !(flags & NEIGH_UPDATE_F_OVERRIDE)) { update_isrouter = 0; if ((flags & NEIGH_UPDATE_F_WEAK_OVERRIDE) && (old & NUD_CONNECTED)) { lladdr = neigh->ha; new = NUD_STALE; } else goto out; } else { if (lladdr == neigh->ha && new == NUD_STALE && ((flags & NEIGH_UPDATE_F_WEAK_OVERRIDE) || (old & NUD_CONNECTED)) ) new = old; } } if (new != old) { neigh_del_timer(neigh); if (new & NUD_IN_TIMER) neigh_add_timer(neigh, (jiffies + ((new & NUD_REACHABLE) ? neigh->parms->reachable_time : 0))); neigh->nud_state = new; notify = 1; } if (lladdr != neigh->ha) { write_seqlock(&neigh->ha_lock); memcpy(&neigh->ha, lladdr, dev->addr_len); write_sequnlock(&neigh->ha_lock); neigh_update_hhs(neigh); if (!(new & NUD_CONNECTED)) neigh->confirmed = jiffies - (NEIGH_VAR(neigh->parms, BASE_REACHABLE_TIME) << 1); notify = 1; } if (new == old) goto out; if (new & NUD_CONNECTED) neigh_connect(neigh); //修改output函数为neigh_connected_output else neigh_suspect(neigh); if (!(old & NUD_VALID)) { //如果源状态不为valid,则发送缓存的skb struct sk_buff *skb; /* Again: avoid dead loop if something went wrong */ while (neigh->nud_state & NUD_VALID && (skb = __skb_dequeue(&neigh->arp_queue)) != NULL) { //取出缓冲报文 struct dst_entry *dst = skb_dst(skb); struct neighbour *n2, *n1 = neigh; write_unlock_bh(&neigh->lock); rcu_read_lock(); /* Why not just use 'neigh' as-is? The problem is that * things such as shaper, eql, and sch_teql can end up * using alternative, different, neigh objects to output * the packet in the output path. So what we need to do * here is re-lookup the top-level neigh in the path so * we can reinject the packet there. */ n2 = NULL; if (dst) { n2 = dst_neigh_lookup_skb(dst, skb); if (n2) n1 = n2; } n1->output(n1, skb); //调用neigh的output函数,此时已经改成connect函数 if (n2) neigh_release(n2); rcu_read_unlock(); write_lock_bh(&neigh->lock); } __skb_queue_purge(&neigh->arp_queue); //清空缓存 neigh->arp_queue_len_bytes = 0; } out: if (update_isrouter) { neigh->flags = (flags & NEIGH_UPDATE_F_ISROUTER) ? (neigh->flags | NTF_ROUTER) : (neigh->flags & ~NTF_ROUTER); } write_unlock_bh(&neigh->lock); if (notify) neigh_update_notify(neigh); return err; }至此,arp的整个大流程基本清晰了,有些细节还有待梳理,例如neigh_update中发包时,为什么需要重新查找neigh表项而不用当前的neigh等。