我们进入arp协议,我们看看linux中号称邻居子系统是啥样的
static struct
packet_type
arp_packet_type __read_mostly = {
.type = cpu_to_be16(
ETH_P_ARP),
.func =
arp_rcv,
};
还记得那个netif_receive_skb吧,选择三层协议处理的地方,arp通过
dev_add_pack注册了上述的packet_type结构
#define
ETH_P_ARP
0x0806 /* Address Resolution packet */
void __init
arp_init
(void)
{
neigh_table_init(&
arp_tbl
); 初始化arp邻居表
dev_add_pack(&arp_packet_type); 注册三层处理协议类型 arp
arp_proc_init(); /proc文件系统相关arp部分创建
#ifdef CONFIG_SYSCTL
neigh_sysctl_register(NULL, &arp_tbl.parms, NET_IPV4,
NET_IPV4_NEIGH, "ipv4", NULL, NULL);
#endif
register_netdevice_notifier(&arp_netdev_notifier); 注册通知链
}
neigh_table_init(&
arp_tbl); 其实就是把arp_tbl挂在全局的neigh_tables变量上
for (tmp = neigh_tables; tmp; tmp = tmp->next) {
if (tmp->family == tbl->family) 用协议族在区分 arp类型为AF_INET IPV6的ND协议为AF_INET6
break;
}
tbl->next = neigh_tables;
neigh_tables = tbl;
重要数据结构:
struct neigh_table
{
struct neigh_table *next;
int family;
协议族
int entry_size;
邻居项的大小 为sizeof(struct neighbour) + 4 考虑到有零长数组
int key_len;
hash关键字的长度 为4
__u32 (*hash)(const void *pkey, const struct net_device *);
hash函数
int (*constructor)(struct neighbour *);
int (*pconstructor)(struct pneigh_entry *);
void (*pdestructor)(struct pneigh_entry *);
void (*proxy_redo)(struct sk_buff *skb);
char *id;
struct neigh_parms parms;
/* HACK. gc_* shoul follow parms without a gap! */
int gc_interval;
常规的垃圾回收间隔时间 默认30秒
int gc_thresh1;
门限
int gc_thresh2;
int gc_thresh3;
unsigned long last_flush;
struct delayed_work gc_work;
常规的垃圾回收定时器
struct timer_list proxy_timer;
struct sk_buff_head proxy_queue;
atomic_t entries;
整个表中邻居项的个数 当entries大于hash_mask+1,哈希桶增长为原来的两倍
rwlock_t lock;
unsigned long last_rand;
struct kmem_cache *kmem_cachep;
struct neigh_statistics *stats;
struct neighbour **hash_buckets; 存放邻居项的哈希桶
unsigned int hash_mask;
hash桶大小的掩码
__u32 hash_rnd;
struct pneigh_entry **phash_buckets;
用于代理arp的邻居哈希表
};
struct neigh_table arp_tbl = { arp的邻居表项
.family = AF_INET,
.entry_size = sizeof(struct neighbour) + 4,
.key_len = 4,
.hash = arp_hash,
.constructor = arp_constructor,
.proxy_redo = parp_redo,
.id = "arp_cache",
.parms = {
.tbl = &arp_tbl,
.base_reachable_time = 30 * HZ,
.retrans_time = 1 * HZ,
.gc_staletime = 60 * HZ,
.reachable_time = 30 * HZ,
.delay_probe_time = 5 * HZ,
.queue_len = 3,
.ucast_probes = 3,
.mcast_probes = 3,
.anycast_delay = 1 * HZ,
.proxy_delay = (8 * HZ) / 10,
.proxy_qlen = 64,
.locktime = 1 * HZ,
},
.gc_interval = 30 * HZ,
.gc_thresh1 = 128,
.gc_thresh2 = 512,
.gc_thresh3 = 1024,
};
struct neighbour
{
struct neighbour *next;
struct neigh_table *tbl;
struct neigh_parms *parms;
struct net_device *dev;
unsigned long used;
unsigned long confirmed;
unsigned long updated;
__u8 flags;
__u8 nud_state;
邻居项的对象
__u8 type;
__u8 dead;
存活标志,如果dead为1,那么垃圾回收函数会将此项删除
atomic_t probes;
重发arp请求的次数
rwlock_t lock;
unsigned char ha[ALIGN(MAX_ADDR_LEN, sizeof(unsigned long))];
struct hh_cache *hh;
atomic_t refcnt;
int (*output)(struct sk_buff *skb);
struct sk_buff_head arp_queue;
struct timer_list timer;
定时器
const struct neigh_ops *ops;
邻居项操作函数
u8 primary_key[0];
哈希关键字 这是个零长数组空间大小在分配时+4,就是那个4 其实就是ip地址长度
.entry_size = sizeof(struct neighbour) + 4, 见struct neigh_table arp_tbl ={}
后面在函数的分析过程中会讲到这些数据结构是如何组织的,以及其功能
分析入口:查找到路由后,会调用arp_bind_neighbour绑定一个邻居项
int arp_bind_neighbour(struct dst_entry *dst)
参数是个dst路由表项,路由的东西我们暂时放一边,后续我们会详细分析
姑且认为就是个表项
{
struct net_device *dev = dst->dev;
struct neighbour *n = dst->neighbour;
取得路由项banding的邻居项
if (dev == NULL)
return -EINVAL;
if (n == NULL) {
__be32 nexthop = ((struct rtable *)dst)->rt_gateway;
取下一跳 路由就是找下一跳
if (dev->flags&(IFF_LOOPBACK|IFF_POINTOPOINT))
nexthop = 0;
n =
__neigh_lookup_errno( &arp_tbl, &nexthop, dev);
找下一跳对应的邻居项
if (IS_ERR(n))
return PTR_ERR(n);
dst->neighbour = n;
找到了就赋值
}
return 0;
}
static inline struct neighbour *
__neigh_lookup_errno(struct neigh_table *tbl, const void *
pkey,
struct net_device *dev)
可以看到pkey就是ip地址,环回接口和点对点结构就是0
{
struct neighbour *n =
neigh_lookup(tbl, pkey, dev);
找ip对应的邻居项
if (n)
return n;
return
neigh_create(tbl, pkey, dev);
创建新的邻居项
}
struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey,
struct net_device *dev)
{
struct neighbour *n;
int key_len = tbl->key_len; 为4,ip地址长度
u32 hash_val;
NEIGH_CACHE_STAT_INC(tbl, lookups);
read_lock_bh(&tbl->lock);
hash_val = tbl->hash(pkey, dev); 利用注册的arphash函数进行key的计算
arp_hash()
for (n = tbl->hash_buckets[hash_val & tbl->hash_mask]; n; n = n->next) {
在hash桶里匹配
if (dev == n->dev && !memcmp(n->primary_key, pkey, key_len)) {
neigh_hold(n);
NEIGH_CACHE_STAT_INC(tbl, hits);
break;
}
}
read_unlock_bh(&tbl->lock);
return n;
}
struct neighbour *
neigh_create
(struct neigh_table *tbl, const void *pkey,
struct net_device *dev)
{
u32 hash_val;
int key_len = tbl->key_len;
int error;
struct neighbour *n1, *rc, *n =
neigh_alloc
(tbl);
申请一个邻居项结构
if (!n) {
rc = ERR_PTR(-ENOBUFS);
goto out;
}
memcpy(n->primary_key, pkey, key_len); 哈希键值就是目的ip地址
n->dev = dev;
dev_hold(dev);
/* Protocol specific setup. */
if (tbl->constructor && (error = tbl->constructor(n)) < 0) {
有构造函数就调用构造函数
arp_constructor()
rc = ERR_PTR(error);
goto out_neigh_release;
}
/* Device specific setup. */
if (n->parms->neigh_setup &&
(error = n->parms->neigh_setup(n)) < 0) {
有setup就调用setup
rc = ERR_PTR(error);
goto out_neigh_release;
}
n->confirmed = jiffies - (n->parms->base_reachable_time << 1);
write_lock_bh(&tbl->lock);
if (atomic_read(&tbl->entries) > (tbl->hash_mask + 1)) 如果总项数超过hash_mask 就增加hash表
neigh_hash_grow(tbl, (tbl->hash_mask + 1) << 1);
hash_val = tbl->hash(pkey, dev) & tbl->hash_mask;
if (n->parms->dead) {
rc = ERR_PTR(-EINVAL);
goto out_tbl_unlock;
}
for (n1 = tbl->hash_buckets[hash_val]; n1; n1 = n1->next) {
插入hash表,有可能已经存在
if (dev == n1->dev && !memcmp(n1->primary_key, pkey, key_len)) {
neigh_hold(n1);
rc = n1;
goto out_tbl_unlock;
}
}
n->next = tbl->hash_buckets[hash_val];
tbl->hash_buckets[hash_val] = n;
n->dead = 0;
neigh_hold(n);
write_unlock_bh(&tbl->lock);
NEIGH_PRINTK2("neigh %p is created.\n", n);
rc = n;
out:
return rc;
out_tbl_unlock:
write_unlock_bh(&tbl->lock);
out_neigh_release:
neigh_release(n);
goto out;
}
static struct neighbour *neigh_alloc(struct neigh_table *tbl)
{
struct neighbour *n = NULL;
unsigned long now = jiffies;
int entries;
entries = atomic_inc_return(&tbl->entries) - 1;
将邻居项数+1
if (entries >= tbl->gc_thresh3 ||
(entries >= tbl->gc_thresh2 &&
time_after(now, tbl->last_flush + 5 * HZ))) {
if (!neigh_forced_gc(tbl) &&
entries >= tbl->gc_thresh3)
超过一定的门限值
goto out_entries;
}
n = kmem_cache_zalloc(tbl->kmem_cachep, GFP_ATOMIC);
从高速缓存中申请一个邻居项
if (!n)
goto out_entries;
skb_queue_head_init(&n->arp_queue);
rwlock_init(&n->lock);
n->updated = n->used = now;
n->nud_state =
NUD_NONE
;
初始化状态
n->output = neigh_blackhole;
n->parms = neigh_parms_clone(&tbl->parms);
setup_timer(&n->timer, neigh_timer_handler, (unsigned long)n);
定时器
NEIGH_CACHE_STAT_INC(tbl, allocs);
n->tbl = tbl;
atomic_set(&n->refcnt, 1);
n->dead = 1;
out:
return n;
out_entries:
atomic_dec(&tbl->entries);
goto out;
}
static int arp_constructor(struct neighbour *neigh)
{
__be32 addr = *(__be32*)neigh->primary_key;
struct net_device *dev = neigh->dev;
struct in_device *in_dev;
struct neigh_parms *parms;
rcu_read_lock();
in_dev = __in_dev_get_rcu(dev);
if (in_dev == NULL) {
rcu_read_unlock();
return -EINVAL;
}
neigh->type = inet_addr_type(dev_net(dev), addr);
parms = in_dev->arp_parms;
__neigh_parms_put(neigh->parms);
neigh->parms = neigh_parms_clone(parms);
rcu_read_unlock();
if (!dev->header_ops) {
无二层头操作的, 给予一套arp_direct_ops操作集
neigh->nud_state = NUD_NOARP;
neigh->ops = &arp_direct_ops;
neigh->output = neigh->ops->queue_xmit;
} else {
#if 1
/* So... these "amateur" devices are hopeless.
The only thing, that I can say now:
It is very sad that we need to keep ugly obsolete
code to make them happy.
They should be moved to more reasonable state, now
they use rebuild_header INSTEAD OF hard_start_xmit!!!
Besides that, they are sort of out of date
(a lot of redundant clones/copies, useless in 2.1),
I wonder why people believe that they work.
*/
switch (dev->type) {
default:
break;
case ARPHRD_ROSE:
#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
case ARPHRD_AX25:
#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE)
case ARPHRD_NETROM:
#endif
neigh->ops = &arp_broken_ops;
neigh->output = neigh->ops->output;
return 0;
#endif
;}
#endif
if (neigh->type == RTN_MULTICAST) {
根据不同二层协议类型,给予不同的操作集
neigh->nud_state = NUD_NOARP;
arp_mc_map(addr, neigh->ha, dev, 1);
} else if (dev->flags&(IFF_NOARP|IFF_LOOPBACK)) {
neigh->nud_state = NUD_NOARP;
memcpy(neigh->ha, dev->dev_addr, dev->addr_len);
} else if (neigh->type == RTN_BROADCAST || dev->flags&IFF_POINTOPOINT) {
neigh->nud_state = NUD_NOARP;
memcpy(neigh->ha, dev->broadcast, dev->addr_len);
}
if (dev->header_ops->cache)
//ether_setup函数中dev->header_ops = ð_header_ops;
neigh->ops = &arp_hh_ops;
.cache = eth_header_cache, 所以走这里
else
neigh->ops = &arp_generic_ops;
if (neigh->nud_state&NUD_VALID)
neigh->output = neigh->ops->connected_output;
else
neigh->output = neigh->ops->output;
我们的初始状态是NUD_NONE 所以走这里
}
return 0;
}
static const struct neigh_ops
arp_hh_ops
= {
.family = AF_INET,
.solicit = arp_solicit,
.error_report = arp_error_report,
.output = neigh_resolve_output,
.connected_output = neigh_resolve_output,
.hh_output = dev_queue_xmit,
.queue_xmit = dev_queue_xmit,
};
最后来个图对这段分析的总结: 没有ULNI中画的好,只是为了说明结构组织
每个邻居协议都有个table结构,IPV4的arp_table IPV6的nd_tbl
对于每个发送报文查到路由后都会对目的ip进行邻居查找,因此对每个目的ip都有个邻居项来保存信息
邻居项以hash表的形式进行在邻居表中保存
上面的分析仅仅对邻居项的创建进行说明,而整个子系统的东西还有很多很多...