1) 在Linux内核中, 将IP包的路由称为"目的入口"(dst_entry),
目的入口反映了相邻的外部主机在主机
内部的一种"映象", IP包首先注入到目的入口中, 经过一系列IP包过滤器,
最后注入到目的入口的帧头缓
冲入口或邻居入口, 通过ARP缓冲创建硬件帧头后发送到设备驱动程序上.
2) 路由缓冲表就是IP路由入口表, 它是转发表路由规则的实例化. 在查询IP路由时,
系统先在路由缓冲表
中查询, 当路由入口已存在时, 将输出包直接绑定到该路由入口, 如果未找到匹配的入口,
则通过转发表
查询路由规则, 当匹配成功后, 要在路由缓冲表中创建相应的目的入口.
3) 路由缓冲表是用散列索引的路由结构(rtable), 路由结构的开始即为目的入口结构,
它们在头部形成联
合.ip_route_output(&rt,daddr,saddr,oif)查询输出设备为oif, 目的地址为daddr,
源地址为saddr的路
由入口. ip_route_input(skb,daddr,saddr,tos,dev)将接收包skb绑定到输入设备为dev,
目的地址为
daddr, 源地址为saddr, 服务类型为tos的目的入口.
struct dst_entry
{
struct dst_entry *next;
atomic_t __refcnt; /* client references */
int __use;
struct net_device *dev;
int obsolete;
int flags;
#define DST_HOST 1
unsigned long lastuse;
unsigned long expires;
unsigned mxlock;
unsigned pmtu;
unsigned window;
unsigned rtt;
unsigned rttvar;
unsigned ssthresh;
unsigned cwnd;
unsigned advmss;
unsigned reordering;
unsigned long rate_last; /* rate limiting for ICMP */
unsigned long rate_tokens;
int error;
struct neighbour *neighbour;
struct hh_cache *hh;
int (*input)(struct sk_buff*);
int (*output)(struct sk_buff*);
#ifdef CONFIG_NET_CLS_ROUTE
__u32 tclassid;
#endif
struct dst_ops *ops;
char info[0];
};
struct rtable
{
union
{
struct dst_entry dst;
struct rtable *rt_next;
} u;
unsigned rt_flags;
unsigned rt_type;
__u32 rt_dst; /* Path destination */
__u32 rt_src; /* Path source */
int rt_iif;
/* Info on neighbour */
__u32 rt_gateway;
/* Cache lookup keys */
struct rt_key key;
/* Miscellaneous cached information */
__u32 rt_spec_dst; /* RFC1122 specific destination */
struct inet_peer *peer; /* long-living peer info */
#ifdef CONFIG_IP_ROUTE_NAT
__u32 rt_src_map;
__u32 rt_dst_map;
#endif
};
struct dst_ops
{
unsigned short family;
unsigned short protocol;
unsigned gc_thresh;
int (*gc)(void);
struct dst_entry * (*check)(struct dst_entry *, __u32 cookie);
struct dst_entry * (*reroute)(struct dst_entry *,
struct sk_buff *);
void (*destroy)(struct dst_entry *);
struct dst_entry * (*negative_advice)(struct dst_entry *);
void (*link_failure)(struct sk_buff *);
int entry_size;
atomic_t entries;
kmem_cache_t *kmem_cachep;
};
struct rt_key
{
__u32 dst;
__u32 src;
int iif;
int oif;
#ifdef CONFIG_IP_ROUTE_FWMARK
__u32 fwmark;
#endif
__u8 tos;
__u8 scope;
};
struct rt_hash_bucket {
struct rtable *chain;
rwlock_t lock;
} __attribute__((__aligned__(8)));
; net/ipv4/route.c
static struct rt_hash_bucket *rt_hash_table;
static unsigned rt_hash_mask;
static int rt_hash_log;
static __inline__ unsigned rt_hash_code(u32 daddr, u32 saddr, u8 tos)
{
unsigned hash = ((daddr&0xF0F0F0F0)>>4)|((daddr&0x0F0F0F0F)<<4);
hash ^= saddr^tos;
hash ^= (hash>>16);
return (hash^(hash>>8)) & rt_hash_mask;
}
static inline int ip_route_output(struct rtable **rp,
u32 daddr, u32 saddr, u32 tos, int oif)
{
struct rt_key key = { dst:daddr, src:saddr, oif:oif, tos:tos };
return ip_route_output_key(rp, &key);
}
int ip_route_output_key(struct rtable **rp, const struct rt_key *key)
{
unsigned hash;
struct rtable *rth;
hash = rt_hash_code(key->dst, key->src^(key->oif<<5), key->tos);
read_lock_bh(&rt_hash_table[hash].lock);
for (rth=rt_hash_table[hash].chain; rth; rth=rth->u.rt_next) {
if (rth->key.dst == key->dst &&
rth->key.src == key->src &&
rth->key.iif == 0 &&
rth->key.oif == key->oif &&
#ifdef CONFIG_IP_ROUTE_FWMARK
rth->key.fwmark == key->fwmark &&
#endif
!((rth->key.tos^key->tos)&(IPTOS_RT_MASK|RTO_ONLINK)) &&
((key->tos&RTO_TPROXY) || !(rth->rt_flags&RTCF_TPROXY))
) {
rth->u.dst.lastuse = jiffies;
dst_hold(&rth->u.dst);
rth->u.dst.__use++;
read_unlock_bh(&rt_hash_table[hash].lock);
*rp = rth;
return 0;
}
}
read_unlock_bh(&rt_hash_table[hash].lock);
return ip_route_output_slow(rp, key);
}
int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
u8 tos, struct net_device *dev) 将从设备dev上的输入IP包绑定到目的入口
{
struct rtable * rth;
unsigned hash;
int iif = dev->ifindex;
tos &= IPTOS_RT_MASK;
hash = rt_hash_code(daddr, saddr^(iif<<5), tos);
read_lock(&rt_hash_table[hash].lock);
for (rth=rt_hash_table[hash].chain; rth; rth=rth->u.rt_next) {
if (rth->key.dst == daddr &&
rth->key.src == saddr &&
rth->key.iif == iif &&
rth->key.oif == 0 &&
#ifdef CONFIG_IP_ROUTE_FWMARK
rth->key.fwmark == skb->nfmark &&
#endif
rth->key.tos == tos) {
rth->u.dst.lastuse = jiffies;
dst_hold(&rth->u.dst);
rth->u.dst.__use++;
read_unlock(&rt_hash_table[hash].lock);
skb->dst = (struct dst_entry*)rth;
return 0;
}
}
read_unlock(&rt_hash_table[hash].lock);
/* Multicast recognition logic is moved from route cache to here.
The problem was that too many Ethernet cards have broken/missing
hardware multicast filters :-( As result the host on multicasting
network acquires a lot of useless route cache entries, sort of
SDR messages from all the world. Now we try to get rid of them.
Really, provided software IP multicast filter is organized
reasonably (at least, hashed), it does not result in a slowdown
comparing with route cache reject entries.
Note, that multicast routers are not affected, because
route cache entry is created eventually.
*/
if (MULTICAST(daddr)) {
struct in_device *in_dev;
read_lock(&inetdev_lock);
if ((in_dev = __in_dev_get(dev)) != NULL) {
int our = ip_check_mc(in_dev, daddr);
if (our
#ifdef CONFIG_IP_MROUTE
|| (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
#endif
) {
read_unlock(&inetdev_lock);
return ip_route_input_mc(skb, daddr, saddr, tos, dev, our);
}
}
read_unlock(&inetdev_lock);
return -EINVAL;
}
return ip_route_input_slow(skb, daddr, saddr, tos, dev);
}