路由Hash表的组织(新的内核使用了不同的组织方式,以后再分析)
为了能使各种操作快速查找到相关信息,Linux中采用了几种不同的Hash表,这些表都指向用于描述路由的相同的数据结构
路由是由多个不同的数据结构的组合来描述的,每个数据结构代表着路由信息的不同部分。定义一条路由所需的信息分成多个数据结构。这样做的原因是只需通过部分字段可以区分多条路由。路由子系统不是维护一个庞大臃肿的结构而是将路由分散为多个片段,这样更容易在相似的路由间共享通用的信息,因而就可以分离出不同的函数,并在这些函数之间定义更加清晰的接口。
两个默认路由表
ip_fib_local_table,内核将到本地地址的路由放在该表中,包括到相关的子网网络地址及子网广播地址的路由。用户不能够直接配置该路由表
ip_fib_main_table,所有其他的路由表项都放在该表内
路由表初始化
1: struct fib_table *fib_hash_table(u32 id)
2: {
3: struct fib_table *tb;
4:
5: tb = kmalloc(sizeof(struct fib_table) + sizeof(struct fn_hash),
6: GFP_KERNEL);
7: if (tb == NULL)
8: return NULL;
9:
10: tb->tb_id = id;
11: tb->tb_default = -1;
12: tb->tb_lookup = fn_hash_lookup;
13: tb->tb_insert = fn_hash_insert;
14: tb->tb_delete = fn_hash_delete;
15: tb->tb_flush = fn_hash_flush;
16: tb->tb_select_default = fn_hash_select_default;
17: tb->tb_dump = fn_hash_dump;
18: memset(tb->tb_data, 0, sizeof(struct fn_hash));
19: return tb;
20: }
路由的添加和删除
路由表查找
1: static int
2: fn_hash_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
3: {
4: int err;
5: struct fn_zone *fz;
6: struct fn_hash *t = (struct fn_hash *)tb->tb_data;
7:
8: read_lock(&fib_hash_lock);
9: for (fz = t->fn_zone_list; fz; fz = fz->fz_next) {
10: struct hlist_head *head;
11: struct hlist_node *node;
12: struct fib_node *f;
13: __be32 k = fz_key(flp->fl4_dst, fz);
14:
15: head = &fz->fz_hash[fn_hash(k, fz)];
16: hlist_for_each_entry(f, node, head, fn_hash) {
17: if (f->fn_key != k)
18: continue;
19:
20: err = fib_semantic_match(&f->fn_alias,
21: flp, res,
22: fz->fz_order);
23: if (err <= 0)
24: goto out;
25: }
26: }
27: err = 1;
28: out:
29: read_unlock(&fib_hash_lock);
30: return err;
31: }
tb,搜索的路由表。因为fn_hash_lookup是一个通用的查找函数,每次只能够查找一张表。调用者依赖是否支持策略路由和相关的因素,来决定搜索那些路由表
flp,搜索关键字
res,查找成功时,用路由信息初始化res
可能返回值, 0,成功;1,失败,没有与搜索关键字匹配的路由项;小于0,管理失败,意味着查找不成功,因为查找到的路由没有价值
拒绝路由的标准
TOS不匹配,注意,当路由项没有配置TOS值时,可以路由任意TOS值的封包
1: static const struct
2: {
3: int error;
4: u8 scope;
5: } fib_props[RTN_MAX + 1] = {
6: {
7: .error = 0,
8: .scope = RT_SCOPE_NOWHERE,
9: }, /* RTN_UNSPEC */
10: {
11: .error = 0,
12: .scope = RT_SCOPE_UNIVERSE,
13: }, /* RTN_UNICAST */
14: {
15: .error = 0,
16: .scope = RT_SCOPE_HOST,
17: }, /* RTN_LOCAL */
18: {
19: .error = 0,
20: .scope = RT_SCOPE_LINK,
21: }, /* RTN_BROADCAST */
22: {
23: .error = 0,
24: .scope = RT_SCOPE_LINK,
25: }, /* RTN_ANYCAST */
26: {
27: .error = 0,
28: .scope = RT_SCOPE_UNIVERSE,
29: }, /* RTN_MULTICAST */
30: {
31: .error = -EINVAL,
32: .scope = RT_SCOPE_UNIVERSE,
33: }, /* RTN_BLACKHOLE */
34: {
35: .error = -EHOSTUNREACH,
36: .scope = RT_SCOPE_UNIVERSE,
37: }, /* RTN_UNREACHABLE */
38: {
39: .error = -EACCES,
40: .scope = RT_SCOPE_UNIVERSE,
41: }, /* RTN_PROHIBIT */
42: {
43: .error = -EAGAIN,
44: .scope = RT_SCOPE_UNIVERSE,
45: }, /* RTN_THROW */
46: {
47: .error = -EINVAL,
48: .scope = RT_SCOPE_NOWHERE,
49: }, /* RTN_NAT */
50: {
51: .error = -EINVAL,
52: .scope = RT_SCOPE_NOWHERE,
53: }, /* RTN_XRESOLVE */
54: };
输入和输出函数
1: static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2: u8 tos, struct net_device *dev)
3: {
4: struct fib_result res;
5: struct in_device *in_dev = in_dev_get(dev);
6: struct flowi fl = { .nl_u = { .ip4_u =
7: { .daddr = daddr,
8: .saddr = saddr,
9: .tos = tos,
10: .scope = RT_SCOPE_UNIVERSE,
11: } },
12: .mark = skb->mark,
13: .iif = dev->ifindex };
14: unsigned flags = 0;
15: u32 itag = 0;
16: struct rtable * rth;
17: unsigned hash;
18: __be32 spec_dst;
19: int err = -EINVAL;
20: int free_res = 0;
21: struct net * net = dev_net(dev);
22:
23: /* IP on this device is disabled. */
24:
25: if (!in_dev)
26: goto out;
27:
28: /* Check for the most weird martians, which can be not detected
29: by fib_lookup.
30: */
31:
32: if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
33: ipv4_is_loopback(saddr))
34: goto martian_source;
35:
36: if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
37: goto brd_input;
38:
39: /* Accept zero addresses only to limited broadcast;
40: * I even do not know to fix it or not. Waiting for complains :-)
41: */
42: if (ipv4_is_zeronet(saddr))
43: goto martian_source;
44:
45: if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
46: ipv4_is_loopback(daddr))
47: goto martian_destination;
48:
49: /*
50: * Now we are ready to route packet.
51: */
52: if ((err = fib_lookup(net, &fl, &res)) != 0) {
53: if (!IN_DEV_FORWARD(in_dev))
54: goto e_hostunreach;
55: goto no_route;
56: }
57: free_res = 1;
58:
59: RT_CACHE_STAT_INC(in_slow_tot);
60:
61: if (res.type == RTN_BROADCAST)
62: goto brd_input;
63:
64: if (res.type == RTN_LOCAL) {
65: int result;
66: result = fib_validate_source(saddr, daddr, tos,
67: net->loopback_dev->ifindex,
68: dev, &spec_dst, &itag, skb->mark);
69: if (result < 0)
70: goto martian_source;
71: if (result)
72: flags |= RTCF_DIRECTSRC;
73: spec_dst = daddr;
74: goto local_input;
75: }
76:
77: if (!IN_DEV_FORWARD(in_dev))
78: goto e_hostunreach;
79: if (res.type != RTN_UNICAST)
80: goto martian_destination;
81:
82: err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
83: done:
84: in_dev_put(in_dev);
85: if (free_res)
86: fib_res_put(&res);
87: out: return err;
88:
89: brd_input:
90: if (skb->protocol != htons(ETH_P_IP))
91: goto e_inval;
92:
93: if (ipv4_is_zeronet(saddr))
94: spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
95: else {
96: err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
97: &itag, skb->mark);
98: if (err < 0)
99: goto martian_source;
100: if (err)
101: flags |= RTCF_DIRECTSRC;
102: }
103: flags |= RTCF_BROADCAST;
104: res.type = RTN_BROADCAST;
105: RT_CACHE_STAT_INC(in_brd);
106:
107: local_input:
108: rth = dst_alloc(&ipv4_dst_ops);
109: if (!rth)
110: goto e_nobufs;
111:
112: rth->u.dst.output= ip_rt_bug;
113: rth->rt_genid = rt_genid(net);
114:
115: atomic_set(&rth->u.dst.__refcnt, 1);
116: rth->u.dst.flags= DST_HOST;
117: if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
118: rth->u.dst.flags |= DST_NOPOLICY;
119: rth->fl.fl4_dst = daddr;
120: rth->rt_dst = daddr;
121: rth->fl.fl4_tos = tos;
122: rth->fl.mark = skb->mark;
123: rth->fl.fl4_src = saddr;
124: rth->rt_src = saddr;
125: #ifdef CONFIG_NET_CLS_ROUTE
126: rth->u.dst.tclassid = itag;
127: #endif
128: rth->rt_iif =
129: rth->fl.iif = dev->ifindex;
130: rth->u.dst.dev = net->loopback_dev;
131: dev_hold(rth->u.dst.dev);
132: rth->idev = in_dev_get(rth->u.dst.dev);
133: rth->rt_gateway = daddr;
134: rth->rt_spec_dst= spec_dst;
135: rth->u.dst.input= ip_local_deliver;
136: rth->rt_flags = flags|RTCF_LOCAL;
137: if (res.type == RTN_UNREACHABLE) {
138: rth->u.dst.input= ip_error;
139: rth->u.dst.error= -err;
140: rth->rt_flags &= ~RTCF_LOCAL;
141: }
142: rth->rt_type = res.type;
143: hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
144: err = rt_intern_hash(hash, rth, NULL, skb);
145: goto done;
146:
147: no_route:
148: RT_CACHE_STAT_INC(in_no_route);
149: spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
150: res.type = RTN_UNREACHABLE;
151: if (err == -ESRCH)
152: err = -ENETUNREACH;
153: goto local_input;
154:
155: /*
156: * Do not cache martian addresses: they should be logged (RFC1812)
157: */
158: martian_destination:
159: RT_CACHE_STAT_INC(in_martian_dst);
160: #ifdef CONFIG_IP_ROUTE_VERBOSE
161: if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
162: printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
163: &daddr, &saddr, dev->name);
164: #endif
165:
166: e_hostunreach:
167: err = -EHOSTUNREACH;
168: goto done;
169:
170: e_inval:
171: err = -EINVAL;
172: goto done;
173:
174: e_nobufs:
175: err = -ENOBUFS;
176: goto done;
177:
178: martian_source:
179: ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
180: goto e_inval;
181: }
输出路由
将封包送往本地和转发出去二者都需要执行以下任务,虽然这些任务可能以不同的方式来执行:
在一些情况下,不需要任何路由查找就可以路由一个封包。这主要有以下三种情况:
1: static int ip_route_output_slow(struct net *net, struct rtable **rp,
2: const struct flowi *oldflp)
3: {
4: u32 tos = RT_FL_TOS(oldflp);
5: struct flowi fl = { .nl_u = { .ip4_u =
6: { .daddr = oldflp->fl4_dst,
7: .saddr = oldflp->fl4_src,
8: .tos = tos & IPTOS_RT_MASK,
9: .scope = ((tos & RTO_ONLINK) ?
10: RT_SCOPE_LINK :
11: RT_SCOPE_UNIVERSE),
12: } },
13: .mark = oldflp->mark,
14: .iif = net->loopback_dev->ifindex,
15: .oif = oldflp->oif };
16: struct fib_result res;
17: unsigned flags = 0;
18: struct net_device *dev_out = NULL;
19: int free_res = 0;
20: int err;
21:
22:
23: res.fi = NULL;
24: #ifdef CONFIG_IP_MULTIPLE_TABLES
25: res.r = NULL;
26: #endif
27:
28: if (oldflp->fl4_src) {
29: err = -EINVAL;
30: if (ipv4_is_multicast(oldflp->fl4_src) ||
31: ipv4_is_lbcast(oldflp->fl4_src) ||
32: ipv4_is_zeronet(oldflp->fl4_src))
33: goto out;
34:
35: /* I removed check for oif == dev_out->oif here.
36: It was wrong for two reasons:
37: 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
38: is assigned to multiple interfaces.
39: 2. Moreover, we are allowed to send packets with saddr
40: of another iface. --ANK
41: */
42:
43: if (oldflp->oif == 0
44: && (ipv4_is_multicast(oldflp->fl4_dst) ||
45: oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
46: /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
47: dev_out = ip_dev_find(net, oldflp->fl4_src);
48: if (dev_out == NULL)
49: goto out;
50:
51: /* Special hack: user can direct multicasts
52: and limited broadcast via necessary interface
53: without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
54: This hack is not just for fun, it allows
55: vic,vat and friends to work.
56: They bind socket to loopback, set ttl to zero
57: and expect that it will work.
58: From the viewpoint of routing cache they are broken,
59: because we are not allowed to build multicast path
60: with loopback source addr (look, routing cache
61: cannot know, that ttl is zero, so that packet
62: will not leave this host and route is valid).
63: Luckily, this hack is good workaround.
64: */
65:
66: fl.oif = dev_out->ifindex;
67: goto make_route;
68: }
69:
70: if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
71: /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
72: dev_out = ip_dev_find(net, oldflp->fl4_src);
73: if (dev_out == NULL)
74: goto out;
75: dev_put(dev_out);
76: dev_out = NULL;
77: }
78: }
79:
80:
81: if (oldflp->oif) {
82: dev_out = dev_get_by_index(net, oldflp->oif);
83: err = -ENODEV;
84: if (dev_out == NULL)
85: goto out;
86:
87: /* RACE: Check return value of inet_select_addr instead. */
88: if (__in_dev_get_rtnl(dev_out) == NULL) {
89: dev_put(dev_out);
90: goto out; /* Wrong error code */
91: }
92:
93: if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
94: oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
95: if (!fl.fl4_src)
96: fl.fl4_src = inet_select_addr(dev_out, 0,
97: RT_SCOPE_LINK);
98: goto make_route;
99: }
100: if (!fl.fl4_src) {
101: if (ipv4_is_multicast(oldflp->fl4_dst))
102: fl.fl4_src = inet_select_addr(dev_out, 0,
103: fl.fl4_scope);
104: else if (!oldflp->fl4_dst)
105: fl.fl4_src = inet_select_addr(dev_out, 0,
106: RT_SCOPE_HOST);
107: }
108: }
109:
110: if (!fl.fl4_dst) {
111: fl.fl4_dst = fl.fl4_src;
112: if (!fl.fl4_dst)
113: fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
114: if (dev_out)
115: dev_put(dev_out);
116: dev_out = net->loopback_dev;
117: dev_hold(dev_out);
118: fl.oif = net->loopback_dev->ifindex;
119: res.type = RTN_LOCAL;
120: flags |= RTCF_LOCAL;
121: goto make_route;
122: }
123:
124: if (fib_lookup(net, &fl, &res)) {
125: res.fi = NULL;
126: if (oldflp->oif) {
127: /* Apparently, routing tables are wrong. Assume,
128: that the destination is on link.
129:
130: WHY? DW.
131: Because we are allowed to send to iface
132: even if it has NO routes and NO assigned
133: addresses. When oif is specified, routing
134: tables are looked up with only one purpose:
135: to catch if destination is gatewayed, rather than
136: direct. Moreover, if MSG_DONTROUTE is set,
137: we send packet, ignoring both routing tables
138: and ifaddr state. --ANK
139:
140:
141: We could make it even if oif is unknown,
142: likely IPv6, but we do not.
143: */
144:
145: if (fl.fl4_src == 0)
146: fl.fl4_src = inet_select_addr(dev_out, 0,
147: RT_SCOPE_LINK);
148: res.type = RTN_UNICAST;
149: goto make_route;
150: }
151: if (dev_out)
152: dev_put(dev_out);
153: err = -ENETUNREACH;
154: goto out;
155: }
156: free_res = 1;
157:
158: if (res.type == RTN_LOCAL) {
159: if (!fl.fl4_src)
160: fl.fl4_src = fl.fl4_dst;
161: if (dev_out)
162: dev_put(dev_out);
163: dev_out = net->loopback_dev;
164: dev_hold(dev_out);
165: fl.oif = dev_out->ifindex;
166: if (res.fi)
167: fib_info_put(res.fi);
168: res.fi = NULL;
169: flags |= RTCF_LOCAL;
170: goto make_route;
171: }
172:
173: #ifdef CONFIG_IP_ROUTE_MULTIPATH
174: if (res.fi->fib_nhs > 1 && fl.oif == 0)
175: fib_select_multipath(&fl, &res);
176: else
177: #endif
178: if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
179: fib_select_default(net, &fl, &res);
180:
181: if (!fl.fl4_src)
182: fl.fl4_src = FIB_RES_PREFSRC(res);
183:
184: if (dev_out)
185: dev_put(dev_out);
186: dev_out = FIB_RES_DEV(res);
187: dev_hold(dev_out);
188: fl.oif = dev_out->ifindex;
189:
190:
191: make_route:
192: err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
193:
194:
195: if (free_res)
196: fib_res_put(&res);
197: if (dev_out)
198: dev_put(dev_out);
199: out: return err;
200: }
默认网关
选择正确的默认网关由fib_select_default完成,当以下两个条件都满足时它被ip_route_output_slow函数启用: