路由表和查找

路由Hash表的组织(新的内核使用了不同的组织方式,以后再分析)

为了能使各种操作快速查找到相关信息,Linux中采用了几种不同的Hash表,这些表都指向用于描述路由的相同的数据结构

  • 一组基于网络掩码长度来访问路由的hash表
  • 一组直接搜索fib_info结构的hash表
  • 一个以设备为索引,可以快速搜索配置路由的下一跳的hash表
  • 一个以设备和一条路由为索引,能够快速识别该路由下一跳所用网关的hash表

路由是由多个不同的数据结构的组合来描述的,每个数据结构代表着路由信息的不同部分。定义一条路由所需的信息分成多个数据结构。这样做的原因是只需通过部分字段可以区分多条路由。路由子系统不是维护一个庞大臃肿的结构而是将路由分散为多个片段,这样更容易在相似的路由间共享通用的信息,因而就可以分离出不同的函数,并在这些函数之间定义更加清晰的接口。

两个默认路由表

ip_fib_local_table,内核将到本地地址的路由放在该表中,包括到相关的子网网络地址及子网广播地址的路由。用户不能够直接配置该路由表

ip_fib_main_table,所有其他的路由表项都放在该表内

路由表初始化

   1:  struct fib_table *fib_hash_table(u32 id)
   2:  {
   3:      struct fib_table *tb;
   4:   
   5:      tb = kmalloc(sizeof(struct fib_table) + sizeof(struct fn_hash),
   6:               GFP_KERNEL);
   7:      if (tb == NULL)
   8:          return NULL;
   9:   
  10:      tb->tb_id = id;
  11:      tb->tb_default = -1;
  12:      tb->tb_lookup = fn_hash_lookup;
  13:      tb->tb_insert = fn_hash_insert;
  14:      tb->tb_delete = fn_hash_delete;
  15:      tb->tb_flush = fn_hash_flush;
  16:      tb->tb_select_default = fn_hash_select_default;
  17:      tb->tb_dump = fn_hash_dump;
  18:      memset(tb->tb_data, 0, sizeof(struct fn_hash));
  19:      return tb;
  20:  }

路由的添加和删除

  • 当在路由表中添加或删除一条路由时,构造搜索关键字,用它来查找一个fib_node和一个fib_alias。这些查找与路由数据封包所走的流程类似,但目的不同:这里的目的是为了检查待添加的路由是否已经存在,或检查待删除的路由是否还存在
  • 生成(插入时)及清理(删除时)相应的hash表
  • 必要时刷新路由缓存表
  • 生成一条Netlink广播通知,告诉感兴趣的模块:从一个路由表中添加或删除了一条路由

路由表查找

   1:  static int
   2:  fn_hash_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
   3:  {
   4:      int err;
   5:      struct fn_zone *fz;
   6:      struct fn_hash *t = (struct fn_hash *)tb->tb_data;
   7:   
   8:      read_lock(&fib_hash_lock);
   9:      for (fz = t->fn_zone_list; fz; fz = fz->fz_next) {
  10:          struct hlist_head *head;
  11:          struct hlist_node *node;
  12:          struct fib_node *f;
  13:          __be32 k = fz_key(flp->fl4_dst, fz);
  14:   
  15:          head = &fz->fz_hash[fn_hash(k, fz)];
  16:          hlist_for_each_entry(f, node, head, fn_hash) {
  17:              if (f->fn_key != k)
  18:                  continue;
  19:   
  20:              err = fib_semantic_match(&f->fn_alias,
  21:                           flp, res,
  22:                           fz->fz_order);
  23:              if (err <= 0)
  24:                  goto out;
  25:          }
  26:      }
  27:      err = 1;
  28:  out:
  29:      read_unlock(&fib_hash_lock);
  30:      return err;
  31:  }

tb,搜索的路由表。因为fn_hash_lookup是一个通用的查找函数,每次只能够查找一张表。调用者依赖是否支持策略路由和相关的因素,来决定搜索那些路由表

flp,搜索关键字

res,查找成功时,用路由信息初始化res

可能返回值, 0,成功;1,失败,没有与搜索关键字匹配的路由项;小于0,管理失败,意味着查找不成功,因为查找到的路由没有价值

拒绝路由的标准

  • TOS不匹配,注意,当路由项没有配置TOS值时,可以路由任意TOS值的封包

  • scope比搜索关键字中指定的更窄的路由项
   1:  static const struct
   2:  {
   3:      int    error;
   4:      u8    scope;
   5:  } fib_props[RTN_MAX + 1] = {
   6:      {
   7:          .error    = 0,
   8:          .scope    = RT_SCOPE_NOWHERE,
   9:      },    /* RTN_UNSPEC */
  10:      {
  11:          .error    = 0,
  12:          .scope    = RT_SCOPE_UNIVERSE,
  13:      },    /* RTN_UNICAST */
  14:      {
  15:          .error    = 0,
  16:          .scope    = RT_SCOPE_HOST,
  17:      },    /* RTN_LOCAL */
  18:      {
  19:          .error    = 0,
  20:          .scope    = RT_SCOPE_LINK,
  21:      },    /* RTN_BROADCAST */
  22:      {
  23:          .error    = 0,
  24:          .scope    = RT_SCOPE_LINK,
  25:      },    /* RTN_ANYCAST */
  26:      {
  27:          .error    = 0,
  28:          .scope    = RT_SCOPE_UNIVERSE,
  29:      },    /* RTN_MULTICAST */
  30:      {
  31:          .error    = -EINVAL,
  32:          .scope    = RT_SCOPE_UNIVERSE,
  33:      },    /* RTN_BLACKHOLE */
  34:      {
  35:          .error    = -EHOSTUNREACH,
  36:          .scope    = RT_SCOPE_UNIVERSE,
  37:      },    /* RTN_UNREACHABLE */
  38:      {
  39:          .error    = -EACCES,
  40:          .scope    = RT_SCOPE_UNIVERSE,
  41:      },    /* RTN_PROHIBIT */
  42:      {
  43:          .error    = -EAGAIN,
  44:          .scope    = RT_SCOPE_UNIVERSE,
  45:      },    /* RTN_THROW */
  46:      {
  47:          .error    = -EINVAL,
  48:          .scope    = RT_SCOPE_NOWHERE,
  49:      },    /* RTN_NAT */
  50:      {
  51:          .error    = -EINVAL,
  52:          .scope    = RT_SCOPE_NOWHERE,
  53:      },    /* RTN_XRESOLVE */
  54:  };

输入和输出函数

   1:  static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
   2:                     u8 tos, struct net_device *dev)
   3:  {
   4:      struct fib_result res;
   5:      struct in_device *in_dev = in_dev_get(dev);
   6:      struct flowi fl = { .nl_u = { .ip4_u =
   7:                        { .daddr = daddr,
   8:                      .saddr = saddr,
   9:                      .tos = tos,
  10:                      .scope = RT_SCOPE_UNIVERSE,
  11:                        } },
  12:                  .mark = skb->mark,
  13:                  .iif = dev->ifindex };
  14:      unsigned    flags = 0;
  15:      u32        itag = 0;
  16:      struct rtable * rth;
  17:      unsigned    hash;
  18:      __be32        spec_dst;
  19:      int        err = -EINVAL;
  20:      int        free_res = 0;
  21:      struct net    * net = dev_net(dev);
  22:   
  23:      /* IP on this device is disabled. */
  24:   
  25:      if (!in_dev)
  26:          goto out;
  27:   
  28:      /* Check for the most weird martians, which can be not detected
  29:         by fib_lookup.
  30:       */
  31:   
  32:      if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
  33:          ipv4_is_loopback(saddr))
  34:          goto martian_source;
  35:   
  36:      if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
  37:          goto brd_input;
  38:   
  39:      /* Accept zero addresses only to limited broadcast;
  40:       * I even do not know to fix it or not. Waiting for complains :-)
  41:       */
  42:      if (ipv4_is_zeronet(saddr))
  43:          goto martian_source;
  44:   
  45:      if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
  46:          ipv4_is_loopback(daddr))
  47:          goto martian_destination;
  48:   
  49:      /*
  50:       *    Now we are ready to route packet.
  51:       */
  52:      if ((err = fib_lookup(net, &fl, &res)) != 0) {
  53:          if (!IN_DEV_FORWARD(in_dev))
  54:              goto e_hostunreach;
  55:          goto no_route;
  56:      }
  57:      free_res = 1;
  58:   
  59:      RT_CACHE_STAT_INC(in_slow_tot);
  60:   
  61:      if (res.type == RTN_BROADCAST)
  62:          goto brd_input;
  63:   
  64:      if (res.type == RTN_LOCAL) {
  65:          int result;
  66:          result = fib_validate_source(saddr, daddr, tos,
  67:                           net->loopback_dev->ifindex,
  68:                           dev, &spec_dst, &itag, skb->mark);
  69:          if (result < 0)
  70:              goto martian_source;
  71:          if (result)
  72:              flags |= RTCF_DIRECTSRC;
  73:          spec_dst = daddr;
  74:          goto local_input;
  75:      }
  76:   
  77:      if (!IN_DEV_FORWARD(in_dev))
  78:          goto e_hostunreach;
  79:      if (res.type != RTN_UNICAST)
  80:          goto martian_destination;
  81:   
  82:      err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
  83:  done:
  84:      in_dev_put(in_dev);
  85:      if (free_res)
  86:          fib_res_put(&res);
  87:  out:    return err;
  88:   
  89:  brd_input:
  90:      if (skb->protocol != htons(ETH_P_IP))
  91:          goto e_inval;
  92:   
  93:      if (ipv4_is_zeronet(saddr))
  94:          spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
  95:      else {
  96:          err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
  97:                        &itag, skb->mark);
  98:          if (err < 0)
  99:              goto martian_source;
 100:          if (err)
 101:              flags |= RTCF_DIRECTSRC;
 102:      }
 103:      flags |= RTCF_BROADCAST;
 104:      res.type = RTN_BROADCAST;
 105:      RT_CACHE_STAT_INC(in_brd);
 106:   
 107:  local_input:
 108:      rth = dst_alloc(&ipv4_dst_ops);
 109:      if (!rth)
 110:          goto e_nobufs;
 111:   
 112:      rth->u.dst.output= ip_rt_bug;
 113:      rth->rt_genid = rt_genid(net);
 114:   
 115:      atomic_set(&rth->u.dst.__refcnt, 1);
 116:      rth->u.dst.flags= DST_HOST;
 117:      if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
 118:          rth->u.dst.flags |= DST_NOPOLICY;
 119:      rth->fl.fl4_dst    = daddr;
 120:      rth->rt_dst    = daddr;
 121:      rth->fl.fl4_tos    = tos;
 122:      rth->fl.mark    = skb->mark;
 123:      rth->fl.fl4_src    = saddr;
 124:      rth->rt_src    = saddr;
 125:  #ifdef CONFIG_NET_CLS_ROUTE
 126:      rth->u.dst.tclassid = itag;
 127:  #endif
 128:      rth->rt_iif    =
 129:      rth->fl.iif    = dev->ifindex;
 130:      rth->u.dst.dev    = net->loopback_dev;
 131:      dev_hold(rth->u.dst.dev);
 132:      rth->idev    = in_dev_get(rth->u.dst.dev);
 133:      rth->rt_gateway    = daddr;
 134:      rth->rt_spec_dst= spec_dst;
 135:      rth->u.dst.input= ip_local_deliver;
 136:      rth->rt_flags     = flags|RTCF_LOCAL;
 137:      if (res.type == RTN_UNREACHABLE) {
 138:          rth->u.dst.input= ip_error;
 139:          rth->u.dst.error= -err;
 140:          rth->rt_flags     &= ~RTCF_LOCAL;
 141:      }
 142:      rth->rt_type    = res.type;
 143:      hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
 144:      err = rt_intern_hash(hash, rth, NULL, skb);
 145:      goto done;
 146:   
 147:  no_route:
 148:      RT_CACHE_STAT_INC(in_no_route);
 149:      spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
 150:      res.type = RTN_UNREACHABLE;
 151:      if (err == -ESRCH)
 152:          err = -ENETUNREACH;
 153:      goto local_input;
 154:   
 155:      /*
 156:       *    Do not cache martian addresses: they should be logged (RFC1812)
 157:       */
 158:  martian_destination:
 159:      RT_CACHE_STAT_INC(in_martian_dst);
 160:  #ifdef CONFIG_IP_ROUTE_VERBOSE
 161:      if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
 162:          printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
 163:              &daddr, &saddr, dev->name);
 164:  #endif
 165:   
 166:  e_hostunreach:
 167:      err = -EHOSTUNREACH;
 168:      goto done;
 169:   
 170:  e_inval:
 171:      err = -EINVAL;
 172:      goto done;
 173:   
 174:  e_nobufs:
 175:      err = -ENOBUFS;
 176:      goto done;
 177:   
 178:  martian_source:
 179:      ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
 180:      goto e_inval;
 181:  }

输出路由

将封包送往本地和转发出去二者都需要执行以下任务,虽然这些任务可能以不同的方式来执行:

  • 从匹配的路由项中选择出使用的出口设备
  • 根据被搜索路由项的scope选择出将要使用的源IP地址
  • 创建及初始化一个新缓存表项,并将它插入到缓存中

在一些情况下,不需要任何路由查找就可以路由一个封包。这主要有以下三种情况:

  • 当出口设备没有提供搜索关键字时,封包目的地为一个多播地址或受限的广播地址
  • 封包目的地为本地多播地址或受限的广播地址,从一个给定的设备发送出去
  • 封包的目的地为未知地址
   1:  static int ip_route_output_slow(struct net *net, struct rtable **rp,
   2:                  const struct flowi *oldflp)
   3:  {
   4:      u32 tos    = RT_FL_TOS(oldflp);
   5:      struct flowi fl = { .nl_u = { .ip4_u =
   6:                        { .daddr = oldflp->fl4_dst,
   7:                      .saddr = oldflp->fl4_src,
   8:                      .tos = tos & IPTOS_RT_MASK,
   9:                      .scope = ((tos & RTO_ONLINK) ?
  10:                            RT_SCOPE_LINK :
  11:                            RT_SCOPE_UNIVERSE),
  12:                        } },
  13:                  .mark = oldflp->mark,
  14:                  .iif = net->loopback_dev->ifindex,
  15:                  .oif = oldflp->oif };
  16:      struct fib_result res;
  17:      unsigned flags = 0;
  18:      struct net_device *dev_out = NULL;
  19:      int free_res = 0;
  20:      int err;
  21:   
  22:   
  23:      res.fi        = NULL;
  24:  #ifdef CONFIG_IP_MULTIPLE_TABLES
  25:      res.r        = NULL;
  26:  #endif
  27:   
  28:      if (oldflp->fl4_src) {
  29:          err = -EINVAL;
  30:          if (ipv4_is_multicast(oldflp->fl4_src) ||
  31:              ipv4_is_lbcast(oldflp->fl4_src) ||
  32:              ipv4_is_zeronet(oldflp->fl4_src))
  33:              goto out;
  34:   
  35:          /* I removed check for oif == dev_out->oif here.
  36:             It was wrong for two reasons:
  37:             1. ip_dev_find(net, saddr) can return wrong iface, if saddr
  38:                is assigned to multiple interfaces.
  39:             2. Moreover, we are allowed to send packets with saddr
  40:                of another iface. --ANK
  41:           */
  42:   
  43:          if (oldflp->oif == 0
  44:              && (ipv4_is_multicast(oldflp->fl4_dst) ||
  45:              oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
  46:              /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
  47:              dev_out = ip_dev_find(net, oldflp->fl4_src);
  48:              if (dev_out == NULL)
  49:                  goto out;
  50:   
  51:              /* Special hack: user can direct multicasts
  52:                 and limited broadcast via necessary interface
  53:                 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
  54:                 This hack is not just for fun, it allows
  55:                 vic,vat and friends to work.
  56:                 They bind socket to loopback, set ttl to zero
  57:                 and expect that it will work.
  58:                 From the viewpoint of routing cache they are broken,
  59:                 because we are not allowed to build multicast path
  60:                 with loopback source addr (look, routing cache
  61:                 cannot know, that ttl is zero, so that packet
  62:                 will not leave this host and route is valid).
  63:                 Luckily, this hack is good workaround.
  64:               */
  65:   
  66:              fl.oif = dev_out->ifindex;
  67:              goto make_route;
  68:          }
  69:   
  70:          if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
  71:              /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
  72:              dev_out = ip_dev_find(net, oldflp->fl4_src);
  73:              if (dev_out == NULL)
  74:                  goto out;
  75:              dev_put(dev_out);
  76:              dev_out = NULL;
  77:          }
  78:      }
  79:   
  80:   
  81:      if (oldflp->oif) {
  82:          dev_out = dev_get_by_index(net, oldflp->oif);
  83:          err = -ENODEV;
  84:          if (dev_out == NULL)
  85:              goto out;
  86:   
  87:          /* RACE: Check return value of inet_select_addr instead. */
  88:          if (__in_dev_get_rtnl(dev_out) == NULL) {
  89:              dev_put(dev_out);
  90:              goto out;    /* Wrong error code */
  91:          }
  92:   
  93:          if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
  94:              oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
  95:              if (!fl.fl4_src)
  96:                  fl.fl4_src = inet_select_addr(dev_out, 0,
  97:                                    RT_SCOPE_LINK);
  98:              goto make_route;
  99:          }
 100:          if (!fl.fl4_src) {
 101:              if (ipv4_is_multicast(oldflp->fl4_dst))
 102:                  fl.fl4_src = inet_select_addr(dev_out, 0,
 103:                                    fl.fl4_scope);
 104:              else if (!oldflp->fl4_dst)
 105:                  fl.fl4_src = inet_select_addr(dev_out, 0,
 106:                                    RT_SCOPE_HOST);
 107:          }
 108:      }
 109:   
 110:      if (!fl.fl4_dst) {
 111:          fl.fl4_dst = fl.fl4_src;
 112:          if (!fl.fl4_dst)
 113:              fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
 114:          if (dev_out)
 115:              dev_put(dev_out);
 116:          dev_out = net->loopback_dev;
 117:          dev_hold(dev_out);
 118:          fl.oif = net->loopback_dev->ifindex;
 119:          res.type = RTN_LOCAL;
 120:          flags |= RTCF_LOCAL;
 121:          goto make_route;
 122:      }
 123:   
 124:      if (fib_lookup(net, &fl, &res)) {
 125:          res.fi = NULL;
 126:          if (oldflp->oif) {
 127:              /* Apparently, routing tables are wrong. Assume,
 128:                 that the destination is on link.
 129:  
 130:                 WHY? DW.
 131:                 Because we are allowed to send to iface
 132:                 even if it has NO routes and NO assigned
 133:                 addresses. When oif is specified, routing
 134:                 tables are looked up with only one purpose:
 135:                 to catch if destination is gatewayed, rather than
 136:                 direct. Moreover, if MSG_DONTROUTE is set,
 137:                 we send packet, ignoring both routing tables
 138:                 and ifaddr state. --ANK
 139:  
 140:  
 141:                 We could make it even if oif is unknown,
 142:                 likely IPv6, but we do not.
 143:               */
 144:   
 145:              if (fl.fl4_src == 0)
 146:                  fl.fl4_src = inet_select_addr(dev_out, 0,
 147:                                    RT_SCOPE_LINK);
 148:              res.type = RTN_UNICAST;
 149:              goto make_route;
 150:          }
 151:          if (dev_out)
 152:              dev_put(dev_out);
 153:          err = -ENETUNREACH;
 154:          goto out;
 155:      }
 156:      free_res = 1;
 157:   
 158:      if (res.type == RTN_LOCAL) {
 159:          if (!fl.fl4_src)
 160:              fl.fl4_src = fl.fl4_dst;
 161:          if (dev_out)
 162:              dev_put(dev_out);
 163:          dev_out = net->loopback_dev;
 164:          dev_hold(dev_out);
 165:          fl.oif = dev_out->ifindex;
 166:          if (res.fi)
 167:              fib_info_put(res.fi);
 168:          res.fi = NULL;
 169:          flags |= RTCF_LOCAL;
 170:          goto make_route;
 171:      }
 172:   
 173:  #ifdef CONFIG_IP_ROUTE_MULTIPATH
 174:      if (res.fi->fib_nhs > 1 && fl.oif == 0)
 175:          fib_select_multipath(&fl, &res);
 176:      else
 177:  #endif
 178:      if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
 179:          fib_select_default(net, &fl, &res);
 180:   
 181:      if (!fl.fl4_src)
 182:          fl.fl4_src = FIB_RES_PREFSRC(res);
 183:   
 184:      if (dev_out)
 185:          dev_put(dev_out);
 186:      dev_out = FIB_RES_DEV(res);
 187:      dev_hold(dev_out);
 188:      fl.oif = dev_out->ifindex;
 189:   
 190:   
 191:  make_route:
 192:      err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
 193:   
 194:   
 195:      if (free_res)
 196:          fib_res_put(&res);
 197:      if (dev_out)
 198:          dev_put(dev_out);
 199:  out:    return err;
 200:  }

默认网关

选择正确的默认网关由fib_select_default完成,当以下两个条件都满足时它被ip_route_output_slow函数启用:

  • fib_lookup返回的路由项子网掩码为/0
  • fib_lookup返回的路由项的类型为RTN_UNICAST

你可能感兴趣的:(路由表和查找)