linux内核 路由缓存表之创建

3.2 创建输入路由缓存
        ip_mkroute_input

功能:

        (1)创建输入路由缓存项

        (2)生成hash值,插入缓存链表中
static int ip_mkroute_input(struct sk_buff *skb,
                struct fib_result *res,
                const struct flowi *fl,
                struct in_device *in_dev,
                __be32 daddr, __be32 saddr, u32 tos)
{
        struct rtable* rth = NULL;
        int err;
        unsigned hash;

#ifdef CONFIG_IP_ROUTE_MULTIPATH
        if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
        {
                DEBUG_V4Route("%s-->Multipath fib_nhs:%d\n",__FUNCTION__,res->fi->fib_nhs);
                fib_select_multipath(fl, res);
        }
#endif

        /* create a routing cache entry */
        err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);//创建输入路由缓存项
        if (err)
            return err;

        /*build services of route cache for policy route*/
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
        if(FW_PR_CONFIG){
                if(skb->nfct){
                    DEBUG_V4Route("%s-->CTINFO2DIR(skb->nfctinfo):%d\n",\
                                                __FUNCTION__,CTINFO2DIR(skb->nfctinfo));
                    if (CTINFO2DIR(skb->nfctinfo) == IP_CT_DIR_ORIGINAL)
                        rt_service_set(rth, skb);
                }
        }
#endif

    if(IS_IPV4_DEBUG_ROUTE)
        ipv4_route_dump(skb,rth);
    /* put it into the cache */
    hash = rt_hash(daddr, saddr, fl->iif,rt_genid(dev_net(rth->u.dst.dev)));
    return rt_intern_hash(hash, rth, NULL, skb);//插入缓存链表
}

3.2.1 创建输入路由缓存项
        __mkroute_input
功能:
        (1)路由合法性检查
        (2)创建输入路由缓存项,
程序流程图
linux内核 路由缓存表之创建_第1张图片
代码
static int __mkroute_input(struct sk_buff *skb,
               struct fib_result *res,
               struct in_device *in_dev,
               __be32 daddr, __be32 saddr, u32 tos,
               struct rtable **result)
{
        struct rtable *rth;
        int err;
        struct in_device *out_dev;
        unsigned flags = 0;
        __be32 spec_dst;
        u32 itag;

        /* get a working reference to the output device */
        out_dev = in_dev_get(FIB_RES_DEV(*res));//获取输出报文的网络设备
        if (out_dev == NULL) {
            if (net_ratelimit())
                printk(KERN_CRIT "Bug in ip_route_input" \
                            "_slow(). Please, report\n");
                return -EINVAL;
        }
        //路由合法性检查,在调用该函数前,已经找到一条从saddr->daddr的路由项,
        //需要进行判断daddr->saddr反向路由是否存在,否则认为它是非法的。
        
        err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
                  in_dev->dev, &spec_dst, &itag, skb->mark);
        if (err < 0) {
                ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,saddr);
                err = -EINVAL;
                goto cleanup;
        }

        if (err)
            flags |= RTCF_DIRECTSRC;

        if (out_dev == in_dev && err &&
            (IN_DEV_SHARED_MEDIA(out_dev) ||
            inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
            flags |= RTCF_DOREDIRECT;

        if (skb->protocol != htons(ETH_P_IP)) {
                /* Not IP (i.e. ARP). Do not create route, if it is
                * invalid for proxy arp. DNAT routes are always valid.
                */
                if (out_dev == in_dev) {
                        err = -EINVAL;
                        goto cleanup;
                }
        }

        rth = dst_alloc(&ipv4_dst_ops);//创建新的路由缓存项
        if (!rth) {
                err = -ENOBUFS;
                goto cleanup;
        }

        atomic_set(&rth->u.dst.__refcnt, 1);//对路由缓存项进行初始化
        rth->u.dst.flags= DST_HOST;
        if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
            rth->u.dst.flags |= DST_NOPOLICY;
        if (IN_DEV_CONF_GET(out_dev, NOXFRM))
            rth->u.dst.flags |= DST_NOXFRM;
        rth->fl.fl4_dst = daddr;
        rth->rt_dst = daddr;
        rth->fl.fl4_tos = tos;
        rth->fl.mark    = skb->mark;
        rth->fl.fl4_src = saddr;
        rth->rt_src = saddr;
        rth->rt_gateway = daddr;
        rth->rt_iif =rth->fl.iif= in_dev->dev->ifindex;
        rth->u.dst.dev  = (out_dev)->dev;
        dev_hold(rth->u.dst.dev);
        rth->idev   = in_dev_get(rth->u.dst.dev);
        rth->fl.oif     = 0;
        rth->rt_spec_dst= spec_dst;

        rth->u.dst.input = ip_forward;//设置路由缓存的输入、输出函数指针
        rth->u.dst.output = ip_output;
        rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev));

        rt_set_nexthop(rth, res, itag);

        rth->rt_flags = flags;

#if defined(CONFIG_MV_ETH_NFP_FIB_LEARN)
        rth->nfp = false;
        if (!(rth->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL | RTCF_REJECT))) {
            if (!nfp_hook_fib_rule_add(AF_INET, (u8*)(&rth->rt_src), (u8*)(&rth->rt_dst), 
                (u8*)(&rth->rt_gateway), rth->rt_iif, rth->u.dst.dev->ifindex))
                rth->nfp = true;
        }
#endif /* CONFIG_MV_ETH_NFP_FIB_LEARN */

        *result = rth;
#ifdef CONFIG_MV_ETH_NFP
        if ( !fp_disable_flag &&
            !(rth->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL | RTCF_REJECT))) {
            fp_routing_info_set(    rth->rt_src, rth->rt_dst, 
                                                rth->rt_gateway, rth->rt_iif, rth->u.dst.dev->ifindex);
        }
#endif /* CONFIG_MV_ETH_NFP */
        err = 0;

cleanup:
        /* release the working reference to the output device */
        in_dev_put(out_dev);
        return err;
}

3.2.2 插入缓存链表
rt_intern_hash
功能
        (1)查找路由缓存是否已经存在,若存在,则将缓存项放在hash链表的首部,并更新使用时间;
        (2)调用arp_bind_neighbour进行路由缓存项与邻居项的绑定操作(赋值dst->neighbour),将新表项插入到缓存表散列桶的链表首部。

流程图

linux内核 路由缓存表之创建_第2张图片

static int rt_intern_hash(unsigned hash, struct rtable *rt,
              struct rtable **rp, struct sk_buff *skb)
              //hash:hash值;rt:待插入的路由缓存;skb:待查找的路由报文
              //rp:插入的路由缓存
{
    struct rtable   *rth, **rthp;
    unsigned long   now;
    struct rtable *cand, **candp;
    u32         min_score;
    int     chain_length;
    int attempts = !in_softirq();

restart:
    chain_length = 0;
    min_score = ~(u32)0;
    cand = NULL;
    candp = NULL;
    now = jiffies;

    if (!rt_caching(dev_net(rt->u.dst.dev))) {
        /*
         * If we're not caching, just tell the caller we
         * were successful and don't touch the route.  The
         * caller hold the sole reference to the cache entry, and
         * it will be released when the caller is done with it.
         * If we drop it here, the callers have no way to resolve routes
         * when we're not caching.  Instead, just point *rp at rt, so
         * the caller gets a single use out of the route
         * Note that we do rt_free on this new route entry, so that
         * once its refcount hits zero, we are still able to reap it
         * (Thanks Alexey)
         * Note also the rt_free uses call_rcu.  We don't actually
         * need rcu protection here, this is just our path to get
         * on the route gc list.
         */

        if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
            int err = arp_bind_neighbour(&rt->u.dst);
            if (err) {
                if (net_ratelimit())
                    printk(KERN_WARNING
                        "Neighbour table failure & not caching routes.\n");
                rt_drop(rt);
                return err;
            }
        }

        rt_free(rt);
        goto skip_hashing;
    }

    rthp = &rt_hash_table[hash].chain;//根据hash值,找对应的rtable

    spin_lock_bh(rt_hash_lock_addr(hash));
    while ((rth = *rthp) != NULL) {//遍历hash表
        if (rt_is_expired(rth)) {//是否过期
            *rthp = rth->u.dst.rt_next;//过期,删除并释放空间
            rt_free(rth);
            continue;
        }
        //未过期,并找到匹配的路由
        if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
            /* Put it first */
            *rthp = rth->u.dst.rt_next;//将缓存项拿到链表的最新端
            /*
             * Since lookup is lockfree, the deletion
             * must be visible to another weakly ordered CPU before
             * the insertion at the start of the hash chain.
             */
            rcu_assign_pointer(rth->u.dst.rt_next,
                       rt_hash_table[hash].chain);
            /*
             * Since lookup is lockfree, the update writes
             * must be ordered for consistency on SMP.
             */
            rcu_assign_pointer(rt_hash_table[hash].chain, rth);

            dst_use(&rth->u.dst, now);//增加引用计数
            spin_unlock_bh(rt_hash_lock_addr(hash));

            rt_drop(rt);//释放新建的待插入的缓存项内存
            if (rp)
                *rp = rth;
            else
                skb_dst_set(skb, &rth->u.dst);
            return 0;
        }

        if (!atomic_read(&rth->u.dst.__refcnt)) {
            u32 score = rt_score(rth);

            if (score <= min_score) {
                cand = rth;
                candp = rthp;
                min_score = score;
            }
        }

        chain_length++;

        rthp = &rth->u.dst.rt_next;
    }

    if (cand) {
        /* ip_rt_gc_elasticity used to be average length of chain
         * length, when exceeded gc becomes really aggressive.
         *
         * The second limit is less certain. At the moment it allows
         * only 2 entries per bucket. We will see.
         */
        if (chain_length > ip_rt_gc_elasticity) {
            *candp = cand->u.dst.rt_next;
            rt_free(cand);
        }
    } else {
        if (chain_length > rt_chain_length_max) {//hash碰撞太严重,需要重构哈希表
            struct net *net = dev_net(rt->u.dst.dev);
            int num = ++net->ipv4.current_rt_cache_rebuild_count;
            if (!rt_caching(dev_net(rt->u.dst.dev))) {
                printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
                    rt->u.dst.dev->name, num);
            }
            rt_emergency_hash_rebuild(dev_net(rt->u.dst.dev));
        }
    }

    /* Try to bind route to arp only if it is output
       route or unicast forwarding path.
     */
    if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {//单播转发报文或者本地发包
        int err = arp_bind_neighbour(&rt->u.dst);//绑定ARP
        if (err) {//绑定失败。垃圾回收,实现内存释放,再重新开始
            spin_unlock_bh(rt_hash_lock_addr(hash));

            if (err != -ENOBUFS) {
                rt_drop(rt);
                return err;
            }

            /* Neighbour tables are full and nothing
               can be released. Try to shrink route cache,
               it is most likely it holds some neighbour records.
             */
            if (attempts-- > 0) {
                int saved_elasticity = ip_rt_gc_elasticity;
                int saved_int = ip_rt_gc_min_interval;
                ip_rt_gc_elasticity = 1;
                ip_rt_gc_min_interval   = 0;
                rt_garbage_collect(&ipv4_dst_ops);
                ip_rt_gc_min_interval   = saved_int;
                ip_rt_gc_elasticity = saved_elasticity;
                goto restart;
            }

            if (net_ratelimit())
                printk(KERN_WARNING "Neighbour table overflow.\n");
            rt_drop(rt);
            return -ENOBUFS;
        }
    }

    rt->u.dst.rt_next = rt_hash_table[hash].chain;//将该表放到哈希链表的头部

#if RT_CACHE_DEBUG >= 2
    if (rt->u.dst.rt_next) {
        struct rtable *trt;
        printk(KERN_DEBUG "rt_cache @%02x: %pI4",
               hash, &rt->rt_dst);
        for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
            printk(" . %pI4", &trt->rt_dst);
        printk("\n");
    }
#endif
    /*
     * Since lookup is lockfree, we must make sure
     * previous writes to rt are comitted to memory
     * before making rt visible to other CPUS.
     */
    rcu_assign_pointer(rt_hash_table[hash].chain, rt);

    spin_unlock_bh(rt_hash_lock_addr(hash));

skip_hashing:
    if (rp)
        *rp = rt;
    else
        skb_dst_set(skb, &rt->u.dst);
    return 0;
}


3.3 创建输出路由缓存

        ip_mkroute_output
        同输入类似

static int ip_mkroute_output(struct rtable **rp,
                 struct fib_result *res,
                 const struct flowi *fl,
                 const struct flowi *oldflp,
                 struct net_device *dev_out,
                 unsigned flags)
{
    struct rtable *rth = NULL;
    int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
    unsigned hash;
    if (err == 0) {
        hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
                   rt_genid(dev_net(dev_out)));
        err = rt_intern_hash(hash, rth, rp, NULL);
    }

    return err;
}


你可能感兴趣的:(linux,网络协议)