Linux网络协议栈4--bridge收发包

bridge 是linux上的虚拟交换机,具有交换机的功能。

网卡收到包后,走到__netif_receive_skb_core后,剥完vlan找到vlan子接口(如果有的话),如果skb->dev是bridge成员口,就会走到bridge成员口的接收处理函数。

static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
{
......
    /*
    bridge、ovs的接口,都会走到。
    如果一个dev被添加到一个bridge(做为bridge的一个接口),这个接口设备的rx_handler将被设置为,
    br_handle_frame函数这是在br_add_if函数中设置的,而br_add_if (net/bridge/br_if.c)是在向
    网桥设备上添加接口时设置的。进入br_handle_frame也就进入了bridge的逻辑代码。*/
    rx_handler = rcu_dereference(skb->dev->rx_handler);
    if (rx_handler) {
        if (pt_prev) {
            ret = deliver_skb(skb, pt_prev, orig_dev);
            pt_prev = NULL;
        }
        switch (rx_handler(&skb)) {
        case RX_HANDLER_CONSUMED:  // 报文已经被消费,结束处理
            ret = NET_RX_SUCCESS;
            goto out;
        case RX_HANDLER_ANOTHER:  // skb->dev 被修改,重新走一次
            goto another_round;
        case RX_HANDLER_EXACT: /* 精确传递到ptype->dev == skb->dev */
            deliver_exact = true;
        case RX_HANDLER_PASS:
            break;
        default:
            BUG();
        }
    }

    ......
}

bridge 的接收处理函数为br_handler_frame,在为bridge添加接口操作时,如brctl addif命令行将一个接口加入到bridge中,就会为接口的 net_device挂载此处理函数。

int br_add_if(struct net_bridge *br, struct net_device *dev)
{
    ......
    err = netdev_rx_handler_register(dev, br_handle_frame, p);
    if (err)
        goto err4;

    ......
}

int netdev_rx_handler_register(struct net_device *dev,
                   rx_handler_func_t *rx_handler,
                   void *rx_handler_data)
{
    ASSERT_RTNL();

    if (dev->rx_handler)
        return -EBUSY;

    /* Note: rx_handler_data must be set before rx_handler */
    rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
    rcu_assign_pointer(dev->rx_handler, rx_handler);

    return 0;
}

bridge接收处理函数br_handle_frame,非linklocal地址的情况下,主要做:
1、ebtables 的BROUTING 表处理,这是在bridge协议栈中将二层转发切换到主机协议栈的三层转发的hook点;
2、bridge NF_BR_PRE_ROUTING hook点处理,配置net.bridge.bridge-nf-call-iptables 系统配置的情况下还会 调用iptables规则处理;
3、进br_handle_frame_finish函数,根据src mac学习fdb表项,继续做转发处理或本机报文上送三层协议栈。

rx_handler_result_t br_handle_frame(struct sk_buff **pskb)
{
    struct net_bridge_port *p;
    struct sk_buff *skb = *pskb;
    const unsigned char *dest = eth_hdr(skb)->h_dest;
    br_should_route_hook_t *rhook;

    if (unlikely(skb->pkt_type == PACKET_LOOPBACK))
        return RX_HANDLER_PASS;

    if (!is_valid_ether_addr(eth_hdr(skb)->h_source))
        goto drop;
    // 如果skb被其它流程共享,clone一份
    skb = skb_share_check(skb, GFP_ATOMIC);
    if (!skb)
        return RX_HANDLER_CONSUMED;

    p = br_port_get_rcu(skb->dev);

    if (unlikely(is_link_local_ether_addr(dest))) {
        u16 fwd_mask = p->br->group_fwd_mask_required;

        /*
         * See IEEE 802.1D Table 7-10 Reserved addresses
         *
         * Assignment               Value
         * Bridge Group Address     01-80-C2-00-00-00
         * (MAC Control) 802.3      01-80-C2-00-00-01
         * (Link Aggregation) 802.3 01-80-C2-00-00-02
         * 802.1X PAE address       01-80-C2-00-00-03
         *
         * 802.1AB LLDP         01-80-C2-00-00-0E
         *
         * Others reserved for future standardization
         */
        // 使用linklocal地址的一些场景的处理,__br_handle_local_finish 会学习src mac的fdb表项
        switch (dest[5]) {
        case 0x00:  /* Bridge Group Address */
            /* If STP is turned off,
               then must forward to keep loop detection */
            if (p->br->stp_enabled == BR_NO_STP ||
                fwd_mask & (1u << dest[5]))
                goto forward;
            *pskb = skb;
            __br_handle_local_finish(skb);
            return RX_HANDLER_PASS;

        case 0x01:  /* IEEE MAC (Pause) */
            goto drop;

        case 0x0E:  /* 802.1AB LLDP */
            fwd_mask |= p->br->group_fwd_mask;
            if (fwd_mask & (1u << dest[5]))
                goto forward;
            *pskb = skb;
            __br_handle_local_finish(skb);
            return RX_HANDLER_PASS;

        default:
            /* Allow selective forwarding for most other protocols */
            fwd_mask |= p->br->group_fwd_mask;
            if (fwd_mask & (1u << dest[5]))
                goto forward;
        }

        /* Deliver packet to local host only */
        NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, dev_net(skb->dev),
            NULL, skb, skb->dev, NULL, br_handle_local_finish);
        return RX_HANDLER_CONSUMED;
    }

forward:
    switch (p->state) {
    case BR_STATE_FORWARDING:
        rhook = rcu_dereference(br_should_route_hook);
        /* ebtables 的BROUTING 表处理,rhook 返回1,表示上送三层协议栈。我的一个项目中用过这个特性,识别用户的部分不重要的业务,走internet(配置有跨公网tunnel),而其它业务流量走加入bridge中的专线口。*/
        if (rhook) {
            if ((*rhook)(skb)) {
                *pskb = skb;
                return RX_HANDLER_PASS;
            }
            dest = eth_hdr(skb)->h_dest;
        }
        /* fall through */
    case BR_STATE_LEARNING:
        // mac地址是br的mac地址,表示送本机的报文。
        if (ether_addr_equal(p->br->dev->dev_addr, dest))
            skb->pkt_type = PACKET_HOST;
        /* NFPROTO_BRIDGE 协议类型的netfilter包括:
            1、ebtable模块注册的钩子,定义在ebt_ops_filter、ebt_ops_nat;
            2、bridge 模块注册的br_nf_ops 中定义的处理,其会根据 net.bridge.bridge-nf-call-iptables 系统配置决定是否调用iptables规则过滤,即在二层转发中做上层协议处理。
                 曾今工作中遇到过bridge-nf-call-iptables 被打开,导致流量不通的一个问题,看看半天,痛苦。
        */
        NF_HOOK(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING,
            dev_net(skb->dev), NULL, skb, skb->dev, NULL,
            br_handle_frame_finish);
        break;
    default:
drop:
        kfree_skb(skb);
    }
    return RX_HANDLER_CONSUMED;
}

br_handle_frame_finish函数主要做:
1、根据报文的源MAC地址,入接口,刷新fdb表;
2、实现arp带答功能(带答比代理更合适);
3、识别报文是单播、广播、还是组播,单播和组播查对应的fdb表;
4、单播:
1)送往本机的,走 br_pass_frame_up,local_in流程;
2)非本机,走 br_forward,forward流程;
广播报文: 走br_flood ,会在除入接口外的其它接口广播;
组播报文:走br_multicast_flood,根据组播表转发。

int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
    struct net_bridge_port *p = br_port_get_rcu(skb->dev);
    const unsigned char *dest = eth_hdr(skb)->h_dest;
    enum br_pkt_type pkt_type = BR_PKT_UNICAST;
    struct net_bridge_fdb_entry *dst = NULL;
    struct net_bridge_mdb_entry *mdst;
    bool local_rcv, mcast_hit = false;
    struct net_bridge *br;
    u16 vid = 0;

    if (!p || p->state == BR_STATE_DISABLED)
        goto drop;

    if (!br_allowed_ingress(p->br, nbp_vlan_group_rcu(p), skb, &vid))
        goto out;

    nbp_switchdev_frame_mark(p, skb);

    /* insert into forwarding database after filtering to avoid spoofing */
    br = p->br;
    if (p->flags & BR_LEARNING)  
        // 根据报文的源MAC地址,入接口,刷新fdb表(mac表)
        br_fdb_update(br, p, eth_hdr(skb)->h_source, vid, false);
    // 混杂模式下,会上送协议栈一份(为什么是!!)
    local_rcv = !!(br->dev->flags & IFF_PROMISC);
    if (is_multicast_ether_addr(dest)) {
        /* by definition the broadcast is also a multicast address */
        if (is_broadcast_ether_addr(dest)) {
            pkt_type = BR_PKT_BROADCAST;
            local_rcv = true;
        } else {
            pkt_type = BR_PKT_MULTICAST;
            if (br_multicast_rcv(br, p, skb, vid))
                goto drop;
        }
    }

    if (p->state == BR_STATE_LEARNING)
        goto drop;

    BR_INPUT_SKB_CB(skb)->brdev = br->dev;

    if (IS_ENABLED(CONFIG_INET) && skb->protocol == htons(ETH_P_ARP))
        // 一个比较重要的特性,arp proxy,接口如果是能了此功能,对于收到的arp request,可以通过查询本地的arp表构造arp reply报文回应。SDN网络经常用到。
        br_do_proxy_arp(skb, br, vid, p);
    // 根据报文类型是组播还是单播,查转发表
    switch (pkt_type) {
    case BR_PKT_MULTICAST:
        mdst = br_mdb_get(br, skb, vid);
        if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) &&
            br_multicast_querier_exists(br, eth_hdr(skb))) {
            if ((mdst && mdst->mglist) ||
                br_multicast_is_router(br)) {
                local_rcv = true;
                br->dev->stats.multicast++;
            }
            mcast_hit = true;
        } else {
            local_rcv = true;
            br->dev->stats.multicast++;
        }
        break;
    case BR_PKT_UNICAST:
        dst = __br_fdb_get(br, dest, vid);
    default:
        break;
    }
    // 单播
    if (dst) {
        // 本地转发表,则上送协议栈
        if (dst->is_local)
            return br_pass_frame_up(skb);
        // 否则,走二层转发
        dst->used = jiffies;
        br_forward(dst->dst, skb, local_rcv, false);
    } else {
        if (!mcast_hit)
            // 未知的单播,泛流
            br_flood(br, skb, pkt_type, local_rcv, false);
        else
            // 组播报文发送
            br_multicast_flood(mdst, skb, local_rcv, false);
    }
    // br混杂模式、广播、某些情况下组播报文(懒得看了)会上送协议栈一份
    if (local_rcv)
        return ·(skb);

out:
    return 0;
drop:
    kfree_skb(skb);
    goto out;
}
EXPORT_SYMBOL_GPL(br_handle_frame_finish);

这里有一个比较重要的特性,arp proxy,接口如果是能了此功能,对于收到的arp request,可以通过查询本地的arp表构造arp reply报文回应。SDN网络经常用到。
功能测试注意项记录:1、必须事先配置好代答mac地址的fdb表,否则无法代答ARP,这是因为配置了proxyarp,无法做广播和未知单播的泛洪,所以mac必须可达,否者彻底不通,在br_flood中有判断。如下面br_do_proxy_arp代码分析; 2、使用ip link set dev veth20 type bridge_slave proxy_arp on 使能proxy arp功能,和/proc/sys/net/ipv4/conf/veth2/proxy_arp 配置的功能不是一个东西;3、低版本的内核不支持,3.10 的centos可以配置但是没效果。

static void br_do_proxy_arp(struct sk_buff *skb, struct net_bridge *br,
                u16 vid, struct net_bridge_port *p)
{
    struct net_device *dev = br->dev;
    struct neighbour *n;
    struct arphdr *parp;
    u8 *arpptr, *sha;
    __be32 sip, tip;

    BR_INPUT_SKB_CB(skb)->proxyarp_replied = false;

    if ((dev->flags & IFF_NOARP) ||
        !pskb_may_pull(skb, arp_hdr_len(dev)))
        return;

    parp = arp_hdr(skb);
    // 过滤 arp 请求报文
    if (parp->ar_pro != htons(ETH_P_IP) ||
        parp->ar_op != htons(ARPOP_REQUEST) ||
        parp->ar_hln != dev->addr_len ||
        parp->ar_pln != 4)
        return;

    arpptr = (u8 *)parp + sizeof(struct arphdr);
    sha = arpptr;
    arpptr += dev->addr_len;    /* sha */
    memcpy(&sip, arpptr, sizeof(sip));
    arpptr += sizeof(sip);
    arpptr += dev->addr_len;    /* tha */
    memcpy(&tip, arpptr, sizeof(tip));

    if (ipv4_is_loopback(tip) ||
        ipv4_is_multicast(tip))
        return;
    // 存在 br口的目的ip的arp表项,这根据arp表信息封装arp reply
    n = neigh_lookup(&arp_tbl, &tip, dev);
    if (n) {
        struct net_bridge_fdb_entry *f;

        if (!(n->nud_state & NUD_VALID)) {
            neigh_release(n);
            return;
        }

        f = __br_fdb_get(br, n->ha, vid);
        /* 注意这里需要满足的条件:
          1、对应fdb表必须存在,所以sdn网络需要事先静态添加fdb表(配置了proxyarp,无法做广播和未知单播的泛洪,所以mac必须可达,否者彻底不通,在br_flood中有判断)
          2、接口配置了BR_PROXYARP
        */ 
        if (f && ((p->flags & BR_PROXYARP) ||
              (f->dst && (f->dst->flags & BR_PROXYARP_WIFI)))) {
            arp_send(ARPOP_REPLY, ETH_P_ARP, sip, skb->dev, tip,
                 sha, n->ha, sha);
            BR_INPUT_SKB_CB(skb)->proxyarp_replied = true;
        }

        neigh_release(n);
    }
}

这里只看上送本机 和 转发流程。
上送本机走 br_pass_frame_up,重点是将skb->dev替换成了bridge口,然后经过LOCAL_IN hook点过滤处理后,调用 netif_receive_skb函数再次走一次收包流程。
netif_receive_skb 这个函数上节已经讲过了,br_handler_frame 也是走这个函数过来的。但是这次 skb->dev 已经替换成bridge口了,它的skb->dev->rx_handler 为空,所以不会再次进入br_handler_frame,而是会进上层协议栈。


static int br_pass_frame_up(struct sk_buff *skb)
{
    struct net_device *indev, *brdev = BR_INPUT_SKB_CB(skb)->brdev;
    struct net_bridge *br = netdev_priv(brdev);
    struct net_bridge_vlan_group *vg;
    struct pcpu_sw_netstats *brstats = this_cpu_ptr(br->stats);

    u64_stats_update_begin(&brstats->syncp);
    brstats->rx_packets++;
    brstats->rx_bytes += skb->len;
    u64_stats_update_end(&brstats->syncp);

    vg = br_vlan_group_rcu(br);
    /* Bridge is just like any other port.  Make sure the
     * packet is allowed except in promisc modue when someone
     * may be running packet capture.
     */
    if (!(brdev->flags & IFF_PROMISC) &&
        !br_allowed_egress(vg, skb)) {
        kfree_skb(skb);
        return NET_RX_DROP;
    }

    indev = skb->dev;
    // 因为要上送本机协议栈,bridge 和本机的联系是通过br口的,所以这里将dev 替换成了 bridge 口
    skb->dev = brdev;
    skb = br_handle_vlan(br, vg, skb);
    if (!skb)
        return NET_RX_DROP;
    /* update the multicast stats if the packet is IGMP/MLD */
    br_multicast_count(br, NULL, skb, br_multicast_igmp_type(skb),
               BR_MCAST_DIR_TX);
    // local_in流程,先走 NF_BR_LOCAL_IN hook点,同PRE_ROUTING点,包含ebtables过滤,根据配置可能包含iptables过滤
    return NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN,
               dev_net(indev), NULL, skb, indev, NULL,
               br_netif_receive_skb);
}

static int
br_netif_receive_skb(struct net *net, struct sock *sk, struct sk_buff *skb)
{
    br_drop_fake_rtable(skb);
    return netif_receive_skb(skb);
}

转发流程,__br_forward 函数 重新设置skb->dev为bridge出接口,然后分别经过 NF_BR_FORWARD 和 NF_BR_POST_ROUTING两个hook点处理,走到接口发送函数 dev_queue_xmit,这个是接口发送的入口。



static void __br_forward(const struct net_bridge_port *to,
             struct sk_buff *skb, bool local_orig)
{
    struct net_bridge_vlan_group *vg;
    struct net_device *indev;
    struct net *net;
    int br_hook;

    vg = nbp_vlan_group_rcu(to);
    skb = br_handle_vlan(to->br, vg, skb);
    if (!skb)
        return;

    indev = skb->dev;
    skb->dev = to->dev;
    if (!local_orig) {
        if (skb_warn_if_lro(skb)) {
            kfree_skb(skb);
            return;
        }
        br_hook = NF_BR_FORWARD;
        skb_forward_csum(skb);
        net = dev_net(indev);
    } else {
        if (unlikely(netpoll_tx_running(to->br->dev))) {
            if (!is_skb_forwardable(skb->dev, skb)) {
                kfree_skb(skb);
            } else {
                skb_push(skb, ETH_HLEN);
                br_netpoll_send_skb(to, skb);
            }
            return;
        }
        br_hook = NF_BR_LOCAL_OUT;
        net = dev_net(skb->dev);
        indev = NULL;
    }
    // NF_BR_FORWARD hook点处理
    NF_HOOK(NFPROTO_BRIDGE, br_hook,
        net, NULL, skb, indev, skb->dev,
        br_forward_finish);
}

int br_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
    // NF_BR_POST_ROUTING hook点处理
    return NF_HOOK(NFPROTO_BRIDGE, NF_BR_POST_ROUTING,
               net, sk, skb, NULL, skb->dev,
               br_dev_queue_push_xmit);

}
EXPORT_SYMBOL_GPL(br_forward_finish);

int br_dev_queue_push_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
{
    if (!is_skb_forwardable(skb->dev, skb))
        goto drop;

    skb_push(skb, ETH_HLEN);
    br_drop_fake_rtable(skb);

    if (skb->ip_summed == CHECKSUM_PARTIAL &&
        (skb->protocol == htons(ETH_P_8021Q) ||
         skb->protocol == htons(ETH_P_8021AD))) {
        int depth;

        if (!__vlan_get_protocol(skb, skb->protocol, &depth))
            goto drop;

        skb_set_network_header(skb, depth);
    }
    // 接口发送,入队列、QoS、再调接口发送驱动处理函数
    dev_queue_xmit(skb);

    return 0;

drop:
    kfree_skb(skb);
    return 0;
}
EXPORT_SYMBOL_GPL(br_dev_queue_push_xmit);

你可能感兴趣的:(Linux网络协议栈4--bridge收发包)