bridge 是linux上的虚拟交换机,具有交换机的功能。
网卡收到包后,走到__netif_receive_skb_core后,剥完vlan找到vlan子接口(如果有的话),如果skb->dev是bridge成员口,就会走到bridge成员口的接收处理函数。
static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
{
......
/*
bridge、ovs的接口,都会走到。
如果一个dev被添加到一个bridge(做为bridge的一个接口),这个接口设备的rx_handler将被设置为,
br_handle_frame函数这是在br_add_if函数中设置的,而br_add_if (net/bridge/br_if.c)是在向
网桥设备上添加接口时设置的。进入br_handle_frame也就进入了bridge的逻辑代码。*/
rx_handler = rcu_dereference(skb->dev->rx_handler);
if (rx_handler) {
if (pt_prev) {
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = NULL;
}
switch (rx_handler(&skb)) {
case RX_HANDLER_CONSUMED: // 报文已经被消费,结束处理
ret = NET_RX_SUCCESS;
goto out;
case RX_HANDLER_ANOTHER: // skb->dev 被修改,重新走一次
goto another_round;
case RX_HANDLER_EXACT: /* 精确传递到ptype->dev == skb->dev */
deliver_exact = true;
case RX_HANDLER_PASS:
break;
default:
BUG();
}
}
......
}
bridge 的接收处理函数为br_handler_frame,在为bridge添加接口操作时,如brctl addif命令行将一个接口加入到bridge中,就会为接口的 net_device挂载此处理函数。
int br_add_if(struct net_bridge *br, struct net_device *dev)
{
......
err = netdev_rx_handler_register(dev, br_handle_frame, p);
if (err)
goto err4;
......
}
int netdev_rx_handler_register(struct net_device *dev,
rx_handler_func_t *rx_handler,
void *rx_handler_data)
{
ASSERT_RTNL();
if (dev->rx_handler)
return -EBUSY;
/* Note: rx_handler_data must be set before rx_handler */
rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
rcu_assign_pointer(dev->rx_handler, rx_handler);
return 0;
}
bridge接收处理函数br_handle_frame,非linklocal地址的情况下,主要做:
1、ebtables 的BROUTING 表处理,这是在bridge协议栈中将二层转发切换到主机协议栈的三层转发的hook点;
2、bridge NF_BR_PRE_ROUTING hook点处理,配置net.bridge.bridge-nf-call-iptables 系统配置的情况下还会 调用iptables规则处理;
3、进br_handle_frame_finish函数,根据src mac学习fdb表项,继续做转发处理或本机报文上送三层协议栈。
rx_handler_result_t br_handle_frame(struct sk_buff **pskb)
{
struct net_bridge_port *p;
struct sk_buff *skb = *pskb;
const unsigned char *dest = eth_hdr(skb)->h_dest;
br_should_route_hook_t *rhook;
if (unlikely(skb->pkt_type == PACKET_LOOPBACK))
return RX_HANDLER_PASS;
if (!is_valid_ether_addr(eth_hdr(skb)->h_source))
goto drop;
// 如果skb被其它流程共享,clone一份
skb = skb_share_check(skb, GFP_ATOMIC);
if (!skb)
return RX_HANDLER_CONSUMED;
p = br_port_get_rcu(skb->dev);
if (unlikely(is_link_local_ether_addr(dest))) {
u16 fwd_mask = p->br->group_fwd_mask_required;
/*
* See IEEE 802.1D Table 7-10 Reserved addresses
*
* Assignment Value
* Bridge Group Address 01-80-C2-00-00-00
* (MAC Control) 802.3 01-80-C2-00-00-01
* (Link Aggregation) 802.3 01-80-C2-00-00-02
* 802.1X PAE address 01-80-C2-00-00-03
*
* 802.1AB LLDP 01-80-C2-00-00-0E
*
* Others reserved for future standardization
*/
// 使用linklocal地址的一些场景的处理,__br_handle_local_finish 会学习src mac的fdb表项
switch (dest[5]) {
case 0x00: /* Bridge Group Address */
/* If STP is turned off,
then must forward to keep loop detection */
if (p->br->stp_enabled == BR_NO_STP ||
fwd_mask & (1u << dest[5]))
goto forward;
*pskb = skb;
__br_handle_local_finish(skb);
return RX_HANDLER_PASS;
case 0x01: /* IEEE MAC (Pause) */
goto drop;
case 0x0E: /* 802.1AB LLDP */
fwd_mask |= p->br->group_fwd_mask;
if (fwd_mask & (1u << dest[5]))
goto forward;
*pskb = skb;
__br_handle_local_finish(skb);
return RX_HANDLER_PASS;
default:
/* Allow selective forwarding for most other protocols */
fwd_mask |= p->br->group_fwd_mask;
if (fwd_mask & (1u << dest[5]))
goto forward;
}
/* Deliver packet to local host only */
NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, dev_net(skb->dev),
NULL, skb, skb->dev, NULL, br_handle_local_finish);
return RX_HANDLER_CONSUMED;
}
forward:
switch (p->state) {
case BR_STATE_FORWARDING:
rhook = rcu_dereference(br_should_route_hook);
/* ebtables 的BROUTING 表处理,rhook 返回1,表示上送三层协议栈。我的一个项目中用过这个特性,识别用户的部分不重要的业务,走internet(配置有跨公网tunnel),而其它业务流量走加入bridge中的专线口。*/
if (rhook) {
if ((*rhook)(skb)) {
*pskb = skb;
return RX_HANDLER_PASS;
}
dest = eth_hdr(skb)->h_dest;
}
/* fall through */
case BR_STATE_LEARNING:
// mac地址是br的mac地址,表示送本机的报文。
if (ether_addr_equal(p->br->dev->dev_addr, dest))
skb->pkt_type = PACKET_HOST;
/* NFPROTO_BRIDGE 协议类型的netfilter包括:
1、ebtable模块注册的钩子,定义在ebt_ops_filter、ebt_ops_nat;
2、bridge 模块注册的br_nf_ops 中定义的处理,其会根据 net.bridge.bridge-nf-call-iptables 系统配置决定是否调用iptables规则过滤,即在二层转发中做上层协议处理。
曾今工作中遇到过bridge-nf-call-iptables 被打开,导致流量不通的一个问题,看看半天,痛苦。
*/
NF_HOOK(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING,
dev_net(skb->dev), NULL, skb, skb->dev, NULL,
br_handle_frame_finish);
break;
default:
drop:
kfree_skb(skb);
}
return RX_HANDLER_CONSUMED;
}
br_handle_frame_finish函数主要做:
1、根据报文的源MAC地址,入接口,刷新fdb表;
2、实现arp带答功能(带答比代理更合适);
3、识别报文是单播、广播、还是组播,单播和组播查对应的fdb表;
4、单播:
1)送往本机的,走 br_pass_frame_up,local_in流程;
2)非本机,走 br_forward,forward流程;
广播报文: 走br_flood ,会在除入接口外的其它接口广播;
组播报文:走br_multicast_flood,根据组播表转发。
int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct net_bridge_port *p = br_port_get_rcu(skb->dev);
const unsigned char *dest = eth_hdr(skb)->h_dest;
enum br_pkt_type pkt_type = BR_PKT_UNICAST;
struct net_bridge_fdb_entry *dst = NULL;
struct net_bridge_mdb_entry *mdst;
bool local_rcv, mcast_hit = false;
struct net_bridge *br;
u16 vid = 0;
if (!p || p->state == BR_STATE_DISABLED)
goto drop;
if (!br_allowed_ingress(p->br, nbp_vlan_group_rcu(p), skb, &vid))
goto out;
nbp_switchdev_frame_mark(p, skb);
/* insert into forwarding database after filtering to avoid spoofing */
br = p->br;
if (p->flags & BR_LEARNING)
// 根据报文的源MAC地址,入接口,刷新fdb表(mac表)
br_fdb_update(br, p, eth_hdr(skb)->h_source, vid, false);
// 混杂模式下,会上送协议栈一份(为什么是!!)
local_rcv = !!(br->dev->flags & IFF_PROMISC);
if (is_multicast_ether_addr(dest)) {
/* by definition the broadcast is also a multicast address */
if (is_broadcast_ether_addr(dest)) {
pkt_type = BR_PKT_BROADCAST;
local_rcv = true;
} else {
pkt_type = BR_PKT_MULTICAST;
if (br_multicast_rcv(br, p, skb, vid))
goto drop;
}
}
if (p->state == BR_STATE_LEARNING)
goto drop;
BR_INPUT_SKB_CB(skb)->brdev = br->dev;
if (IS_ENABLED(CONFIG_INET) && skb->protocol == htons(ETH_P_ARP))
// 一个比较重要的特性,arp proxy,接口如果是能了此功能,对于收到的arp request,可以通过查询本地的arp表构造arp reply报文回应。SDN网络经常用到。
br_do_proxy_arp(skb, br, vid, p);
// 根据报文类型是组播还是单播,查转发表
switch (pkt_type) {
case BR_PKT_MULTICAST:
mdst = br_mdb_get(br, skb, vid);
if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) &&
br_multicast_querier_exists(br, eth_hdr(skb))) {
if ((mdst && mdst->mglist) ||
br_multicast_is_router(br)) {
local_rcv = true;
br->dev->stats.multicast++;
}
mcast_hit = true;
} else {
local_rcv = true;
br->dev->stats.multicast++;
}
break;
case BR_PKT_UNICAST:
dst = __br_fdb_get(br, dest, vid);
default:
break;
}
// 单播
if (dst) {
// 本地转发表,则上送协议栈
if (dst->is_local)
return br_pass_frame_up(skb);
// 否则,走二层转发
dst->used = jiffies;
br_forward(dst->dst, skb, local_rcv, false);
} else {
if (!mcast_hit)
// 未知的单播,泛流
br_flood(br, skb, pkt_type, local_rcv, false);
else
// 组播报文发送
br_multicast_flood(mdst, skb, local_rcv, false);
}
// br混杂模式、广播、某些情况下组播报文(懒得看了)会上送协议栈一份
if (local_rcv)
return ·(skb);
out:
return 0;
drop:
kfree_skb(skb);
goto out;
}
EXPORT_SYMBOL_GPL(br_handle_frame_finish);
这里有一个比较重要的特性,arp proxy,接口如果是能了此功能,对于收到的arp request,可以通过查询本地的arp表构造arp reply报文回应。SDN网络经常用到。
功能测试注意项记录:1、必须事先配置好代答mac地址的fdb表,否则无法代答ARP,这是因为配置了proxyarp,无法做广播和未知单播的泛洪,所以mac必须可达,否者彻底不通,在br_flood中有判断。如下面br_do_proxy_arp代码分析; 2、使用ip link set dev veth20 type bridge_slave proxy_arp on 使能proxy arp功能,和/proc/sys/net/ipv4/conf/veth2/proxy_arp 配置的功能不是一个东西;3、低版本的内核不支持,3.10 的centos可以配置但是没效果。
static void br_do_proxy_arp(struct sk_buff *skb, struct net_bridge *br,
u16 vid, struct net_bridge_port *p)
{
struct net_device *dev = br->dev;
struct neighbour *n;
struct arphdr *parp;
u8 *arpptr, *sha;
__be32 sip, tip;
BR_INPUT_SKB_CB(skb)->proxyarp_replied = false;
if ((dev->flags & IFF_NOARP) ||
!pskb_may_pull(skb, arp_hdr_len(dev)))
return;
parp = arp_hdr(skb);
// 过滤 arp 请求报文
if (parp->ar_pro != htons(ETH_P_IP) ||
parp->ar_op != htons(ARPOP_REQUEST) ||
parp->ar_hln != dev->addr_len ||
parp->ar_pln != 4)
return;
arpptr = (u8 *)parp + sizeof(struct arphdr);
sha = arpptr;
arpptr += dev->addr_len; /* sha */
memcpy(&sip, arpptr, sizeof(sip));
arpptr += sizeof(sip);
arpptr += dev->addr_len; /* tha */
memcpy(&tip, arpptr, sizeof(tip));
if (ipv4_is_loopback(tip) ||
ipv4_is_multicast(tip))
return;
// 存在 br口的目的ip的arp表项,这根据arp表信息封装arp reply
n = neigh_lookup(&arp_tbl, &tip, dev);
if (n) {
struct net_bridge_fdb_entry *f;
if (!(n->nud_state & NUD_VALID)) {
neigh_release(n);
return;
}
f = __br_fdb_get(br, n->ha, vid);
/* 注意这里需要满足的条件:
1、对应fdb表必须存在,所以sdn网络需要事先静态添加fdb表(配置了proxyarp,无法做广播和未知单播的泛洪,所以mac必须可达,否者彻底不通,在br_flood中有判断)
2、接口配置了BR_PROXYARP
*/
if (f && ((p->flags & BR_PROXYARP) ||
(f->dst && (f->dst->flags & BR_PROXYARP_WIFI)))) {
arp_send(ARPOP_REPLY, ETH_P_ARP, sip, skb->dev, tip,
sha, n->ha, sha);
BR_INPUT_SKB_CB(skb)->proxyarp_replied = true;
}
neigh_release(n);
}
}
这里只看上送本机 和 转发流程。
上送本机走 br_pass_frame_up,重点是将skb->dev替换成了bridge口,然后经过LOCAL_IN hook点过滤处理后,调用 netif_receive_skb函数再次走一次收包流程。
netif_receive_skb 这个函数上节已经讲过了,br_handler_frame 也是走这个函数过来的。但是这次 skb->dev 已经替换成bridge口了,它的skb->dev->rx_handler 为空,所以不会再次进入br_handler_frame,而是会进上层协议栈。
static int br_pass_frame_up(struct sk_buff *skb)
{
struct net_device *indev, *brdev = BR_INPUT_SKB_CB(skb)->brdev;
struct net_bridge *br = netdev_priv(brdev);
struct net_bridge_vlan_group *vg;
struct pcpu_sw_netstats *brstats = this_cpu_ptr(br->stats);
u64_stats_update_begin(&brstats->syncp);
brstats->rx_packets++;
brstats->rx_bytes += skb->len;
u64_stats_update_end(&brstats->syncp);
vg = br_vlan_group_rcu(br);
/* Bridge is just like any other port. Make sure the
* packet is allowed except in promisc modue when someone
* may be running packet capture.
*/
if (!(brdev->flags & IFF_PROMISC) &&
!br_allowed_egress(vg, skb)) {
kfree_skb(skb);
return NET_RX_DROP;
}
indev = skb->dev;
// 因为要上送本机协议栈,bridge 和本机的联系是通过br口的,所以这里将dev 替换成了 bridge 口
skb->dev = brdev;
skb = br_handle_vlan(br, vg, skb);
if (!skb)
return NET_RX_DROP;
/* update the multicast stats if the packet is IGMP/MLD */
br_multicast_count(br, NULL, skb, br_multicast_igmp_type(skb),
BR_MCAST_DIR_TX);
// local_in流程,先走 NF_BR_LOCAL_IN hook点,同PRE_ROUTING点,包含ebtables过滤,根据配置可能包含iptables过滤
return NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN,
dev_net(indev), NULL, skb, indev, NULL,
br_netif_receive_skb);
}
static int
br_netif_receive_skb(struct net *net, struct sock *sk, struct sk_buff *skb)
{
br_drop_fake_rtable(skb);
return netif_receive_skb(skb);
}
转发流程,__br_forward 函数 重新设置skb->dev为bridge出接口,然后分别经过 NF_BR_FORWARD 和 NF_BR_POST_ROUTING两个hook点处理,走到接口发送函数 dev_queue_xmit,这个是接口发送的入口。
static void __br_forward(const struct net_bridge_port *to,
struct sk_buff *skb, bool local_orig)
{
struct net_bridge_vlan_group *vg;
struct net_device *indev;
struct net *net;
int br_hook;
vg = nbp_vlan_group_rcu(to);
skb = br_handle_vlan(to->br, vg, skb);
if (!skb)
return;
indev = skb->dev;
skb->dev = to->dev;
if (!local_orig) {
if (skb_warn_if_lro(skb)) {
kfree_skb(skb);
return;
}
br_hook = NF_BR_FORWARD;
skb_forward_csum(skb);
net = dev_net(indev);
} else {
if (unlikely(netpoll_tx_running(to->br->dev))) {
if (!is_skb_forwardable(skb->dev, skb)) {
kfree_skb(skb);
} else {
skb_push(skb, ETH_HLEN);
br_netpoll_send_skb(to, skb);
}
return;
}
br_hook = NF_BR_LOCAL_OUT;
net = dev_net(skb->dev);
indev = NULL;
}
// NF_BR_FORWARD hook点处理
NF_HOOK(NFPROTO_BRIDGE, br_hook,
net, NULL, skb, indev, skb->dev,
br_forward_finish);
}
int br_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
// NF_BR_POST_ROUTING hook点处理
return NF_HOOK(NFPROTO_BRIDGE, NF_BR_POST_ROUTING,
net, sk, skb, NULL, skb->dev,
br_dev_queue_push_xmit);
}
EXPORT_SYMBOL_GPL(br_forward_finish);
int br_dev_queue_push_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
{
if (!is_skb_forwardable(skb->dev, skb))
goto drop;
skb_push(skb, ETH_HLEN);
br_drop_fake_rtable(skb);
if (skb->ip_summed == CHECKSUM_PARTIAL &&
(skb->protocol == htons(ETH_P_8021Q) ||
skb->protocol == htons(ETH_P_8021AD))) {
int depth;
if (!__vlan_get_protocol(skb, skb->protocol, &depth))
goto drop;
skb_set_network_header(skb, depth);
}
// 接口发送,入队列、QoS、再调接口发送驱动处理函数
dev_queue_xmit(skb);
return 0;
drop:
kfree_skb(skb);
return 0;
}
EXPORT_SYMBOL_GPL(br_dev_queue_push_xmit);