[docker 网络][flannel] 背后操作

1. 前言

上文 [docker 网络][flannel] 配置安装测试 利用flannel vxlan实现了容器跨主机相互访问. 本文将模拟flannel vxlan看看flannel到底做了些什么操作可以让容器跨主机相互访问. 关于 vxlan的原理可以参考 VXLAN详解, 本文将会注重实际操作过程.

环境如下:

Machine 1 : 172.21.0.16 主机名:master
Machine 2 : 172.21.0.12 主机名:worker

flannel
1. [docker 网络][flannel] 配置安装测试
2. [docker 网络][flannel] 背后操作
3. [docker 网络][flannel] 源码简单分析

2. 增加vxlan节点

2.1 master(172.21.0.16)

add-vxlan.sh脚本增加一个vxlan类型的vxlan.1, 地址为10.0.1.250/32.

[root@master vxlan]# 
[root@master vxlan]# cat add-vxlan.sh 
ip link delete vxlan.1

ip link add vxlan.1 type vxlan id 1 dev eth0 local 172.21.0.16 dstport 4789 nolearning
ip addr add 10.0.1.250/32 dev vxlan.1
ip link set vxlan.1 up

[root@master vxlan]# ./add-vxlan.sh 
[root@master vxlan]# ifconfig vxlan.1
vxlan.1: flags=4163  mtu 1450
        inet 10.0.1.250  netmask 255.255.255.255  broadcast 0.0.0.0
        inet6 fe80::c0d4:cfff:feb5:8612  prefixlen 64  scopeid 0x20
        ether c2:d4:cf:b5:86:12  txqueuelen 1000  (Ethernet)
        RX packets 0  bytes 0 (0.0 B)
        RX errors 0  dropped 0  overruns 0  frame 0
        TX packets 0  bytes 0 (0.0 B)
        TX errors 0  dropped 6 overruns 0  carrier 0  collisions 0

[root@master vxlan]# 

2.2 worker(172.21.0.12)

[root@worker vxlan]# cat add-vxlan.sh 
ip link delete vxlan.1

ip link add vxlan.1 type vxlan id 1 dev eth0 local 172.21.0.12 dstport 4789 nolearning
ip addr add 10.0.2.250/32 dev vxlan.1
ip link set vxlan.1 up

[root@worker vxlan]# ./add-vxlan.sh 
[root@worker vxlan]# ifconfig vxlan.1
vxlan.1: flags=4163  mtu 1450
        inet 10.0.2.250  netmask 255.255.255.255  broadcast 0.0.0.0
        inet6 fe80::bcba:39ff:fe2e:a8ed  prefixlen 64  scopeid 0x20
        ether be:ba:39:2e:a8:ed  txqueuelen 1000  (Ethernet)
        RX packets 0  bytes 0 (0.0 B)
        RX errors 0  dropped 0  overruns 0  frame 0
        TX packets 0  bytes 0 (0.0 B)
        TX errors 0  dropped 6 overruns 0  carrier 0  collisions 0

[root@worker vxlan]# 

3. 增加fdb, neighbors 和 route

3.1 master(172.21.0.16)

需要知道worker(172.21.0.12)vxlan.1mac地址be:ba:39:2e:a8:ed以及vxlan.1ip地址(10.0.2.250/32).

[root@master vxlan]# cat add-fdb-arp-route.sh 
#ip route add 10.0.2.0/24 dev vxlan.1 onlink
ip route add 10.0.2.0/24 via 10.0.2.250 dev vxlan.1 onlink
bridge fdb add $1 dev vxlan.1 dst 172.21.0.12
ip neighbor add 10.0.2.250 lladdr $1 dev vxlan.1

[root@master vxlan]# ./add-fdb-arp-route.sh be:ba:39:2e:a8:ed
[root@master vxlan]# route -n
Kernel IP routing table
Destination     Gateway         Genmask         Flags Metric Ref    Use Iface
0.0.0.0         172.21.0.1      0.0.0.0         UG    0      0        0 eth0
10.0.2.0        10.0.2.250      255.255.255.0   UG    0      0        0 vxlan.1
169.254.0.0     0.0.0.0         255.255.0.0     U     1002   0        0 eth0
172.21.0.0      0.0.0.0         255.255.240.0   U     0      0        0 eth0
[root@master vxlan]# bridge fdb show
...
be:ba:39:2e:a8:ed dev vxlan.1 dst 172.21.0.12 self permanent
...
[root@master vxlan]# ip neighbor show
...
10.0.2.250 dev vxlan.1 lladdr be:ba:39:2e:a8:ed PERMANENT
...
[root@master vxlan]# 

3.2 worker(172.21.0.12)

需要知道master(172.21.0.16)vxlan.1mac地址c2:d4:cf:b5:86:12以及vxlan.1ip地址(10.0.2.250).

[root@worker vxlan]# cat add-fdb-arp-route.sh 
ip route add 10.0.1.0/24 via 10.0.1.250 dev vxlan.1 onlink
bridge fdb add $1 dev vxlan.1 dst 172.21.0.16
ip neighbor add 10.0.1.250 lladdr $1 dev vxlan.1

[root@worker vxlan]# ./add-fdb-arp-route.sh c2:d4:cf:b5:86:12
[root@worker vxlan]# route -n
Kernel IP routing table
Destination     Gateway         Genmask         Flags Metric Ref    Use Iface
0.0.0.0         172.21.0.1      0.0.0.0         UG    0      0        0 eth0
10.0.1.0        10.0.1.250      255.255.255.0   UG    0      0        0 vxlan.1
169.254.0.0     0.0.0.0         255.255.0.0     U     1002   0        0 eth0
172.21.0.0      0.0.0.0         255.255.240.0   U     0      0        0 eth0
[root@worker vxlan]# bridge fdb show
...
c2:d4:cf:b5:86:12 dev vxlan.1 dst 172.21.0.16 self permanent
...
[root@worker vxlan]# ip neighbor show
...
10.0.1.250 dev vxlan.1 lladdr c2:d4:cf:b5:86:12 PERMANENT
...
[root@worker vxlan]# 

3.3 测试vxlan之间相互访问

===> master vxlan.1 -> worker vxlan.1
[root@master vxlan]# ping -c 1 10.0.2.250
PING 10.0.2.250 (10.0.2.250) 56(84) bytes of data.
64 bytes from 10.0.2.250: icmp_seq=1 ttl=64 time=0.454 ms

--- 10.0.2.250 ping statistics ---
1 packets transmitted, 1 received, 0% packet loss, time 0ms
rtt min/avg/max/mdev = 0.454/0.454/0.454/0.000 ms
[root@master vxlan]# 

===> worker vxlan.1 -> master vxlan.1
[root@worker vxlan]# ping -c 1 10.0.1.250
PING 10.0.1.250 (10.0.1.250) 56(84) bytes of data.
64 bytes from 10.0.1.250: icmp_seq=1 ttl=64 time=0.437 ms

--- 10.0.1.250 ping statistics ---
1 packets transmitted, 1 received, 0% packet loss, time 0ms
rtt min/avg/max/mdev = 0.437/0.437/0.437/0.000 ms
[root@worker vxlan]# 

可以看到两个vxlan设备已经可以相互访问.

4. 增加network namespace (模拟docker)

4.1 master(172.21.0.16)

[root@master vxlan]# ip netns ls
[root@master vxlan]# cat add-ns.sh 
ip link delete veth1 type veth
ip netns delete ns1
ip link delete docker0 type bridge 
iptables -t nat -F
iptables -F

ip link add veth1 type veth peer name veth2
ip link set veth1 up
ip link add docker0 type bridge
ifconfig docker0 10.0.1.1/24
#brctl addif docker0 veth1
ip link set veth1 master docker0
ip netns add ns1
ip link set veth2 netns ns1

ip netns exec ns1 ip addr add 10.0.1.2/24 dev veth2
ip netns exec ns1 ip link set lo up
ip netns exec ns1 ip link set veth2 up
ip netns exec ns1 route add default gw 10.0.1.1

iptables -P FORWARD ACCEPT
iptables -t nat -A POSTROUTING -s 10.0.1.0/24 -o eth0 -j MASQUERADE
iptables -t filter -A FORWARD -s 10.0.0.0/16 -j ACCEPT
iptables -t filter -A FORWARD -d 10.0.0.0/16 -j ACCEPT

[root@master vxlan]# ./add-ns.sh 
Cannot find device "docker0"
[root@master vxlan]# ./add-ns.sh 
[root@master vxlan]# ip netns ls
ns1 (id: 0)
[root@master vxlan]# ip netns exec ns1 sh
sh-4.2# ifconfig
lo: flags=73  mtu 65536
        inet 127.0.0.1  netmask 255.0.0.0
        inet6 ::1  prefixlen 128  scopeid 0x10
        loop  txqueuelen 1000  (Local Loopback)
        RX packets 0  bytes 0 (0.0 B)
        RX errors 0  dropped 0  overruns 0  frame 0
        TX packets 0  bytes 0 (0.0 B)
        TX errors 0  dropped 0 overruns 0  carrier 0  collisions 0

veth2: flags=4163  mtu 1500
        inet 10.0.1.2  netmask 255.255.255.0  broadcast 0.0.0.0
        inet6 fe80::48e8:88ff:fe95:945c  prefixlen 64  scopeid 0x20
        ether 4a:e8:88:95:94:5c  txqueuelen 1000  (Ethernet)
        RX packets 14  bytes 1116 (1.0 KiB)
        RX errors 0  dropped 0  overruns 0  frame 0
        TX packets 8  bytes 648 (648.0 B)
        TX errors 0  dropped 0 overruns 0  carrier 0  collisions 0
sh-4.2# route -n
Kernel IP routing table
Destination     Gateway         Genmask         Flags Metric Ref    Use Iface
0.0.0.0         10.0.1.1        0.0.0.0         UG    0      0        0 veth2
10.0.1.0        0.0.0.0         255.255.255.0   U     0      0        0 veth2
sh-4.2# 

可以看到network namespace已经创建成功了, 并且相关配置已经设置完成.

4.2 worker(172.21.0.12)

[root@worker vxlan]# ip netns ls
[root@worker vxlan]# cat add-ns.sh
ip link delete veth1 type veth
ip netns delete ns1
ip link delete docker0 type bridge 
iptables -t nat -F
iptables -F

ip link add veth1 type veth peer name veth2
ip link set veth1 up
ip link add docker0 type bridge
ifconfig docker0 10.0.2.1/24
#brctl addif docker0 veth1
ip link set veth1 master docker0
ip netns add ns1
ip link set veth2 netns ns1

ip netns exec ns1 ip addr add 10.0.2.2/24 dev veth2
ip netns exec ns1 ip link set lo up
ip netns exec ns1 ip link set veth2 up
ip netns exec ns1 route add default gw 10.0.2.1

iptables -P FORWARD ACCEPT
iptables -t nat -A POSTROUTING -s 10.0.2.0/24 -o eth0 -j MASQUERADE
iptables -t filter -A FORWARD -s 10.0.0.0/16 -j ACCEPT
iptables -t filter -A FORWARD -d 10.0.0.0/16 -j ACCEPT

[root@worker vxlan]# ./add-ns.sh 
Cannot find device "veth1"
Cannot remove namespace file "/var/run/netns/ns1": No such file or directory
Cannot find device "docker0"
[root@worker vxlan]# ./add-ns.sh 
[root@worker vxlan]# ip netns ls
ns1 (id: 1)
[root@worker vxlan]# ip netns exec ns1 sh
sh-4.2# ifconfig 
lo: flags=73  mtu 65536
        inet 127.0.0.1  netmask 255.0.0.0
        inet6 ::1  prefixlen 128  scopeid 0x10
        loop  txqueuelen 1000  (Local Loopback)
        RX packets 0  bytes 0 (0.0 B)
        RX errors 0  dropped 0  overruns 0  frame 0
        TX packets 0  bytes 0 (0.0 B)
        TX errors 0  dropped 0 overruns 0  carrier 0  collisions 0

veth2: flags=4163  mtu 1500
        inet 10.0.2.2  netmask 255.255.255.0  broadcast 0.0.0.0
        inet6 fe80::6c11:71ff:feb8:3a6c  prefixlen 64  scopeid 0x20
        ether 6e:11:71:b8:3a:6c  txqueuelen 1000  (Ethernet)
        RX packets 15  bytes 1206 (1.1 KiB)
        RX errors 0  dropped 0  overruns 0  frame 0
        TX packets 8  bytes 648 (648.0 B)
        TX errors 0  dropped 0 overruns 0  carrier 0  collisions 0

sh-4.2# route -n
Kernel IP routing table
Destination     Gateway         Genmask         Flags Metric Ref    Use Iface
0.0.0.0         10.0.2.1        0.0.0.0         UG    0      0        0 veth2
10.0.2.0        0.0.0.0         255.255.255.0   U     0      0        0 veth2
sh-4.2# 

worker中模拟的容器也创建成功了.

4.3 测试跨主机访问

ns1.png

master中的容器访问worker中的容器,

[root@master vxlan]# ip netns exec ns1 sh
===> 访问worker 的ns1
sh-4.2# ping -c 1 10.0.2.2
PING 10.0.2.2 (10.0.2.2) 56(84) bytes of data.
64 bytes from 10.0.2.2: icmp_seq=1 ttl=62 time=0.411 ms

--- 10.0.2.2 ping statistics ---
1 packets transmitted, 1 received, 0% packet loss, time 0ms
rtt min/avg/max/mdev = 0.411/0.411/0.411/0.000 ms

===> 访问worker 的docker0
sh-4.2# ping -c 1 10.0.2.1
PING 10.0.2.1 (10.0.2.1) 56(84) bytes of data.
64 bytes from 10.0.2.1: icmp_seq=1 ttl=63 time=0.389 ms

--- 10.0.2.1 ping statistics ---
1 packets transmitted, 1 received, 0% packet loss, time 0ms
rtt min/avg/max/mdev = 0.389/0.389/0.389/0.000 ms

===> 访问worker 的vxlan.1
sh-4.2# ping -c 1 10.0.2.250
PING 10.0.2.250 (10.0.2.250) 56(84) bytes of data.
64 bytes from 10.0.2.250: icmp_seq=1 ttl=63 time=0.394 ms

--- 10.0.2.250 ping statistics ---
1 packets transmitted, 1 received, 0% packet loss, time 0ms
rtt min/avg/max/mdev = 0.394/0.394/0.394/0.000 ms

===> 访问worker
sh-4.2# ping -c 1 172.21.0.12
PING 172.21.0.12 (172.21.0.12) 56(84) bytes of data.
64 bytes from 172.21.0.12: icmp_seq=1 ttl=63 time=0.351 ms

--- 172.21.0.12 ping statistics ---
1 packets transmitted, 1 received, 0% packet loss, time 0ms
rtt min/avg/max/mdev = 0.351/0.351/0.351/0.000 ms

worker容器访问master中的容器, docker0, vxlan.1 以及主机.

[root@worker vxlan]# ip netns exec ns1 sh
===> 访问master的ns1
sh-4.2# ping -c 1 10.0.1.2
PING 10.0.1.2 (10.0.1.2) 56(84) bytes of data.
64 bytes from 10.0.1.2: icmp_seq=1 ttl=62 time=0.449 ms

--- 10.0.1.2 ping statistics ---
1 packets transmitted, 1 received, 0% packet loss, time 0ms
rtt min/avg/max/mdev = 0.449/0.449/0.449/0.000 ms

===> 访问master的docker0
sh-4.2# ping -c 1 10.0.1.1
PING 10.0.1.1 (10.0.1.1) 56(84) bytes of data.
64 bytes from 10.0.1.1: icmp_seq=1 ttl=63 time=0.408 ms

--- 10.0.1.1 ping statistics ---
1 packets transmitted, 1 received, 0% packet loss, time 0ms
rtt min/avg/max/mdev = 0.408/0.408/0.408/0.000 ms

===> 访问master的vxlan.1
sh-4.2# ping -c 1 10.0.1.250
PING 10.0.1.250 (10.0.1.250) 56(84) bytes of data.
64 bytes from 10.0.1.250: icmp_seq=1 ttl=63 time=0.409 ms

--- 10.0.1.250 ping statistics ---
1 packets transmitted, 1 received, 0% packet loss, time 0ms
rtt min/avg/max/mdev = 0.409/0.409/0.409/0.000 ms

===> 访问master
sh-4.2# ping -c 1 172.21.0.16
PING 172.21.0.16 (172.21.0.16) 56(84) bytes of data.
64 bytes from 172.21.0.16: icmp_seq=1 ttl=63 time=0.348 ms

--- 172.21.0.16 ping statistics ---
1 packets transmitted, 1 received, 0% packet loss, time 0ms
rtt min/avg/max/mdev = 0.348/0.348/0.348/0.000 ms

可以看到两个容器network namespace实现了跨主机访问.

5. 在master中增加一个network namespace

ns-2.png
[root@master vxlan]# cat add-another-ns.sh 
ip link delete veth5 type veth
ip netns delete ns2

ip link add veth5 type veth peer name veth6
ip link set veth5 up
ip link set veth5 master docker0
ip netns add ns2
ip link set veth6 netns ns2

ip netns exec ns2 ip addr add 10.0.1.3/24 dev veth6
ip netns exec ns2 ip link set lo up
ip netns exec ns2 ip link set veth6 up
ip netns exec ns2 route add default gw 10.0.1.1


[root@master vxlan]# ./add-another-ns.sh 
Cannot find device "veth5"
Cannot remove namespace file "/var/run/netns/ns2": No such file or directory
[root@master vxlan]# ./add-another-ns.sh 
[root@master vxlan]# ip netns ls
ns2 (id: 1)
ns1 (id: 0)
[root@master vxlan]# 
[root@master vxlan]# ip netns exec ns2 sh
sh-4.2# ifconfig
lo: flags=73  mtu 65536
        inet 127.0.0.1  netmask 255.0.0.0
        inet6 ::1  prefixlen 128  scopeid 0x10
        loop  txqueuelen 1000  (Local Loopback)
        RX packets 0  bytes 0 (0.0 B)
        RX errors 0  dropped 0  overruns 0  frame 0
        TX packets 0  bytes 0 (0.0 B)
        TX errors 0  dropped 0 overruns 0  carrier 0  collisions 0

veth6: flags=4163  mtu 1500
        inet 10.0.1.3  netmask 255.255.255.0  broadcast 0.0.0.0
        inet6 fe80::5ca9:72ff:fe81:24d3  prefixlen 64  scopeid 0x20
        ether 5e:a9:72:81:24:d3  txqueuelen 1000  (Ethernet)
        RX packets 8  bytes 648 (648.0 B)
        RX errors 0  dropped 0  overruns 0  frame 0
        TX packets 8  bytes 648 (648.0 B)
        TX errors 0  dropped 0 overruns 0  carrier 0  collisions 0

sh-4.2# route -n
Kernel IP routing table
Destination     Gateway         Genmask         Flags Metric Ref    Use Iface
0.0.0.0         10.0.1.1        0.0.0.0         UG    0      0        0 veth6
10.0.1.0        0.0.0.0         255.255.255.0   U     0      0        0 veth6
===> 访问本机ns1
sh-4.2# ping -c 1 10.0.1.2
PING 10.0.1.2 (10.0.1.2) 56(84) bytes of data.
64 bytes from 10.0.1.2: icmp_seq=1 ttl=64 time=0.071 ms

--- 10.0.1.2 ping statistics ---
1 packets transmitted, 1 received, 0% packet loss, time 0ms
rtt min/avg/max/mdev = 0.071/0.071/0.071/0.000 ms

===> 访问本机docker0
sh-4.2# ping -c 1 10.0.1.1
PING 10.0.1.1 (10.0.1.1) 56(84) bytes of data.
64 bytes from 10.0.1.1: icmp_seq=1 ttl=64 time=0.067 ms

--- 10.0.1.1 ping statistics ---
1 packets transmitted, 1 received, 0% packet loss, time 0ms
rtt min/avg/max/mdev = 0.067/0.067/0.067/0.000 ms

===> 访问本机vxlan.1
sh-4.2# ping -c 1 10.0.1.250
PING 10.0.1.250 (10.0.1.250) 56(84) bytes of data.
64 bytes from 10.0.1.250: icmp_seq=1 ttl=64 time=0.066 ms

--- 10.0.1.250 ping statistics ---
1 packets transmitted, 1 received, 0% packet loss, time 0ms
rtt min/avg/max/mdev = 0.066/0.066/0.066/0.000 ms

===> 访问本机
sh-4.2# ping -c 1 172.21.0.16
PING 172.21.0.16 (172.21.0.16) 56(84) bytes of data.
64 bytes from 172.21.0.16: icmp_seq=1 ttl=64 time=0.044 ms

--- 172.21.0.16 ping statistics ---
1 packets transmitted, 1 received, 0% packet loss, time 0ms
rtt min/avg/max/mdev = 0.044/0.044/0.044/0.000 ms

===> 访问互联网
sh-4.2# ping -c 1 www.baidu.com
PING www.a.shifen.com (220.181.38.149) 56(84) bytes of data.
64 bytes from 220.181.38.149 (220.181.38.149): icmp_seq=1 ttl=249 time=6.13 ms

--- www.a.shifen.com ping statistics ---
1 packets transmitted, 1 received, 0% packet loss, time 0ms
rtt min/avg/max/mdev = 6.132/6.132/6.132/0.000 ms
sh-4.2# exit
exit
[root@master vxlan]# 

可以看到主机内部的容器(network nameapce)已经可以相互之间访问.

你可能感兴趣的:([docker 网络][flannel] 背后操作)