上一篇文章分析了kubelet创建pod时首先需要创建一个sandbox容器,该容器保证了k8s的pod中多个容器使用同一个网络命名空间,每个容器能够像访问本地端口一样访问对端容器端口。虽然sandbox的创建流程和运行时参数配置的代码我们都一一分析过了,实际的容器网络也是调用cni插件配置,但是cni插件是怎么工作的呢,这一节我们着重从cni(以caliclo为例)插件一端分析网络配置过程。
cni插件其实是一个二进制执行文件,kubelet在经过各种逻辑生成配置参数,然后根据配置配置参数去执行cni插件,我们看下这些参数是哪些
prevResult, err = invoke.ExecPluginWithResult(pluginPath, newConf.Bytes, c.args("ADD", rt))
// 其中,rt的内容为
rt := &libcni.RuntimeConf{
ContainerID: podSandboxID.ID,
NetNS: podNetnsPath,
IfName: network.DefaultInterfaceName,
Args: [][2]string{
{"IgnoreUnknown", "1"},
{"K8S_POD_NAMESPACE", podNs},
{"K8S_POD_NAME", podName},
{"K8S_POD_INFRA_CONTAINER_ID", podSandboxID.ID},
},
}
// newConf.Bytes是cni插件的配置文件,以我的集群为例
{
"name": "k8s-pod-network",
"cniVersion": "0.1.0",
"type": "calico",
"etcd_endpoints": "http://127.0.0.1:2379",
"etcd_key_file": "",
"etcd_cert_file": "",
"etcd_ca_cert_file": "",
"log_level": "debug",
"mtu": 1500,
"ipam": {
"type": "calico-ipam"
},
"policy": {
"type": "k8s",
"k8s_api_root": "https://10.96.0.1:443",
"k8s_auth_token": "eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJrdWJlcm5ldGVzL3NlcnZpY2VhY2NvdW50Iiwia3ViZXJuZXRlcy5pby9zZXJ2aWNlYWNjb3VudC9uYW1lc3BhY2UiOiJrdWJlLXN5c3RlbSIsImt1YmVybmV0ZXMuaW8vc2VydmljZWFjY291bnQvc2VjcmV0Lm5hbWUiOiJjYWxpY28tbm9kZS10b2tlbi14cTQ1dyIsImt1YmVybmV0ZXMuaW8vc2VydmljZWFjY291bnQvc2VydmljZS1hY2NvdW50Lm5hbWUiOiJjYWxpY28tbm9kZSIsImt1YmVybmV0ZXMuaW8vc2VydmljZWFjY291bnQvc2VydmljZS1hY2NvdW50LnVpZCI6ImMwZDI5NDQyLWExMGUtMTFlOC04NGM4LTAwMGMyOWUwYjU0OSIsInN1YiI6InN5c3RlbTpzZXJ2aWNlYWNjb3VudDprdWJlLXN5c3RlbTpjYWxpY28tbm9kZSJ9.iT1WQTKMrJaDjM_cBTi4IO6m2Zx566wlyQm2N0CcCLNWmBwRfO4FKhuRTCp5yd9gNTBZlosdByovIeQJLBPXfUqcFOIZ6he4or1fVnjgBOIqy1G2zii7X0KrVMHEizwwS9sz44ielsjxD-BIbdwxaXv0U9yjaB1TH9Fp8LdmNtdMUs1UrrimODUqG4QAFcaGA9UBAfgchsx6pPmRNJ2Jft79W6kXv-BCK6vR434UeobfxM5k8rj4rXhCzLjQt8iIKfgOFYlzehH-ZWFjNETUxrRoyRDqvBvxQ9Sh9Bh8dC5ODO34acZPAKy1GiP2S-9fP1P8RwKP-hmw4em1RC4UdA"
},
"kubernetes": {
"kubeconfig": "/etc/cni/net.d/calico-kubeconfig"
}
}
该部分在第一节已经详细介绍,如有疑问可以查看第一节的内容。
带着这些信息,我们开始看calico插件的网络配置流程(只分析基于k8s的容器网络配置),先看calico的main函数(cni版本1.11.6)
func main() {
// Set up logging formatting.
log.SetFormatter(&logutils.Formatter{})
// Install a hook that adds file/line no information.
log.AddHook(&logutils.ContextHook{})
// Display the version on "-v", otherwise just delegate to the skel code.
// Use a new flag set so as not to conflict with existing libraries which use "flag"
flagSet := flag.NewFlagSet("Calico", flag.ExitOnError)
version := flagSet.Bool("v", false, "Display version")
err := flagSet.Parse(os.Args[1:])
if err != nil {
fmt.Println(err)
os.Exit(1)
}
if *version {
fmt.Println(VERSION)
os.Exit(0)
}
if err := AddIgnoreUnknownArgs(); err != nil {
os.Exit(1)
}
skel.PluginMain(cmdAdd, nil, cmdDel, cniSpecVersion.All, "")
}
PluginMain是一个委托函数,参数的接收为cmdAdd,cmdGet,cmdDel以及cni支持版本等。在PluginMain函数里我们看到其实它调用的是PluginMainWithError,而逻辑主要是dispatcher下的pluginMain函数
func PluginMain(cmdAdd, cmdGet, cmdDel func(_ *CmdArgs) error, versionInfo version.PluginInfo, about string) {
if e := PluginMainWithError(cmdAdd, cmdGet, cmdDel, versionInfo, about); e != nil {
if err := e.Print(); err != nil {
log.Print("Error writing error JSON to stdout: ", err)
}
os.Exit(1)
}
}
func PluginMainWithError(cmdAdd, cmdGet, cmdDel func(_ *CmdArgs) error, versionInfo version.PluginInfo, about string) *types.Error {
return (&dispatcher{
Getenv: os.Getenv,
Stdin: os.Stdin,
Stdout: os.Stdout,
Stderr: os.Stderr,
}).pluginMain(cmdAdd, cmdGet, cmdDel, versionInfo, about)
}
dispatcher从系统的环境变量和输入接收参数,同时设置系统的输出和错误,dispatcher的pluginMain函数如下
func (t *dispatcher) pluginMain(cmdAdd, cmdGet, cmdDel func(_ *CmdArgs) error, versionInfo version.PluginInfo, about string) *types.Error {
cmd, cmdArgs, err := t.getCmdArgsFromEnv()
if err != nil {
// Print the about string to stderr when no command is set
if _, ok := err.(missingEnvError); ok && t.Getenv("CNI_COMMAND") == "" && about != "" {
fmt.Fprintln(t.Stderr, about)
return nil
}
return createTypedError(err.Error())
}
if cmd != "VERSION" {
err = validateConfig(cmdArgs.StdinData)
if err != nil {
return createTypedError(err.Error())
}
}
switch cmd {
case "ADD":
err = t.checkVersionAndCall(cmdArgs, versionInfo, cmdAdd)
case "GET":
configVersion, err := t.ConfVersionDecoder.Decode(cmdArgs.StdinData)
if err != nil {
return createTypedError(err.Error())
}
if gtet, err := version.GreaterThanOrEqualTo(configVersion, "0.4.0"); err != nil {
return createTypedError(err.Error())
} else if !gtet {
return &types.Error{
Code: types.ErrIncompatibleCNIVersion,
Msg: "config version does not allow GET",
}
}
for _, pluginVersion := range versionInfo.SupportedVersions() {
gtet, err := version.GreaterThanOrEqualTo(pluginVersion, configVersion)
if err != nil {
return createTypedError(err.Error())
} else if gtet {
if err := t.checkVersionAndCall(cmdArgs, versionInfo, cmdGet); err != nil {
return createTypedError(err.Error())
}
return nil
}
}
return &types.Error{
Code: types.ErrIncompatibleCNIVersion,
Msg: "plugin version does not allow GET",
}
case "DEL":
err = t.checkVersionAndCall(cmdArgs, versionInfo, cmdDel)
case "VERSION":
err = versionInfo.Encode(t.Stdout)
default:
return createTypedError("unknown CNI_COMMAND: %v", cmd)
}
if err != nil {
if e, ok := err.(*types.Error); ok {
// don't wrap Error in Error
return e
}
return createTypedError(err.Error())
}
return nil
}
从这个函数,我们可以看出,calico接收ADD、GET、DEL、VERSION等命令操作,包含添加容器网络、清除容器网络和查看cni版本等。
既然我们主要谈的是容器网络创建流程,那我们这里主要分析ADD命令,查看checkVersionAndCall函数
func (t *dispatcher) checkVersionAndCall(cmdArgs *CmdArgs, pluginVersionInfo version.PluginInfo, toCall func(*CmdArgs) error) error {
configVersion, err := t.ConfVersionDecoder.Decode(cmdArgs.StdinData)
if err != nil {
return err
}
verErr := t.VersionReconciler.Check(configVersion, pluginVersionInfo)
if verErr != nil {
return &types.Error{
Code: types.ErrIncompatibleCNIVersion,
Msg: "incompatible CNI versions",
Details: verErr.Details(),
}
}
return toCall(cmdArgs)
}
首先通过cmdArgs的StdinData参数获取cni版本,然后通过Check函数检查所获取的cni版本是否是支持(以下版本”0.1.0”, “0.2.0”, “0.3.0”, “0.3.1”, “0.4.0”)的。如果检查通过,则执行toCall函数,这个toCall函数就是cmdAdd,如果我们回到main函数查看,cmdAdd就在main.go里,这里通过回调运行cmdAdd函数。
func cmdAdd(args *skel.CmdArgs) error {
// Unmarshal the network config, and perform validation
conf := NetConf{}
if err := json.Unmarshal(args.StdinData, &conf); err != nil {
return fmt.Errorf("failed to load netconf: %v", err)
}
cniVersion := conf.CNIVersion
ConfigureLogging(conf.LogLevel)
workload, orchestrator, err := GetIdentifiers(args)
if err != nil {
return err
}
logger := CreateContextLogger(workload)
// Allow the nodename to be overridden by the network config
updateNodename(conf, logger)
logger.WithFields(log.Fields{
"Orchestrator": orchestrator,
"Node": nodename,
"Workload": workload,
"ContainerID": args.ContainerID,
}).Info("Extracted identifiers")
logger.WithFields(log.Fields{"NetConfg": conf}).Info("Loaded CNI NetConf")
calicoClient, err := CreateClient(conf)
if err != nil {
return err
}
ready, err := IsReady(calicoClient)
if err != nil {
return err
}
if !ready {
logger.Warn("Upgrade may be in progress, ready flag is not set")
return fmt.Errorf("Calico is currently not ready to process requests")
}
// Always check if there's an existing endpoint.
endpoints, err := calicoClient.WorkloadEndpoints().List(api.WorkloadEndpointMetadata{
Node: nodename,
Orchestrator: orchestrator,
Workload: workload})
if err != nil {
return err
}
logger.Debugf("Retrieved endpoints: %v", endpoints)
var endpoint *api.WorkloadEndpoint
if len(endpoints.Items) == 1 {
endpoint = &endpoints.Items[0]
}
fmt.Fprintf(os.Stderr, "Calico CNI checking for existing endpoint: %v\n", endpoint)
// Collect the result in this variable - this is ultimately what gets "returned" by this function by printing
// it to stdout.
var result *current.Result
// If running under Kubernetes then branch off into the kubernetes code, otherwise handle everything in this
// function.
if orchestrator == "k8s" {
if result, err = k8s.CmdAddK8s(args, conf, nodename, calicoClient, endpoint); err != nil {
return err
}
} else {
...
}
// Handle profile creation - this is only done if there isn't a specific policy handler.
if conf.Policy.PolicyType == "" {
logger.Debug("Handling profiles")
// Start by checking if the profile already exists. If it already exists then there is no work to do.
// The CNI plugin never updates a profile.
exists := true
_, err = calicoClient.Profiles().Get(api.ProfileMetadata{Name: conf.Name})
if err != nil {
_, ok := err.(errors.ErrorResourceDoesNotExist)
if ok {
exists = false
} else {
// Cleanup IP allocation and return the error.
ReleaseIPAllocation(logger, conf.IPAM.Type, args.StdinData)
return err
}
}
if !exists {
// The profile doesn't exist so needs to be created. The rules vary depending on whether k8s is being used.
// Under k8s (without full policy support) the rule is permissive and allows all traffic.
// Otherwise, incoming traffic is only allowed from profiles with the same tag.
fmt.Fprintf(os.Stderr, "Calico CNI creating profile: %s\n", conf.Name)
var inboundRules []api.Rule
if orchestrator == "k8s" {
inboundRules = []api.Rule{{Action: "allow"}}
} else {
inboundRules = []api.Rule{{Action: "allow", Source: api.EntityRule{Tag: conf.Name}}}
}
profile := &api.Profile{
Metadata: api.ProfileMetadata{
Name: conf.Name,
Tags: []string{conf.Name},
},
Spec: api.ProfileSpec{
EgressRules: []api.Rule{{Action: "allow"}},
IngressRules: inboundRules,
},
}
logger.WithField("profile", profile).Info("Creating profile")
if _, err := calicoClient.Profiles().Create(profile); err != nil {
// Cleanup IP allocation and return the error.
ReleaseIPAllocation(logger, conf.IPAM.Type, args.StdinData)
return err
}
}
}
// Set Gateway to nil. Calico-IPAM doesn't set it, but host-local does.
// We modify IPs subnet received from the IPAM plugin (host-local),
// so Gateway isn't valid anymore. It is also not used anywhere by Calico.
for _, ip := range result.IPs {
ip.Gateway = nil
}
// Print result to stdout, in the format defined by the requested cniVersion.
return types.PrintResult(result, cniVersion)
为了分析清楚,还是先给出skel.CmdArgs和NetConf的数据结构,NetConf可以看作是calico config的配置文件。
// CmdArgs captures all the arguments passed in to the plugin
// via both env vars and stdin
type CmdArgs struct {
ContainerID string
Netns string
IfName string
Args string
Path string
StdinData []byte
}
// NetConf stores the common network config for Calico CNI plugin
type NetConf struct {
CNIVersion string `json:"cniVersion,omitempty"`
Name string `json:"name"`
Type string `json:"type"`
IPAM struct {
Name string
Type string `json:"type"`
Subnet string `json:"subnet"`
AssignIpv4 *string `json:"assign_ipv4"`
AssignIpv6 *string `json:"assign_ipv6"`
IPv4Pools []string `json:"ipv4_pools,omitempty"`
IPv6Pools []string `json:"ipv6_pools,omitempty"`
} `json:"ipam,omitempty"`
MTU int `json:"mtu"`
Hostname string `json:"hostname"`
Nodename string `json:"nodename"`
DatastoreType string `json:"datastore_type"`
EtcdAuthority string `json:"etcd_authority"`
EtcdEndpoints string `json:"etcd_endpoints"`
LogLevel string `json:"log_level"`
Policy Policy `json:"policy"`
Kubernetes Kubernetes `json:"kubernetes"`
Args Args `json:"args"`
EtcdScheme string `json:"etcd_scheme"`
EtcdKeyFile string `json:"etcd_key_file"`
EtcdCertFile string `json:"etcd_cert_file"`
EtcdCaCertFile string `json:"etcd_ca_cert_file"`
}
在cmdAdd函数中,通过json序列化args的StdinData(可通过上述的calico配置文件加强理解)参数到conf中,GetIdentifiers返回workloadID和orchestratorID。workloadID格式为pod的namespace.name,orchestratorID为”k8s”。因为我的环境是k8s,所以我主要关注CmdAddK8s函数,这里屏蔽掉我不关心的IPAM为host-local部分代码
func CmdAddK8s(args *skel.CmdArgs, conf utils.NetConf, nodename string, calicoClient *calicoclient.Client, endpoint *api.WorkloadEndpoint) (*current.Result, error) {
var err error
var result *current.Result
k8sArgs := utils.K8sArgs{}
err = types.LoadArgs(args.Args, &k8sArgs)
if err != nil {
return nil, err
}
utils.ConfigureLogging(conf.LogLevel)
workload, orchestrator, err := utils.GetIdentifiers(args)
if err != nil {
return nil, err
}
logger := utils.CreateContextLogger(workload)
logger.WithFields(log.Fields{
"Orchestrator": orchestrator,
"Node": nodename,
}).Info("Extracted identifiers for CmdAddK8s")
endpointAlreadyExisted := endpoint != nil
if endpointAlreadyExisted {
// This happens when Docker or the node restarts. K8s calls CNI with the same parameters as before.
// Do the networking (since the network namespace was destroyed and recreated).
// There's an existing endpoint - no need to create another. Find the IP address from the endpoint
// and use that in the response.
result, err = utils.CreateResultFromEndpoint(endpoint)
if err != nil {
return nil, err
}
logger.WithField("result", result).Debug("Created result from existing endpoint")
// If any labels changed whilst the container was being restarted, they will be picked up by the policy
// controller so there's no need to update the labels here.
} else {
client, err := newK8sClient(conf, logger)
if err != nil {
return nil, err
}
logger.WithField("client", client).Debug("Created Kubernetes client")
if conf.IPAM.Type == "host-local" && strings.EqualFold(conf.IPAM.Subnet, "usePodCidr") {
...
}
labels := make(map[string]string)
annot := make(map[string]string)
// Only attempt to fetch the labels and annotations from Kubernetes
// if the policy type has been set to "k8s". This allows users to
// run the plugin under Kubernetes without needing it to access the
// Kubernetes API
if conf.Policy.PolicyType == "k8s" {
var err error
labels, annot, err = getK8sLabelsAnnotations(client, k8sArgs)
if err != nil {
return nil, err
}
logger.WithField("labels", labels).Debug("Fetched K8s labels")
logger.WithField("annotations", annot).Debug("Fetched K8s annotations")
// Check for calico IPAM specific annotations and set them if needed.
if conf.IPAM.Type == "calico-ipam" {
v4pools := annot["cni.projectcalico.org/ipv4pools"]
v6pools := annot["cni.projectcalico.org/ipv6pools"]
if len(v4pools) != 0 || len(v6pools) != 0 {
var stdinData map[string]interface{}
if err := json.Unmarshal(args.StdinData, &stdinData); err != nil {
return nil, err
}
var v4PoolSlice, v6PoolSlice []string
if len(v4pools) > 0 {
if err := json.Unmarshal([]byte(v4pools), &v4PoolSlice); err != nil {
logger.WithField("IPv4Pool", v4pools).Error("Error parsing IPv4 IPPools")
return nil, err
}
if _, ok := stdinData["ipam"].(map[string]interface{}); !ok {
logger.Fatal("Error asserting stdinData type")
os.Exit(0)
}
stdinData["ipam"].(map[string]interface{})["ipv4_pools"] = v4PoolSlice
logger.WithField("ipv4_pools", v4pools).Debug("Setting IPv4 Pools")
}
if len(v6pools) > 0 {
if err := json.Unmarshal([]byte(v6pools), &v6PoolSlice); err != nil {
logger.WithField("IPv6Pool", v6pools).Error("Error parsing IPv6 IPPools")
return nil, err
}
if _, ok := stdinData["ipam"].(map[string]interface{}); !ok {
logger.Fatal("Error asserting stdinData type")
os.Exit(0)
}
stdinData["ipam"].(map[string]interface{})["ipv6_pools"] = v6PoolSlice
logger.WithField("ipv6_pools", v6pools).Debug("Setting IPv6 Pools")
}
newData, err := json.Marshal(stdinData)
if err != nil {
logger.WithField("stdinData", stdinData).Error("Error Marshaling data")
return nil, err
}
args.StdinData = newData
logger.WithField("stdin", string(args.StdinData)).Debug("Updated stdin data")
}
}
}
ipAddrsNoIpam := annot["cni.projectcalico.org/ipAddrsNoIpam"]
ipAddrs := annot["cni.projectcalico.org/ipAddrs"]
// switch based on which annotations are passed or not passed.
switch {
case ipAddrs == "" && ipAddrsNoIpam == "":
// Call IPAM plugin if ipAddrsNoIpam or ipAddrs annotation is not present.
logger.Debugf("Calling IPAM plugin %s", conf.IPAM.Type)
ipamResult, err := ipam.ExecAdd(conf.IPAM.Type, args.StdinData)
if err != nil {
return nil, err
}
logger.Debugf("IPAM plugin returned: %+v", ipamResult)
// Convert IPAM result into current Result.
// IPAM result has a bunch of fields that are optional for an IPAM plugin
// but required for a CNI plugin, so this is to populate those fields.
// See CNI Spec doc for more details.
result, err = current.NewResultFromResult(ipamResult)
if err != nil {
utils.ReleaseIPAllocation(logger, conf.IPAM.Type, args.StdinData)
return nil, err
}
if len(result.IPs) == 0 {
utils.ReleaseIPAllocation(logger, conf.IPAM.Type, args.StdinData)
return nil, errors.New("IPAM plugin returned missing IP config")
}
case ipAddrs != "" && ipAddrsNoIpam != "":
// Can't have both ipAddrs and ipAddrsNoIpam annotations at the same time.
e := fmt.Errorf("Can't have both annotations: 'ipAddrs' and 'ipAddrsNoIpam' in use at the same time")
logger.Error(e)
return nil, e
case ipAddrsNoIpam != "":
// ipAddrsNoIpam annotation is set so bypass IPAM, and set the IPs manually.
overriddenResult, err := overrideIPAMResult(ipAddrsNoIpam, logger)
if err != nil {
return nil, err
}
logger.Debugf("Bypassing IPAM to set the result to: %+v", overriddenResult)
// Convert overridden IPAM result into current Result.
// This method fill in all the empty fields necessory for CNI output according to spec.
result, err = current.NewResultFromResult(overriddenResult)
if err != nil {
return nil, err
}
if len(result.IPs) == 0 {
return nil, errors.New("Failed to build result")
}
case ipAddrs != "":
// When ipAddrs annotation is set, we call out to the configured IPAM plugin
// requesting the specific IP addresses included in the annotation.
result, err = ipAddrsResult(ipAddrs, conf, args, logger)
if err != nil {
return nil, err
}
logger.Debugf("IPAM result set to: %+v", result)
}
// Create the endpoint object and configure it.
endpoint = api.NewWorkloadEndpoint()
endpoint.Metadata.Name = args.IfName
endpoint.Metadata.Node = nodename
endpoint.Metadata.Orchestrator = orchestrator
endpoint.Metadata.Workload = workload
endpoint.Metadata.Labels = labels
// Set the profileID according to whether Kubernetes policy is required.
// If it's not, then just use the network name (which is the normal behavior)
// otherwise use one based on the Kubernetes pod's Namespace.
if conf.Policy.PolicyType == "k8s" {
endpoint.Spec.Profiles = []string{fmt.Sprintf("k8s_ns.%s", k8sArgs.K8S_POD_NAMESPACE)}
} else {
endpoint.Spec.Profiles = []string{conf.Name}
}
// Populate the endpoint with the output from the IPAM plugin.
if err = utils.PopulateEndpointNets(endpoint, result); err != nil {
// Cleanup IP allocation and return the error.
utils.ReleaseIPAllocation(logger, conf.IPAM.Type, args.StdinData)
return nil, err
}
logger.WithField("endpoint", endpoint).Info("Populated endpoint")
}
fmt.Fprintf(os.Stderr, "Calico CNI using IPs: %s\n", endpoint.Spec.IPNetworks)
// maybeReleaseIPAM cleans up any IPAM allocations if we were creating a new endpoint;
// it is a no-op if this was a re-network of an existing endpoint.
maybeReleaseIPAM := func() {
logger.Debug("Checking if we need to clean up IPAM.")
logger := logger.WithField("IPs", endpoint.Spec.IPNetworks)
if endpointAlreadyExisted {
logger.Info("Not cleaning up IPAM allocation; this was a pre-existing endpoint.")
return
}
logger.Info("Releasing IPAM allocation after failure")
utils.ReleaseIPAllocation(logger, conf.IPAM.Type, args.StdinData)
}
// Whether the endpoint existed or not, the veth needs (re)creating.
hostVethName := k8sbackend.VethNameForWorkload(workload)
_, contVethMac, err := utils.DoNetworking(args, conf, result, logger, hostVethName)
if err != nil {
logger.WithError(err).Error("Error setting up networking")
maybeReleaseIPAM()
return nil, err
}
mac, err := net.ParseMAC(contVethMac)
if err != nil {
logger.WithError(err).WithField("mac", mac).Error("Error parsing container MAC")
maybeReleaseIPAM()
return nil, err
}
endpoint.Spec.MAC = &cnet.MAC{HardwareAddr: mac}
endpoint.Spec.InterfaceName = hostVethName
endpoint.Metadata.ActiveInstanceID = args.ContainerID
logger.WithField("endpoint", endpoint).Info("Added Mac, interface name, and active container ID to endpoint")
// Write the endpoint object (either the newly created one, or the updated one)
if _, err := calicoClient.WorkloadEndpoints().Apply(endpoint); err != nil {
logger.WithError(err).Error("Error creating/updating endpoint in datastore.")
maybeReleaseIPAM()
return nil, err
}
logger.Info("Wrote updated endpoint to datastore")
return result, nil
}
CmdAddK8s函数流程非常清晰,这里分析每个步骤的逻辑,调用GetIdentifiers获得workload和orchestrator的值(形式和上述提到的一样)。然后根据endpoint变量是否为空判断处理流程,这里直接贴出代码里的注释
// This happens when Docker or the node restarts. K8s calls CNI with the same parameters as before.
// Do the networking (since the network namespace was destroyed and recreated).
// There's an existing endpoint - no need to create another. Find the IP address from the endpoint
// and use that in the response.
如果不为空则执行CreateResultFromEndpoint函数,其实是用的之前分配的结果,这里我们先不关注,既然做为新建流程,我们就从最原始的创建开始,假设我们是在新建一个pod,那么我们现在要给这个pod配置网络,我们顺着calico的代码看需要给它配置什么。如果我们在calico的配置文件里配置conf.Policy.PolicyType为k8s,那么它会从pod中获取pod的labels和annotations,再根据配置文件的conf.IPAM.Type是否calico-ipam,然后从annotations中分别取出key为cni.projectcalico.org/ipv4pools和cni.projectcalico.org/ipv6pools(如果有)的值配置ipv4地址池和ipv6地址池。然后从annotations取出key为cni.projectcalico.org/ipAddrsNoIpam和cni.projectcalico.org/ipAddrs(如果有)的值配置ipAddrsNoIpam和ipAddrs参数。根据ipAddrsNoIpam和ipAddrs的值有下列4种情况:
我们看到,除了第2、3种情况,第1、4都使用到了calico-ipam,我们都知道在cni的定义里,ipam是在各插件中抽出来的,为了避免每个插件都分配地址,简单来讲ipam就是用来分配ip地址的,最后返回分配的结果。关于calico-ipam留到第三节再分析。
获得ipam分配的ip结果后,接下来重要的事就是将这个ip结果配置在容器的网络命名空间。
为了便于分析接下来的内容,先copy个图片上来
calico配置容器网络其实就是配置veth pair,一处在主机端,另一处在容器端,这个veth pair连接容器和主机的网络空间,主机端的veth是虚拟网卡,calico的网络走的是路由模式。关于veth pair的教程可以查阅这篇文章 :Linux-虚拟网络设备-veth pair。
继续回到源码的分析,VethNameForWorkload函数获取calico在主机端的veth名称,使用sha1算法计算workload的内容,最后截取前11个字符
// VethNameForWorkload returns a deterministic veth name
// for the given Kubernetes workload.
func VethNameForWorkload(workload string) string {
// A SHA1 is always 20 bytes long, and so is sufficient for generating the
// veth name and mac addr.
h := sha1.New()
h.Write([]byte(workload))
return fmt.Sprintf("cali%s", hex.EncodeToString(h.Sum(nil))[:11])
}
DoNetworking这里就是calico cni的核心操作了,先看DoNetworking的逻辑
// DoNetworking performs the networking for the given config and IPAM result
func DoNetworking(args *skel.CmdArgs, conf NetConf, result *current.Result, logger *log.Entry, desiredVethName string) (hostVethName, contVethMAC string, err error) {
// Select the first 11 characters of the containerID for the host veth.
hostVethName = "cali" + args.ContainerID[:Min(11, len(args.ContainerID))]
contVethName := args.IfName
var hasIPv4, hasIPv6 bool
// If a desired veth name was passed in, use that instead.
if desiredVethName != "" {
hostVethName = desiredVethName
}
// Clean up if hostVeth exists.
if oldHostVeth, err := netlink.LinkByName(hostVethName); err == nil {
if err = netlink.LinkDel(oldHostVeth); err != nil {
return "", "", fmt.Errorf("failed to delete old hostVeth %v: %v", hostVethName, err)
}
logger.Infof("cleaning old hostVeth: %v", hostVethName)
}
err = ns.WithNetNSPath(args.Netns, func(hostNS ns.NetNS) error {
veth := &netlink.Veth{
LinkAttrs: netlink.LinkAttrs{
Name: contVethName,
Flags: net.FlagUp,
MTU: conf.MTU,
},
PeerName: hostVethName,
}
if err := netlink.LinkAdd(veth); err != nil {
logger.Errorf("Error adding veth %+v: %s", veth, err)
return err
}
hostVeth, err := netlink.LinkByName(hostVethName)
if err != nil {
err = fmt.Errorf("failed to lookup %q: %v", hostVethName, err)
return err
}
if mac, err := net.ParseMAC("EE:EE:EE:EE:EE:EE"); err != nil {
logger.Infof("failed to parse MAC Address: %v. Using kernel generated MAC.", err)
} else {
// Set the MAC address on the host side interface so the kernel does not
// have to generate a persistent address which fails some times.
if err = netlink.LinkSetHardwareAddr(hostVeth, mac); err != nil {
logger.Warnf("failed to Set MAC of %q: %v. Using kernel generated MAC.", hostVethName, err)
}
}
// Explicitly set the veth to UP state, because netlink doesn't always do that on all the platforms with net.FlagUp.
// veth won't get a link local address unless it's set to UP state.
if err = netlink.LinkSetUp(hostVeth); err != nil {
return fmt.Errorf("failed to set %q up: %v", hostVethName, err)
}
contVeth, err := netlink.LinkByName(contVethName)
if err != nil {
err = fmt.Errorf("failed to lookup %q: %v", contVethName, err)
return err
}
// Fetch the MAC from the container Veth. This is needed by Calico.
contVethMAC = contVeth.Attrs().HardwareAddr.String()
logger.WithField("MAC", contVethMAC).Debug("Found MAC for container veth")
// At this point, the virtual ethernet pair has been created, and both ends have the right names.
// Both ends of the veth are still in the container's network namespace.
for _, addr := range result.IPs {
// Before returning, create the routes inside the namespace, first for IPv4 then IPv6.
if addr.Version == "4" {
// Add a connected route to a dummy next hop so that a default route can be set
gw := net.IPv4(169, 254, 1, 1)
gwNet := &net.IPNet{IP: gw, Mask: net.CIDRMask(32, 32)}
err := netlink.RouteAdd(
&netlink.Route{
LinkIndex: contVeth.Attrs().Index,
Scope: netlink.SCOPE_LINK,
Dst: gwNet,
},
)
if err != nil {
return fmt.Errorf("failed to add route inside the container: %v", err)
}
if err = ip.AddDefaultRoute(gw, contVeth); err != nil {
return fmt.Errorf("failed to add the default route inside the container: %v", err)
}
if err = netlink.AddrAdd(contVeth, &netlink.Addr{IPNet: &addr.Address}); err != nil {
return fmt.Errorf("failed to add IP addr to %q: %v", contVethName, err)
}
// Set hasIPv4 to true so sysctls for IPv4 can be programmed when the host side of
// the veth finishes moving to the host namespace.
hasIPv4 = true
}
// Handle IPv6 routes
if addr.Version == "6" {
// Make sure ipv6 is enabled in the container/pod network namespace.
// Without these sysctls enabled, interfaces will come up but they won't get a link local IPv6 address
// which is required to add the default IPv6 route.
if err = writeProcSys("/proc/sys/net/ipv6/conf/all/disable_ipv6", "0"); err != nil {
return fmt.Errorf("failed to set net.ipv6.conf.all.disable_ipv6=0: %s", err)
}
if err = writeProcSys("/proc/sys/net/ipv6/conf/default/disable_ipv6", "0"); err != nil {
return fmt.Errorf("failed to set net.ipv6.conf.default.disable_ipv6=0: %s", err)
}
if err = writeProcSys("/proc/sys/net/ipv6/conf/lo/disable_ipv6", "0"); err != nil {
return fmt.Errorf("failed to set net.ipv6.conf.lo.disable_ipv6=0: %s", err)
}
// No need to add a dummy next hop route as the host veth device will already have an IPv6
// link local address that can be used as a next hop.
// Just fetch the address of the host end of the veth and use it as the next hop.
addresses, err := netlink.AddrList(hostVeth, netlink.FAMILY_V6)
if err != nil {
logger.Errorf("Error listing IPv6 addresses for the host side of the veth pair: %s", err)
return err
}
if len(addresses) < 1 {
// If the hostVeth doesn't have an IPv6 address then this host probably doesn't
// support IPv6. Since a IPv6 address has been allocated that can't be used,
// return an error.
return fmt.Errorf("failed to get IPv6 addresses for host side of the veth pair")
}
hostIPv6Addr := addresses[0].IP
_, defNet, _ := net.ParseCIDR("::/0")
if err = ip.AddRoute(defNet, hostIPv6Addr, contVeth); err != nil {
return fmt.Errorf("failed to add IPv6 default gateway to %v %v", hostIPv6Addr, err)
}
if err = netlink.AddrAdd(contVeth, &netlink.Addr{IPNet: &addr.Address}); err != nil {
return fmt.Errorf("failed to add IPv6 addr to %q: %v", contVeth, err)
}
// Set hasIPv6 to true so sysctls for IPv6 can be programmed when the host side of
// the veth finishes moving to the host namespace.
hasIPv6 = true
}
}
// Now that the everything has been successfully set up in the container, move the "host" end of the
// veth into the host namespace.
if err = netlink.LinkSetNsFd(hostVeth, int(hostNS.Fd())); err != nil {
return fmt.Errorf("failed to move veth to host netns: %v", err)
}
return nil
})
if err != nil {
logger.Errorf("Error creating veth: %s", err)
return "", "", err
}
err = configureSysctls(hostVethName, hasIPv4, hasIPv6)
if err != nil {
return "", "", fmt.Errorf("error configuring sysctls for interface: %s, error: %s", hostVethName, err)
}
// Moving a veth between namespaces always leaves it in the "DOWN" state. Set it back to "UP" now that we're
// back in the host namespace.
hostVeth, err := netlink.LinkByName(hostVethName)
if err != nil {
return "", "", fmt.Errorf("failed to lookup %q: %v", hostVethName, err)
}
if err = netlink.LinkSetUp(hostVeth); err != nil {
return "", "", fmt.Errorf("failed to set %q up: %v", hostVethName, err)
}
// Now that the host side of the veth is moved, state set to UP, and configured with sysctls, we can add the routes to it in the host namespace.
err = SetupRoutes(hostVeth, result)
if err != nil {
return "", "", fmt.Errorf("error adding host side routes for interface: %s, error: %s", hostVeth.Attrs().Name, err)
}
return hostVethName, contVethMAC, err
}
想一想,如果是我们自己去创建一个veth pair我们的操作步骤是什么?在某个网络空间下操作(这里使用的是容器的网络空间)
对着上面几个步骤去查看DoNetworking函数,代码里的逻辑和我们的操作其实是如出一辙的,DoNetworking函数里还有个configureSysctls函数,他将给主机端的veth配置必要的sysctls。DoNetworking最后返回主机veth name和容器mac,这些信息都更新到enpoint,最后是将这个信息通过calico client更新到etcd里。
最后返回CmdAddK8s的结果,该结果就是执行calico cni二进制文件的结果,而程序运行到这里也就退出了。