Kubernetes
源码版本:remotes/origin/release-1.25
Kubernetes
编译出来的Kubelet
版本:Kubernetes v1.24.0-beta.0.2463+ee7799bab469d7
Kubernetes
集群实验环境:使用Kubernetes v1.25.4
二进制的方式搭建了一个单节点集群
K8S 单节点单节点搭建可以参考:Kubernetes v1.25 搭建单节点集群用于Debug K8S源码
Golang
版本:go1.19.3 linux/amd64
IDEA
版本:2022.2.3
Delve
版本:1.9.1
[root@k8s-master1 kubernetes]#
[root@k8s-master1 kubernetes]# dlv version
Delve Debugger
Version: 1.9.1
Build: $Id: d81b9fd12bfa603f3cf7a4bc842398bd61c42940 $
[root@k8s-master1 kubernetes]#
[root@k8s-master1 kubernetes]# go version
go version go1.19.3 linux/amd64
[root@k8s-master1 kubernetes]#
[root@k8s-master1 kubernetes]# kubectl version
WARNING: This version information is deprecated and will be replaced with the output from kubectl version --short. Use --output=yaml|json to get the full version.
Client Version: version.Info{Major:"1", Minor:"25", GitVersion:"v1.25.4", GitCommit:"872a965c6c6526caa949f0c6ac028ef7aff3fb78", GitTreeState:"clean", BuildDate:"2022-11-09T13:36:36Z", GoVersion:"go1.19.3", Compiler:"gc", Platform:"linux/amd64"}
Kustomize Version: v4.5.7
Server Version: version.Info{Major:"1", Minor:"25", GitVersion:"v1.25.4", GitCommit:"872a965c6c6526caa949f0c6ac028ef7aff3fb78", GitTreeState:"clean", BuildDate:"2022-11-09T13:29:58Z", GoVersion:"go1.19.3", Compiler:"gc", Platform:"linux/amd64"}
[root@k8s-master1 kubernetes]#
[root@k8s-master1 kubernetes]#
[root@k8s-master1 kubernetes]# kubectl get nodes -owide
NAME STATUS ROLES AGE VERSION INTERNAL-IP EXTERNAL-IP OS-IMAGE KERNEL-VERSION CONTAINER-RUNTIME
k8s-master1 Ready <none> 31h v1.25.4 192.168.11.71 <none> CentOS Linux 7 (Core) 3.10.0-1160.80.1.el7.x86_64 containerd://1.6.10
[root@k8s-master1 kubernetes]#
[root@k8s-master1 kubernetes]#
[root@k8s-master1 kubernetes]# kubectl get componentstatus
Warning: v1 ComponentStatus is deprecated in v1.19+
NAME STATUS MESSAGE ERROR
etcd-0 Healthy {"health":"true","reason":""}
controller-manager Healthy ok
scheduler Healthy ok
[root@k8s-master1 kubernetes]#
Kubelet
启动参数配置如下:
[root@k8s-master1 kubernetes]# ps -ef|grep "/usr/local/bin/kubelet"
root 7972 1 6 07:06 ? 00:00:06 /usr/local/bin/kubelet --bootstrap-kubeconfig=/etc/kubernetes/bootstrap-kubelet.kubeconfig --kubeconfig=/etc/kubernetes/kubelet.kubeconfig --config=/etc/kubernetes/kubelet-conf.yml --container-runtime-endpoint=unix:///run/containerd/containerd.sock --node-labels=node.kubernetes.io/node= --v=8
root 9549 6424 0 07:07 pts/0 00:00:00 grep --color=auto /usr/local/bin/kubelet
[root@k8s-master1 kubernetes]#
Kubelet
参数配置如下:
apiVersion: kubelet.config.k8s.io/v1beta1
kind: KubeletConfiguration
address: 0.0.0.0
port: 10250
readOnlyPort: 10255
authentication:
anonymous:
enabled: false
webhook:
cacheTTL: 2m0s
enabled: true
x509:
clientCAFile: /etc/kubernetes/pki/ca.pem
authorization:
mode: Webhook
webhook:
cacheAuthorizedTTL: 5m0s
cacheUnauthorizedTTL: 30s
cgroupDriver: systemd
cgroupsPerQOS: true
clusterDNS:
- 10.96.0.10
clusterDomain: cluster.local
containerLogMaxFiles: 5
containerLogMaxSize: 10Mi
contentType: application/vnd.kubernetes.protobuf
cpuCFSQuota: true
cpuManagerPolicy: none
cpuManagerReconcilePeriod: 10s
enableControllerAttachDetach: true
enableDebuggingHandlers: true
enforceNodeAllocatable:
- pods
eventBurst: 10
eventRecordQPS: 5
evictionHard:
imagefs.available: 15%
memory.available: 100Mi
nodefs.available: 10%
nodefs.inodesFree: 5%
evictionPressureTransitionPeriod: 5m0s
failSwapOn: true
fileCheckFrequency: 20s
hairpinMode: promiscuous-bridge
healthzBindAddress: 127.0.0.1
healthzPort: 10248
httpCheckFrequency: 20s
imageGCHighThresholdPercent: 85
imageGCLowThresholdPercent: 80
imageMinimumGCAge: 2m0s
iptablesDropBit: 15
iptablesMasqueradeBit: 14
kubeAPIBurst: 10
kubeAPIQPS: 5
makeIPTablesUtilChains: true
maxOpenFiles: 1000000
maxPods: 110
nodeStatusUpdateFrequency: 10s
oomScoreAdj: -999
podPidsLimit: -1
registryBurst: 10
registryPullQPS: 5
resolvConf: /etc/resolv.conf
rotateCertificates: true
runtimeRequestTimeout: 2m0s
serializeImagePulls: true
staticPodPath: /etc/kubernetes/manifests
streamingConnectionIdleTimeout: 4h0m0s
syncFrequency: 1m0s
volumeStatsAggPeriod: 1m0s
ContainerGCManager
用于根据指定的GCPolicy
删除已经死亡的容器。GCPolicy
只针对容器起作用,对于Sandbox
并不起作用。只有当Sandbox
处于非Ready
的状态或者是没有包含容器的时候才会被移除。
containerGC |
先来看看ContainerGC
是如何定义的,如下。
type containerGC struct {
client internalapi.RuntimeService
manager *kubeGenericRuntimeManager
podStateProvider podStateProvider
}
可以看到,ContainerGCManger
和KubeGenericRuntimeManager
是相互依赖的。其中依赖的PodStateProvider
是由PodWorker
来实现
GCPolicy |
GCPolicy
用于指定GCManager
的垃圾回收策略。
type GCPolicy struct {
// 用于指定容器可以被回收的最小年龄
MinAge time.Duration
// 用于指定每个Pod最多允许死亡多少个容器,超过了就会回收
MaxPerPodContainer int
// 用于指定当前节点当中需要垃圾回收容器的最小阈值,即只有当当前节点的可以擦除的容器数量大于MaxContainers时才需要擦除容器。
MaxContainers int
}
GarbageCollect |
一起来看看GCManager
唯一对外开放的接口GarbageCollect
都回收了哪些垃圾,具体逻辑如下:
PodSandbox
PodSandbox
的日志目录func (cgc *containerGC) GarbageCollect(gcPolicy kubecontainer.GCPolicy, allSourcesReady bool, evictNonDeletedPods bool) error {
errors := []error{}
// Remove evictable containers
if err := cgc.evictContainers(gcPolicy, allSourcesReady, evictNonDeletedPods); err != nil {
errors = append(errors, err)
}
// Remove sandboxes with zero containers
if err := cgc.evictSandboxes(evictNonDeletedPods); err != nil {
errors = append(errors, err)
}
// Remove pod sandbox log directory
if err := cgc.evictPodLogsDirectories(allSourcesReady); err != nil {
errors = append(errors, err)
}
return utilerrors.NewAggregate(errors)
}
evictContainers |
来一起看看GCManager
是如何移除容器的,具体逻辑如下:
Ready
,并且年龄大于GCPolicy
的容器,这些过滤出来的容器就是可以擦除的容器allSourcesReady
是否为真,如果为真就把查询到的可以擦除的容器全部干掉
Source
指的时Pod
的来源,目前的Kubernetes
中一共由三个来源,分别是apiserver, http, file
GCPolicy.MaxPerPodContainer
,那么就删除那些已经死亡的Container
的数量大于GCPolicy.MaxPerPodContainer
的容器GCPolicy.MaxContainers
,就删除满足条件的容器func (cgc *containerGC) evictContainers(gcPolicy kubecontainer.GCPolicy, allSourcesReady bool, evictNonDeletedPods bool) error {
// Separate containers by evict units.
evictUnits, err := cgc.evictableContainers(gcPolicy.MinAge)
if err != nil {
return err
}
// Remove deleted pod containers if all sources are ready.
if allSourcesReady {
for key, unit := range evictUnits {
if cgc.podStateProvider.ShouldPodContentBeRemoved(key.uid) || (evictNonDeletedPods && cgc.podStateProvider.ShouldPodRuntimeBeRemoved(key.uid)) {
cgc.removeOldestN(unit, len(unit)) // Remove all.
delete(evictUnits, key)
}
}
}
// Enforce max containers per evict unit.
if gcPolicy.MaxPerPodContainer >= 0 {
cgc.enforceMaxContainersPerEvictUnit(evictUnits, gcPolicy.MaxPerPodContainer)
}
// Enforce max total number of containers.
if gcPolicy.MaxContainers >= 0 && evictUnits.NumContainers() > gcPolicy.MaxContainers {
// Leave an equal number of containers per evict unit (min: 1).
numContainersPerEvictUnit := gcPolicy.MaxContainers / evictUnits.NumEvictUnits()
if numContainersPerEvictUnit < 1 {
numContainersPerEvictUnit = 1
}
cgc.enforceMaxContainersPerEvictUnit(evictUnits, numContainersPerEvictUnit)
// If we still need to evict, evict oldest first.
numContainers := evictUnits.NumContainers()
if numContainers > gcPolicy.MaxContainers {
flattened := make([]containerGCInfo, 0, numContainers)
for key := range evictUnits {
flattened = append(flattened, evictUnits[key]...)
}
sort.Sort(byCreated(flattened))
cgc.removeOldestN(flattened, numContainers-gcPolicy.MaxContainers)
}
}
return nil
}
evictableContainers |
顾名思义,evictableContainers
方法就是用来移除那些可以被移除的容器,那么可以被移除的容器具体参照什么标准呢?那就是容器没有处于Running
状态,并且容器的创建时间到现在已经超过了GCPolicy.MinAge
,我们来看看具体逻辑:
CRI
规范的容器运行时接口ListContainers
获取所有容器Ready
,并且年龄大于GCPolicy
的容器func (cgc *containerGC) evictableContainers(minAge time.Duration) (containersByEvictUnit, error) {
containers, err := cgc.manager.getKubeletContainers(true)
if err != nil {
return containersByEvictUnit{}, err
}
evictUnits := make(containersByEvictUnit)
newestGCTime := time.Now().Add(-minAge)
for _, container := range containers {
// Prune out running containers.
if container.State == runtimeapi.ContainerState_CONTAINER_RUNNING {
continue
}
createdAt := time.Unix(0, container.CreatedAt)
if newestGCTime.Before(createdAt) {
continue
}
labeledInfo := getContainerInfoFromLabels(container.Labels)
containerInfo := containerGCInfo{
id: container.Id,
name: container.Metadata.Name,
createTime: createdAt,
unknown: container.State == runtimeapi.ContainerState_CONTAINER_UNKNOWN,
}
key := evictUnit{
uid: labeledInfo.PodUID,
name: containerInfo.name,
}
evictUnits[key] = append(evictUnits[key], containerInfo)
}
return evictUnits, nil
}
getKubeletContainers |
直接调用底层的运行时获取所有的Container
func (m *kubeGenericRuntimeManager) getKubeletContainers(allContainers bool) ([]*runtimeapi.Container, error) {
filter := &runtimeapi.ContainerFilter{}
if !allContainers {
filter.State = &runtimeapi.ContainerStateValue{
State: runtimeapi.ContainerState_CONTAINER_RUNNING,
}
}
containers, err := m.runtimeService.ListContainers(filter)
if err != nil {
klog.ErrorS(err, "ListContainers failed")
return nil, err
}
return containers, nil
}
// enforceMaxContainersPerEvictUnit enforces MaxPerPodContainer for each evictUnit.
func (cgc *containerGC) enforceMaxContainersPerEvictUnit(evictUnits containersByEvictUnit, MaxContainers int) {
for key := range evictUnits {
toRemove := len(evictUnits[key]) - MaxContainers
if toRemove > 0 {
evictUnits[key] = cgc.removeOldestN(evictUnits[key], toRemove)
}
}
}
removeOldestN |
removeOldestN
用于移除所有可以删除的容器,如果容器的状态为UnKnown
,那么尝试先杀掉这个容器。然后再移除这个容器
// removeOldestN removes the oldest toRemove containers and returns the resulting slice.
func (cgc *containerGC) removeOldestN(containers []containerGCInfo, toRemove int) []containerGCInfo {
// Remove from oldest to newest (last to first).
numToKeep := len(containers) - toRemove
if numToKeep > 0 {
sort.Sort(byCreated(containers))
}
for i := len(containers) - 1; i >= numToKeep; i-- {
if containers[i].unknown {
// Containers in known state could be running, we should try
// to stop it before removal.
id := kubecontainer.ContainerID{
Type: cgc.manager.runtimeName,
ID: containers[i].id,
}
message := "Container is in unknown state, try killing it before removal"
if err := cgc.manager.killContainer(nil, id, containers[i].name, message, reasonUnknown, nil); err != nil {
klog.ErrorS(err, "Failed to stop container", "containerID", containers[i].id)
continue
}
}
if err := cgc.manager.removeContainer(containers[i].id); err != nil {
klog.ErrorS(err, "Failed to remove container", "containerID", containers[i].id)
}
}
// Assume we removed the containers so that we're not too aggressive.
return containers[:numToKeep]
}
evictSandboxes |
来看看GCManager
是如何移除Sandbox
的,可以被移除的Sandbox
需要满意以下几个条件:一、PodSandbox
处于非Ready
的状态。二、不包含任何容器。三、属于一个不存在的Pod
。具体逻辑如下:
PodSandbox
Sabdbox
的ID
Sandbox
func (cgc *containerGC) evictSandboxes(evictNonDeletedPods bool) error {
containers, err := cgc.manager.getKubeletContainers(true)
if err != nil {
return err
}
sandboxes, err := cgc.manager.getKubeletSandboxes(true)
if err != nil {
return err
}
// collect all the PodSandboxId of container
sandboxIDs := sets.NewString()
for _, container := range containers {
sandboxIDs.Insert(container.PodSandboxId)
}
sandboxesByPod := make(sandboxesByPodUID)
for _, sandbox := range sandboxes {
podUID := types.UID(sandbox.Metadata.Uid)
sandboxInfo := sandboxGCInfo{
id: sandbox.Id,
createTime: time.Unix(0, sandbox.CreatedAt),
}
// Set ready sandboxes to be active.
if sandbox.State == runtimeapi.PodSandboxState_SANDBOX_READY {
sandboxInfo.active = true
}
// Set sandboxes that still have containers to be active.
if sandboxIDs.Has(sandbox.Id) {
sandboxInfo.active = true
}
sandboxesByPod[podUID] = append(sandboxesByPod[podUID], sandboxInfo)
}
for podUID, sandboxes := range sandboxesByPod {
if cgc.podStateProvider.ShouldPodContentBeRemoved(podUID) || (evictNonDeletedPods && cgc.podStateProvider.ShouldPodRuntimeBeRemoved(podUID)) {
// Remove all evictable sandboxes if the pod has been removed.
// Note that the latest dead sandbox is also removed if there is
// already an active one.
cgc.removeOldestNSandboxes(sandboxes, len(sandboxes))
} else {
// Keep latest one if the pod still exists.
cgc.removeOldestNSandboxes(sandboxes, len(sandboxes)-1)
}
}
return nil
}
func (cgc *containerGC) removeOldestNSandboxes(sandboxes []sandboxGCInfo, toRemove int) {
numToKeep := len(sandboxes) - toRemove
if numToKeep > 0 {
sort.Sort(sandboxByCreated(sandboxes))
}
// Remove from oldest to newest (last to first).
for i := len(sandboxes) - 1; i >= numToKeep; i-- {
if !sandboxes[i].active {
cgc.removeSandbox(sandboxes[i].id)
}
}
}
evictPodLogsDirectories |
evictPodLogsDirectories
用于删除所有被移除的容器的日志目录
func (cgc *containerGC) evictPodLogsDirectories(allSourcesReady bool) error {
osInterface := cgc.manager.osInterface
if allSourcesReady {
// Only remove pod logs directories when all sources are ready.
dirs, err := osInterface.ReadDir(podLogsRootDirectory)
if err != nil {
return fmt.Errorf("failed to read podLogsRootDirectory %q: %v", podLogsRootDirectory, err)
}
for _, dir := range dirs {
name := dir.Name()
podUID := parsePodUIDFromLogsDirectory(name)
if !cgc.podStateProvider.ShouldPodContentBeRemoved(podUID) {
continue
}
klog.V(4).InfoS("Removing pod logs", "podUID", podUID)
err := osInterface.RemoveAll(filepath.Join(podLogsRootDirectory, name))
if err != nil {
klog.ErrorS(err, "Failed to remove pod logs directory", "path", name)
}
}
}
logSymlinks, _ := osInterface.Glob(filepath.Join(legacyContainerLogsDir, fmt.Sprintf("*.%s", legacyLogSuffix)))
for _, logSymlink := range logSymlinks {
if _, err := osInterface.Stat(logSymlink); os.IsNotExist(err) {
if containerID, err := getContainerIDFromLegacyLogSymlink(logSymlink); err == nil {
resp, err := cgc.manager.runtimeService.ContainerStatus(containerID, false)
if err != nil {
klog.InfoS("Error getting ContainerStatus for containerID", "containerID", containerID, "err", err)
} else {
status := resp.GetStatus()
if status == nil {
klog.V(4).InfoS("Container status is nil")
continue
}
if status.State != runtimeapi.ContainerState_CONTAINER_EXITED {
klog.V(5).InfoS("Container is still running, not removing symlink", "containerID", containerID, "path", logSymlink)
continue
}
}
} else {
klog.V(4).InfoS("Unable to obtain container ID", "err", err)
}
err := osInterface.Remove(logSymlink)
if err != nil {
klog.ErrorS(err, "Failed to remove container log dead symlink", "path", logSymlink)
} else {
klog.V(4).InfoS("Removed symlink", "path", logSymlink)
}
}
}
return nil
}
StartGarbageCollection |
在kubelet
启动过程中,会通过调用StartGarbageCollection
方法来开启一个协程以每分钟一次的频率回收可以被擦除的Container
以及Sandbox
func (kl *Kubelet) StartGarbageCollection() {
loggedContainerGCFailure := false
go wait.Until(func() {
if err := kl.containerGC.GarbageCollect(); err != nil {
klog.ErrorS(err, "Container garbage collection failed")
kl.recorder.Eventf(kl.nodeRef, v1.EventTypeWarning, events.ContainerGCFailed, err.Error())
loggedContainerGCFailure = true
} else {
var vLevel klog.Level = 4
if loggedContainerGCFailure {
vLevel = 1
loggedContainerGCFailure = false
}
klog.V(vLevel).InfoS("Container garbage collection succeeded")
}
}, ContainerGCPeriod, wait.NeverStop)
// when the high threshold is set to 100, stub the image GC manager
if kl.kubeletConfiguration.ImageGCHighThresholdPercent == 100 {
klog.V(2).InfoS("ImageGCHighThresholdPercent is set 100, Disable image GC")
return
}
prevImageGCFailed := false
go wait.Until(func() {
if err := kl.imageManager.GarbageCollect(); err != nil {
if prevImageGCFailed {
klog.ErrorS(err, "Image garbage collection failed multiple times in a row")
// Only create an event for repeated failures
kl.recorder.Eventf(kl.nodeRef, v1.EventTypeWarning, events.ImageGCFailed, err.Error())
} else {
klog.ErrorS(err, "Image garbage collection failed once. Stats initialization may not have completed yet")
}
prevImageGCFailed = true
} else {
var vLevel klog.Level = 4
if prevImageGCFailed {
vLevel = 1
prevImageGCFailed = false
}
klog.V(vLevel).InfoS("Image garbage collection succeeded")
}
}, ImageGCPeriod, wait.NeverStop)
}