Kubernetes
源码版本:remotes/origin/release-1.25
Kubernetes
编译出来的Kubelet
版本:Kubernetes v1.24.0-beta.0.2463+ee7799bab469d7
Kubernetes
集群实验环境:使用Kubernetes v1.25.4
二进制的方式搭建了一个单节点集群
K8S 单节点单节点搭建可以参考:Kubernetes v1.25 搭建单节点集群用于Debug K8S源码
Golang
版本:go1.19.3 linux/amd64
IDEA
版本:2022.2.3
Delve
版本:1.9.1
[root@k8s-master1 kubernetes]#
[root@k8s-master1 kubernetes]# dlv version
Delve Debugger
Version: 1.9.1
Build: $Id: d81b9fd12bfa603f3cf7a4bc842398bd61c42940 $
[root@k8s-master1 kubernetes]#
[root@k8s-master1 kubernetes]# go version
go version go1.19.3 linux/amd64
[root@k8s-master1 kubernetes]#
[root@k8s-master1 kubernetes]# kubectl version
WARNING: This version information is deprecated and will be replaced with the output from kubectl version --short. Use --output=yaml|json to get the full version.
Client Version: version.Info{Major:"1", Minor:"25", GitVersion:"v1.25.4", GitCommit:"872a965c6c6526caa949f0c6ac028ef7aff3fb78", GitTreeState:"clean", BuildDate:"2022-11-09T13:36:36Z", GoVersion:"go1.19.3", Compiler:"gc", Platform:"linux/amd64"}
Kustomize Version: v4.5.7
Server Version: version.Info{Major:"1", Minor:"25", GitVersion:"v1.25.4", GitCommit:"872a965c6c6526caa949f0c6ac028ef7aff3fb78", GitTreeState:"clean", BuildDate:"2022-11-09T13:29:58Z", GoVersion:"go1.19.3", Compiler:"gc", Platform:"linux/amd64"}
[root@k8s-master1 kubernetes]#
[root@k8s-master1 kubernetes]#
[root@k8s-master1 kubernetes]# kubectl get nodes -owide
NAME STATUS ROLES AGE VERSION INTERNAL-IP EXTERNAL-IP OS-IMAGE KERNEL-VERSION CONTAINER-RUNTIME
k8s-master1 Ready <none> 31h v1.25.4 192.168.11.71 <none> CentOS Linux 7 (Core) 3.10.0-1160.80.1.el7.x86_64 containerd://1.6.10
[root@k8s-master1 kubernetes]#
[root@k8s-master1 kubernetes]#
[root@k8s-master1 kubernetes]# kubectl get componentstatus
Warning: v1 ComponentStatus is deprecated in v1.19+
NAME STATUS MESSAGE ERROR
etcd-0 Healthy {"health":"true","reason":""}
controller-manager Healthy ok
scheduler Healthy ok
[root@k8s-master1 kubernetes]#
Kubelet
启动参数配置如下:
[root@k8s-master1 kubernetes]# ps -ef|grep "/usr/local/bin/kubelet"
root 7972 1 6 07:06 ? 00:00:06 /usr/local/bin/kubelet --bootstrap-kubeconfig=/etc/kubernetes/bootstrap-kubelet.kubeconfig --kubeconfig=/etc/kubernetes/kubelet.kubeconfig --config=/etc/kubernetes/kubelet-conf.yml --container-runtime-endpoint=unix:///run/containerd/containerd.sock --node-labels=node.kubernetes.io/node= --v=8
root 9549 6424 0 07:07 pts/0 00:00:00 grep --color=auto /usr/local/bin/kubelet
[root@k8s-master1 kubernetes]#
Kubelet
参数配置如下:
apiVersion: kubelet.config.k8s.io/v1beta1
kind: KubeletConfiguration
address: 0.0.0.0
port: 10250
readOnlyPort: 10255
authentication:
anonymous:
enabled: false
webhook:
cacheTTL: 2m0s
enabled: true
x509:
clientCAFile: /etc/kubernetes/pki/ca.pem
authorization:
mode: Webhook
webhook:
cacheAuthorizedTTL: 5m0s
cacheUnauthorizedTTL: 30s
cgroupDriver: systemd
cgroupsPerQOS: true
clusterDNS:
- 10.96.0.10
clusterDomain: cluster.local
containerLogMaxFiles: 5
containerLogMaxSize: 10Mi
contentType: application/vnd.kubernetes.protobuf
cpuCFSQuota: true
cpuManagerPolicy: none
cpuManagerReconcilePeriod: 10s
enableControllerAttachDetach: true
enableDebuggingHandlers: true
enforceNodeAllocatable:
- pods
eventBurst: 10
eventRecordQPS: 5
evictionHard:
imagefs.available: 15%
memory.available: 100Mi
nodefs.available: 10%
nodefs.inodesFree: 5%
evictionPressureTransitionPeriod: 5m0s
failSwapOn: true
fileCheckFrequency: 20s
hairpinMode: promiscuous-bridge
healthzBindAddress: 127.0.0.1
healthzPort: 10248
httpCheckFrequency: 20s
imageGCHighThresholdPercent: 85
imageGCLowThresholdPercent: 80
imageMinimumGCAge: 2m0s
iptablesDropBit: 15
iptablesMasqueradeBit: 14
kubeAPIBurst: 10
kubeAPIQPS: 5
makeIPTablesUtilChains: true
maxOpenFiles: 1000000
maxPods: 110
nodeStatusUpdateFrequency: 10s
oomScoreAdj: -999
podPidsLimit: -1
registryBurst: 10
registryPullQPS: 5
resolvConf: /etc/resolv.conf
rotateCertificates: true
runtimeRequestTimeout: 2m0s
serializeImagePulls: true
staticPodPath: /etc/kubernetes/manifests
streamingConnectionIdleTimeout: 4h0m0s
syncFrequency: 1m0s
volumeStatsAggPeriod: 1m0s
从名字上来看,ImageGCManager
是用于回收镜像,在没有看代码之前,我脑海中抛出了几个问题。
ImageGCManager
是如何回收镜像的?看完源码啦,我现在试图来回答这两个问题。
1、ImageGCManager
是如何回收镜像的?
ImageGCManager
通过调用运行时的ListSandbox
以及ListContainer
接口组装出来所有的Pod
Pod
,获取所有Pod
的所有容器使用的镜像ImageGCManager
的缓存做对比,如果仅在缓存中出现,说明镜像没有使用,那么就一出该镜像,是否磁盘空间继续来回答第二个问题
2、一个镜像是否需要被回收是按照怎样的标准来判定的?
ImageGCManager
中持有一个缓存,缓存中记录了用于所有提交的资源清单所使用的镜像,当需要清理镜像释放磁盘空间的时候,就以当前可以查询到的Pod
为标准,然后可以查询到的Pod
中的镜像还需要使用,缓存中剩下的其余镜像就认为是垃圾镜像,可以被删除谈谈我的读后感
ImageGCManager
的设计还挺有意思的,在进行镜像回收的过程中,ImageGCManager
并非是直接删除所有未使用的镜像,而是把所有未使用的镜像按照最后一次检测到的顺序进行排序,越老的镜像越先删除,并且一旦所有删除的镜像的大小达到了目标释放空间就停止删除。显然,越后删除的镜像越有可能再将来再次被使用,这种思想有点LRU
的意思。
ImageGCManager |
先来看看ImageGCManager
的接口是如何定义的,如下
type ImageGCManager interface {
GarbageCollect() error
Start()
GetImageList() ([]container.Image, error)
DeleteUnusedImages() error
}
ImageGCPolicy |
和ContainerGCPolicy
类似,是用来指导ImageGCManger
回收哪些镜像用的策略
type ImageGCPolicy struct {
HighThresholdPercent int
LowThresholdPercent int
MinAge time.Duration
}
realImageGCManager |
type realImageGCManager struct {
// 这个一般传入GenericRuntimeManager作为依赖
runtime container.Runtime
// 镜像缓存
imageRecords map[string]*imageRecord
imageRecordsLock sync.Mutex
// 镜像的回收策略
policy ImageGCPolicy
// 实现为PodWorker
statsProvider StatsProvider
// 事件记录器
recorder record.EventRecorder
nodeRef *v1.ObjectReference
initialized bool
imageCache imageCache
sandboxImage string
}
GarbageCollect |
来看看GarbageCollect
是如何做垃圾收集的,具体逻辑如下:
Kubelet
所运行的节点的容量为零,直接发送一个InvalidDiskCapacity
事件,然后直接退出ImageGCPolicy.HithThresholdPercent
要多,那么需要释放磁盘空间
freeSpace
方法释放磁盘空间的逻辑并不难,就是通过查询底层的运行时,找到那些没有使用的镜像,然后释放空间,只要达到释放空间的目标,就停止删除那些不需要使用的镜像func (im *realImageGCManager) GarbageCollect() error {
// Get disk usage on disk holding images.
fsStats, err := im.statsProvider.ImageFsStats()
if err != nil {
return err
}
var capacity, available int64
if fsStats.CapacityBytes != nil {
capacity = int64(*fsStats.CapacityBytes)
}
if fsStats.AvailableBytes != nil {
available = int64(*fsStats.AvailableBytes)
}
if available > capacity {
klog.InfoS("Availability is larger than capacity", "available", available, "capacity", capacity)
available = capacity
}
// Check valid capacity.
if capacity == 0 {
err := goerrors.New("invalid capacity 0 on image filesystem")
im.recorder.Eventf(im.nodeRef, v1.EventTypeWarning, events.InvalidDiskCapacity, err.Error())
return err
}
// If over the max threshold, free enough to place us at the lower threshold.
usagePercent := 100 - int(available*100/capacity)
if usagePercent >= im.policy.HighThresholdPercent {
amountToFree := capacity*int64(100-im.policy.LowThresholdPercent)/100 - available
klog.InfoS("Disk usage on image filesystem is over the high threshold, trying to free bytes down to the low threshold", "usage", usagePercent, "highThreshold", im.policy.HighThresholdPercent, "amountToFree", amountToFree, "lowThreshold", im.policy.LowThresholdPercent)
freed, err := im.freeSpace(amountToFree, time.Now())
if err != nil {
return err
}
if freed < amountToFree {
err := fmt.Errorf("failed to garbage collect required amount of images. Wanted to free %d bytes, but freed %d bytes", amountToFree, freed)
im.recorder.Eventf(im.nodeRef, v1.EventTypeWarning, events.FreeDiskSpaceFailed, err.Error())
return err
}
}
return nil
}
freeSpace |
来看看freeSpace
是如何释放磁盘空间的:
ListSandBox, ListConainer
获取所有正在使用的镜像Pinned
,那么就记录该镜像func (im *realImageGCManager) freeSpace(bytesToFree int64, freeTime time.Time) (int64, error) {
imagesInUse, err := im.detectImages(freeTime)
if err != nil {
return 0, err
}
im.imageRecordsLock.Lock()
defer im.imageRecordsLock.Unlock()
// Get all images in eviction order.
images := make([]evictionInfo, 0, len(im.imageRecords))
for image, record := range im.imageRecords {
if isImageUsed(image, imagesInUse) {
klog.V(5).InfoS("Image ID is being used", "imageID", image)
continue
}
// Check if image is pinned, prevent garbage collection
if record.pinned {
klog.V(5).InfoS("Image is pinned, skipping garbage collection", "imageID", image)
continue
}
images = append(images, evictionInfo{
id: image,
imageRecord: *record,
})
}
sort.Sort(byLastUsedAndDetected(images))
// Delete unused images until we've freed up enough space.
var deletionErrors []error
spaceFreed := int64(0)
for _, image := range images {
klog.V(5).InfoS("Evaluating image ID for possible garbage collection", "imageID", image.id)
// Images that are currently in used were given a newer lastUsed.
if image.lastUsed.Equal(freeTime) || image.lastUsed.After(freeTime) {
klog.V(5).InfoS("Image ID was used too recently, not eligible for garbage collection", "imageID", image.id, "lastUsed", image.lastUsed, "freeTime", freeTime)
continue
}
// Avoid garbage collect the image if the image is not old enough.
// In such a case, the image may have just been pulled down, and will be used by a container right away.
if freeTime.Sub(image.firstDetected) < im.policy.MinAge {
klog.V(5).InfoS("Image ID's age is less than the policy's minAge, not eligible for garbage collection", "imageID", image.id, "age", freeTime.Sub(image.firstDetected), "minAge", im.policy.MinAge)
continue
}
// Remove image. Continue despite errors.
klog.InfoS("Removing image to free bytes", "imageID", image.id, "size", image.size)
err := im.runtime.RemoveImage(container.ImageSpec{Image: image.id})
if err != nil {
deletionErrors = append(deletionErrors, err)
continue
}
delete(im.imageRecords, image.id)
spaceFreed += image.size
if spaceFreed >= bytesToFree {
break
}
}
if len(deletionErrors) > 0 {
return spaceFreed, fmt.Errorf("wanted to free %d bytes, but freed %d bytes space with errors in image deletion: %v", bytesToFree, spaceFreed, errors.NewAggregate(deletionErrors))
}
return spaceFreed, nil
}
detectImages |
CRI
规范的容器运行时的ImageStatus
接口查询Sandbox
镜像CRI
规范的容器运行时的ListImages
接口查询所有镜像CRI
规范的容器运行时的ListSandboxs, ListContainers
接口查询所有Pod
Pod
,获取所有Pod
的所有Container
所使用的镜像func (im *realImageGCManager) detectImages(detectTime time.Time) (sets.String, error) {
imagesInUse := sets.NewString()
// Always consider the container runtime pod sandbox image in use
imageRef, err := im.runtime.GetImageRef(container.ImageSpec{Image: im.sandboxImage})
if err == nil && imageRef != "" {
imagesInUse.Insert(imageRef)
}
images, err := im.runtime.ListImages()
if err != nil {
return imagesInUse, err
}
pods, err := im.runtime.GetPods(true)
if err != nil {
return imagesInUse, err
}
// Make a set of images in use by containers.
for _, pod := range pods {
for _, container := range pod.Containers {
klog.V(5).InfoS("Container uses image", "pod", klog.KRef(pod.Namespace, pod.Name), "containerName", container.Name, "containerImage", container.Image, "imageID", container.ImageID)
imagesInUse.Insert(container.ImageID)
}
}
// Add new images and record those being used.
now := time.Now()
currentImages := sets.NewString()
im.imageRecordsLock.Lock()
defer im.imageRecordsLock.Unlock()
for _, image := range images {
klog.V(5).InfoS("Adding image ID to currentImages", "imageID", image.ID)
currentImages.Insert(image.ID)
// New image, set it as detected now.
if _, ok := im.imageRecords[image.ID]; !ok {
klog.V(5).InfoS("Image ID is new", "imageID", image.ID)
im.imageRecords[image.ID] = &imageRecord{
firstDetected: detectTime,
}
}
// Set last used time to now if the image is being used.
if isImageUsed(image.ID, imagesInUse) {
klog.V(5).InfoS("Setting Image ID lastUsed", "imageID", image.ID, "lastUsed", now)
im.imageRecords[image.ID].lastUsed = now
}
klog.V(5).InfoS("Image ID has size", "imageID", image.ID, "size", image.Size)
im.imageRecords[image.ID].size = image.Size
klog.V(5).InfoS("Image ID is pinned", "imageID", image.ID, "pinned", image.Pinned)
im.imageRecords[image.ID].pinned = image.Pinned
}
// Remove old images from our records.
for image := range im.imageRecords {
if !currentImages.Has(image) {
klog.V(5).InfoS("Image ID is no longer present; removing from imageRecords", "imageID", image)
delete(im.imageRecords, image)
}
}
return imagesInUse, nil
}
GetImageList |
GetImageList
没啥看头,就是从缓存中获取所有镜像。缓存中的镜像是通过垃圾收集的过程中记录。
func (im *realImageGCManager) GetImageList() ([]container.Image, error) {
return im.imageCache.get(), nil
}
DeleteUnusedImages |
直接删除掉所有未使用的镜像,freeSpace
在前面已经分析过了
func (im *realImageGCManager) DeleteUnusedImages() error {
klog.InfoS("Attempting to delete unused images")
_, err := im.freeSpace(math.MaxInt64, time.Now())
return err
}
Start |
逻辑并不复杂,起了两个协程来不断的检测镜像。
func (im *realImageGCManager) Start() {
go wait.Until(func() {
// Initial detection make detected time "unknown" in the past.
var ts time.Time
if im.initialized {
ts = time.Now()
}
_, err := im.detectImages(ts)
if err != nil {
klog.InfoS("Failed to monitor images", "err", err)
} else {
im.initialized = true
}
}, 5*time.Minute, wait.NeverStop)
// Start a goroutine periodically updates image cache.
go wait.Until(func() {
images, err := im.runtime.ListImages()
if err != nil {
klog.InfoS("Failed to update image list", "err", err)
} else {
im.imageCache.set(images)
}
}, 30*time.Second, wait.NeverStop)
}
StartGarbageCollection |
Kubelet
在启动的过程当中会启动一个协程不断的回收未使用的镜像。
func (kl *Kubelet) StartGarbageCollection() {
loggedContainerGCFailure := false
go wait.Until(func() {
if err := kl.containerGC.GarbageCollect(); err != nil {
klog.ErrorS(err, "Container garbage collection failed")
kl.recorder.Eventf(kl.nodeRef, v1.EventTypeWarning, events.ContainerGCFailed, err.Error())
loggedContainerGCFailure = true
} else {
var vLevel klog.Level = 4
if loggedContainerGCFailure {
vLevel = 1
loggedContainerGCFailure = false
}
klog.V(vLevel).InfoS("Container garbage collection succeeded")
}
}, ContainerGCPeriod, wait.NeverStop)
// when the high threshold is set to 100, stub the image GC manager
if kl.kubeletConfiguration.ImageGCHighThresholdPercent == 100 {
klog.V(2).InfoS("ImageGCHighThresholdPercent is set 100, Disable image GC")
return
}
prevImageGCFailed := false
go wait.Until(func() {
if err := kl.imageManager.GarbageCollect(); err != nil {
if prevImageGCFailed {
klog.ErrorS(err, "Image garbage collection failed multiple times in a row")
// Only create an event for repeated failures
kl.recorder.Eventf(kl.nodeRef, v1.EventTypeWarning, events.ImageGCFailed, err.Error())
} else {
klog.ErrorS(err, "Image garbage collection failed once. Stats initialization may not have completed yet")
}
prevImageGCFailed = true
} else {
var vLevel klog.Level = 4
if prevImageGCFailed {
vLevel = 1
prevImageGCFailed = false
}
klog.V(vLevel).InfoS("Image garbage collection succeeded")
}
}, ImageGCPeriod, wait.NeverStop)
}