kubelet、kube-controller-manager、kube-scheduler监控资源(pod、service等)的变化,当这些对象发生变化时,kube-apiserver会主动通知这些组件。大概是一个发布-订阅系统
组件向apiserver发起的watch可以带条件,scheduler watch的是所有未被调度的Pod,也就是Pod.destNode=""的Pod来进行调度操作;kubelet只关心自己节点上的Pod列表。apiserver向etcd发起的watch是没有条件的,只能知道某个数据发生了变化或创建、删除,但不能过滤具体的值。也就是说对象数据的条件过滤必须在apiserver端而不是etcd端完成。
pkg/registry/core/pod/storage/storage.go
kube-apiserver针对每一类资源(pod、service、endpoint、replication controller、depolyments)都会构建Storage,每一类资源(pod、service、depolyments),都会创建Storage对象,如:PodStorage。PodStorage.Pod.Store封装了对etcd的操作
type PodStorage struct {
Pod *REST
Binding *BindingREST
Eviction *EvictionREST
Status *StatusREST
Log *podrest.LogREST
Proxy *podrest.ProxyREST
Exec *podrest.ExecREST
Attach *podrest.AttachREST
PortForward *podrest.PortForwardREST
}
路径 staging/src/k8s.io/apiserver/pkg/endpoints/installer.go 一个rest.Storage
对象会被转换为watcher
和lister
对象
// what verbs are supported by the storage, used to know what verbs we support per path
creater, isCreater := storage.(rest.Creater)
namedCreater, isNamedCreater := storage.(rest.NamedCreater)
lister, isLister := storage.(rest.Lister)
getter, isGetter := storage.(rest.Getter)
提供list和watch服务的入口是同一个,在API接口中是通过 GET /pods?watch=true
这种方式来区分是list还是watch
case "LIST": // List all resources of a kind.
handler := metrics.InstrumentRouteFunc(action.Verb, resource, subresource, requestScope, restfulListResource(lister, watcher, reqScope, false, a.minRequestTimeout))
route := ws.GET(action.Path).To(handler).
Doc(doc).
Param(ws.QueryParameter("pretty", "If 'true', then the output is pretty printed.")).
Operation("list"+namespaced+kind+strings.Title(subresource)+operationSuffix).
Produces(append(storageMeta.ProducesMIMETypes(action.Verb), allMediaTypes...)...).
Returns(http.StatusOK, "OK", versionedList).
Writes(versionedList)
switch {
case isLister && isWatcher:
case isWatcher:
addParams(route, action.Params)
routes = append(routes, route)
restfulListResource函数最终去向为ListResource,路径vendor/k8s.io/apiserver/pkg/endpoints/handlers/get.go
func restfulListResource(r rest.Lister, rw rest.Watcher, scope handlers.RequestScope, forceWatch bool, minRequestTimeout time.Duration) restful.RouteFunction {
return func(req *restful.Request, res *restful.Response) {
handlers.ListResource(r, rw, scope, forceWatch, minRequestTimeout)(res.ResponseWriter, req.Request)
}
}
rw.Watch()
创建一个watcher
serveWatch()
来处理这个请求if opts.Watch || forceWatch {
......
watcher, err := rw.Watch(ctx, &opts)
requestInfo, _ := request.RequestInfoFrom(ctx)
metrics.RecordLongRunning(req, requestInfo, func() {
serveWatch(watcher, scope, req, w, timeout)
})
return
}
channel中读取一个event对象,然后不断的写入到http response的流中
for {
select {
case event, ok := <-ch:
obj := event.Object
s.Fixup(obj)
if err := s.EmbeddedEncoder.Encode(obj, buf);
unknown.Raw = buf.Bytes()
event.Object = &unknown
outEvent := &metav1.WatchEvent{}
*internalEvent = metav1.InternalEvent(event)
err := metav1.Convert_versioned_InternalEvent_to_versioned_Event(internalEvent, outEvent, nil)
if err := e.Encode(outEvent);
if len(ch) == 0 {
flusher.Flush()
}
buf.Reset()
}
watcher(watch.Interface)对象是被Rest.Storage对象创建出来的。watch是所有通用的函数,根据label和field过滤
func (e *Store) Watch(ctx genericapirequest.Context, options *metainternalversion.ListOptions) (watch.Interface, error) {
label := labels.Everything()
if options != nil && options.LabelSelector != nil {
label = options.LabelSelector
}
field := fields.Everything()
if options != nil && options.FieldSelector != nil {
field = options.FieldSelector
}
predicate := e.PredicateFunc(label, field)
resourceVersion := ""
if options != nil {
resourceVersion = options.ResourceVersion
predicate.IncludeUninitialized = options.IncludeUninitialized
}
return e.WatchPredicate(ctx, predicate, resourceVersion)
}
可以看出調用e.storage.watch這個接口,誰實現了這個接口看下面第二章講解
// WatchPredicate starts a watch for the items that matches.
func (e *Store) WatchPredicate(ctx genericapirequest.Context, p storage.SelectionPredicate, resourceVersion string) (watch.Interface, error) {
if name, ok := p.MatchesSingle(); ok {
if key, err := e.KeyFunc(ctx, name); err == nil {
w, err := e.Storage.Watch(ctx, key, resourceVersion, p)
if err != nil {
return nil, err
}
if e.Decorator != nil {
return newDecoratedWatcher(w, e.Decorator), nil
}
return w, nil
}
// if we cannot extract a key based on the current context, the
// optimization is skipped
}
w, err := e.Storage.WatchList(ctx, e.KeyRootFunc(ctx), resourceVersion, p)
if err != nil {
return nil, err
}
if e.Decorator != nil {
return newDecoratedWatcher(w, e.Decorator), nil
}
return w, nil
}
type Cacher struct {
// Underlying storage.Interface.
storage Interface
// Expected type of objects in the underlying cache.
objectType reflect.Type
// "sliding window" of recent changes of objects and the current state.
watchCache *watchCache
reflector *cache.Reflector
watcherIdx int
watchers indexedWatchers
}
// Implements storage.Interface.
func (c *Cacher) Watch(ctx context.Context, key string, resourceVersion string, pred SelectionPredicate) (watch.Interface, error) {
watchRV, err := ParseWatchResourceVersion(resourceVersion)
if err != nil {
return nil, err
}
c.ready.wait()
initEvents, err := c.watchCache.GetAllEventsSinceThreadUnsafe(watchRV)
chanSize := 10
forget := forgetWatcher(c, c.watcherIdx, triggerValue, triggerSupported)
watcher := newCacheWatcher(watchRV, chanSize, initEvents, watchFilterFunction(key, pred), forget, c.versioner)
c.watchers.addWatcher(watcher, c.watcherIdx, triggerValue, triggerSupported)
c.watcherIdx++
return watcher, nil
}
func newCacheWatcher(resourceVersion uint64, chanSize int, initEvents []*watchCacheEvent, filter watchFilterFunc, forget func(bool), versioner Versioner) *cacheWatcher {
watcher := &cacheWatcher{
input: make(chan *watchCacheEvent, chanSize),
result: make(chan watch.Event, chanSize),
done: make(chan struct{}),
filter: filter,
stopped: false,
forget: forget,
versioner: versioner,
}
go watcher.process(initEvents, resourceVersion)
return watcher
}
func (c *cacheWatcher) process(initEvents []*watchCacheEvent, resourceVersion uint64) {
for _, event := range initEvents {
c.sendWatchCacheEvent(event)
}
processingTime := time.Since(startTime)
if processingTime > initProcessThreshold {
objType := ""
if len(initEvents) > 0 {
objType = reflect.TypeOf(initEvents[0].Object).String()
}
glog.V(2).Infof("processing %d initEvents of %s took %v", len(initEvents), objType, processingTime)
}
defer close(c.result)
defer c.Stop()
for {
event, ok := <-c.input
if !ok {
return
}
// only send events newer than resourceVersion
if event.ResourceVersion > resourceVersion {
c.sendWatchCacheEvent(event)
}
}
}
// NOTE: sendWatchCacheEvent is assumed to not modify !!!
func (c *cacheWatcher) sendWatchCacheEvent(event *watchCacheEvent) {
curObjPasses := event.Type != watch.Deleted && c.filter(event.Key, event.ObjLabels, event.ObjFields, event.ObjUninitialized)
oldObjPasses := false
if event.PrevObject != nil {
oldObjPasses = c.filter(event.Key, event.PrevObjLabels, event.PrevObjFields, event.PrevObjUninitialized)
}
if !curObjPasses && !oldObjPasses {
// Watcher is not interested in that object.
return
}
var watchEvent watch.Event
switch {
case curObjPasses && !oldObjPasses:
watchEvent = watch.Event{Type: watch.Added, Object: event.Object.DeepCopyObject()}
case curObjPasses && oldObjPasses:
watchEvent = watch.Event{Type: watch.Modified, Object: event.Object.DeepCopyObject()}
case !curObjPasses && oldObjPasses:
// return a delete event with the previous object content, but with the event's resource version
oldObj := event.PrevObject.DeepCopyObject()
if err := c.versioner.UpdateObject(oldObj, event.ResourceVersion); err != nil {
utilruntime.HandleError(fmt.Errorf("failure to version api object (%d) %#v: %v", event.ResourceVersion, oldObj, err))
}
watchEvent = watch.Event{Type: watch.Deleted, Object: oldObj}
}
select {
case <-c.done:
return
default:
}
select {
case c.result <- watchEvent:
case <-c.done:
}
}
PodStorage.Pod.Store封装了对etcd的操作;
func NewStorage(optsGetter generic.RESTOptionsGetter, k client.ConnectionInfoGetter, proxyTransport http.RoundTripper, podDisruptionBudgetClient policyclient.PodDisruptionBudgetsGetter) PodStorage {
store := &genericregistry.Store{
NewFunc: func() runtime.Object { return &api.Pod{} },
NewListFunc: func() runtime.Object { return &api.PodList{} },
PredicateFunc: pod.MatchPod,
DefaultQualifiedResource: api.Resource("pods"),
CreateStrategy: pod.Strategy,
UpdateStrategy: pod.Strategy,
DeleteStrategy: pod.Strategy,
ReturnDeletedObject: true,
TableConvertor: printerstorage.TableConvertor{TablePrinter: printers.NewTablePrinter().With(printersinternal.AddHandlers)},
}
options := &generic.StoreOptions{RESTOptions: optsGetter, AttrFunc: pod.GetAttrs, TriggerFunc: pod.NodeNameTriggerFunc}
if err := store.CompleteWithOptions(options); err != nil {
panic(err) // TODO: Propagate error up
}
statusStore := *store
statusStore.UpdateStrategy = pod.StatusStrategy
return PodStorage{
Pod: &REST{store, proxyTransport},
Binding: &BindingREST{store: store},
Eviction: newEvictionStorage(store, podDisruptionBudgetClient),
Status: &StatusREST{store: &statusStore},
Log: &podrest.LogREST{Store: store, KubeletConn: k},
Proxy: &podrest.ProxyREST{Store: store, ProxyTransport: proxyTransport},
Exec: &podrest.ExecREST{Store: store, KubeletConn: k},
Attach: &podrest.AttachREST{Store: store, KubeletConn: k},
PortForward: &podrest.PortForwardREST{Store: store, KubeletConn: k},
}
}
func (e *Store) CompleteWithOptions(options *generic.StoreOptions) error {
opts, err := options.RESTOptions.GetRESTOptions(e.DefaultQualifiedResource)
if err != nil {
return err
}
if e.Storage == nil {
e.Storage, e.DestroyFunc = opts.Decorator(
opts.StorageConfig,
e.NewFunc(),
prefix,
keyFunc,
e.NewListFunc,
attrFunc,
triggerFunc,
)
}
return nil
}
func (f *storageFactoryRestOptionsFactory) GetRESTOptions(resource schema.GroupResource) (generic.RESTOptions, error) {
storageConfig, err := f.StorageFactory.NewConfig(resource)
if err != nil {
return generic.RESTOptions{}, fmt.Errorf("unable to find storage destination for %v, due to %v", resource, err.Error())
}
ret := generic.RESTOptions{
StorageConfig: storageConfig,
Decorator: generic.UndecoratedStorage,
DeleteCollectionWorkers: f.Options.DeleteCollectionWorkers,
EnableGarbageCollection: f.Options.EnableGarbageCollection,
ResourcePrefix: f.StorageFactory.ResourcePrefix(resource),
}
if f.Options.EnableWatchCache {
sizes, err := ParseWatchCacheSizes(f.Options.WatchCacheSizes)
if err != nil {
return generic.RESTOptions{}, err
}
cacheSize, ok := sizes[resource]
if !ok {
cacheSize = f.Options.DefaultWatchCacheSize
}
ret.Decorator = genericregistry.StorageWithCacher(cacheSize)
}
return ret, nil
}
// Creates a cacher based given storageConfig.
func StorageWithCacher(capacity int) generic.StorageDecorator {
return func(
storageConfig *storagebackend.Config,
objectType runtime.Object,
resourcePrefix string,
keyFunc func(obj runtime.Object) (string, error),
newListFunc func() runtime.Object,
getAttrsFunc storage.AttrFunc,
triggerFunc storage.TriggerPublisherFunc) (storage.Interface, factory.DestroyFunc) {
s, d := generic.NewRawStorage(storageConfig)
if capacity == 0 {
glog.V(5).Infof("Storage caching is disabled for %T", objectType)
return s, d
}
cacherConfig := storage.CacherConfig{
CacheCapacity: capacity,
Storage: s,
Versioner: etcdstorage.APIObjectVersioner{},
Type: objectType,
ResourcePrefix: resourcePrefix,
KeyFunc: keyFunc,
NewListFunc: newListFunc,
GetAttrsFunc: getAttrsFunc,
TriggerPublisherFunc: triggerFunc,
Codec: storageConfig.Codec,
}
cacher := storage.NewCacherFromConfig(cacherConfig)
destroyFunc := func() {
cacher.Stop()
d()
}
RegisterStorageCleanup(destroyFunc)
return cacher, destroyFunc
}
}
(1) watchCache是一个结构体,用来存储apiserver从etcd watch到的对象
(2) watchers是一个indexedWatchers结构体,当kubelet,scheduler需要watch某类资源时,他们会向kube-apiserver发起watch请求,apiserver就会生成一个cacheWatcher,cacheWatcher负责将watch的资源从apiserver发送到kubelet, scheduler
(3) Reflector结构体数据成员:ListerWatcher,ListerWatcher是接口对象,包括方法List()和Watch();listerWatcher包装了Storage,主要是将watch到的对象存到watchCache中;
// ListerWatcher is any object that knows how to perform an initial list and start a watch on a resource.
type ListerWatcher interface {
// List should return a list type object; the Items field will be extracted, and the
// ResourceVersion field will be used to start the watch in the right place.
List(options metav1.ListOptions) (runtime.Object, error)
// Watch should begin a watch at the specified version.
Watch(options metav1.ListOptions) (watch.Interface, error)
}
(4) incoming channel接收watchCacheEvent的100个空间的channel
func (c *Cacher) dispatchEvents() {
for {
select {
case event, ok := <-c.incoming:
if !ok {
return
}
c.dispatchEvent(&event)
case <-c.stopCh:
return
}
}
}
func (c *Cacher) dispatchEvent(event *watchCacheEvent) {
triggerValues, supported := c.triggerValues(event)
for _, watcher := range c.watchers.allWatchers {
watcher.add(event, c.dispatchTimeoutBudget)
}
if supported {
for _, triggerValue := range triggerValues {
for _, watcher := range c.watchers.valueWatchers[triggerValue] {
watcher.add(event, c.dispatchTimeoutBudget)
}
}
} else {
for _, watchers := range c.watchers.valueWatchers {
for _, watcher := range watchers {
watcher.add(event, c.dispatchTimeoutBudget)
}
}
}
}
func NewCacherFromConfig(config CacherConfig) *Cacher {
watchCache := newWatchCache(config.CacheCapacity, config.KeyFunc, config.GetAttrsFunc)
listerWatcher := newCacherListerWatcher(config.Storage, config.ResourcePrefix, config.NewListFunc)
reflectorName := "storage/cacher.go:" + config.ResourcePrefix
stopCh := make(chan struct{})
cacher := &Cacher{
}
watchCache.SetOnEvent(cacher.processEvent)
go cacher.dispatchEvents()
cacher.stopWg.Add(1)
go func() {
defer cacher.stopWg.Done()
wait.Until(
func() {
if !cacher.isStopped() {
cacher.startCaching(stopCh)
}
}, time.Second, stopCh,
)
}()
return cacher
}
func (c *Cacher) startCaching(stopChannel <-chan struct{}) {
successfulList := false
c.watchCache.SetOnReplace(func() {
successfulList = true
c.ready.set(true)
})
defer func() {
if successfulList {
c.ready.set(false)
}
}()
c.terminateAllWatchers()
if err := c.reflector.ListAndWatch(stopChannel); err != nil {
glog.Errorf("unexpected ListAndWatch error: %v", err)
}
}
func (r *Reflector) ListAndWatch(stopCh <-chan struct{}) error {
list, err := r.listerWatcher.List(options)
r.metrics.listDuration.Observe(time.Since(start).Seconds())
listMetaInterface, err := meta.ListAccessor(list)
if err != nil {
return fmt.Errorf("%s: Unable to understand list result %#v: %v", r.name, list, err)
}
resourceVersion = listMetaInterface.GetResourceVersion()
for {
w, err := r.listerWatcher.Watch(options)
if err := r.watchHandler(w, &resourceVersion, resyncerrc, stopCh); err != nil {
return nil
}
}
}
func newCacherListerWatcher(storage Interface, resourcePrefix string, newListFunc func() runtime.Object) cache.ListerWatcher {
return &cacherListerWatcher{
storage: storage,
resourcePrefix: resourcePrefix,
newListFunc: newListFunc,
}
}
// Implements cache.ListerWatcher interface.
func (lw *cacherListerWatcher) List(options metav1.ListOptions) (runtime.Object, error) {
list := lw.newListFunc()
if err := lw.storage.List(context.TODO(), lw.resourcePrefix, "", Everything, list); err != nil {
return nil, err
}
return list, nil
}
// Implements cache.ListerWatcher interface.
func (lw *cacherListerWatcher) Watch(options metav1.ListOptions) (watch.Interface, error) {
return lw.storage.WatchList(context.TODO(), lw.resourcePrefix, options.ResourceVersion, Everything)
}
func NewRawStorage(config *storagebackend.Config) (storage.Interface, factory.DestroyFunc) {
s, d, err := factory.Create(*config)
return s, d
}
func Create(c storagebackend.Config) (storage.Interface, DestroyFunc, error) {
switch c.Type {
case storagebackend.StorageTypeETCD2:
return newETCD2Storage(c)
case storagebackend.StorageTypeUnset, storagebackend.StorageTypeETCD3:
return newETCD3Storage(c)
default:
return nil, nil, fmt.Errorf("unknown storage type: %s", c.Type)
}
}
func newETCD3Storage(c storagebackend.Config) (storage.Interface, DestroyFunc, error) {
client, err := clientv3.New(cfg)
if c.Quorum {
return etcd3.New(client, c.Codec, c.Prefix, transformer, c.Paging), destroyFunc, nil
}
return etcd3.NewWithNoQuorumRead(client, c.Codec, c.Prefix, transformer, c.Paging), destroyFunc, nil
}
// List implements storage.Interface.List.
func (s *store) List(ctx context.Context, key, resourceVersion string, pred storage.SelectionPredicate, listObj runtime.Object) error
func (s *store) watch(ctx context.Context, key string, rv string, pred storage.SelectionPredicate, recursive bool) (watch.Interface, error) {
Container逻辑上是W
示例 流程分析
网上很好的一个示例图
流程/Pod状态 | PodPhase | PodCondition | 组件 |
---|---|---|---|
流程1 | - | - | 用户声明资源数据: 创建deployment,无Pod |
流程2~3 | Pending | - | kube-controller-manager: deploy控制器创建Pod |
流程4~5 | Pending | PodScheduled=true | Kube-scheduler: 调度器调度Pod成功 |
流程6~7 | Running | PodScheduled=true; PodInitialized=true; PodReady=true | kubelet: Work node上成功运行容器 |
staging/src/k8s.io/apimachinery/pkg/watch
watch包中主要包含5个文件:watch.go,mux.go,filter.go,streamwatch.go,util.go。
interface对事件的监听结果和停止监听的方法
// Interface can be implemented by anything that knows how to watch and report changes.
type Interface interface {
// Stops watching. Will close the channel returned by ResultChan(). Releases
// any resources used by the watch.
Stop()
// Returns a chan which will receive all the events. If an error occurs
// or Stop() is called, this channel will be closed, in which case the
// watch should be completely cleaned up.
ResultChan() <-chan Event
}
Event结构体,包括事件类型和事件发生的对象,k8s中对象都是runtime.Object。event对象就代表了k8s中某种资源对象(包括各个组件)发生的某种操作。
type Event struct {
Type EventType
// Object is:
// * If Type is Added or Modified: the new state of the object.
// * If Type is Deleted: the state of the object immediately before deletion.
// * If Type is Error: *api.Status is recommended; other types may make sense
// depending on context.
Object runtime.Object
}
master.go文件中三个结构体实现了Interface接口
emptyWatch: 最简单的实现
type emptyWatch chan Event
// NewEmptyWatch returns a watch interface that returns no results and is closed.
// May be used in certain error conditions where no information is available but
// an error is not warranted.
func NewEmptyWatch() Interface {
ch := make(chan Event)
close(ch)
return emptyWatch(ch)
}
FakerWatcher: 线程安全的,包括对k8s的各种事件的操作(ADDED,MODIFIED,DELETED,ERROR)
type FakeWatcher struct {
result chan Event
Stopped bool
sync.Mutex
}
func NewFake() *FakeWatcher {
return &FakeWatcher{
result: make(chan Event),
}
RaceFreeFakeWatcher: 线程安全
type RaceFreeFakeWatcher struct {
result chan Event
Stopped bool
sync.Mutex
}
func NewRaceFreeFake() *RaceFreeFakeWatcher {
return &RaceFreeFakeWatcher{
result: make(chan Event, DefaultChanSize),
}
}
A.b mutx.go文件
定义:事件广播器
包括所有的k8s的事件(pod,service,deployment,namespace),创建一个事件广播器,会启动一个goroutine注所有的watcher
定义:broadcastWatcher
事件广播器观察者就是interface的实现,很简单,就是一个resultChan方法和stop
A.b.a Broadcaster结构体
Broadcaster很好理解,直接过
type Broadcaster struct {
// TODO: see if this lock is needed now that new watchers go through
// the incoming channel.
lock sync.Mutex
watchers map[int64]*broadcasterWatcher
nextWatcher int64
distributing sync.WaitGroup
incoming chan Event
// How large to make watcher's channel.
watchQueueLength int
// If one of the watch channels is full, don't wait for it to become empty.
// Instead just deliver it to the watchers that do have space in their
// channels and move on to the next event.
// It's more fair to do this on a per-watcher basis than to do it on the
// "incoming" channel, which would allow one slow watcher to prevent all
// other watchers from getting new events.
fullChannelBehavior FullChannelBehavior
}
A.b.b BroadcasterWatcher结构体
BroadcasterWatcher处理单个的Broadcaster,设计的也相当简单
type broadcasterWatcher struct {
result chan Event
stopped chan struct{}
stop sync.Once
id int64
m *Broadcaster
}
// ResultChan returns a channel to use for waiting on events.
func (mw *broadcasterWatcher) ResultChan() <-chan Event {
return mw.result
}
// Stop stops watching and removes mw from its list.
func (mw *broadcasterWatcher) Stop() {
mw.stop.Do(func() {
close(mw.stopped)
mw.m.stopWatching(mw.id)
})
}
A.c filter.go文件
A.c.a r结构体
filteredWatch带有过滤的watch,比broaderwatcher升级了过滤功能,代码也是相当简单明了
type filteredWatch struct {
incoming Interface
result chan Event
f FilterFunc
}