【kubernetes/k8s源码分析】 rook operator cluster controller源码分析之一

        Mon作用是监控、管理和协调整个分布式系统环境中其它各个OSD/PG、Client、MDS角色的工作,保证整个分布环境中的数据一致性

        MON采用主备模式(leader/follower),即使系统中有多个MON角色,实际工作的也只有一个MON,其它MON都处于standby状态

       当Ceph失去了Leader MON后,其它MON会基于PaxOS算法,投票选出新的Leader

    mon启动命令

     ceph-mon --fsid=dcef92d7-1f6a-4b9d-8ed0-0037d537d00b --keyring=/etc/ceph/keyring-store/keyring --log-to-stderr=true --err-to-stderr=true --mon-cluster-log-to-stderr=true --log-stderr-prefix=debug  --mon-host=10.200.63.69:6789 --mon-initial-members=a --id=a --foreground --public-addr=10.200.63.69 --public-bind-addr=192.170.56.83

 

      比如,kubectl apply -f cluster.yaml,如下资源为CephCluster

apiVersion: ceph.rook.io/v1
kind: CephCluster
metadata:
  name: rook-ceph
  namespace: rook-ceph
spec:
  cephVersion:
    # The container image used to launch the Ceph daemon pods (mon, mgr, osd, mds, rgw).
    # v12 is luminous, v13 is mimic, and v14 is nautilus.
    # RECOMMENDATION: In production, use a specific version tag instead of the general v14 flag, which pulls the latest release and could result in different
    # versions running within the cluster. See tags available at https://hub.docker.com/r/ceph/ceph/tags/.
    image: ceph/ceph:v14.2.0-20190410
    # Whether to allow unsupported versions of Ceph. Currently luminous, mimic and nautilus are supported, with the recommendation to upgrade to nautilus.
    # Do not set to true in production.
    allowUnsupported: false
  # The path on the host where configuration files will be persisted. Must be specified.
  # Important: if you reinstall the cluster, make sure you delete this directory from each host or else the mons will fail to start on the new cluster.
  # In Minikube, the '/data' directory is configured to persist across reboots. Use "/data/rook" in Minikube environment.
  dataDirHostPath: /var/lib/rook
  # set the amount of mons to be started
  mon:
    count: 1
    allowMultiplePerNode: false
  # enable the ceph dashboard for viewing cluster status
  dashboard:
    enabled: true
    # serve the dashboard under a subpath (useful when you are accessing the dashboard via a reverse proxy)
    # urlPrefix: /ceph-dashboard
    # serve the dashboard at the given port.
    # port: 8443
    # serve the dashboard using SSL
    # ssl: true
  network:
    # toggle to use hostNetwork
    hostNetwork: false
  rbdMirroring:
    # The number of daemons that will perform the rbd mirroring.
    # rbd mirroring must be configured with "rbd mirror" from the rook toolbox.
    workers: 0
  # To control where various services will be scheduled by kubernetes, use the placement configuration sections below.
  # The example under 'all' would have all services scheduled on kubernetes nodes labeled with 'role=storage-node' and
  # tolerate taints with a key of 'storage-node'.
#  placement:
#    all:
#      nodeAffinity:
#        requiredDuringSchedulingIgnoredDuringExecution:
#          nodeSelectorTerms:
#          - matchExpressions:
#            - key: role
#              operator: In
#              values:
#              - storage-node
#      podAffinity:
#      podAntiAffinity:
#      tolerations:
#      - key: storage-node
#        operator: Exists
# The above placement information can also be specified for mon, osd, and mgr components
#    mon:
#    osd:
#    mgr:
  annotations:
#    all:
#    mon:
#    osd:
# If no mgr annotations are set, prometheus scrape annotations will be set by default.
#   mgr:
  resources:
# The requests and limits set here, allow the mgr pod to use half of one CPU core and 1 gigabyte of memory
#    mgr:
#      limits:
#        cpu: "500m"
#        memory: "1024Mi"
#      requests:
#        cpu: "500m"
#        memory: "1024Mi"
# The above example requests/limits can also be added to the mon and osd components
#    mon:
#    osd:
  storage: # cluster level storage configuration and selection
    useAllNodes: true
    useAllDevices: false
    deviceFilter:
    location:
    config:
      # The default and recommended storeType is dynamically set to bluestore for devices and filestore for directories.
      # Set the storeType explicitly only if it is required not to use the default.
      # storeType: bluestore
      # metadataDevice: "md0" # specify a non-rotational storage so ceph-volume will use it as block db device of bluestore.
      # databaseSizeMB: "1024" # uncomment if the disks are smaller than 100 GB
      # journalSizeMB: "1024"  # uncomment if the disks are 20 GB or smaller
      # osdsPerDevice: "1" # this value can be overridden at the node or device level
      # encryptedDevice: "true" # the default value for this option is "false"
# Cluster level list of directories to use for filestore-based OSD storage. If uncommented, this example would create an OSD under the dataDirHostPath.
    directories:
    - path: /var/lib/rook
# Individual nodes and their config can be specified as well, but 'useAllNodes' above must be set to false. Then, only the named
# nodes below will be used as storage resources.  Each node's 'name' field should match their 'kubernetes.io/hostname' label.
#    nodes:
#    - name: "172.17.4.101"
#      directories: # specific directories to use for storage can be specified for each node
#      - path: "/rook/storage-dir"
#      resources:
#        limits:
#          cpu: "500m"
#          memory: "1024Mi"
#        requests:
#          cpu: "500m"
#          memory: "1024Mi"
    - name: "master-node"
      devices: # specific devices to use for storage can be specified for each node
      - name: "sdb"
#      - name: "nvme01" # multiple osds can be created on high performance devices
        config:
          osdsPerDevice: "5"
      config: # configuration can be specified at the node level which overrides the cluster level config
        storeType: filestore
#    - name: "172.17.4.301"
#      deviceFilter: "^sd."

 

1. NewClusterController函数

     实例化ClusterController,用于watch资源CephCluster,node

// NewClusterController create controller for watching cluster custom resources created
func NewClusterController(context *clusterd.Context, rookImage string, volumeAttachment attachment.Attachment) *ClusterController {
	return &ClusterController{
		context:          context,
		volumeAttachment: volumeAttachment,
		rookImage:        rookImage,
		clusterMap:       make(map[string]*cluster),
	}
}

 

2. StartWatch函数

  2.1 定义资源的回调函数,add update delete

      异步watch资源CephCluster资源

resourceHandlerFuncs := cache.ResourceEventHandlerFuncs{
	AddFunc:    c.onAdd,
	UpdateFunc: c.onUpdate,
	DeleteFunc: c.onDelete,
}

logger.Infof("start watching clusters in all namespaces")
watcher := opkit.NewWatcher(ClusterResource, namespace, resourceHandlerFuncs, c.context.RookClientset.CephV1().RESTClient())
go watcher.Watch(&cephv1.CephCluster{}, stopCh)

  2.2 watch node资源,设置informer机制

      onK8sNodeAdd,第3章节讲解

// watch for events on new/updated K8s nodes, too

lwNodes := &cache.ListWatch{
	ListFunc: func(options metav1.ListOptions) (runtime.Object, error) {
		return c.context.Clientset.CoreV1().Nodes().List(options)
	},
	WatchFunc: func(options metav1.ListOptions) (watch.Interface, error) {
		return c.context.Clientset.CoreV1().Nodes().Watch(options)
	},
}

_, nodeController := cache.NewInformer(
	lwNodes,
	&v1.Node{},
	0,
	cache.ResourceEventHandlerFuncs{
		AddFunc:    c.onK8sNodeAdd,
		UpdateFunc: c.onK8sNodeUpdate,
		DeleteFunc: nil,
	},
)
go nodeController.Run(stopCh)

  2.3 开启hotplug机制,设置configmap informer机制,就是更新device configmap功能

if disableVal := os.Getenv(disableHotplugEnv); disableVal != "true" {
	// watch for updates to the device discovery configmap
	logger.Infof("Enabling hotplug orchestration: %s=%s", disableHotplugEnv, disableVal)
	operatorNamespace := os.Getenv(k8sutil.PodNamespaceEnvVar)
	_, deviceCMController := cache.NewInformer(
		cache.NewFilteredListWatchFromClient(c.context.Clientset.CoreV1().RESTClient(),
			"configmaps", operatorNamespace, func(options *metav1.ListOptions) {
				options.LabelSelector = fmt.Sprintf("%s=%s", k8sutil.AppAttr, discoverDaemon.AppName)
			},
		),
		&v1.ConfigMap{},
		0,
		cache.ResourceEventHandlerFuncs{
			AddFunc:    nil,
			UpdateFunc: c.onDeviceCMUpdate,
			DeleteFunc: nil,
		},
	)

	go deviceCMController.Run(stopCh)
} else {
	logger.Infof("Disabling hotplug orchestration via %s", disableHotplugEnv)
}

  2.4 watchLegacyClusters功能watch Cluster资源

func (c *ClusterController) watchLegacyClusters(namespace string, stopCh chan struct{}, resourceHandlerFuncs cache.ResourceEventHandlerFuncs) {
	// watch for cluster.rook.io/v1beta1 events if the CRD exists
	if _, err := c.context.RookClientset.CephV1beta1().Clusters(namespace).List(metav1.ListOptions{}); err != nil {
		logger.Infof("skipping watching for legacy rook cluster events (legacy cluster CRD probably doesn't exist): %+v", err)
	} else {
		logger.Infof("start watching legacy rook clusters in all namespaces")
		watcherLegacy := opkit.NewWatcher(ClusterResourceRookLegacy, namespace, resourceHandlerFuncs, c.context.RookClientset.CephV1beta1().RESTClient())
		go watcherLegacy.Watch(&cephv1beta1.Cluster{}, stopCh)
	}
}

 

3. onK8sNodeAdd函数

    一旦watch node资源为add操作时执行,验证通过,调用createInstance创建

func (c *ClusterController) onK8sNodeAdd(obj interface{}) {
	newNode, ok := obj.(*v1.Node)
	if !ok {
		logger.Warningf("Expected NodeList but handler received %#v", obj)
	}

	if k8sutil.GetNodeSchedulable(*newNode) == false {
		logger.Debugf("Skipping cluster update. Added node %s is unschedulable", newNode.Labels[v1.LabelHostname])
		return
	}

	for _, cluster := range c.clusterMap {
		if cluster.Spec.Storage.UseAllNodes == false {
			logger.Debugf("Skipping -> Do not use all Nodes")
			continue
		}
		if cluster.Info == nil {
			logger.Info("Cluster %s is not ready. Skipping orchestration.", cluster.Namespace)
			continue
		}

		if valid, _ := k8sutil.ValidNode(*newNode, cluster.Spec.Placement.All()); valid == true {
			logger.Debugf("Adding %s to cluster %s", newNode.Labels[v1.LabelHostname], cluster.Namespace)
			err := cluster.createInstance(c.rookImage, cluster.Info.CephVersion)
			if err != nil {
				logger.Errorf("Failed to update cluster in namespace %s. Was not able to add %s. %+v", cluster.Namespace, newNode.Labels[v1.LabelHostname], err)
			}
		} else {
			logger.Infof("Could not add host %s . It is not valid", newNode.Labels[v1.LabelHostname])
			continue
		}
		logger.Infof("Added %s to cluster %s", newNode.Labels[v1.LabelHostname], cluster.Namespace)
	}
}

 

4. onAdd函数

   watch CephCluster资源创建的情况

  4.1 验证mon服务配置,默认副本为3,基数而且1<=count<=9

if cluster.Spec.Mon.Count <= 0 {
	logger.Warningf("mon count is 0 or less, should be at least 1, will use default value of %d", mon.DefaultMonCount)
	cluster.Spec.Mon.Count = mon.DefaultMonCount
	cluster.Spec.Mon.AllowMultiplePerNode = true
}
if cluster.Spec.Mon.Count > mon.MaxMonCount {
	logger.Warningf("mon count is bigger than %d (given: %d), not supported, changing to %d", mon.MaxMonCount, cluster.Spec.Mon.Count, mon.MaxMonCount)
	cluster.Spec.Mon.Count = mon.MaxMonCount
}
if cluster.Spec.Mon.Count%2 == 0 {
	logger.Warningf("mon count is even (given: %d), should be uneven, continuing", cluster.Spec.Mon.Count)
}

    4.2 detectCephVersion函数 

      运行了个rook-ceph-detect-version的job,启动主要执行命令ceph version

# ceph version
ceph version 13.2.3 (9bf3c8b1a04b0aa4a3cc78456a508f1c48e70279) mimic (stable

  •       WaitForJobCompletion函数直到status为succeed成功
  •       DeleteBatchJob删除job
job := &batch.Job{
	ObjectMeta: metav1.ObjectMeta{
		Name:      detectVersionName,
		Namespace: c.Namespace,
	},
	Spec: batch.JobSpec{
		Template: v1.PodTemplateSpec{
			ObjectMeta: metav1.ObjectMeta{
				Labels: map[string]string{
					"job": detectVersionName,
				},
			},
			Spec: podSpec,
		},
	},
}
k8sutil.AddRookVersionLabelToJob(job)
k8sutil.SetOwnerRef(c.context.Clientset, c.Namespace, &job.ObjectMeta, &c.ownerRef)

// run the job to detect the version
if err := k8sutil.RunReplaceableJob(c.context.Clientset, job, true); err != nil {
	return nil, fmt.Errorf("failed to start version job. %+v", err)
}

     4.3 ceph 支持与不支持版本

// Luminous Ceph version
Luminous = CephVersion{12, 0, 0}
// Mimic Ceph version
Mimic = CephVersion{13, 0, 0}
// Nautilus Ceph version
Nautilus = CephVersion{14, 0, 0}
// Octopus Ceph version
Octopus = CephVersion{15, 0, 0}

// supportedVersions are production-ready versions that rook supports
supportedVersions   = []CephVersion{Luminous, Mimic, Nautilus}
unsupportedVersions = []CephVersion{Octopus}
if !cluster.Spec.CephVersion.AllowUnsupported {
	if !cephVersion.Supported() {
		logger.Errorf("unsupported ceph version detected: %s. allowUnsupported must be set to true to run with this version.", cephVersion)
		return
	}
}

    4.4 updateClusterStatus函数

      获得CephCluster资源信息,更新status为Creating

apiVersion: ceph.rook.io/v1
kind: CephCluster
metadata:
  annotations:
    kubectl.kubernetes.io/last-applied-configuration: |
      {"apiVersion":"ceph.rook.io/v1","kind":"CephCluster","metadata":{"annotations":{},"name":"rook-ceph","namespace":"rook-ceph"},"spec":{"annotations":null,"cephVersion":{"allowUnsupported":false,"image":"ceph/ceph:v13.2.3-20190410"},"dashboard":{"enabled":false},"dataDirHostPath":"/var/lib/rook","mon":{"allowMultiplePerNode":false,"count":1},"network":{"hostNetwork":false},"rbdMirroring":{"workers":0},"resources":null,"storage":{"config":{"osdsPerDevice":"1","storeType":"bluestore"},"deviceFilter":"/dev/sda","location":null,"nodes":[{"directories":[{"path":"/var/lib/rook"}],"name":"master-node"},{"directories":[{"path":"/var/lib/rook"}],"name":"node1"}],"useAllDevices":false,"useAllNodes":false}}}
  creationTimestamp: "2019-05-23T06:46:46Z"
  finalizers:
  - cephcluster.ceph.rook.io
  generation: 1144
  name: rook-ceph
  namespace: rook-ceph
  resourceVersion: "299112"
  selfLink: /apis/ceph.rook.io/v1/namespaces/rook-ceph/cephclusters/rook-ceph
  uid: 885ae2f9-7d26-11e9-82d7-0800271c9f15
spec:
  cephVersion:
    image: ceph/ceph:v13.2.3-20190410
  dashboard: {}
  dataDirHostPath: /var/lib/rook
  mon:
    allowMultiplePerNode: false
    count: 1
    preferredCount: 0
  network:
    hostNetwork: false
  rbdMirroring:
    workers: 0
  storage:
    config:
      osdsPerDevice: "1"
      storeType: bluestore
    deviceFilter: /dev/sda
    nodes:
    - config: null
      directories:
      - config: null
        path: /var/lib/rook
      name: master-node
      resources: {}
    - config: null
      directories:
      - config: null
        path: /var/lib/rook
      name: node1
      resources: {}
    useAllDevices: false
status:
  ceph:
    health: HEALTH_OK
    lastChanged: "2019-05-23T06:56:14Z"
    lastChecked: "2019-05-24T02:24:24Z"
    previousHealth: HEALTH_WARN
  state: Created    

 

  4.5 创建rook集群组件

    updateClusterStatus更新集群状态创建中

    createInstance创建组建

    updateClusterStatus更新集群状态已经创建

// Start the Rook cluster components. Retry several times in case of failure.
err = wait.Poll(clusterCreateInterval, clusterCreateTimeout, func() (bool, error) {
	if err := c.updateClusterStatus(clusterObj.Namespace, clusterObj.Name, cephv1.ClusterStateCreating, ""); err != nil {
		logger.Errorf("failed to update cluster status in namespace %s: %+v", cluster.Namespace, err)
		return false, nil
	}

	err := cluster.createInstance(c.rookImage, *cephVersion)
	if err != nil {
		logger.Errorf("failed to create cluster in namespace %s. %+v", cluster.Namespace, err)
		return false, nil
	}

	// cluster is created, update the cluster CRD status now
	if err := c.updateClusterStatus(clusterObj.Namespace, clusterObj.Name, cephv1.ClusterStateCreated, ""); err != nil {
		logger.Errorf("failed to update cluster status in namespace %s: %+v", cluster.Namespace, err)
		return false, nil
	}

	return true, nil
})

 

5. createInstance函数

  主要函数在doOrchestration,第6章节分析

func (c *cluster) createInstance(rookImage string, cephVersion cephver.CephVersion) error {
	var err error
	c.setOrchestrationNeeded()

	// execute an orchestration until
	// there are no more unapplied changes to the cluster definition and
	// while no other goroutine is already running a cluster update
	for c.checkSetOrchestrationStatus() == true {
		if err != nil {
			logger.Errorf("There was an orchestration error, but there is another orchestration pending; proceeding with next orchestration run (which may succeed). %+v", err)
		}
		// Use a DeepCopy of the spec to avoid using an inconsistent data-set
		spec := c.Spec.DeepCopy()

		err = c.doOrchestration(rookImage, cephVersion, spec)

		c.unsetOrchestrationStatus()
	}

	return err
}

 

6. doOrchestration函数

  6.1 创建namespace为rook-ceph,name为rook-config-override的configmap

apiVersion: v1
data:
  config: ""
kind: ConfigMap
metadata:
  creationTimestamp: "2019-05-23T06:46:57Z"
  name: rook-config-override
  namespace: rook-ceph
  ownerReferences:
  - apiVersion: v1
    blockOwnerDeletion: true
    kind: CephCluster
    name: rook-ceph
    uid: 885ae2f9-7d26-11e9-82d7-0800271c9f15
  resourceVersion: "134416"
  selfLink: /api/v1/namespaces/rook-ceph/configmaps/rook-config-override
  uid: 8ef4890a-7d26-11e9-82d7-0800271c9f15

// Create a configmap for overriding ceph config settings
// These settings should only be modified by a user after they are initialized
placeholderConfig := map[string]string{
	k8sutil.ConfigOverrideVal: "",
}
cm := &v1.ConfigMap{
	ObjectMeta: metav1.ObjectMeta{
		Name: k8sutil.ConfigOverrideName,
	},
	Data: placeholderConfig,
}
k8sutil.SetOwnerRef(c.context.Clientset, c.Namespace, &cm.ObjectMeta, &c.ownerRef)
_, err := c.context.Clientset.CoreV1().ConfigMaps(c.Namespace).Create(cm)
if err != nil && !errors.IsAlreadyExists(err) {
	return fmt.Errorf("failed to create override configmap %s. %+v", c.Namespace, err)
}

  6.2 Start 创建mon pod,创建配置

     6.2.1 CreateOrLoadClusterInfo函数

      调用client-go API得到secret,如果没有则调用createNamedClusterInfo函数,执行命令ceph-authtool --create-keyring /var/lib/rook/rook-ceph/mon.keyring --gen-key -n mon. --cap mon 'allow * 生成mon secret

      执行命令ceph-authtool --create-keyring /var/lib/rook/rook-ceph/client.admin.keyring --gen-key -n client.admin --cap mon 'allow *' --cap osd 'allow *' --cap mgr 'allow *' --cap mds 'allow' 生成admin secret

     GenerateConfigFile函数将配置文件写入/var/lib/rook/rook-ceph/rook-ceph.config,并拷贝之/etc/ceph/ceph.conf如下所士:

[global]
fsid                      = 4b8dfb1b-2a8b-46df-b5d3-a9021b501337
run dir                   = /var/lib/rook/rook-ceph
mon initial members       = a
mon host                  = v1:10.200.33.128:6789
log file                  = /dev/stderr
mon cluster log file      = /dev/stderr
mon keyvaluedb            = rocksdb
mon_allow_pool_delete     = true
mon_max_pg_per_osd        = 1000
debug default             = 0
debug rados               = 0
debug mon                 = 0
debug osd                 = 0
debug bluestore           = 0
debug filestore           = 0
debug journal             = 0
debug leveldb             = 0
filestore_omap_backend    = rocksdb
osd pg bits               = 11
osd pgp bits              = 11
osd pool default size     = 1
osd pool default min size = 1
osd pool default pg num   = 100
osd pool default pgp num  = 100
rbd_default_features      = 3
fatal signal handlers     = false

[client.admin]
keyring = /var/lib/rook/rook-ceph/client.admin.keyring

secrets, err := context.Clientset.CoreV1().Secrets(namespace).Get(appName, metav1.GetOptions{})
if err != nil {
	if !errors.IsNotFound(err) {
		return nil, maxMonID, monMapping, fmt.Errorf("failed to get mon secrets. %+v", err)
	}
	if ownerRef == nil {
		return nil, maxMonID, monMapping, fmt.Errorf("not expected to create new cluster info and did not find existing secret")
	}

	clusterInfo, err = createNamedClusterInfo(context, namespace)
	if err != nil {
		return nil, maxMonID, monMapping, fmt.Errorf("failed to create mon secrets. %+v", err)
	}

	err = createClusterAccessSecret(context.Clientset, namespace, clusterInfo, ownerRef)
	if err != nil {
		return nil, maxMonID, monMapping, err
	}
} else {
	clusterInfo = &cephconfig.ClusterInfo{
		Name:          string(secrets.Data[clusterSecretName]),
		FSID:          string(secrets.Data[fsidSecretName]),
		MonitorSecret: string(secrets.Data[monSecretName]),
		AdminSecret:   string(secrets.Data[adminSecretName]),
	}
	logger.Debugf("found existing monitor secrets for cluster %s", clusterInfo.Name)
}

     6.2.2 initClusterInfo函数

     saveMonConfig函数创建rook-ceph-mon-endpoints configmap,如下内容

data:
  data: a=10.200.33.128:6789
  mapping: '{"node":{"a":{"Name":"master-node","Hostname":"master-node","Address":"192.168.72.106"}},"port":{}}'
  maxMonId: "0"

     CreateOrUpdate创建rook-ceph-mons-keyring serret,目的所有mon共享

     CreateOrUpdate创建rook-ceph-admin-keyring的secret     

// initClusterInfo retrieves the ceph cluster info if it already exists.
// If a new cluster, create new keys.
func (c *Cluster) initClusterInfo(cephVersion cephver.CephVersion) error {
	var err error
	// get the cluster info from secret
	c.clusterInfo, c.maxMonID, c.mapping, err = CreateOrLoadClusterInfo(c.context, c.Namespace, &c.ownerRef)
	c.clusterInfo.CephVersion = cephVersion

	if err != nil {
		return fmt.Errorf("failed to get cluster info. %+v", err)
	}

	// save cluster monitor config
	if err = c.saveMonConfig(); err != nil {
		return fmt.Errorf("failed to save mons. %+v", err)
	}

	k := keyring.GetSecretStore(c.context, c.Namespace, &c.ownerRef)
	// store the keyring which all mons share
	if err := k.CreateOrUpdate(keyringStoreName, c.genMonSharedKeyring()); err != nil {
		return fmt.Errorf("failed to save mon keyring secret. %+v", err)
	}
	// also store the admin keyring for other daemons that might need it during init
	if err := k.Admin().CreateOrUpdate(c.clusterInfo); err != nil {
		return fmt.Errorf("failed to save admin keyring secret. %+v", err)
	}

	return nil
}

 

7. startMons函数

  7.1 initMonConfig函数

      初始化mon配置,

func (c *Cluster) initMonConfig(size int) (int, []*monConfig) {
	mons := []*monConfig{}

	// initialize the mon pod info for mons that have been previously created
	for _, monitor := range c.clusterInfo.Monitors {
		mons = append(mons, &monConfig{
			ResourceName: resourceName(monitor.Name),
			DaemonName:   monitor.Name,
			Port:         cephutil.GetPortFromEndpoint(monitor.Endpoint),
			DataPathMap: config.NewStatefulDaemonDataPathMap(
				c.dataDirHostPath, dataDirRelativeHostPath(monitor.Name), config.MonType, monitor.Name, c.Namespace),
		})
	}

	// initialize mon info if we don't have enough mons (at first startup)
	existingCount := len(c.clusterInfo.Monitors)
	for i := len(c.clusterInfo.Monitors); i < size; i++ {
		c.maxMonID++
		mons = append(mons, c.newMonConfig(c.maxMonID))
	}

	return existingCount, mons
}

  7.2 ensureMonsRunning函数

     7.2.1 initMonIPs 如果未使用host模式,则创建service,使用cluster ip

func (c *Cluster) initMonIPs(mons []*monConfig) error {
	for _, m := range mons {
		if c.HostNetwork {
			logger.Infof("setting mon endpoints for hostnetwork mode")
			node, ok := c.mapping.Node[m.DaemonName]
			if !ok {
				return fmt.Errorf("mon doesn't exist in assignment map")
			}
			m.PublicIP = node.Address
		} else {
			serviceIP, err := c.createService(m)
			if err != nil {
				return fmt.Errorf("failed to create mon service. %+v", err)
			}
			m.PublicIP = serviceIP
		}
		c.clusterInfo.Monitors[m.DaemonName] = cephconfig.NewMonInfo(m.DaemonName, m.PublicIP, m.Port)
	}

	return nil
}

    7.2.2 saveMonConfig函数保存configmap rook-ceph-mon-endpoints

    7.2.3 writeConnectionConfig函数写入配置文件/var/lib/rook/rook-ceph目录中,并拷贝至/etc/ceph/目录

    7.2.4 调用client-go API创建mon的deployment

        详情mon deployment后面查看

        waitForQuorumWithMons调用ceph mon_status --connect-timeout=15 --cluster=rook-ceph --conf=/var/lib/rook/rook-ceph/rook-ceph.config --keyring=/var/lib/rook/rook-ceph/client.admin.keyring --format json --out-file /tmp/565758730得到状态

{
    "name":"a",
    "rank":0,
    "state":"leader",
    "election_epoch":9,
    "quorum":[
        0
    ],
    "features":{
        "required_con":"144115738102218752",
        "required_mon":[
            "kraken",
            "luminous",
            "mimic",
            "osdmap-prune"
        ],
        "quorum_con":"4611087854031142907",
        "quorum_mon":[
            "kraken",
            "luminous",
            "mimic",
            "osdmap-prune"
        ]
    },
    "outside_quorum":[

    ],
    "extra_probe_peers":[

    ],
    "sync_provider":[

    ],
    "monmap":{
        "epoch":1,
        "fsid":"dcef92d7-1f6a-4b9d-8ed0-0037d537d00b",
        "modified":"2019-05-23 06:47:12.870838",
        "created":"2019-05-23 06:47:12.870838",
        "features":{
            "persistent":[
                "kraken",
                "luminous",
                "mimic",
                "osdmap-prune"
            ],
            "optional":[

            ]
        },
        "mons":[
            {
                "rank":0,
                "name":"a",
                "addr":"10.200.63.69:6789/0",
                "public_addr":"10.200.63.69:6789/0"
            }
        ]
    },
    "feature_map":{
        "mon":[
            {
                "features":"0x3ffddff8ffa4fffb",
                "release":"luminous",
                "num":1
            }
        ],
        "osd":[
            {
                "features":"0x3ffddff8ffa4fffb",
                "release":"luminous",
                "num":2
            }
        ],
        "client":[
            {
                "features":"0x27018fb86aa42ada",
                "release":"jewel",
                "num":1
            },
            {
                "features":"0x3ffddff8ffa4fffb",
                "release":"luminous",
                "num":1
            }
        ],
        "mgr":[
            {
                "features":"0x3ffddff8ffa4fffb",
                "release":"luminous",
                "num":1
            }
        ]
    }
}

func (c *Cluster) startMon(m *monConfig, hostname string) error {
	d := c.makeDeployment(m, hostname)
	logger.Debugf("Starting mon: %+v", d.Name)
	_, err := c.context.Clientset.AppsV1().Deployments(c.Namespace).Create(d)
	if err != nil {
		if !errors.IsAlreadyExists(err) {
			return fmt.Errorf("failed to create mon deployment %s. %+v", m.ResourceName, err)
		}
		logger.Infof("deployment for mon %s already exists. updating if needed", m.ResourceName)
		if _, err := updateDeploymentAndWait(c.context, d, c.Namespace); err != nil {
			return fmt.Errorf("failed to update mon deployment %s. %+v", m.ResourceName, err)
		}
	}

	return nil
}

 

8. 开启协议

// Enable Ceph messenger 2 protocol on Nautilus
if c.clusterInfo.CephVersion.IsAtLeastNautilus() {
	v, err := client.GetCephMonVersion(c.context)
	if err != nil {
		return fmt.Errorf("failed to get ceph mon version. %+v", err)
	}
	if v.IsAtLeastNautilus() {
		versions, err := client.GetCephVersions(c.context)
		if err != nil {
			return fmt.Errorf("failed to get ceph daemons versions. %+v", err)
		}
		if len(versions.Mon) == 1 {
			// If length is one, this clearly indicates that all the mons are running the same version
			// We are doing this because 'ceph version' might return the Ceph version that a majority of mons has but not all of them
			// so instead of trying to active msgr2 when mons are not ready, we activate it when we believe that's the right time
			client.EnableMessenger2(c.context)
		}
	}
}

    8.1 EnableMessager2

      執行命令ceph mon enable-msgr2,msgr2支持加密,支持kerberos授权认证等提高ceph集群的安全性非常有益处

// EnableMessenger2 enable the messenger 2 protocol on Nautilus clusters
func EnableMessenger2(context *clusterd.Context) error {
	_, err := context.Executor.ExecuteCommandWithOutput(false, "", "ceph", "mon", "enable-msgr2")
	if err != nil {
		return fmt.Errorf("failed to enable msgr2 protocol: %+v", err)
	}
	logger.Infof("successfully enabled msgr2 protocol")

	return nil
}

 

9. createInitialCrushMap函數

    9.1 CreateDefaultCrushMap函數

      调整现有集群上的可调选项 ceph osd crush tunables firefly --connect-timeout=15 --cluster=rook-ceph --conf=/var/lib/rook/rook-ceph/rook-ceph.config --keyring=/var/lib/rook/rook-ceph/client.admin.keyring --format plain --out-file /tmp/113703137

      把纯文本编译为二进制图文件 crushtool -c /tmp/533654220 -o /tmp/337292219

      SetCrushMap把編譯的二進制文件設置到cluster, ceph osd setcrushmap -i /tmp/337292219 --connect-timeout=15 --cluster=rook-ceph --conf=/var/lib/rook/rook-ceph/rook-ceph.config --keyring=/var/lib/rook/rook-ceph/client.admin.keyring --format json --out-file /tmp/809736670

 

 

Mon 知识点

  1. ceph quorum_status --format json  查看MON角色状态

  • quorum_names:投票参与者
  • quorum:投票参与者的编号
  • quorum_leader_name:当前选举出来的Leader MON
  • election_epoch:目前一共经历的投票轮次数量
  • epoch代:发起投票的次数
  • rank:每个MON角色的权重,权重越小的MON角色,在投票选举中越容易获得多数派支持

{"election_epoch":3,"quorum":[0],"quorum_names":["a"],"quorum_leader_name":"a","quorum_age":189,"monmap":{"epoch":1,"fsid":"9d6b7ae7-362c-45df-bb28-cfc8393a2edf","modified":"2019-05-06 19:40:19.266161","created":"2019-05-06 19:40:19.266161","min_mon_release":14,"min_mon_release_name":"nautilus","features":{"persistent":["kraken","luminous","mimic","osdmap-prune","nautilus"],"optional":[]},"mons":[{"rank":0,"name":"a","public_addrs":{"addrvec":[{"type":"v2","addr":"10.200.88.83:3300","nonce":0},{"type":"v1","addr":"10.200.88.83:6789","nonce":0}]},"addr":"10.200.88.83:6789/0","public_addr":"10.200.88.83:6789/0"}]}}

   

mon deployment

apiVersion: v1
kind: Pod
metadata:
  creationTimestamp: "2019-05-23T06:47:11Z"
  generateName: rook-ceph-mon-a-54d989f65-
  labels:
    app: rook-ceph-mon
    ceph_daemon_id: a
    mon: a
    mon_cluster: rook-ceph
    pod-template-hash: 54d989f65
    rook_cluster: rook-ceph
  name: rook-ceph-mon-a-54d989f65-hs958
  namespace: rook-ceph
  ownerReferences:
  - apiVersion: apps/v1
    blockOwnerDeletion: true
    controller: true
    kind: ReplicaSet
    name: rook-ceph-mon-a-54d989f65
    uid: 975c250f-7d26-11e9-82d7-0800271c9f15
  resourceVersion: "140783"
  selfLink: /api/v1/namespaces/rook-ceph/pods/rook-ceph-mon-a-54d989f65-hs958
  uid: 975e035c-7d26-11e9-82d7-0800271c9f15
spec:
  affinity: {}
  containers:
  - args:
    - --fsid=dcef92d7-1f6a-4b9d-8ed0-0037d537d00b
    - --keyring=/etc/ceph/keyring-store/keyring
    - --log-to-stderr=true
    - --err-to-stderr=true
    - --mon-cluster-log-to-stderr=true
    - '--log-stderr-prefix=debug '
    - --mon-host=$(ROOK_CEPH_MON_HOST)
    - --mon-initial-members=$(ROOK_CEPH_MON_INITIAL_MEMBERS)
    - --id=a
    - --foreground
    - --public-addr=10.200.63.69
    - --public-bind-addr=$(ROOK_POD_IP)
    command:
    - ceph-mon
    env:
    - name: CONTAINER_IMAGE
      value: ceph/ceph:v13.2.3-20190410
    - name: POD_NAME
      valueFrom:
        fieldRef:
          apiVersion: v1
          fieldPath: metadata.name
    - name: POD_NAMESPACE
      valueFrom:
        fieldRef:
          apiVersion: v1
          fieldPath: metadata.namespace
    - name: NODE_NAME
      valueFrom:
        fieldRef:
          apiVersion: v1
          fieldPath: spec.nodeName
    - name: POD_MEMORY_LIMIT
      valueFrom:
        resourceFieldRef:
          divisor: "0"
          resource: limits.memory
    - name: POD_MEMORY_REQUEST
      valueFrom:
        resourceFieldRef:
          divisor: "0"
          resource: requests.memory
    - name: POD_CPU_LIMIT
      valueFrom:
        resourceFieldRef:
          divisor: "1"
          resource: limits.cpu
    - name: POD_CPU_REQUEST
      valueFrom:
        resourceFieldRef:
          divisor: "0"
          resource: requests.cpu
    - name: ROOK_CEPH_MON_HOST
      valueFrom:
        secretKeyRef:
          key: mon_host
          name: rook-ceph-config
    - name: ROOK_CEPH_MON_INITIAL_MEMBERS
      valueFrom:
        secretKeyRef:
          key: mon_initial_members
          name: rook-ceph-config
    - name: ROOK_POD_IP
      valueFrom:
        fieldRef:
          apiVersion: v1
          fieldPath: status.podIP
    image: ceph/ceph:v13.2.3-20190410
    imagePullPolicy: IfNotPresent
    name: mon
    ports:
    - containerPort: 6789
      name: client
      protocol: TCP
    resources: {}
    securityContext:
      privileged: false
      procMount: Default
    terminationMessagePath: /dev/termination-log
    terminationMessagePolicy: File
    volumeMounts:
    - mountPath: /etc/ceph
      name: rook-ceph-config
      readOnly: true
    - mountPath: /etc/ceph/keyring-store/
      name: rook-ceph-mons-keyring
      readOnly: true
    - mountPath: /var/log/ceph
      name: rook-ceph-log
    - mountPath: /var/lib/ceph/mon/ceph-a
      name: ceph-daemon-data
    - mountPath: /var/run/secrets/kubernetes.io/serviceaccount
      name: default-token-5jp2h
      readOnly: true
  dnsPolicy: ClusterFirst
  enableServiceLinks: true
  initContainers:
  - args:
    - --fsid=dcef92d7-1f6a-4b9d-8ed0-0037d537d00b
    - --keyring=/etc/ceph/keyring-store/keyring
    - --log-to-stderr=true
    - --err-to-stderr=true
    - --mon-cluster-log-to-stderr=true
    - '--log-stderr-prefix=debug '
    - --mon-host=$(ROOK_CEPH_MON_HOST)
    - --mon-initial-members=$(ROOK_CEPH_MON_INITIAL_MEMBERS)
    - --id=a
    - --public-addr=10.200.63.69
    - --mkfs
    command:
    - ceph-mon
    env:
    - name: CONTAINER_IMAGE
      value: ceph/ceph:v13.2.3-20190410
    - name: POD_NAME
      valueFrom:
        fieldRef:
          apiVersion: v1
          fieldPath: metadata.name
    - name: POD_NAMESPACE
      valueFrom:
        fieldRef:
          apiVersion: v1
          fieldPath: metadata.namespace
    - name: NODE_NAME
      valueFrom:
        fieldRef:
          apiVersion: v1
          fieldPath: spec.nodeName
    - name: POD_MEMORY_LIMIT
      valueFrom:
        resourceFieldRef:
          divisor: "0"
          resource: limits.memory
    - name: POD_MEMORY_REQUEST
      valueFrom:
        resourceFieldRef:
          divisor: "0"
          resource: requests.memory
    - name: POD_CPU_LIMIT
      valueFrom:
        resourceFieldRef:
          divisor: "1"
          resource: limits.cpu
    - name: POD_CPU_REQUEST
      valueFrom:
        resourceFieldRef:
          divisor: "0"
          resource: requests.cpu
    - name: ROOK_CEPH_MON_HOST
      valueFrom:
        secretKeyRef:
          key: mon_host
          name: rook-ceph-config
    - name: ROOK_CEPH_MON_INITIAL_MEMBERS
      valueFrom:
        secretKeyRef:
          key: mon_initial_members
          name: rook-ceph-config
    image: ceph/ceph:v13.2.3-20190410
    imagePullPolicy: IfNotPresent
    name: init-mon-fs
    resources: {}
    securityContext:
      privileged: false
      procMount: Default
    terminationMessagePath: /dev/termination-log
    terminationMessagePolicy: File
    volumeMounts:
    - mountPath: /etc/ceph
      name: rook-ceph-config
      readOnly: true
    - mountPath: /etc/ceph/keyring-store/
      name: rook-ceph-mons-keyring
      readOnly: true
    - mountPath: /var/log/ceph
      name: rook-ceph-log
    - mountPath: /var/lib/ceph/mon/ceph-a
      name: ceph-daemon-data
    - mountPath: /var/run/secrets/kubernetes.io/serviceaccount
      name: default-token-5jp2h
      readOnly: true
  nodeName: master-node
  nodeSelector:
    kubernetes.io/hostname: master-node
  restartPolicy: Always
  schedulerName: default-scheduler
  securityContext: {}
  serviceAccount: default
  serviceAccountName: default
  terminationGracePeriodSeconds: 30
  volumes:
  - configMap:
      defaultMode: 420
      items:
      - key: ceph.conf
        mode: 256
        path: ceph.conf
      name: rook-ceph-config
    name: rook-ceph-config
  - name: rook-ceph-mons-keyring
    secret:
      defaultMode: 420
      secretName: rook-ceph-mons-keyring
  - hostPath:
      path: /var/lib/rook/rook-ceph/log
      type: ""
    name: rook-ceph-log
  - hostPath:
      path: /var/lib/rook/mon-a/data
      type: ""
    name: ceph-daemon-data
  - name: default-token-5jp2h
    secret:
      defaultMode: 420
      secretName: default-token-5jp2h
status:
  conditions:
  - lastProbeTime: null
    lastTransitionTime: "2019-05-23T07:41:47Z"
    status: "True"
    type: Initialized
  - lastProbeTime: null
    lastTransitionTime: "2019-05-23T07:41:48Z"
    status: "True"
    type: Ready
  - lastProbeTime: null
    lastTransitionTime: "2019-05-23T07:41:48Z"
    status: "True"
    type: ContainersReady
  - lastProbeTime: null
    lastTransitionTime: "2019-05-23T06:47:11Z"
    status: "True"
    type: PodScheduled
  containerStatuses:
  - containerID: docker://a08fe6840c7fc9dc6755a88cd7dd06038e57c6f412a09c63414dff627fc4905b
    image: ceph/ceph:v13.2.3-20190410
    imageID: docker://sha256:fdb3585c96619a300dc2f153a3269c7b6e222adce9eed6ec199dc54302b9195a
    lastState:
      terminated:
        containerID: docker://8477dd340d5d1dd8f8c5c9d7815da31b4cc9aa9ecb95ea8bf844976dee917988
        exitCode: 0
        finishedAt: "2019-05-23T07:39:14Z"
        reason: Completed
        startedAt: "2019-05-23T07:28:48Z"
    name: mon
    ready: true
    restartCount: 3
    state:
      running:
        startedAt: "2019-05-23T07:41:48Z"
  hostIP: 192.168.74.57
  initContainerStatuses:
  - containerID: docker://7316d3105025179241e3fe56eb7b7024325c700cd5bfb9044d72305593c2b753
    image: ceph/ceph:v13.2.3-20190410
    imageID: docker://sha256:fdb3585c96619a300dc2f153a3269c7b6e222adce9eed6ec199dc54302b9195a
    lastState: {}
    name: init-mon-fs
    ready: true
    restartCount: 3
    state:
      terminated:
        containerID: docker://7316d3105025179241e3fe56eb7b7024325c700cd5bfb9044d72305593c2b753
        exitCode: 0
        finishedAt: "2019-05-23T07:41:46Z"
        reason: Completed
        startedAt: "2019-05-23T07:41:40Z"
  phase: Running
  podIP: 192.170.56.83
  qosClass: BestEffort
  startTime: "2019-05-23T06:47:11Z"

 

总结:

     本文关于CephClusters资源watch情况,如果新增则调用onAdd函数

     包括创建configmap,初始集群信息,配置文件rook-ceph.config,secret configmap存入k8s中

     创建mon deployment,调用ceph mon_status证明启动,设置crash map等

     创建mgr,osd等

你可能感兴趣的:(kubernetes,CSI,存储)