【openstack】【nova】nova 删除云主机流程及代码分析

正常情况下删除云主机的操作,主要采用如下两个方式:
1. is_local_delete = True 采用local_delete()
2. is_local_delete = False 采用compute_rpcapi.terminate_instance()
云主机在如下的状态下vm_statesvm_states.SHELVED, vm_states.SHELVED_OFFLOADED下,会采用其他方式。

当发生如下两种常见场景时,会采用上述的删除流程:
1)nova-compute已经down了一段时间,nova_api检测服务状态时(nova service-list),发现nova-compute已经down(但 该节点的所有云主机进程还在)
2)nova api 检测服务状态时(nova service-list),发现nova-compute正常(但nova_compute服务可能正常,也可能down)。
检测方式:nova-compute每隔10s向数据库写入一次更新时间,nova-api检测时会根据该更新时间来作判断。当超过service_down_time=60s,则会认为nova-compute服务已经down

两种删除场景,会出现如下问题(后端都采用ceph存储):
第一种场景:当云主机是从cinder volume创建时,删除该云主机会存在系统盘残留。
原因:系统盘对应的rbd image被加锁

第二种场景:当nova_api检测时,发现compute是正常的,但是实际已经down了。
此时,删除云主机,会一直处于deleting状态,直达nova_compute正常时,才会正常删除

注释:ceph创建rbd时默认是不加锁的,只有当被使用时,才会对其加锁。第一种场景时,系统盘创建后会被attach到云主机,因此会被加锁。
正常情况下,硬盘挂载到云主机时,是不会加锁的,直到在云主机内将其mount

代码重点分析(liberty):
nova/compute/api.py
def _delete_instance(self, context, instance):
    self._delete(context, instance, 'delete', self._do_delete,
                 task_state=task_states.DELETING)
#####cb=self._do_delete(),instance_attrs= task_states.DELETING
def _delete(self, context, instance, delete_type, cb, **instance_attrs):
    if instance.disable_terminate:
        LOG.info(_LI('instance termination disabled'),
                 instance=instance)
        return
    bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
            context, instance.uuid)
    project_id, user_id = quotas_obj.ids_from_instance(context, instance)

    # At these states an instance has a snapshot associate.
    if instance.vm_state in (vm_states.SHELVED,
                             vm_states.SHELVED_OFFLOADED):
        snapshot_id = instance.system_metadata.get('shelved_image_id')
        LOG.info(_LI("Working on deleting snapshot %s "
                     "from shelved instance..."),
                 snapshot_id, instance=instance)
        try:
            self.image_api.delete(context, snapshot_id)
        except (exception.ImageNotFound,
                exception.ImageNotAuthorized) as exc:
            LOG.warning(_LW("Failed to delete snapshot "
                            "from shelved instance (%s)."),
                        exc.format_message(), instance=instance)
        except Exception:
            LOG.exception(_LE("Something wrong happened when trying to "
                              "delete snapshot from shelved instance."),
                          instance=instance)


    original_task_state = instance.task_state
    quotas = None
    try:
        # NOTE(maoy): no expected_task_state needs to be set
        instance.update(instance_attrs)
        instance.progress = 0

        instance.save()//写入数据库

        # NOTE(comstud): If we delete the instance locally, we'll
        # commit the reservations here.  Otherwise, the manager side
        # will commit or rollback the reservations based on success.
        quotas = self._create_reservations(context,
                                           instance,
                                           original_task_state,
                                           project_id, user_id)
        //配额管理
        if self.cell_type == 'api':
            # NOTE(comstud): If we're in the API cell, we need to
            # skip all remaining logic and just call the callback,
            # which will cause a cast to the child cell.  Also,
            # commit reservations here early until we have a better
            # way to deal with quotas with cells.
            cb(context, instance, bdms, reservations=None)
            quotas.commit()
            return
        shelved_offloaded = (instance.vm_state
                             == vm_states.SHELVED_OFFLOADED)
        if not instance.host and not shelved_offloaded:
            try:
                compute_utils.notify_about_instance_usage(
                        self.notifier, context, instance,
                        "%s.start" % delete_type)
                instance.destroy()
                compute_utils.notify_about_instance_usage(
                        self.notifier, context, instance,
                        "%s.end" % delete_type,
                        system_metadata=instance.system_metadata)
                quotas.commit()
                return
            except exception.ObjectActionError:
                instance.refresh()

        if instance.vm_state == vm_states.RESIZED:
            self._confirm_resize_on_deleting(context, instance)

        is_local_delete = True
        try:
            if not shelved_offloaded:
                service = objects.Service.get_by_compute_host(
                    context.elevated(), instance.host)
                is_local_delete = not self.servicegroup_api.service_is_up(
                    service)// 检查服务状态
            if not is_local_delete:
                if original_task_state in (task_states.DELETING,
                                              task_states.SOFT_DELETING):
                    LOG.info(_LI('Instance is already in deleting state, '
                                 'ignoring this request'),
                             instance=instance)
                    quotas.rollback()
                    return
                self._record_action_start(context, instance,
                                          instance_actions.DELETE)

                # NOTE(snikitin): If instance's vm_state is 'soft-delete',
                # we should not count reservations here, because instance
                # in soft-delete vm_state have already had quotas
                # decremented. More details:
                # https://bugs.launchpad.net/nova/+bug/1333145
                if instance.vm_state == vm_states.SOFT_DELETED:
                    quotas.rollback()

                cb(context, instance, bdms,
                   reservations=quotas.reservations) //对应第二场景,具体方法:def _do_delete():
        except exception.ComputeHostNotFound:
            pass

        if is_local_delete:
            # If instance is in shelved_offloaded state or compute node
            # isn't up, delete instance from db and clean bdms info and
            # network info
            self._local_delete(context, instance, bdms, delete_type, cb) //对应第一种场景方法
            quotas.commit()


    except exception.InstanceNotFound:
        # NOTE(comstud): Race condition. Instance already gone.
        if quotas:
            quotas.rollback()
    except Exception:
        with excutils.save_and_reraise_exception():
            if quotas:
                quotas.rollback()
# 场景二:
 def _do_delete(self, context, instance, bdms, reservations=None,
                   local=False):
        if local:
            instance.vm_state = vm_states.DELETED
            instance.task_state = None
            instance.terminated_at = timeutils.utcnow()
            instance.save()
        else:
            self.compute_rpcapi.terminate_instance(context, instance, bdms,
                                                   reservations=reservations,
                                                   delete_type='delete')
##通过消息队列将请求转发到对应节点,然后调用driver(libvirt)删除云主机。


# 场景一:
    def _local_delete(self, context, instance, bdms, delete_type, cb):
        if instance.vm_state == vm_states.SHELVED_OFFLOADED:
            LOG.info(_LI("instance is in SHELVED_OFFLOADED state, cleanup"
                         " the instance's info from database."),
                     instance=instance)
        else:
            LOG.warning(_LW("instance's host %s is down, deleting from "
                            "database"), instance.host, instance=instance)
        if instance.info_cache is not None:
            instance.info_cache.delete()
        else:
            # NOTE(yoshimatsu): Avoid AttributeError if instance.info_cache
            # is None. When the root cause that instance.info_cache becomes
            # None is fixed, the log level should be reconsidered.
            LOG.warning(_LW("Info cache for instance could not be found. "
                            "Ignore."), instance=instance)
        compute_utils.notify_about_instance_usage(
            self.notifier, context, instance, "%s.start" % delete_type)

        elevated = context.elevated()
        if self.cell_type != 'api':
            # NOTE(liusheng): In nova-network multi_host scenario,deleting
            # network info of the instance may need instance['host'] as
            # destination host of RPC call. If instance in SHELVED_OFFLOADED
            # state, instance['host'] is None, here, use shelved_host as host
            # to deallocate network info and reset instance['host'] after that.
            # Here we shouldn't use instance.save(), because this will mislead
            # user who may think the instance's host has been changed, and
            # actually, the instance.host is always None.
            orig_host = instance.host
            try:
                if instance.vm_state == vm_states.SHELVED_OFFLOADED:
                    sysmeta = getattr(instance,
                                      obj_base.get_attrname('system_metadata'))
                    instance.host = sysmeta.get('shelved_host')
                self.network_api.deallocate_for_instance(elevated,
                                                         instance)
            finally:
                instance.host = orig_host

        # cleanup volumes
        for bdm in bdms:
            if bdm.is_volume:
                # NOTE(vish): We don't have access to correct volume
                #             connector info, so just pass a fake
                #             connector. This can be improved when we
                #             expose get_volume_connector to rpc.
                connector = {'ip': '127.0.0.1', 'initiator': 'iqn.fake'}
                try:
                    self.volume_api.terminate_connection(context,
                                                         bdm.volume_id,
                                                         connector)
                    self.volume_api.detach(elevated, bdm.volume_id)
                    if bdm.delete_on_termination:
                        self.volume_api.delete(context, bdm.volume_id) //删除系统盘,这里无法删除
                except Exception as exc:
                    err_str = _LW("Ignoring volume cleanup failure due to %s")
                    LOG.warn(err_str % exc, instance=instance)
            bdm.destroy()
        cb(context, instance, bdms, local=True)
        sys_meta = instance.system_metadata
        instance.destroy()
        compute_utils.notify_about_instance_usage(
            self.notifier, context, instance, "%s.end" % delete_type,
            system_metadata=sys_meta)
具体可以在删除的代码中解除锁定break_lock:
cinder/volume/drivers/rbd.py
    def delete_volume(self, volume):
        """Deletes a logical volume."""
        # NOTE(dosaboy): this was broken by commit cbe1d5f. Ensure names are
        #                utf-8 otherwise librbd will barf.
        volume_name = utils.convert_str(volume['name'])
       .......
  .......
            def _try_remove_volume(client, volume_name):
                with RBDVolumeProxy(self, volume_name) as volume:
                    locker_info = volume.list_lockers()
                    if locker_info:
                        LOG.debug(_("Unlock the rbd volume %s firstly."), volume_name)
                        locker_client, locker_cookie, locker_address = locker_info["lockers"][0]
                        volume.break_lock(client=locker_client, cookie=locker_cookie)
                self.RBDProxy().remove(client.ioctx, volume_name)

注释:虽然可以删除系统盘,但是当computer服务up后,相关的qemu-kvm进程会存在,但是nova-compute服务会周期性运行任务去cleanup此类instances。

之后会分析nova相关的定时任务

你可能感兴趣的:(nova)