正常情况下删除云主机的操作,主要采用如下两个方式:
1. is_local_delete = True 采用local_delete()
2. is_local_delete = False 采用compute_rpcapi.terminate_instance()
云主机在如下的状态下vm_statesvm_states.SHELVED, vm_states.SHELVED_OFFLOADED下,会采用其他方式。
当发生如下两种常见场景时,会采用上述的删除流程:
1)nova-compute已经down了一段时间,nova_api检测服务状态时(nova service-list),发现nova-compute已经down(但
该节点的所有云主机进程还在)
2)nova api 检测服务状态时(nova service-list),发现nova-compute正常(但nova_compute服务可能正常,也可能down)。
检测方式:nova-compute每隔10s向数据库写入一次更新时间,nova-api检测时会根据该更新时间来作判断。当超过service_down_time=60s,则会认为nova-compute服务已经down
两种删除场景,会出现如下问题(后端都采用ceph存储):
第一种场景:当云主机是从cinder volume创建时,删除该云主机会存在系统盘残留。
原因:系统盘对应的rbd image被加锁
第二种场景:当nova_api检测时,发现compute是正常的,但是实际已经down了。
此时,删除云主机,会一直处于deleting状态,直达nova_compute正常时,才会正常删除
注释:ceph创建rbd时默认是不加锁的,只有当被使用时,才会对其加锁。第一种场景时,系统盘创建后会被attach到云主机,因此会被加锁。
正常情况下,硬盘挂载到云主机时,是不会加锁的,直到在云主机内将其mount
代码重点分析(liberty):
nova/compute/api.py
def _delete_instance(self, context, instance):
self._delete(context, instance, 'delete', self._do_delete,
task_state=task_states.DELETING)
#####cb=self._do_delete(),instance_attrs= task_states.DELETING
def _delete(self, context, instance, delete_type, cb, **instance_attrs):
if instance.disable_terminate:
LOG.info(_LI('instance termination disabled'),
instance=instance)
return
bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
context, instance.uuid)
project_id, user_id = quotas_obj.ids_from_instance(context, instance)
# At these states an instance has a snapshot associate.
if instance.vm_state in (vm_states.SHELVED,
vm_states.SHELVED_OFFLOADED):
snapshot_id = instance.system_metadata.get('shelved_image_id')
LOG.info(_LI("Working on deleting snapshot %s "
"from shelved instance..."),
snapshot_id, instance=instance)
try:
self.image_api.delete(context, snapshot_id)
except (exception.ImageNotFound,
exception.ImageNotAuthorized) as exc:
LOG.warning(_LW("Failed to delete snapshot "
"from shelved instance (%s)."),
exc.format_message(), instance=instance)
except Exception:
LOG.exception(_LE("Something wrong happened when trying to "
"delete snapshot from shelved instance."),
instance=instance)
original_task_state = instance.task_state
quotas = None
try:
# NOTE(maoy): no expected_task_state needs to be set
instance.update(instance_attrs)
instance.progress = 0
instance.save()//写入数据库
# NOTE(comstud): If we delete the instance locally, we'll
# commit the reservations here. Otherwise, the manager side
# will commit or rollback the reservations based on success.
quotas = self._create_reservations(context,
instance,
original_task_state,
project_id, user_id)
//配额管理
if self.cell_type == 'api':
# NOTE(comstud): If we're in the API cell, we need to
# skip all remaining logic and just call the callback,
# which will cause a cast to the child cell. Also,
# commit reservations here early until we have a better
# way to deal with quotas with cells.
cb(context, instance, bdms, reservations=None)
quotas.commit()
return
shelved_offloaded = (instance.vm_state
== vm_states.SHELVED_OFFLOADED)
if not instance.host and not shelved_offloaded:
try:
compute_utils.notify_about_instance_usage(
self.notifier, context, instance,
"%s.start" % delete_type)
instance.destroy()
compute_utils.notify_about_instance_usage(
self.notifier, context, instance,
"%s.end" % delete_type,
system_metadata=instance.system_metadata)
quotas.commit()
return
except exception.ObjectActionError:
instance.refresh()
if instance.vm_state == vm_states.RESIZED:
self._confirm_resize_on_deleting(context, instance)
is_local_delete = True
try:
if not shelved_offloaded:
service = objects.Service.get_by_compute_host(
context.elevated(), instance.host)
is_local_delete = not self.servicegroup_api.service_is_up(
service)//
检查服务状态
if not is_local_delete:
if original_task_state in (task_states.DELETING,
task_states.SOFT_DELETING):
LOG.info(_LI('Instance is already in deleting state, '
'ignoring this request'),
instance=instance)
quotas.rollback()
return
self._record_action_start(context, instance,
instance_actions.DELETE)
# NOTE(snikitin): If instance's vm_state is 'soft-delete',
# we should not count reservations here, because instance
# in soft-delete vm_state have already had quotas
# decremented. More details:
# https://bugs.launchpad.net/nova/+bug/1333145
if instance.vm_state == vm_states.SOFT_DELETED:
quotas.rollback()
cb(context, instance, bdms,
reservations=quotas.reservations)
//对应第二场景,具体方法:def _do_delete():
except exception.ComputeHostNotFound:
pass
if is_local_delete:
# If instance is in shelved_offloaded state or compute node
# isn't up, delete instance from db and clean bdms info and
# network info
self._local_delete(context, instance, bdms, delete_type, cb)
//对应第一种场景方法
quotas.commit()
except exception.InstanceNotFound:
# NOTE(comstud): Race condition. Instance already gone.
if quotas:
quotas.rollback()
except Exception:
with excutils.save_and_reraise_exception():
if quotas:
quotas.rollback()
# 场景二:
def _do_delete(self, context, instance, bdms, reservations=None,
local=False):
if local:
instance.vm_state = vm_states.DELETED
instance.task_state = None
instance.terminated_at = timeutils.utcnow()
instance.save()
else:
self.compute_rpcapi.terminate_instance(context, instance, bdms,
reservations=reservations,
delete_type='delete')
##通过消息队列将请求转发到对应节点,然后调用driver(libvirt)删除云主机。
# 场景一:
def _local_delete(self, context, instance, bdms, delete_type, cb):
if instance.vm_state == vm_states.SHELVED_OFFLOADED:
LOG.info(_LI("instance is in SHELVED_OFFLOADED state, cleanup"
" the instance's info from database."),
instance=instance)
else:
LOG.warning(_LW("instance's host %s is down, deleting from "
"database"), instance.host, instance=instance)
if instance.info_cache is not None:
instance.info_cache.delete()
else:
# NOTE(yoshimatsu): Avoid AttributeError if instance.info_cache
# is None. When the root cause that instance.info_cache becomes
# None is fixed, the log level should be reconsidered.
LOG.warning(_LW("Info cache for instance could not be found. "
"Ignore."), instance=instance)
compute_utils.notify_about_instance_usage(
self.notifier, context, instance, "%s.start" % delete_type)
elevated = context.elevated()
if self.cell_type != 'api':
# NOTE(liusheng): In nova-network multi_host scenario,deleting
# network info of the instance may need instance['host'] as
# destination host of RPC call. If instance in SHELVED_OFFLOADED
# state, instance['host'] is None, here, use shelved_host as host
# to deallocate network info and reset instance['host'] after that.
# Here we shouldn't use instance.save(), because this will mislead
# user who may think the instance's host has been changed, and
# actually, the instance.host is always None.
orig_host = instance.host
try:
if instance.vm_state == vm_states.SHELVED_OFFLOADED:
sysmeta = getattr(instance,
obj_base.get_attrname('system_metadata'))
instance.host = sysmeta.get('shelved_host')
self.network_api.deallocate_for_instance(elevated,
instance)
finally:
instance.host = orig_host
# cleanup volumes
for bdm in bdms:
if bdm.is_volume:
# NOTE(vish): We don't have access to correct volume
# connector info, so just pass a fake
# connector. This can be improved when we
# expose get_volume_connector to rpc.
connector = {'ip': '127.0.0.1', 'initiator': 'iqn.fake'}
try:
self.volume_api.terminate_connection(context,
bdm.volume_id,
connector)
self.volume_api.detach(elevated, bdm.volume_id)
if bdm.delete_on_termination:
self.volume_api.delete(context, bdm.volume_id)
//删除系统盘,这里无法删除
except Exception as exc:
err_str = _LW("Ignoring volume cleanup failure due to %s")
LOG.warn(err_str % exc, instance=instance)
bdm.destroy()
cb(context, instance, bdms, local=True)
sys_meta = instance.system_metadata
instance.destroy()
compute_utils.notify_about_instance_usage(
self.notifier, context, instance, "%s.end" % delete_type,
system_metadata=sys_meta)
具体可以在删除的代码中解除锁定break_lock:
cinder/volume/drivers/rbd.py
def delete_volume(self, volume):
"""Deletes a logical volume."""
# NOTE(dosaboy): this was broken by commit cbe1d5f. Ensure names are
# utf-8 otherwise librbd will barf.
volume_name = utils.convert_str(volume['name'])
.......
.......
def _try_remove_volume(client, volume_name):
with RBDVolumeProxy(self, volume_name) as volume:
locker_info = volume.list_lockers()
if locker_info:
LOG.debug(_("Unlock the rbd volume %s firstly."), volume_name)
locker_client, locker_cookie, locker_address = locker_info["lockers"][0]
volume.break_lock(client=locker_client, cookie=locker_cookie)
self.RBDProxy().remove(client.ioctx, volume_name)
注释:虽然可以删除系统盘,但是当computer服务up后,相关的qemu-kvm进程会存在,但是nova-compute服务会周期性运行任务去cleanup此类instances。
之后会分析nova相关的定时任务