libvirt numatune 原理

目录

numatune是什么

numatune memory

libvirt

内核

numatune memnode

qemu

内核

alloc page

结论


numatune是什么

numatune是libvirt的一个参数,可以用在numa架构的虚拟机上,用来控制虚拟机内存访问的亲合性。

使用方法如下:

xml:


  ...
  
    
    
    
  
  ...

numatune里由两部分组成

numatune memory

numatune momory里的值会写到 cgroup cpuset.mems里,libvirt 对cgroup进行设置

libvirt

对emulator线程进行cgroup cpuset.mems 设置

if (mem_mask)
    if (virCgroupNewThread(priv->cgroup, VIR_CGROUP_THREAD_EMULATOR, 0,
                           false, &cgroup_temp) < 0 ||
        virCgroupSetCpusetMems(cgroup_temp, mem_mask) < 0)

对vcpu线程进行 cgroup cpuset.mems 设置

    for (i = 0; i < maxvcpus; i++) {
        vcpu = virDomainDefGetVcpu(vm->def, i);

        if (!vcpu->online)
            continue;

        if (qemuProcessSetupVcpu(vm, i) < 0)
            return -1;
    }


qemuProcessSetupVcpu(virDomainObjPtr vm, unsigned int vcpuid)
{
    pid_t vcpupid = qemuDomainGetVcpuPid(vm, vcpuid);  //获取vcpu线程id
    if (qemuProcessSetupPid(vm, vcpupid, VIR_CGROUP_THREAD_VCPU,
                            vcpuid, vcpu->cpumask,
                            vm->def->cputune.period,
                            vm->def->cputune.quota,
                            &vcpu->sched) < 0)
}

内核

设置好cgroup,内核就要根据cgroup设置的值对进程的vma进行迁移,以及在指定里node为进程分配页面,主要在 update_tasks_nodemask 函数中实现。

update_tasks_nodemask会遍历当前cgroup下所有的task_struct进行操作

1、修改task_struct->mems_allowed

2、遍历task_struct下所有的vma,修改vma->vm_policy。cgroup这个修改vma->vm_policy的功能,是在 4.x 内核里才添加的,3.x 的内核没有这个功能

3、判断是否进行迁移,如果设置了迁移了,则把不在设置node的页面,全部迁移到指定node 里

	css_task_iter_start(&cs->css, 0, &it);
	while ((task = css_task_iter_next(&it))) {
		struct mm_struct *mm;
		bool migrate;

		cpuset_change_task_nodemask(task, &newmems);     // 修改task_struct->mems_allowed 为cpuset.mems的值

		mm = get_task_mm(task);
		if (!mm)
			continue;

		migrate = is_memory_migrate(cs);              //判断是否可以迁移

		mpol_rebind_mm(mm, &cs->mems_allowed);        //遍历task的所有vma,对所有的vma->vm_policy 修改为 cpuset.mems的值
		if (migrate)
			cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems);   //如果可以迁移,则把不在cpuset.mems的node的页面都迁移到指定node上
		else
			mmput(mm);
	}

migrate = is_memory_migrate(cs); 怎么判断是否可以迁移的,在cpuset.memory_migrate设置,目前看这个默认值是 true,表示会迁移

[[email protected] ~]# cat /cgroup/cpuset/libvirt/qemu-29-instance-535b3c33-49e1-4f01-9192-18af59d49af8/emulator/cpuset.memory_migrate
1 

numatune memnode

qemu调用mbind函数实现对虚拟机内存node的绑定

qemu

如果设置了numatune,看到qemu的参数会多出来host-nodes=0,policy=bind.

-object memory-backend-file,id=ram-node0,prealloc=yes,mem-path=/dev/hugepages/libvirt/qemu/17-centos,share=yes,size=2147483648,host-nodes=0,policy=bind

遍历所有的object

qemu_opts_foreach(qemu_find_opts("object"),
                          user_creatable_add_opts_foreach,
                          object_create_delayed, NULL))

host_memory_backend_memory_complete

这个函数主要三两个功能

1、alloc,分配内存,打开大页文件,设置文件大小,初始化vma。

2、mbind 设置NUMA 内存访问策略,flag = MPOL_MF_STRICT | MPOL_MF_MOVE;   MPOL_MF_MOVE 表示移动不在指定node的页到指定node

3、prealloc 进行内存预分配

static void
host_memory_backend_memory_complete(UserCreatable *uc, Error **errp)
{
    HostMemoryBackend *backend = MEMORY_BACKEND(uc);
    HostMemoryBackendClass *bc = MEMORY_BACKEND_GET_CLASS(uc);
    Error *local_err = NULL;
    void *ptr;
    uint64_t sz;


    if (bc->alloc) {
        bc->alloc(backend, &local_err);
        if (local_err) {
            goto out;
        }

        ..........

        ptr = memory_region_get_ram_ptr(&backend->mr);
        sz = memory_region_size(&backend->mr);

        /* ensure policy won't be ignored in case memory is preallocated
         * before mbind(). note: MPOL_MF_STRICT is ignored on hugepages so
         * this doesn't catch hugepage case. */
        unsigned flags = MPOL_MF_STRICT | MPOL_MF_MOVE;

        if (mbind(ptr, sz, backend->policy,
                  maxnode ? backend->host_nodes : NULL, maxnode + 1, flags)) {
            if (backend->policy != MPOL_DEFAULT || errno != ENOSYS) {
                error_setg_errno(errp, errno,
                                 "cannot bind memory to host NUMA nodes");
                return;
            }
        }
        /* Preallocate memory after the NUMA policy has been instantiated.
         * This is necessary to guarantee memory is allocated with
         * specified NUMA policy in place.
         */
        if (backend->prealloc) {
            os_mem_prealloc(memory_region_get_fd(&backend->mr), ptr, sz,
                            smp_cpus, &local_err);
            if (local_err) {
                goto out;
            }
        }
    }
}

内核

mbind函数实现原理

1、创建一个mempolicy new

2、调用mbind_range,对start + len 的所有vma,进行vma->vm_policy设置

3、迁移page

static long do_mbind(unsigned long start, unsigned long len,
		     unsigned short mode, unsigned short mode_flags,
		     nodemask_t *nmask, unsigned long flags)
{
	struct mm_struct *mm = current->mm;
	struct mempolicy *new;

	LIST_HEAD(pagelist);

	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
	end = start + len;

	new = mpol_new(mode, mode_flags, nmask);

	ret = queue_pages_range(mm, start, end, nmask,
			  flags | MPOL_MF_INVERT, &pagelist);

	err = mbind_range(mm, start, end, new);


	if (!err) {
		if (!list_empty(&pagelist)) {
			nr_failed = migrate_pages(&pagelist, new_page, NULL,
				start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
		}
	}
}

mbind_range

对start + len 的所有vma,进行vma→vm_policy设置,设置为numatune里对应的 policy

	for (; vma && vma->vm_start < end; prev = vma, vma = next) {
		next = vma->vm_next;
		vmstart = max(start, vma->vm_start);
		vmend   = min(end, vma->vm_end);

		pgoff = vma->vm_pgoff +
			((vmstart - vma->vm_start) >> PAGE_SHIFT);
		prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
				 vma->anon_vma, vma->vm_file, pgoff,
				 new_pol, vma->vm_userfaultfd_ctx);
 replace:
		err = vma_replace_policy(vma, new_pol);   //遍历vma,设置vma->vm_policy
		if (err)
			goto out;
	}

alloc page

页面分配器的函数在内核中有着各种各样的版本,不论是返回虚拟地址的还是返回struct page指针的,最终都会调用一个共同的接口:__alloc_pages_nodemask()

struct page * __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, nodemask_t *nodemask)
{
    struct page *page;
    unsigned int alloc_flags = ALLOC_WMARK_LOW;   //先尝试从LOW水位分配
    gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */
    struct alloc_context ac = { };
 
    gfp_mask &= gfp_allowed_mask;
    alloc_mask = gfp_mask;
    if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags))
        return NULL;
 
    /* First allocation attempt */
    page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
 
    page = __alloc_pages_slowpath(alloc_mask, order, &ac);
 
    return page;
}

通过prepare_alloc_pages函数初始化alloc_context,确认下分配的zone,zonelist,mask,以及迁移类型。

分配node时,会先判断vma里没有,优先使用vm->vm_policy里设定的node,否则采用task_struct 里的 mems_allowed。

static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
 		int preferred_nid, nodemask_t *nodemask,
 		struct alloc_context *ac, gfp_t *alloc_mask,
 		unsigned int *alloc_flags)
{
 	ac->high_zoneidx = gfp_zone(gfp_mask);
 	ac->zonelist = node_zonelist(preferred_nid, gfp_mask);
 	ac->nodemask = nodemask;
 	ac->migratetype = gfpflags_to_migratetype(gfp_mask);
 	if (cpusets_enabled()) {           //当开启了cpuset的功能
 		*alloc_mask |= __GFP_HARDWALL;
 		if (!ac->nodemask)            //如果vma里没有 nodemask,则采用task_struct 里的 mems_allowed
 			ac->nodemask = &cpuset_current_mems_allowed;
 		else
 			*alloc_flags |= ALLOC_CPUSET;
 	}
 	fs_reclaim_acquire(gfp_mask);
 	fs_reclaim_release(gfp_mask);
 	might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
 	if (should_fail_alloc_page(gfp_mask, order))
 		return false;
 	if (IS_ENABLED(CONFIG_CMA) && ac->migratetype == MIGRATE_MOVABLE)
 		*alloc_flags |= ALLOC_CMA;
 	return true;
}

结论

memory 最终作用于 cgroup里的 cpuset.mems, memnode 作用于 qemu 里 hostnode=,最终使用 mbind 函数

memory细度大,针对的是整个进程,memnode细度细,只针对特定的vma设置相应的vma->vm_policy。

在分配页面时,会首先选择vma->vm_policy里设定的node,在vma->vm_policy里没有时候才使用task_struct->mems_allowed,如果都没有则使用当前cpu的node.

cgroup里默认会迁移,在cgroup设置时会迁移不在指定node的页在, mbind时因为设置了MPOL_MF_MOVE也会迁移

参考

https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/7/html/virtualization_tuning_and_optimization_guide/sect-virtualization_tuning_optimization_guide-numa-numa_and_libvirt

你可能感兴趣的:(libvirt,qemu,kernel)