目录
numatune是什么
numatune memory
libvirt
内核
numatune memnode
qemu
内核
alloc page
结论
numatune是libvirt的一个参数,可以用在numa架构的虚拟机上,用来控制虚拟机内存访问的亲合性。
使用方法如下:
xml:
...
...
numatune里由
numatune momory里的值会写到 cgroup cpuset.mems里,libvirt 对cgroup进行设置
对emulator线程进行cgroup cpuset.mems 设置
if (mem_mask)
if (virCgroupNewThread(priv->cgroup, VIR_CGROUP_THREAD_EMULATOR, 0,
false, &cgroup_temp) < 0 ||
virCgroupSetCpusetMems(cgroup_temp, mem_mask) < 0)
对vcpu线程进行 cgroup cpuset.mems 设置
for (i = 0; i < maxvcpus; i++) {
vcpu = virDomainDefGetVcpu(vm->def, i);
if (!vcpu->online)
continue;
if (qemuProcessSetupVcpu(vm, i) < 0)
return -1;
}
qemuProcessSetupVcpu(virDomainObjPtr vm, unsigned int vcpuid)
{
pid_t vcpupid = qemuDomainGetVcpuPid(vm, vcpuid); //获取vcpu线程id
if (qemuProcessSetupPid(vm, vcpupid, VIR_CGROUP_THREAD_VCPU,
vcpuid, vcpu->cpumask,
vm->def->cputune.period,
vm->def->cputune.quota,
&vcpu->sched) < 0)
}
设置好cgroup,内核就要根据cgroup设置的值对进程的vma进行迁移,以及在指定里node为进程分配页面,主要在 update_tasks_nodemask 函数中实现。
update_tasks_nodemask会遍历当前cgroup下所有的task_struct进行操作
1、修改task_struct->mems_allowed
2、遍历task_struct下所有的vma,修改vma->vm_policy。cgroup这个修改vma->vm_policy的功能,是在 4.x 内核里才添加的,3.x 的内核没有这个功能
3、判断是否进行迁移,如果设置了迁移了,则把不在设置node的页面,全部迁移到指定node 里
css_task_iter_start(&cs->css, 0, &it);
while ((task = css_task_iter_next(&it))) {
struct mm_struct *mm;
bool migrate;
cpuset_change_task_nodemask(task, &newmems); // 修改task_struct->mems_allowed 为cpuset.mems的值
mm = get_task_mm(task);
if (!mm)
continue;
migrate = is_memory_migrate(cs); //判断是否可以迁移
mpol_rebind_mm(mm, &cs->mems_allowed); //遍历task的所有vma,对所有的vma->vm_policy 修改为 cpuset.mems的值
if (migrate)
cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems); //如果可以迁移,则把不在cpuset.mems的node的页面都迁移到指定node上
else
mmput(mm);
}
migrate = is_memory_migrate(cs); 怎么判断是否可以迁移的,在cpuset.memory_migrate设置,目前看这个默认值是 true,表示会迁移
[[email protected] ~]# cat /cgroup/cpuset/libvirt/qemu-29-instance-535b3c33-49e1-4f01-9192-18af59d49af8/emulator/cpuset.memory_migrate
1
qemu调用mbind函数实现对虚拟机内存node的绑定
如果设置了numatune,看到qemu的参数会多出来host-nodes=0,policy=bind.
-object memory-backend-file,id=ram-node0,prealloc=yes,mem-path=/dev/hugepages/libvirt/qemu/17-centos,share=yes,size=2147483648,host-nodes=0,policy=bind
遍历所有的object
qemu_opts_foreach(qemu_find_opts("object"),
user_creatable_add_opts_foreach,
object_create_delayed, NULL))
host_memory_backend_memory_complete
这个函数主要三两个功能
1、alloc,分配内存,打开大页文件,设置文件大小,初始化vma。
2、mbind 设置NUMA 内存访问策略,flag = MPOL_MF_STRICT | MPOL_MF_MOVE; MPOL_MF_MOVE 表示移动不在指定node的页到指定node
3、prealloc 进行内存预分配
static void
host_memory_backend_memory_complete(UserCreatable *uc, Error **errp)
{
HostMemoryBackend *backend = MEMORY_BACKEND(uc);
HostMemoryBackendClass *bc = MEMORY_BACKEND_GET_CLASS(uc);
Error *local_err = NULL;
void *ptr;
uint64_t sz;
if (bc->alloc) {
bc->alloc(backend, &local_err);
if (local_err) {
goto out;
}
..........
ptr = memory_region_get_ram_ptr(&backend->mr);
sz = memory_region_size(&backend->mr);
/* ensure policy won't be ignored in case memory is preallocated
* before mbind(). note: MPOL_MF_STRICT is ignored on hugepages so
* this doesn't catch hugepage case. */
unsigned flags = MPOL_MF_STRICT | MPOL_MF_MOVE;
if (mbind(ptr, sz, backend->policy,
maxnode ? backend->host_nodes : NULL, maxnode + 1, flags)) {
if (backend->policy != MPOL_DEFAULT || errno != ENOSYS) {
error_setg_errno(errp, errno,
"cannot bind memory to host NUMA nodes");
return;
}
}
/* Preallocate memory after the NUMA policy has been instantiated.
* This is necessary to guarantee memory is allocated with
* specified NUMA policy in place.
*/
if (backend->prealloc) {
os_mem_prealloc(memory_region_get_fd(&backend->mr), ptr, sz,
smp_cpus, &local_err);
if (local_err) {
goto out;
}
}
}
}
mbind函数实现原理
1、创建一个mempolicy new
2、调用mbind_range,对start + len 的所有vma,进行vma->vm_policy设置
3、迁移page
static long do_mbind(unsigned long start, unsigned long len,
unsigned short mode, unsigned short mode_flags,
nodemask_t *nmask, unsigned long flags)
{
struct mm_struct *mm = current->mm;
struct mempolicy *new;
LIST_HEAD(pagelist);
len = (len + PAGE_SIZE - 1) & PAGE_MASK;
end = start + len;
new = mpol_new(mode, mode_flags, nmask);
ret = queue_pages_range(mm, start, end, nmask,
flags | MPOL_MF_INVERT, &pagelist);
err = mbind_range(mm, start, end, new);
if (!err) {
if (!list_empty(&pagelist)) {
nr_failed = migrate_pages(&pagelist, new_page, NULL,
start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
}
}
}
mbind_range
对start + len 的所有vma,进行vma→vm_policy设置,设置为numatune里对应的 policy
for (; vma && vma->vm_start < end; prev = vma, vma = next) {
next = vma->vm_next;
vmstart = max(start, vma->vm_start);
vmend = min(end, vma->vm_end);
pgoff = vma->vm_pgoff +
((vmstart - vma->vm_start) >> PAGE_SHIFT);
prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
vma->anon_vma, vma->vm_file, pgoff,
new_pol, vma->vm_userfaultfd_ctx);
replace:
err = vma_replace_policy(vma, new_pol); //遍历vma,设置vma->vm_policy
if (err)
goto out;
}
页面分配器的函数在内核中有着各种各样的版本,不论是返回虚拟地址的还是返回struct page指针的,最终都会调用一个共同的接口:__alloc_pages_nodemask()
struct page * __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, nodemask_t *nodemask)
{
struct page *page;
unsigned int alloc_flags = ALLOC_WMARK_LOW; //先尝试从LOW水位分配
gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */
struct alloc_context ac = { };
gfp_mask &= gfp_allowed_mask;
alloc_mask = gfp_mask;
if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags))
return NULL;
/* First allocation attempt */
page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
page = __alloc_pages_slowpath(alloc_mask, order, &ac);
return page;
}
通过prepare_alloc_pages函数初始化alloc_context,确认下分配的zone,zonelist,mask,以及迁移类型。
分配node时,会先判断vma里没有,优先使用vm->vm_policy里设定的node,否则采用task_struct 里的 mems_allowed。
static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
int preferred_nid, nodemask_t *nodemask,
struct alloc_context *ac, gfp_t *alloc_mask,
unsigned int *alloc_flags)
{
ac->high_zoneidx = gfp_zone(gfp_mask);
ac->zonelist = node_zonelist(preferred_nid, gfp_mask);
ac->nodemask = nodemask;
ac->migratetype = gfpflags_to_migratetype(gfp_mask);
if (cpusets_enabled()) { //当开启了cpuset的功能
*alloc_mask |= __GFP_HARDWALL;
if (!ac->nodemask) //如果vma里没有 nodemask,则采用task_struct 里的 mems_allowed
ac->nodemask = &cpuset_current_mems_allowed;
else
*alloc_flags |= ALLOC_CPUSET;
}
fs_reclaim_acquire(gfp_mask);
fs_reclaim_release(gfp_mask);
might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
if (should_fail_alloc_page(gfp_mask, order))
return false;
if (IS_ENABLED(CONFIG_CMA) && ac->migratetype == MIGRATE_MOVABLE)
*alloc_flags |= ALLOC_CMA;
return true;
}
memory 最终作用于 cgroup里的 cpuset.mems, memnode 作用于 qemu 里 hostnode=,最终使用 mbind 函数
memory细度大,针对的是整个进程,memnode细度细,只针对特定的vma设置相应的vma->vm_policy。
在分配页面时,会首先选择vma->vm_policy里设定的node,在vma->vm_policy里没有时候才使用task_struct->mems_allowed,如果都没有则使用当前cpu的node.
cgroup里默认会迁移,在cgroup设置时会迁移不在指定node的页在, mbind时因为设置了MPOL_MF_MOVE也会迁移
参考
https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/7/html/virtualization_tuning_and_optimization_guide/sect-virtualization_tuning_optimization_guide-numa-numa_and_libvirt