共享内存mmap底层原理

共享内存基础知识

共享内存有两个,一个 mmap,一个 systemV 的 shm

由于所有用户进程总的虚拟地址空间比可用的物理内存大很多,因此只有最常用的部分才与物理页帧关联(这不是问题,因为大多数程序只占用实际可用内存的一小部分)

  • 在将磁盘上的数据映射到进程的虚拟地址空间的时,内核必须提供数据结构,以建立虚拟地址空间的区域和相关数据所在位置之间的关联,Linux 软件系统多级页表映射机制
  • 共享内存使得多个进程可以访问同一块内存空间(节约了内存空间),不同进程可以及时看到对方进程中对共享内存中数据得更新(多个进程可以同时操作,所以需要进行同步 ,一般与信号量配合使用)

本文主要介绍 mmap

共享内存的 API

void *mmap(void *addr, size_t len, int prot, int flags, int fd, off_t offset);
  • addr:
    • 指定了映射被放置的虚拟地址,首选做法是将 addr 指定为 NULL,内核会为映射选择一个合适的地址(将 addr 指定为非 NULL,内核会将该参数值作为一个提示信息来处理)
  • length:
    • 指定了映射字节数,如果 length 不是分页的整数倍,内核会以分页大小为单位建立映射
  • prot:是一个位掩码,指定了新内存映射上的保护信息
  • flags:是一个控制映射操作各个方面的选项的位掩码(只能选一个)
    • MAP_PRIVATE - 私有:对映射区域的写入操作会产生一个映射文件的复制,即私人的“写入时复制”(copy on write)对此区域作的任何修改都不会写回原来的文件内容
    • MAP_SHARED - 共有:对映射区域的写入数据会复制回文件内,而且允许其他映射该文件的进程共享
    • MAP_ANONYMOUS - 匿名:建立匿名映射,此时会忽略参数fd,不涉及文件,而且映射区域无法和其他进程共享
  • 匿名映射会忽略下面两个参数:
    • fd:表示映射的文件的文件描述符
    • offset:指定了映射在文件中的起点,必须是系统分页大小的倍数
  • return:
    • 成功:返回被映射区的指针
    • 出错:返回 “-1”,错误原因存于 error 中

共享内存使用案例

mmap:

#include 
#include

int main(int argc, char* argv[]){
    int fd = open("./flag.txt", 0666);
    if(-1 == fd)
    {
        perror("open");
        return -1;
    }
    int length = 1;
    // char *addr = (char*)mmap(NULL, length, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);
    char *addr = (char*)mmap(NULL, length, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
    if(addr == MAP_FAILED)
    {
        perror("mmap");
        return -1;
    }
    
    puts("get data from mmap:");
    write(1,addr,0x40);
    puts(" ");
    puts("input data to mmap:");
    read(0,addr,0x40);
    
    if(munmap(addr, length) == -1) /* 解除映射区域 */
    {
        perror("munmap");
        return -1;
    }
}
  • 效果:
    0x7ffff7fcf000     0x7ffff7fd0000 r--p     1000 0      /usr/lib/x86_64-linux-gnu/ld-2.31.so
    0x7ffff7fd0000     0x7ffff7ff3000 r-xp    23000 1000   /usr/lib/x86_64-linux-gnu/ld-2.31.so
    0x7ffff7ff3000     0x7ffff7ffb000 r--p     8000 24000  /usr/lib/x86_64-linux-gnu/ld-2.31.so
    0x7ffff7ffb000     0x7ffff7ffc000 rw-p     1000 0      /home/yhellow/桌面/exp/flag.txt /* target */
    0x7ffff7ffc000     0x7ffff7ffd000 r--p     1000 2c000  /usr/lib/x86_64-linux-gnu/ld-2.31.so
    0x7ffff7ffd000     0x7ffff7ffe000 rw-p     1000 2d000  /usr/lib/x86_64-linux-gnu/ld-2.31.so
  • 其实 mmap 也可以用来进程间通信,但是用它分配内存的情况多一点

Linux 中 mmap 的实现
kernel 版本:linux-4.20.1

mmap 的作用就是把磁盘文件的一部分(指定 fd)直接映射到进程的内存中

0x7ffff7eda8e4 <mmap64+36>    syscall  <SYS_mmap>
        addr: 0x0
        len: 0x1
        prot: 0x3
        flags: 0x2
        fd: 0x3 (/home/yhellow/桌面/exp/flag.txt)
        offset: 0x0
asmlinkage unsigned long
sys_mmap (unsigned long addr, unsigned long len, int prot, int flags, int fd, long off)
{
	if (offset_in_page(off) != 0)
		return -EINVAL;

	addr = ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT); /* 核心函数 */
	if (!IS_ERR((void *) addr))
		force_successful_syscall_return();
	return addr;
}
unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
			      unsigned long prot, unsigned long flags,
			      unsigned long fd, unsigned long pgoff)
{
	struct file *file = NULL;
	unsigned long retval;

	if (!(flags & MAP_ANONYMOUS)) { /* MAP_ANONYMOUS:匿名的 */
		audit_mmap_fd(fd, flags); /* 把'fd'和'flags'写到mmap结构体中 */
		file = fget(fd); /* 获取对应的文件 */
		if (!file)
			return -EBADF;
		if (is_file_hugepages(file))
			len = ALIGN(len, huge_page_size(hstate_file(file))); /* 对齐 */
		retval = -EINVAL;
		if (unlikely(flags & MAP_HUGETLB && !is_file_hugepages(file)))
			goto out_fput;
	} else if (flags & MAP_HUGETLB) { /* MAP_HUGETLB:大页面映射 */
		struct user_struct *user = NULL;
		struct hstate *hs;

		hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
		if (!hs)
			return -EINVAL;

		len = ALIGN(len, huge_page_size(hs)); /* 对齐 */
		/*
		 * VM_NORESERVE is used because the reservations will be
		 * taken when vm_ops->mmap() is called
		 * A dummy user value is used because we are not locking
		 * memory so no accounting is necessary
		 */
		file = hugetlb_file_setup(HUGETLB_ANON_FILE, len,
				VM_NORESERVE,
				&user, HUGETLB_ANONHUGE_INODE,
				(flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
		if (IS_ERR(file))
			return PTR_ERR(file);
	}

	flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); /* 去掉可执行权限,去掉不可写权限 */

	retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); /* 核心函数 */
out_fput:
	if (file)
		fput(file);
	return retval;
}
  • 简单检查并处理了一下标志位,然后进行对齐
unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
	unsigned long len, unsigned long prot,
	unsigned long flag, unsigned long pgoff)
{
	unsigned long ret;
	struct mm_struct *mm = current->mm; /* 获取当前进程的内存描述符 */
	unsigned long populate;
	LIST_HEAD(uf);

	ret = security_mmap_file(file, prot, flag); /* 内核sandboxing功能,通过sandboxing调用mmap_file函数,如果是文件映射会mmap_file会对文件进行权限检查之类操作 */
	if (!ret) {
		if (down_write_killable(&mm->mmap_sem)) 
			return -EINTR;
		ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff,
				    &populate, &uf); /* 核心函数 */
		up_write(&mm->mmap_sem);
		userfaultfd_unmap_complete(mm, &uf);
		if (populate)
			mm_populate(ret, populate);
	}
	return ret;
}
  • security_mmap_file 最终会调用 ima_file_mmap
static inline unsigned long
do_mmap_pgoff(struct file *file, unsigned long addr,
	unsigned long len, unsigned long prot, unsigned long flags,
	unsigned long pgoff, unsigned long *populate,
	struct list_head *uf)
{
	return do_mmap(file, addr, len, prot, flags, 0, pgoff, populate, uf);
}
unsigned long do_mmap(struct file *file,
			unsigned long addr,
			unsigned long len,
			unsigned long prot,
			unsigned long flags,
			vm_flags_t vm_flags,
			unsigned long pgoff,
			unsigned long *populate,
			struct list_head *uf)
{
	struct vm_area_struct *vma; /* Linux中vm_area_struct表示的虚拟地址是给进程使用的(vm_struct表示的虚拟地址是给内核使用的) */
	struct vm_region *region;
	struct rb_node *rb;
	unsigned long capabilities, result;
	int ret;

	*populate = 0;

	/* decide whether we should attempt the mapping, and if so what sort of
	 * mapping */
	ret = validate_mmap_request(file, addr, len, prot, flags, pgoff,
				    &capabilities); /* 用于决定是否应该尝试映射 */
	if (ret < 0)
		return ret;

	/* we ignore the address hint */
	addr = 0;
	len = PAGE_ALIGN(len);

	/* we've determined that we can make the mapping, now translate what we
	 * now know into VMA flags */
	vm_flags |= determine_vm_flags(file, prot, flags, capabilities); 

	/* we're going to need to record the mapping */
	region = kmem_cache_zalloc(vm_region_jar, GFP_KERNEL); /* 记录映射(kmem_cache_zalloc除了分配内存对象之外,还把内存对象所代表的内存空间初始化为"0") */
	if (!region)
		goto error_getting_region;

	vma = vm_area_alloc(current->mm); /* 调用kmem_cache_alloc分配新的vma,然后调用vma_init进行初始化 */
	if (!vma)
		goto error_getting_vma;

	region->vm_usage = 1; /* 设置vm_region */
	region->vm_flags = vm_flags;
	region->vm_pgoff = pgoff;
	
	vma->vm_flags = vm_flags; /* 设置vm_area_struct */
	vma->vm_pgoff = pgoff;

	if (file) { /* 这里的file就是通过mmap的参数'fd'得来的 */
		region->vm_file = get_file(file);
		vma->vm_file = get_file(file);
	}

	down_write(&nommu_region_sem);

	/* if we want to share, we need to check for regions created by other
	 * mmap() calls that overlap with our proposed mapping
	 * - we can only share with a superset match on most regular files
	 * - shared mappings on character devices and memory backed files are
	 *   permitted to overlap inexactly as far as we are concerned for in
	 *   these cases, sharing is handled in the driver or filesystem rather
	 *   than here
	 */
	if (vm_flags & VM_MAYSHARE) { /* VM_MAYSHARE:用于确定是否可以设置对应的VM_SHARED(可以被多个进程共享) */
		struct vm_region *pregion;
		unsigned long pglen, rpglen, pgend, rpgend, start;

		pglen = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
		pgend = pgoff + pglen;

		for (rb = rb_first(&nommu_region_tree); rb; rb = rb_next(rb)) {
			pregion = rb_entry(rb, struct vm_region, vm_rb);

			if (!(pregion->vm_flags & VM_MAYSHARE))
				continue;

			/* search for overlapping mappings on the same file */
			if (file_inode(pregion->vm_file) !=
			    file_inode(file)) 
				continue;

			if (pregion->vm_pgoff >= pgend)
				continue;

			rpglen = pregion->vm_end - pregion->vm_start;
			rpglen = (rpglen + PAGE_SIZE - 1) >> PAGE_SHIFT;
			rpgend = pregion->vm_pgoff + rpglen;
			if (pgoff >= rpgend)
				continue;

			/* handle inexactly overlapping matches between
			 * mappings */
			if ((pregion->vm_pgoff != pgoff || rpglen != pglen) &&
			    !(pgoff >= pregion->vm_pgoff && pgend <= rpgend)) {
				/* new mapping is not a subset of the region */
				if (!(capabilities & NOMMU_MAP_DIRECT))
					goto sharing_violation;
				continue;
			}

			/* we've found a region we can share */
			pregion->vm_usage++;
			vma->vm_region = pregion; /* 设置vm_area_struct */
			start = pregion->vm_start;
			start += (pgoff - pregion->vm_pgoff) << PAGE_SHIFT;
			vma->vm_start = start;
			vma->vm_end = start + len;

			if (pregion->vm_flags & VM_MAPPED_COPY)
				vma->vm_flags |= VM_MAPPED_COPY;
			else {
				ret = do_mmap_shared_file(vma); /* 在文件上设置共享映射(驱动程序或文件系统提供并固定存储) */
				if (ret < 0) {
					vma->vm_region = NULL;
					vma->vm_start = 0;
					vma->vm_end = 0;
					pregion->vm_usage--;
					pregion = NULL;
					goto error_just_free;
				}
			}
			fput(region->vm_file);
			kmem_cache_free(vm_region_jar, region);
			region = pregion;
			result = start;
			goto share;
		}

		/* obtain the address at which to make a shared mapping
		 * - this is the hook for quasi-memory character devices to
		 *   tell us the location of a shared mapping
		 */
		if (capabilities & NOMMU_MAP_DIRECT) {
			addr = file->f_op->get_unmapped_area(file, addr, len,
							     pgoff, flags); 
            /* get_unmapped_area调用的是"current->mm->get_unmapped_area",在不同体系结构上对应不同的函数,但这些函数的基本原理都是类似的 */
			if (IS_ERR_VALUE(addr)) {
				ret = addr;
				if (ret != -ENOSYS)
					goto error_just_free;

				/* the driver refused to tell us where to site
				 * the mapping so we'll have to attempt to copy
				 * it */
				ret = -ENODEV;
				if (!(capabilities & NOMMU_MAP_COPY))
					goto error_just_free;

				capabilities &= ~NOMMU_MAP_DIRECT;
			} else {
				vma->vm_start = region->vm_start = addr;
				vma->vm_end = region->vm_end = addr + len;
			}
		}
	}

	vma->vm_region = region;

	/* set up the mapping
	 * - the region is filled in if NOMMU_MAP_DIRECT is still set
	 */
	if (file && vma->vm_flags & VM_SHARED) /* VM_SHARED:可以被多个进程共享 */
		ret = do_mmap_shared_file(vma); /* 在文件上设置共享映射(驱动程序或文件系统提供并固定存储) */
	else
		ret = do_mmap_private(vma, region, len, capabilities); /* 设置私有映射或匿名共享映射 */
	if (ret < 0)
		goto error_just_free;
	add_nommu_region(region);

	/* clear anonymous mappings that don't ask for uninitialized data */
	if (!vma->vm_file && !(flags & MAP_UNINITIALIZED)) /* 清除不要求未初始化数据的匿名映射 */
		memset((void *)region->vm_start, 0,
		       region->vm_end - region->vm_start);

	/* okay... we have a mapping; now we have to register it */
	result = vma->vm_start;

	current->mm->total_vm += len >> PAGE_SHIFT;

share:
	add_vma_to_mm(current->mm, vma); /* 在list和tree的适当位置将VMA添加到进程的mm_struct中,如果不是匿名页面,也添加到地址空间的页面树中 */

	/* we flush the region from the icache only when the first executable
	 * mapping of it is made  */
	if (vma->vm_flags & VM_EXEC && !region->vm_icache_flushed) {
		flush_icache_range(region->vm_start, region->vm_end);
		region->vm_icache_flushed = true;
	}

	up_write(&nommu_region_sem);

	return result;

error_just_free:
	up_write(&nommu_region_sem);
error:
	if (region->vm_file)
		fput(region->vm_file);
	kmem_cache_free(vm_region_jar, region);
	if (vma->vm_file)
		fput(vma->vm_file);
	vm_area_free(vma);
	return ret;

sharing_violation:
	up_write(&nommu_region_sem);
	pr_warn("Attempt to share mismatched mappings\n");
	ret = -EINVAL;
	goto error;

error_getting_vma:
	kmem_cache_free(vm_region_jar, region);
	pr_warn("Allocation of vma for %lu byte allocation from process %d failed\n",
			len, current->pid);
	show_free_areas(0, NULL);
	return -ENOMEM;

error_getting_region:
	pr_warn("Allocation of vm region for %lu byte allocation from process %d failed\n",
			len, current->pid);
	show_free_areas(0, NULL);
	return -ENOMEM;
}
  • 首先调用 vm_area_alloc(底层还是调用 kmem_cache_alloc,然后调用 vma_init 把该 vma 插入红黑树)
  • 新分配的 vm_area_struct 用于管理进程使用的虚拟地址(虚存管理的最基本的管理单元):
struct vm_area_struct {
	/* The first cache line has the info for VMA tree walking. */

	unsigned long vm_start;		/* Our start address within vm_mm. */
	unsigned long vm_end;		/* The first byte after our end address
					   within vm_mm. */

	/* linked list of VM areas per task, sorted by address */
	struct vm_area_struct *vm_next, *vm_prev;

	struct rb_node vm_rb;

	/*
	 * Largest free memory gap in bytes to the left of this VMA.
	 * Either between this VMA and vma->vm_prev, or between one of the
	 * VMAs below us in the VMA rbtree and its ->vm_prev. This helps
	 * get_unmapped_area find a free area of the right size.
	 */
	unsigned long rb_subtree_gap;

	/* Second cache line starts here. */

	struct mm_struct *vm_mm;	/* The address space we belong to. */
	pgprot_t vm_page_prot;		/* Access permissions of this VMA. */
	unsigned long vm_flags;		/* Flags, see mm.h. */

	/*
	 * For areas with an address space and backing store,
	 * linkage into the address_space->i_mmap interval tree.
	 */
	struct {
		struct rb_node rb;
		unsigned long rb_subtree_last;
	} shared;

	/*
	 * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma
	 * list, after a COW of one of the file pages.	A MAP_SHARED vma
	 * can only be in the i_mmap tree.  An anonymous MAP_PRIVATE, stack
	 * or brk vma (with NULL file) can only be in an anon_vma list.
	 */
	struct list_head anon_vma_chain; /* Serialized by mmap_sem &
					  * page_table_lock */
	struct anon_vma *anon_vma;	/* Serialized by page_table_lock */

	/* Function pointers to deal with this struct. */
	const struct vm_operations_struct *vm_ops;

	/* Information about our backing store: */
	unsigned long vm_pgoff;		/* Offset (within vm_file) in PAGE_SIZE
					   units */
	struct file * vm_file;		/* File we map to (can be NULL). */
	void * vm_private_data;		/* was vm_pte (shared mem) */

	atomic_long_t swap_readahead_info;
#ifndef CONFIG_MMU
	struct vm_region *vm_region;	/* NOMMU mapping region */
#endif
#ifdef CONFIG_NUMA
	struct mempolicy *vm_policy;	/* NUMA policy for the VMA */
#endif
	struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
} __randomize_layout;

struct core_thread {
	struct task_struct *task;
	struct core_thread *next;
};

struct core_state {
	atomic_t nr_threads;
	struct core_thread dumper;
	struct completion startup;
};
  • 核心函数 get_unmapped_area 调用的是 current->mm->get_unmapped_area,在 Linux 中,实际上调用的是 arch_get_unmapped_area(进程中能够找到查找空闲虚拟内存的方法)
enum mmap_allocation_direction {UP, DOWN}; /* UP == '0', DOWN == '1' */

unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr0,
	unsigned long len, unsigned long pgoff, unsigned long flags)
{
	return arch_get_unmapped_area_common(filp,
			addr0, len, pgoff, flags, UP); /* addr0 == '0' */
}
static unsigned long arch_get_unmapped_area_common(struct file *filp,
	unsigned long addr0, unsigned long len, unsigned long pgoff,
	unsigned long flags, enum mmap_allocation_direction dir) /* dir == UP */
{
	struct mm_struct *mm = current->mm;
	struct vm_area_struct *vma;
	unsigned long addr = addr0;
	int do_color_align;
	struct vm_unmapped_area_info info; /* 用于管理分配内存请求 */

	if (unlikely(len > TASK_SIZE))
		return -ENOMEM;

	if (flags & MAP_FIXED) {
		/* Even MAP_FIXED mappings must reside within TASK_SIZE */
		if (TASK_SIZE - len < addr)
			return -EINVAL;

		/*
		 * We do not accept a shared mapping if it would violate
		 * cache aliasing constraints.
		 */
		if ((flags & MAP_SHARED) &&
		    ((addr - (pgoff << PAGE_SHIFT)) & shm_align_mask))
			return -EINVAL;
		return addr;
	}

	do_color_align = 0;
	if (filp || (flags & MAP_SHARED))
		do_color_align = 1;

	/* requesting a specific address */
	if (addr) {
		if (do_color_align)
			addr = COLOUR_ALIGN(addr, pgoff);
		else
			addr = PAGE_ALIGN(addr);

		vma = find_vma(mm, addr); /* 找到对应的vma */
		if (TASK_SIZE - len >= addr &&
		    (!vma || addr + len <= vm_start_gap(vma)))
			return addr;
	}

	info.length = len; 
	info.align_mask = do_color_align ? (PAGE_MASK & shm_align_mask) : 0;
	info.align_offset = pgoff << PAGE_SHIFT;

	if (dir == DOWN) { /* 自上而下进行映射(在本调用链中恒不成立) */
		info.flags = VM_UNMAPPED_AREA_TOPDOWN;
		info.low_limit = PAGE_SIZE;
		info.high_limit = mm->mmap_base;
		addr = vm_unmapped_area(&info); /* 根据vm_unmapped_area_info扫描mmap映射区域来查找满足请求的内存 */

		if (!(addr & ~PAGE_MASK)) /* "addr&~PAGE_MASK"可判定addr是否是4096倍数,如果结果为"0",则是,否则不是 */
			return addr; /* addr是否是4096倍数则返回 */

		/*
		 * A failed mmap() very likely causes application failure,
		 * so fall back to the bottom-up function here. This scenario
		 * can happen with large stack limits and large mmap()
		 * allocations.
		 */
	}

	info.flags = 0;
	info.low_limit = mm->mmap_base;
	info.high_limit = TASK_SIZE;
	return vm_unmapped_area(&info); /* 根据vm_unmapped_area_info扫描mmap映射区域来查找满足请求的内存 */
}
  • vm_unmapped_area 用于在 mmap 映射区域中查找满足请求的内存(以 vm_area_struct 为单位),这是内存分配中最底层的内容
/*
 * 搜索未映射的地址范围,条件如下:
 * - 不与任何VMA相交
 * - 区间范围属于 [low_limit,high_limit)
 * - 地址大小至少是 length
 * - 满足 (begin_addr & align_mask) == (align_offset & align_mask)
 */
static inline unsigned long
vm_unmapped_area(struct vm_unmapped_area_info *info)
{
	if (info->flags & VM_UNMAPPED_AREA_TOPDOWN) /* VM_UNMAPPED_AREA_TOPDOWN:将虚拟机未映射区域自上而下进行映射(在本调用链中恒不成立) */
		return unmapped_area_topdown(info); /* 反向 */
	else
		return unmapped_area(info); /* 正向 */
}
  • 看来 mmap 还支持反向映射,我们这里主要研究正向映射 unmapped_area
unsigned long unmapped_area(struct vm_unmapped_area_info *info)
{
	/*
	 * 我们通过寻找紧跟合适间隙的rbtree节点来实现搜索
	 * - gap_start = vma->vm_prev->vm_end <= info->high_limit - length;
	 * - gap_end   = vma->vm_start        >= info->low_limit  + length;
	 * - gap_end - gap_start >= length
	 */

	struct mm_struct *mm = current->mm;
	struct vm_area_struct *vma;
	unsigned long length, low_limit, high_limit, gap_start, gap_end;

	/* Adjust search length to account for worst case alignment overhead */
	length = info->length + info->align_mask;
	if (length < info->length)
		return -ENOMEM;

	/* Adjust search limits by the desired length */
	if (info->high_limit < length)
		return -ENOMEM;
	high_limit = info->high_limit - length;

	if (info->low_limit > high_limit)
		return -ENOMEM;
	low_limit = info->low_limit + length;

	/* Check if rbtree root looks promising */
	if (RB_EMPTY_ROOT(&mm->mm_rb))
		goto check_highest;
	vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb);
	if (vma->rb_subtree_gap < length)
		goto check_highest;

	while (true) {
		/* Visit left subtree if it looks promising */
		gap_end = vm_start_gap(vma);
		if (gap_end >= low_limit && vma->vm_rb.rb_left) {
			struct vm_area_struct *left =
				rb_entry(vma->vm_rb.rb_left,
					 struct vm_area_struct, vm_rb);
			if (left->rb_subtree_gap >= length) {
				vma = left;
				continue;
			}
		}

		gap_start = vma->vm_prev ? vm_end_gap(vma->vm_prev) : 0;
check_current:
		/* Check if current node has a suitable gap */
		if (gap_start > high_limit)
			return -ENOMEM;
		if (gap_end >= low_limit &&
		    gap_end > gap_start && gap_end - gap_start >= length)
			goto found;

		/* Visit right subtree if it looks promising */
		if (vma->vm_rb.rb_right) {
			struct vm_area_struct *right =
				rb_entry(vma->vm_rb.rb_right,
					 struct vm_area_struct, vm_rb);
			if (right->rb_subtree_gap >= length) {
				vma = right;
				continue;
			}
		}

		/* Go back up the rbtree to find next candidate node */
		while (true) {
			struct rb_node *prev = &vma->vm_rb;
			if (!rb_parent(prev))
				goto check_highest;
			vma = rb_entry(rb_parent(prev),
				       struct vm_area_struct, vm_rb);
			if (prev == vma->vm_rb.rb_left) {
				gap_start = vm_end_gap(vma->vm_prev);
				gap_end = vm_start_gap(vma);
				goto check_current;
			}
		}
	}

check_highest:
	/* Check highest gap, which does not precede any rbtree node */
	gap_start = mm->highest_vm_end;
	gap_end = ULONG_MAX;  /* Only for VM_BUG_ON below */
	if (gap_start > high_limit)
		return -ENOMEM;

found:
	/* We found a suitable gap. Clip it with the original low_limit. */
	if (gap_start < info->low_limit)
		gap_start = info->low_limit;

	/* Adjust gap address to the desired alignment */
	gap_start += (info->align_offset - gap_start) & info->align_mask;

	VM_BUG_ON(gap_start + info->length > info->high_limit);
	VM_BUG_ON(gap_start + info->length > gap_end);
	return gap_start; /* 最后返回找到的addr */
}
  • 最底层的查找过程是用 红黑树 实现的(由于本人对红黑树还不是很了解,这里就先跳过了)
  • 至于 mmap 映射区域的由来,这就是分页机制和内容了
  • 最后返回到之前的函数中,mmap 也设置了两种机制:共享和私有
    • 如果是共享映射,那么在内存中对文件进行修改,磁盘中对应的文件也会被修改,相反,磁盘中的文件有了修改,内存中的文件也被修改
    • 如果是私有映射,那么内存中的文件是独立的,二者进行修改都不会对对方造成影响
  • 不管是调用 do_mmap_shared_file 或者 do_mmap_private,他们底层都会调用 call_mmap 完成最后的设置
static inline int call_mmap(struct file *file, struct vm_area_struct *vma)
{
	return file->f_op->mmap(file, vma);
}
  • 在 Ext4 文件系统中 file->f_op->mmap 指向 ext4_file_mmap(Linux 默认的文件系统为 Ext2 Ext3 Ext4)
static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
{
	struct inode *inode = file->f_mapping->host;

	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
		return -EIO;

	/*
	 * We don't support synchronous mappings for non-DAX files. At least
	 * until someone comes with a sensible use case.
	 */
	if (!IS_DAX(file_inode(file)) && (vma->vm_flags & VM_SYNC))
		return -EOPNOTSUPP;

	file_accessed(file);
	if (IS_DAX(file_inode(file))) {
		vma->vm_ops = &ext4_dax_vm_ops; /* 初始化vma->vm_ops(在page fault handler中被使用到) */
		vma->vm_flags |= VM_HUGEPAGE;
	} else {
		vma->vm_ops = &ext4_file_vm_ops;
	}
	return 0;
}
  • 当所有的剩余工作都处理完成后,mmap 就会返回在 mmap 映射区找到的 addr

新人初稿,若有问题请大佬指出

你可能感兴趣的:(linux)