linux进程地址空间(3) 内存映射(1)mmap与do_mmap

如果已经看了博客中本系列文档的前面的几篇文章,应该就已经对mallocmmap大致了解了,它们就是在堆中创建(或合并)所需虚拟地址的vma线性区,换句话说,就是达到进程地址空间中要有满足要求的vma,但不会给vma映射物理页(除非一定要求,即vmaflags标识了页锁定标志VM_LOCKED),这是linux的对用户进程物理页分配的推后原则,把握这个原则有助于分析malloc/mmap乃至理解linux的用户进程内存管理。

库函数malloclinux内核的实现是典型的匿名映射,关于匿名映射可以参考前面的缺页异常处理的文章,IPC方式中的共享内存也是匿名映射,绝大多数的mmap应用场合是文件映射,而内核中的处理,对于malloc的处理函数是do_brk,这是因为malloc操作的是进程地址空间的堆段,而函数do_brk就是针对堆段的处理,mmap包括IPC的共享内存都是由函数do_mmap处理;不论哪种处理其实操作的都是进程地址空间的线性区

下面首先看下函数do_mmap,源码如下

static inline unsigned long do_mmap(struct file *file, unsigned long addr,

         unsigned long len, unsigned long prot,

         unsigned long flag, unsigned long offset)

{

         unsigned long ret = -EINVAL;

         if ((offset + PAGE_ALIGN(len)) < offset)

                   goto out;

         if (!(offset & ~PAGE_MASK))

                   ret = do_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);

out:

         return ret;

}

它的核心函数是do_mmap_pgoff,这里主要关注下do_mmap的参数情况:

file: 如果新的线性区将要把一个文件映射到内存,则要用文件描述符file和文件偏移offset,如不需要,则fileoffset不考虑都为空;

addr: 指定从哪里开始查找空闲区间,一般都是NULL即由内核指定;

len: 要求的线性地址空间长度;

prot: 指定线性区下的页的访问权限;

flag: 指定线性区的其他标志

初步有个印象即可,接下来关注函数do_mmap_pgoff,源码如下

unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,

                            unsigned long len, unsigned long prot,

                            unsigned long flags, unsigned long pgoff)

{

    /*当前进程的mm*/

         struct mm_struct * mm = current->mm;

         struct inode *inode;

         unsigned int vm_flags;

         int error;

         unsigned long reqprot = prot;

         /*

          * Does the application expect PROT_READ to imply PROT_EXEC?

          *

          * (the exception is when the underlying filesystem is noexec

          *  mounted, in which case we dont add PROT_EXEC.)

          */

         /*是否隐藏了可执行属性*/

         if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))

                   if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC)))

                            prot |= PROT_EXEC;

         if (!len)

                   return -EINVAL;

    /*判断输入的欲映射的起始地址是否小于最小映射地址,如果小于,将addr修改为最小地址,不过前提是MAP_FIXED旗标没有设置*/

         if (!(flags & MAP_FIXED))

                   addr = round_hint_to_min(addr);

         /* Careful about overflows.. */

    /*检测len是否为0*/

         len = PAGE_ALIGN(len);

         if (!len)

                   return -ENOMEM;

         /* offset overflow? */

    /*再次检测是否越界*/

         if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)

               return -EOVERFLOW;

         /* Too many mappings? */

    /*在一个进程中对于mmap个数是有限制的。超出了还是nomem的错误*/

         if (mm->map_count > sysctl_max_map_count)

                   return -ENOMEM;

         /* Obtain the address to map to. we verify (or select) it and ensure

          * that it represents a valid section of the address space.

          */

/*创建新的vma区域之前先要寻找一块足够大小的空闲区域,本函数就是用于查找没有映射过的空洞内存区,返回值addr就是这段空洞的起始地址*/

         addr = get_unmapped_area(file, addr, len, pgoff, flags);

         if (addr & ~PAGE_MASK)

                   return addr;

         /* Do simple checking here so the lower-level routines won't have

          * to. we assume access permissions have been handled by the open

          * of the memory object, so we don't do any here.

          */

         /*设置vm_flags,根据传入的portflags以及mm本身自有的旗标来设置*/

         vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |

                            mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;

    /*下面两个if是关于锁定的内存的内容,暂不关注*/

         if (flags & MAP_LOCKED)

                   if (!can_do_mlock())

                            return -EPERM;

         /* mlock MCL_FUTURE? */

         if (vm_flags & VM_LOCKED) {

                   unsigned long locked, lock_limit;

                   locked = len >> PAGE_SHIFT;

                   locked += mm->locked_vm;

                   lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;

                   lock_limit >>= PAGE_SHIFT;

                   if (locked > lock_limit && !capable(CAP_IPC_LOCK))

                            return -EAGAIN;

         }

    /*判断是文件映射还是匿名映射,如果是文件映射则赋值inode*/

         inode = file ? file->f_path.dentry->d_inode : NULL;

/*vm_flags进行设置,由参数flags确定vma线性区的flags,是共享还是私有*/

    /*文件映射*/

         if (file) {

                   switch (flags & MAP_TYPE) {

        /*共享*/

                   case MAP_SHARED:

                            if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE))

                                     return -EACCES;

                            /*

                             * Make sure we don't allow writing to an append-only

                             * file..

                             */

                            if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE))

                                     return -EACCES;

                            /*

                             * Make sure there are no mandatory locks on the file.

                             */

                            if (locks_verify_locked(inode))

                                     return -EAGAIN;

                            vm_flags |= VM_SHARED | VM_MAYSHARE;

                            if (!(file->f_mode & FMODE_WRITE))

                                     vm_flags &= ~(VM_MAYWRITE | VM_SHARED);

                            /* fall through */

        /*私有*/

                   case MAP_PRIVATE:

                            if (!(file->f_mode & FMODE_READ))

                                     return -EACCES;

                            if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) {

                                     if (vm_flags & VM_EXEC)

                                               return -EPERM;

                                     vm_flags &= ~VM_MAYEXEC;

                            }

                            if (!file->f_op || !file->f_op->mmap)

                                     return -ENODEV;

                            break;

                   default:

                            return -EINVAL;

                   }

         }

    /*匿名映射*/

    else {

                   switch (flags & MAP_TYPE) {

        /*共享,对应共享内存*/

                   case MAP_SHARED:

                            /*

                             * Ignore pgoff.

                             */

                            pgoff = 0;

                            vm_flags |= VM_SHARED | VM_MAYSHARE;

                            break;

        /*私有*/

                   case MAP_PRIVATE:

                            /*

                             * Set pgoff according to addr for anon_vma.

                             */

                            pgoff = addr >> PAGE_SHIFT;

                            break;

                   default:

                            return -EINVAL;

                   }

         }

         error = security_file_mmap(file, reqprot, prot, flags, addr, 0);

         if (error)

                   return error;

         error = ima_file_mmap(file, prot);

         if (error)

                   return error;

    /*实际创建vma*/

         return mmap_region(file, addr, len, flags, vm_flags, pgoff);

}

这个函数由三部分组成

1、 找到能否创建符合要求的vma,应该在哪里创建?

这部分主要通过函数get_unmapped_area实现,我们需要一段虚拟空间,范围是[addraddr+len],用户进程一般不会指定addr(对应flags含义标志MAP_FIXED的情况),也就是由内核指定这个虚拟空间的首地址addr在哪里,在函数do_mmap_pgoff调用get_unmapped_area之前会预指定addr,通过调用函数round_hint_to_min实现,按我的理解这个预指定的值是宏CONFIG_DEFAULT_MMAP_MIN_ADDR 的值为4096(个人认为初始值是多少并不重要,因为后面会不断的找合适的值),然后用这个预指定的addr为参数调用函数get_unmapped_area,源码如下

unsigned long

get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,

           unsigned long pgoff, unsigned long flags)

{

unsigned long (*get_area)(struct file *, unsigned long,

                               unsigned long, unsigned long, unsigned long);

unsigned long error = arch_mmap_check(addr, len, flags);

if (error)

           return error;

/* Careful about overflows.. */

if (len > TASK_SIZE)

           return -ENOMEM;

 /*这是内存描述符的get_unmapped_area方法,可能是arch_get_unmapped_area,也可能是arch_get_unmapped_area_topdown,用于找到合适的空闲区间可容纳[addraddr+len],空闲区间就是指这个区间当前还没有vma*/

get_area = current->mm->get_unmapped_area;

 /*这是文件的get_unmapped_area方法,一般来说驱动不用自己实现get_unmapped_area方法,它只需实现mmap方法,映射特定的物理内存*/

if (file && file->f_op && file->f_op->get_unmapped_area)

           get_area = file->f_op->get_unmapped_area;

 /*会调用以上两种中的任意一种函数,最终得到合适的创建新vma的起始地址addr*/

addr = get_area(file, addr, len, pgoff, flags);

if (IS_ERR_VALUE(addr))

           return addr;

if (addr > TASK_SIZE - len)

           return -ENOMEM;

if (addr & ~PAGE_MASK)

           return -EINVAL;

 /*就是返回addr*/

return arch_rebalance_pgtables(addr, len);

}

可见,函数get_unmapped_area实际是通过函数指针get_area实现,get_area有两种可能,如果是文件映射,并且该文件的file_operation定义了get_unmapped_area方法,那么使用它的get_unmapped_area方法实现定位虚拟区间,但我估计这样用的做法很少,以mmap使用较多的设备驱动来讲,多数设备驱动文件的file_operation没有定义get_unmapped_area方法,因为没有必要;所以一般都是用另一种方法,使用mmget_unmapped_area方法,对于arm它是函数arch_get_unmapped_area,源码如下:

unsigned long

arch_get_unmapped_area(struct file *filp, unsigned long addr,

           unsigned long len, unsigned long pgoff, unsigned long flags)

{

struct mm_struct *mm = current->mm;

struct vm_area_struct *vma;

unsigned long start_addr;

#ifdef CONFIG_CPU_V6

unsigned int cache_type;

int do_align = 0, aliasing = 0;

/*

 * We only need to do colour alignment if either the I or D

 * caches alias.  This is indicated by bits 9 and 21 of the

 * cache type register.

 */

cache_type = read_cpuid_cachetype();

if (cache_type != read_cpuid_id()) {

           aliasing = (cache_type | cache_type >> 12) & (1 << 11);

           if (aliasing)

                    do_align = filp || flags & MAP_SHARED;

}

#else

#define do_align 0

#define aliasing 0

#endif

/*

 * We enforce the MAP_FIXED case.

 */

/*一般来说flags不会是MAP_FIXED,这说明在mmap调用时就指定了具体的虚拟地址addr,当然如果真的如此,也就是确实用户进程指定了虚拟地址addr,那么也就不用通过寻找空闲的vma再去找其起始地址addr*/

if (flags & MAP_FIXED) {

           if (aliasing && flags & MAP_SHARED &&

               (addr - (pgoff << PAGE_SHIFT)) & (SHMLBA - 1))

                    return -EINVAL;

           return addr;

}

    /*要求长度不可大于一个进程的最大地址空间长度,为3G-16MB(16MBmodule占用的)*/

if (len > TASK_SIZE)

           return -ENOMEM;

if (addr) {

        /*do_align0,故addr按页对齐*/

           if (do_align)

                    addr = COLOUR_ALIGN(addr, pgoff);

           else

                    addr = PAGE_ALIGN(addr);

        /*找到本mm中的第一个vm_end大于addr的线性区*/

           vma = find_vma(mm, addr);

     /*要么是没有找到vm_end大于addr的线性区,即vma = NULL

      要么是找到这样的线性区,但addr+len <= 该线性区的起始,          这说明这个addr及其len是适合创建新的线性区vma的,它不会干扰现有vma*/

           if (TASK_SIZE - len >= addr &&

               (!vma || addr + len <= vma->vm_start))

                    return addr;

}

    /*上面的if不成功,就得重新选取起始位置addr:

      1、所需长度len比当前进程mm的最大空洞还大,设置搜索位置addr(start_addr变量)mmfree_area_cache*/

if (len > mm->cached_hole_size) {

        start_addr = addr = mm->free_area_cache;

}

    /*2、所需长度小于当前进程mm的最大空洞,从用户空间的1/3位置开始(对于arm,就是3G/3 = 1G0x40000000),并置mm的空洞为0,这是要重新开始仔细查找了,mm的空洞将在查找过程中随时更新*/

    else {

        start_addr = addr = TASK_UNMAPPED_BASE;

        mm->cached_hole_size = 0;

}

full_search:

    /*确保addr页对齐*/

if (do_align)

           addr = COLOUR_ALIGN(addr, pgoff);

else

           addr = PAGE_ALIGN(addr);

    /*addr开始,遍历包含它或在它之后的本进程空间的线性区vma*/

for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {

           /* At this point:  (!vma || addr < vma->vm_end). */

        /*如果马上就要超出用户空间了的话:*/

           if (TASK_SIZE - len < addr) {

                    /*

                     * Start a new search - just in case we missed

                     * some holes.

                     */

                    /*利用临时变量start_addr,如果上次不是从用户空间1/3开始遍历的(前面的else),现在改为1/3重新遍历并置mm的空间为0,重新遍历*/

                    if (start_addr != TASK_UNMAPPED_BASE) {

                             start_addr = addr = TASK_UNMAPPED_BASE;

                             mm->cached_hole_size = 0;

                             goto full_search;

                    }

            /*这说明从1/3遍历也不行,只能报错返回了!*/

                    return -ENOMEM;

           }

     /*找到合适的区间了,将使用这部分空间,还要更新mm的空闲线性区搜索位置(free_area_cache),所谓合适的区间,就是说这个区间起始地址与addr之间距离大于要求的长度len,所以在addr可以创建新vma*/

           if (!vma || addr + len <= vma->vm_start) {

                    /*

                     * Remember the place where we stopped the search:

                     */

                    mm->free_area_cache = addr + len;

                    return addr;

           }

     /*还是没有找到合适的空闲区间,继续往前遍历(addr = vma->vm_end),注意需要及时更新mm的最大空洞大小,就是一旦发现有更大的空洞时*/

           if (addr + mm->cached_hole_size < vma->vm_start)

                   mm->cached_hole_size = vma->vm_start - addr;

    /*addr每次都在此次vma的结尾处,等待下次遍历时比较空洞是否够长*/

           addr = vma->vm_end;

           if (do_align)

                    addr = COLOUR_ALIGN(addr, pgoff);

}

}

首先直接碰碰运气,看addr后面的vma(如果存在的话)addr的距离是否够长,即大于长度len,如果可以的话,就直接返回addr即可;

往往不会这么容易成功,这就将进入循环查找流程,即标号full_search的部分,这里看到了mmfree_area_cachecached_hole_size的用处了,它们就是标识从当前正在从哪里查找以及当前进程的vma之间最大空洞是多大,循环查找的过程就是让addr不断的蹦到一个vma的结尾处,把它与下一个vma的开始处的距离,和需要的长度len比较,当发现一个比len大的空洞时,即发现了可以用来创建新vma的地方了,返回addrmmfree_area_cache就是标识每次是从哪里查找的,cached_hole_size的用处是不断更新为发现的最大空洞的值,这就利于今后再创建新vma;正常情况下返回的是找到的合适的addr

回到函数get_unmapped_area,它将返回这个addr给函数do_mmap_pgoff

2、  然后是确定vma线性区的flags,针对文件映射和匿名映射有所不同

3、最后是实际创建新vma线性区,通过函数mmap_region实现,源码如下:

unsigned long mmap_region(struct file *file, unsigned long addr,

                      unsigned long len, unsigned long flags,

                      unsigned int vm_flags, unsigned long pgoff)

{

struct mm_struct *mm = current->mm;

struct vm_area_struct *vma, *prev;

int correct_wcount = 0;

int error;

struct rb_node **rb_link, *rb_parent;

unsigned long charged = 0;

struct inode *inode =  file ? file->f_path.dentry->d_inode : NULL;

/* Clear old maps */

error = -ENOMEM;

munmap_back:

/*函数find_vma_prepare()find_vma()基本相同,它扫描当前进程地址空间的vm_area_struct结构所形成的红黑树,试图找到包含addrvma线性区;如果找到了,说明addr所在的虚拟区已经在使用,也就是已经有映射存在,因此要调用do_munmap()把这个老的虚拟区从进程地址空间中撤销,如果撤销不成功,就返回一个负数;如果撤销成功,就继续查找,直到在红黑树中找不到addr所在的虚拟区*/

vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);

 /*说明这个vma与要新建的vma有重叠*/

if (vma && vma->vm_start < addr + len) {

           if (do_munmap(mm, addr, len))

                    return -ENOMEM;

           goto munmap_back;

}

/* Check against address space limit. */

 /*检查映射页数是否超过映射限制*/

if (!may_expand_vm(mm, len >> PAGE_SHIFT))

           return -ENOMEM;

/*

 * Set 'VM_NORESERVE' if we should not account for the

 * memory use of this mapping.

 */

if ((flags & MAP_NORESERVE)) {

           /* We honor MAP_NORESERVE if allowed to overcommit */

           if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)

                    vm_flags |= VM_NORESERVE;

           /* hugetlb applies strict overcommit unless MAP_NORESERVE */

           if (file && is_file_hugepages(file))

                    vm_flags |= VM_NORESERVE;

}

/*

 * Private writable mapping: check memory availability

 */

if (accountable_mapping(file, vm_flags)) {

           charged = len >> PAGE_SHIFT;

           if (security_vm_enough_memory(charged))

                    return -ENOMEM;

           vm_flags |= VM_ACCOUNT;

}

/*

 * Can we just expand an old mapping?

 */

/*检查前一个线性区是否可以包含新的线性区,需要它的vm_flags与新线性区是一样的,

  还会试图把新线性区前面的线性区及后面的线性区合并

  一旦可以,立即跳到标号out,即不用新创建vma线性区

  这是linux对于线性区管理的一个原则,目的是为了:

  1、尽量减少vma的个数,减少从slab获取的内存

  2、尽量减少线性区和线性区之间的空洞*/

vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, NULL);

if (vma)

           goto out;

/*

 * Determine the object being mapped and call the appropriate

 * specific mapper. the address has already been validated, but

 * not unmapped, but the maps are removed from the list.

 */

/*无法合并,需要新创建vma,用slab分配一个线性区结构*/

vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);

if (!vma) {

           error = -ENOMEM;

           goto unacct_error;

}

 /*初始化vma线性区成员*/

vma->vm_mm = mm;

vma->vm_start = addr;

vma->vm_end = addr + len;

vma->vm_flags = vm_flags;

vma->vm_page_prot = vm_get_page_prot(vm_flags);

vma->vm_pgoff = pgoff;

/*如果是映射文件*/

if (file) {

           error = -EINVAL;

           if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))

                    goto free_vma;

           if (vm_flags & VM_DENYWRITE) {

                    error = deny_write_access(file);

                    if (error)

                             goto free_vma;

                    correct_wcount = 1;

           }

           vma->vm_file = file;

           get_file(file);

           error = file->f_op->mmap(file, vma);

           if (error)

                    goto unmap_and_free_vma;

           if (vm_flags & VM_EXECUTABLE)

                    added_exe_file_vma(mm);

           /* Can addr have changed??

            *

            * Answer: Yes, several device drivers can do it in their

            *         f_op->mmap method. -DaveM

            */

           addr = vma->vm_start;

           pgoff = vma->vm_pgoff;

           vm_flags = vma->vm_flags;

}

/*如果是共享匿名区(用于IPC的共享内存)*/

    else if (vm_flags & VM_SHARED) {

           error = shmem_zero_setup(vma);

           if (error)

                    goto free_vma;

}

if (vma_wants_writenotify(vma))

           vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED);

/*vma关联到所有的数据结构中*/

vma_link(mm, vma, prev, rb_link, rb_parent);

file = vma->vm_file;

/* Once vma denies write, undo our temporary denial count */

if (correct_wcount)

           atomic_inc(&inode->i_writecount);

out:

perf_event_mmap(vma);

/*更新mm的映射长度*/

mm->total_vm += len >> PAGE_SHIFT;

vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);

 /*关于锁定的内存的问题*/

if (vm_flags & VM_LOCKED) {

           /*

            * makes pages present; downgrades, drops, reacquires mmap_sem

            */

           long nr_pages = mlock_vma_pages_range(vma, addr, addr + len);

           if (nr_pages < 0)

                    return nr_pages;     /* vma gone! */

           mm->locked_vm += (len >> PAGE_SHIFT) - nr_pages;

} else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK))

           make_pages_present(addr, addr + len);

return addr;

unmap_and_free_vma:

if (correct_wcount)

           atomic_inc(&inode->i_writecount);

vma->vm_file = NULL;

fput(file);

/* Undo any partial mapping done by a device driver. */

unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);

charged = 0;

free_vma:

kmem_cache_free(vm_area_cachep, vma);

unacct_error:

if (charged)

           vm_unacct_memory(charged);

return error;

}

这个函数包括两部分内容:

1、 去除干扰:调用函数find_vma_prepare查找是否已有vma线性区包含addr了,如果有,调用函数do_munmap把这个vma干掉,函数find_vma_prepare的原理是:所有vma通过红黑树存储起来,通过当前红黑树把查找是否有包含addrvma线性区,查找原理是利用红黑树的特性,后续会有关于红黑树的专题;

2、 创建映射:要注意,linux不希望vmavma之间总有空洞,只要要新创建的vmaflags属性和它前面的或后面的vma相同,那么就可以合并成一个新的vma,这样做一来减少vma的个数,也就减少从slab获取的物理内存,二来减少虚拟空间的空洞的浪费;如果无法合并,那么也只好新建vma并对vma结构体初始化相关成员;根据vma是否有页锁定标志(VM_LOCKED),决定是否立即分配物理页;

3、 最后把该vma起始地址即addr返回;

最终do_mmap的返回值就是addr

可见do_mmap完成的就是在本进程地址空间找到一段合适的虚拟地址空间,并把起始地址返回给用户进程,并未映射物理页(除非用户进程要求vma页锁定),这部分留给用户对其访问时产生的缺页异常处理

你可能感兴趣的:(linux进程地址空间(3) 内存映射(1)mmap与do_mmap)