内核常常要判断进程地址空间中的内存区域是否满足某些条件,为了方便执行,内核定义了许多辅助函数,它们都声明在linux/mm.h中。
在mm/mmap.c中
- struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr)
- {
- struct vm_area_struct *vma = NULL;
- if (mm) {
-
-
- vma = mm->mmap_cache;
- //如果缓存中并未包含希望的VMA,则搜索红黑树
- if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
- struct rb_node * rb_node;
- rb_node = mm->mm_rb.rb_node;
- vma = NULL;
- while (rb_node) {
- struct vm_area_struct * vma_tmp;
- vma_tmp = rb_entry(rb_node,
- struct vm_area_struct, vm_rb);
- if (vma_tmp->vm_end > addr) {
- vma = vma_tmp;
- if (vma_tmp->vm_start <= addr)
- break;
- rb_node = rb_node->rb_left;
- } else
- rb_node = rb_node->rb_right;
- }
- if (vma)
- mm->mmap_cache = vma;
- }
- }
- return vma; //返回第一个大于addr的VMA
- }
该函数在指定的地址空间中搜索第一个vm_end大于addr的内存区域。换句话说,该函数寻找第一个包含addr或首地址大于addr的内存区域。
注意,由于返回的VMA首地址可能大于addr,所以指定的地址并不一定就包含在返回的VMA中。因为肯那个在对某个VMA执行操作之后,还有其他操作会对该VMA继续进行,所以该函数返回的结果被缓存在内存描述符mm_struct中的mmap_cache域中。如果指定的地址不在缓存中,那么必须搜索和内存描述符相关的所有内存区域,这种搜索通过红黑树进行。
find_vma_prev()
find_vma_prev()函数和find_vma()工作方式相同,但是它返回第一个小于addr的VMA。
- struct vm_area_struct *
- find_vma_prev(struct mm_struct *mm, unsigned long addr,
- struct vm_area_struct **pprev)
- {
- struct vm_area_struct *vma = NULL, *prev = NULL;
- struct rb_node * rb_node;
- if (!mm)
- goto out;
-
- vma = mm->mmap;
-
- rb_node = mm->mm_rb.rb_node;
- while (rb_node) {
- struct vm_area_struct *vma_tmp;
- vma_tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);
- if (addr < vma_tmp->vm_end) {
- rb_node = rb_node->rb_left;
- } else {
- prev = vma_tmp;
- if (!prev->vm_next || (addr < prev->vm_next->vm_end))
- break;
- rb_node = rb_node->rb_right;
- }
- }
- out:
- *pprev = prev;
- return prev ? prev->vm_next : vma;
- }
find_vma_intersection()函数返回第一个和指定区间相交的VMA。因为该函数是内联函数,故定义在linux/mm.h中:
- static inline struct vm_area_struct * find_vma_intersection(struct mm_struct * mm, unsigned long start_addr, unsigned long end_addr)
- {
- struct vm_area_struct * vma = find_vma(mm,start_addr);
- if (vma && end_addr <= vma->vm_start)
- vma = NULL;
- return vma;
- }
内核使用do_mmap()函数创建一个新的线性地址空间。如果说该函数创建了一个新的VMA并不非常准确,因为如果创建的地址区间和益而高已经存在的地址空间相邻,并且它们具有同样的访问权限的话,那么两个区间将合并为一个。如果不能合并,那么就确实要创建一个新的VMA了。无论哪种情况,do_mmap()都会将一个地址区间加入到进程的地址空间中(无论是扩展已存在的内存空间还是创建一个新的区域)。
- 在mm.h中
- static inline unsigned long do_mmap(struct file *file, unsigned long addr,
- unsigned long len, unsigned long prot,
- unsigned long flag, unsigned long offset)
- {
- unsigned long ret = -EINVAL;
- if ((offset + PAGE_ALIGN(len)) < offset)
- goto out;
- if (!(offset & ~PAGE_MASK))
- ret = do_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
- out:
- return ret;
- }
如果file参数是NULL并且offset参数也是0,那么就代表这次映射没有和文件相关,该情况被称作匿名映射;如果指定了文件名和偏移量,那么该映射被称为文件映射。
addr是可选参数,它指定搜索空闲区域的起始位置。
prot参数指定内存区域中页面的访问权限,不同的体系结构标志的定义不同。 flag参数指定了VMA标志。
- #define PROT_READ 0x1 /* page can be read */
- #define PROT_WRITE 0x2 /* page can be written */
- #define PROT_EXEC 0x4 /* page can be executed */
- #define PROT_SEM 0x8 /* page may be used for atomic ops */
- #define PROT_NONE 0x0 /* page can not be accessed */
- #define PROT_GROWSDOWN 0x01000000 /* mprotect flag: extend change to start of growsdown vma */
- #define PROT_GROWSUP 0x02000000 /* mprotect flag: extend change to end of growsup vma */
- #define MAP_SHARED 0x01 /* Share changes */
- #define MAP_PRIVATE 0x02 /* Changes are private */
- #define MAP_TYPE 0x0f /* Mask for type of mapping */
- #define MAP_FIXED 0x10 /* Interpret addr exactly */
- #define MAP_ANONYMOUS 0x20 /* don't use a file */
- 在mm/mmap.c中
- unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
- unsigned long len, unsigned long prot,
- unsigned long flags, unsigned long pgoff)
- {
- struct mm_struct * mm = current->mm;
- struct vm_area_struct * vma, * prev;
- struct inode *inode;
- unsigned int vm_flags;
- int correct_wcount = 0;
- int error;
- struct rb_node ** rb_link, * rb_parent;
- int accountable = 1;
- unsigned long charged = 0, reqprot = prot;
-
- if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
- if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC)))
- prot |= PROT_EXEC;
- if (!len)
- return -EINVAL;
- error = arch_mmap_check(addr, len, flags);
- if (error)
- return error;
-
- len = PAGE_ALIGN(len);
- if (!len || len > TASK_SIZE)
- return -ENOMEM;
-
- if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
- return -EOVERFLOW;
-
- if (mm->map_count > sysctl_max_map_count)
- return -ENOMEM;
-
- addr = get_unmapped_area(file, addr, len, pgoff, flags);
- if (addr & ~PAGE_MASK)
- return addr;
-
- vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |
- mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
- if (flags & MAP_LOCKED) {
- if (!can_do_mlock())
- return -EPERM;
- vm_flags |= VM_LOCKED;
- }
-
- if (vm_flags & VM_LOCKED) {
- unsigned long locked, lock_limit;
- locked = len >> PAGE_SHIFT;
- locked += mm->locked_vm;
- lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
- lock_limit >>= PAGE_SHIFT;
- if (locked > lock_limit && !capable(CAP_IPC_LOCK))
- return -EAGAIN;
- }
- inode = file ? file->f_path.dentry->d_inode : NULL;
- if (file) {
- switch (flags & MAP_TYPE) {
- case MAP_SHARED:
- if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE))
- return -EACCES;
-
- if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE))
- return -EACCES;
-
- if (locks_verify_locked(inode))
- return -EAGAIN;
- vm_flags |= VM_SHARED | VM_MAYSHARE;
- if (!(file->f_mode & FMODE_WRITE))
- vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
-
- case MAP_PRIVATE:
- if (!(file->f_mode & FMODE_READ))
- return -EACCES;
- if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) {
- if (vm_flags & VM_EXEC)
- return -EPERM;
- vm_flags &= ~VM_MAYEXEC;
- }
- if (is_file_hugepages(file))
- accountable = 0;
- if (!file->f_op || !file->f_op->mmap)
- return -ENODEV;
- break;
- default:
- return -EINVAL;
- }
- } else {
- switch (flags & MAP_TYPE) {
- case MAP_SHARED:
- vm_flags |= VM_SHARED | VM_MAYSHARE;
- break;
- case MAP_PRIVATE:
-
- pgoff = addr >> PAGE_SHIFT;
- break;
- default:
- return -EINVAL;
- }
- }
- error = security_file_mmap(file, reqprot, prot, flags);
- if (error)
- return error;
-
-
- error = -ENOMEM;
- munmap_back:
- vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
- if (vma && vma->vm_start < addr + len) {
- if (do_munmap(mm, addr, len))
- return -ENOMEM;
- goto munmap_back;
- }
-
- if (!may_expand_vm(mm, len >> PAGE_SHIFT))
- return -ENOMEM;
- if (accountable && (!(flags & MAP_NORESERVE) ||
- sysctl_overcommit_memory == OVERCOMMIT_NEVER)) {
- if (vm_flags & VM_SHARED) {
-
- vm_flags |= VM_ACCOUNT;
- } else if (vm_flags & VM_WRITE) {
-
- charged = len >> PAGE_SHIFT;
- if (security_vm_enough_memory(charged))
- return -ENOMEM;
- vm_flags |= VM_ACCOUNT;
- }
- }
-
- if (!file && !(vm_flags & VM_SHARED) &
- vma_merge(mm, prev, addr, addr + len, vm_flags,
- NULL, NULL, pgoff, NULL))
- goto out;
-
- vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
- if (!vma) {
- error = -ENOMEM;
- goto unacct_error;
- }
- vma->vm_mm = mm;
- vma->vm_start = addr;
- vma->vm_end = addr + len;
- vma->vm_flags = vm_flags;
- vma->vm_page_prot = protection_map[vm_flags
- (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)];
- vma->vm_pgoff = pgoff;
- if (file) {
- error = -EINVAL;
- if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
- goto free_vma;
- if (vm_flags & VM_DENYWRITE) {
- error = deny_write_access(file);
- if (error)
- goto free_vma;
- correct_wcount = 1;
- }
- vma->vm_file = file;
- get_file(file);
- error = file->f_op->mmap(file, vma);
- if (error)
- goto unmap_and_free_vma;
- } else if (vm_flags & VM_SHARED) {
- error = shmem_zero_setup(vma);
- if (error)
- goto free_vma;
- }
-
- if ((vm_flags & (VM_SHARED|VM_ACCOUNT)) == (VM_SHARED|VM_ACCOUNT))
- vma->vm_flags &= ~VM_ACCOUNT;
-
- addr = vma->vm_start;
- pgoff = vma->vm_pgoff;
- vm_flags = vma->vm_flags;
- if (vma_wants_writenotify(vma))
- vma->vm_page_prot =
- protection_map[vm_flags & (VM_READ|VM_WRITE|VM_EXEC)];
- if (!file || !vma_merge(mm, prev, addr, vma->vm_end,
- vma->vm_flags, NULL, file, pgoff, vma_policy(vma))) {
- file = vma->vm_file;
- vma_link(mm, vma, prev, rb_link, rb_parent);
- if (correct_wcount)
- atomic_inc(&inode->i_writecount);
- } else {
- if (file) {
- if (correct_wcount)
- atomic_inc(&inode->i_writecount);
- fput(file);
- }
- mpol_free(vma_policy(vma));
- kmem_cache_free(vm_area_cachep, vma);
- }
- out:
- mm->total_vm += len >> PAGE_SHIFT;
- vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
- if (vm_flags & VM_LOCKED) {
- mm->locked_vm += len >> PAGE_SHIFT;
- make_pages_present(addr, addr + len);
- }
- if (flags & MAP_POPULATE) {
- up_write(&mm->mmap_sem);
- sys_remap_file_pages(addr, len, 0,
- pgoff, flags & MAP_NONBLOCK);
- down_write(&mm->mmap_sem);
- }
- return addr;
- unmap_and_free_vma:
- if (correct_wcount)
- atomic_inc(&inode->i_writecount);
- vma->vm_file = NULL;
- fput(file);
-
- unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);
- charged = 0;
- free_vma:
- kmem_cache_free(vm_area_cachep, vma);
- unacct_error:
- if (charged)
- vm_unacct_memory(charged);
- return error;
- }
mmap()系统调用
在用户空间可以通过mmap()系统调用获取内核函数do_mmap()的功能。mmap()系统调用定义如下:
- void *mmap2(void *start,size_t length,int prot,int flags,int fd,off_t pgoff)
该系统调用是mmap()的变种所以起名为mmap2()。最原始的mmap()调用中最后一个参数是字节偏移量,而且目前这个变种使用页面偏移量最最后一个参数。使用页面偏移量可以映射更大的文件和更大的偏移位置。原始的mmap()调用由POSIX定义,仍然在C库中作为mmap()方法被使用。虽然C库仍然可以使用原始版本的映射方法,但是它其实还是基于函数mmap2()进行的,对原始mmap()方法的调用是通过将字节偏移转化为页面偏移,从而转化为对mmap2()函数的调用。
do_munmap()函数从特定的进程地址空间中删除指定地址区间。
- int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
- {
- unsigned long end;
- struct vm_area_struct *vma, *prev, *last;
- if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start)
- return -EINVAL;
- if ((len = PAGE_ALIGN(len)) == 0)
- return -EINVAL;
-
- vma = find_vma_prev(mm, start, &prev);
- if (!vma)
- return 0;
-
-
- end = start + len;
- if (vma->vm_start >= end)
- return 0;
-
- if (start > vma->vm_start) {
- int error = split_vma(mm, vma, start, 0);
- if (error)
- return error;
- prev = vma;
- }
-
- last = find_vma(mm, end);
- if (last && end > last->vm_start) {
- int error = split_vma(mm, last, end, 1);
- if (error)
- return error;
- }
- vma = prev? prev->vm_next: mm->mmap;
-
- detach_vmas_to_be_unmapped(mm, vma, prev, end);
- unmap_region(mm, vma, prev, start, end);
-
- remove_vma_list(mm, vma);
- return 0;
- }
munmap()系统调用
系统调用munmap()给用户空间程序提供了一种从自身地址空间中删除指定地址区间的方法,它的定义如下:
- int munmap(void *start,size_t length)
该系统调用定义在文件mm/mmap.c中,它是对do_munmap()函数的一个简单的封装:
- asmlinkage long sys_munmap(unsigned long addr, size_t len)
- {
- int ret;
- struct mm_struct *mm = current->mm;
- profile_munmap(addr);
- down_write(&mm->mmap_sem);
- ret = do_munmap(mm, addr, len);
- up_write(&mm->mmap_sem);
- return ret;
- }