共享内存基础知识
共享内存有两个,一个 mmap,一个 systemV 的 shm
由于所有用户进程总的虚拟地址空间比可用的物理内存大很多,因此只有最常用的部分才与物理页帧关联(这不是问题,因为大多数程序只占用实际可用内存的一小部分)
本文主要介绍 mmap
共享内存的 API
void *mmap(void *addr, size_t len, int prot, int flags, int fd, off_t offset);
共享内存使用案例
mmap:
#include
#include
int main(int argc, char* argv[]){
int fd = open("./flag.txt", 0666);
if(-1 == fd)
{
perror("open");
return -1;
}
int length = 1;
// char *addr = (char*)mmap(NULL, length, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);
char *addr = (char*)mmap(NULL, length, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
if(addr == MAP_FAILED)
{
perror("mmap");
return -1;
}
puts("get data from mmap:");
write(1,addr,0x40);
puts(" ");
puts("input data to mmap:");
read(0,addr,0x40);
if(munmap(addr, length) == -1) /* 解除映射区域 */
{
perror("munmap");
return -1;
}
}
0x7ffff7fcf000 0x7ffff7fd0000 r--p 1000 0 /usr/lib/x86_64-linux-gnu/ld-2.31.so
0x7ffff7fd0000 0x7ffff7ff3000 r-xp 23000 1000 /usr/lib/x86_64-linux-gnu/ld-2.31.so
0x7ffff7ff3000 0x7ffff7ffb000 r--p 8000 24000 /usr/lib/x86_64-linux-gnu/ld-2.31.so
0x7ffff7ffb000 0x7ffff7ffc000 rw-p 1000 0 /home/yhellow/桌面/exp/flag.txt /* target */
0x7ffff7ffc000 0x7ffff7ffd000 r--p 1000 2c000 /usr/lib/x86_64-linux-gnu/ld-2.31.so
0x7ffff7ffd000 0x7ffff7ffe000 rw-p 1000 2d000 /usr/lib/x86_64-linux-gnu/ld-2.31.so
Linux 中 mmap 的实现
kernel 版本:linux-4.20.1
mmap 的作用就是把磁盘文件的一部分(指定 fd)直接映射到进程的内存中
► 0x7ffff7eda8e4 <mmap64+36> syscall <SYS_mmap>
addr: 0x0
len: 0x1
prot: 0x3
flags: 0x2
fd: 0x3 (/home/yhellow/桌面/exp/flag.txt)
offset: 0x0
asmlinkage unsigned long
sys_mmap (unsigned long addr, unsigned long len, int prot, int flags, int fd, long off)
{
if (offset_in_page(off) != 0)
return -EINVAL;
addr = ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT); /* 核心函数 */
if (!IS_ERR((void *) addr))
force_successful_syscall_return();
return addr;
}
unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
unsigned long prot, unsigned long flags,
unsigned long fd, unsigned long pgoff)
{
struct file *file = NULL;
unsigned long retval;
if (!(flags & MAP_ANONYMOUS)) { /* MAP_ANONYMOUS:匿名的 */
audit_mmap_fd(fd, flags); /* 把'fd'和'flags'写到mmap结构体中 */
file = fget(fd); /* 获取对应的文件 */
if (!file)
return -EBADF;
if (is_file_hugepages(file))
len = ALIGN(len, huge_page_size(hstate_file(file))); /* 对齐 */
retval = -EINVAL;
if (unlikely(flags & MAP_HUGETLB && !is_file_hugepages(file)))
goto out_fput;
} else if (flags & MAP_HUGETLB) { /* MAP_HUGETLB:大页面映射 */
struct user_struct *user = NULL;
struct hstate *hs;
hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
if (!hs)
return -EINVAL;
len = ALIGN(len, huge_page_size(hs)); /* 对齐 */
/*
* VM_NORESERVE is used because the reservations will be
* taken when vm_ops->mmap() is called
* A dummy user value is used because we are not locking
* memory so no accounting is necessary
*/
file = hugetlb_file_setup(HUGETLB_ANON_FILE, len,
VM_NORESERVE,
&user, HUGETLB_ANONHUGE_INODE,
(flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
if (IS_ERR(file))
return PTR_ERR(file);
}
flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); /* 去掉可执行权限,去掉不可写权限 */
retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); /* 核心函数 */
out_fput:
if (file)
fput(file);
return retval;
}
unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
unsigned long flag, unsigned long pgoff)
{
unsigned long ret;
struct mm_struct *mm = current->mm; /* 获取当前进程的内存描述符 */
unsigned long populate;
LIST_HEAD(uf);
ret = security_mmap_file(file, prot, flag); /* 内核sandboxing功能,通过sandboxing调用mmap_file函数,如果是文件映射会mmap_file会对文件进行权限检查之类操作 */
if (!ret) {
if (down_write_killable(&mm->mmap_sem))
return -EINTR;
ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff,
&populate, &uf); /* 核心函数 */
up_write(&mm->mmap_sem);
userfaultfd_unmap_complete(mm, &uf);
if (populate)
mm_populate(ret, populate);
}
return ret;
}
security_mmap_file
最终会调用 ima_file_mmap
static inline unsigned long
do_mmap_pgoff(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot, unsigned long flags,
unsigned long pgoff, unsigned long *populate,
struct list_head *uf)
{
return do_mmap(file, addr, len, prot, flags, 0, pgoff, populate, uf);
}
unsigned long do_mmap(struct file *file,
unsigned long addr,
unsigned long len,
unsigned long prot,
unsigned long flags,
vm_flags_t vm_flags,
unsigned long pgoff,
unsigned long *populate,
struct list_head *uf)
{
struct vm_area_struct *vma; /* Linux中vm_area_struct表示的虚拟地址是给进程使用的(vm_struct表示的虚拟地址是给内核使用的) */
struct vm_region *region;
struct rb_node *rb;
unsigned long capabilities, result;
int ret;
*populate = 0;
/* decide whether we should attempt the mapping, and if so what sort of
* mapping */
ret = validate_mmap_request(file, addr, len, prot, flags, pgoff,
&capabilities); /* 用于决定是否应该尝试映射 */
if (ret < 0)
return ret;
/* we ignore the address hint */
addr = 0;
len = PAGE_ALIGN(len);
/* we've determined that we can make the mapping, now translate what we
* now know into VMA flags */
vm_flags |= determine_vm_flags(file, prot, flags, capabilities);
/* we're going to need to record the mapping */
region = kmem_cache_zalloc(vm_region_jar, GFP_KERNEL); /* 记录映射(kmem_cache_zalloc除了分配内存对象之外,还把内存对象所代表的内存空间初始化为"0") */
if (!region)
goto error_getting_region;
vma = vm_area_alloc(current->mm); /* 调用kmem_cache_alloc分配新的vma,然后调用vma_init进行初始化 */
if (!vma)
goto error_getting_vma;
region->vm_usage = 1; /* 设置vm_region */
region->vm_flags = vm_flags;
region->vm_pgoff = pgoff;
vma->vm_flags = vm_flags; /* 设置vm_area_struct */
vma->vm_pgoff = pgoff;
if (file) { /* 这里的file就是通过mmap的参数'fd'得来的 */
region->vm_file = get_file(file);
vma->vm_file = get_file(file);
}
down_write(&nommu_region_sem);
/* if we want to share, we need to check for regions created by other
* mmap() calls that overlap with our proposed mapping
* - we can only share with a superset match on most regular files
* - shared mappings on character devices and memory backed files are
* permitted to overlap inexactly as far as we are concerned for in
* these cases, sharing is handled in the driver or filesystem rather
* than here
*/
if (vm_flags & VM_MAYSHARE) { /* VM_MAYSHARE:用于确定是否可以设置对应的VM_SHARED(可以被多个进程共享) */
struct vm_region *pregion;
unsigned long pglen, rpglen, pgend, rpgend, start;
pglen = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
pgend = pgoff + pglen;
for (rb = rb_first(&nommu_region_tree); rb; rb = rb_next(rb)) {
pregion = rb_entry(rb, struct vm_region, vm_rb);
if (!(pregion->vm_flags & VM_MAYSHARE))
continue;
/* search for overlapping mappings on the same file */
if (file_inode(pregion->vm_file) !=
file_inode(file))
continue;
if (pregion->vm_pgoff >= pgend)
continue;
rpglen = pregion->vm_end - pregion->vm_start;
rpglen = (rpglen + PAGE_SIZE - 1) >> PAGE_SHIFT;
rpgend = pregion->vm_pgoff + rpglen;
if (pgoff >= rpgend)
continue;
/* handle inexactly overlapping matches between
* mappings */
if ((pregion->vm_pgoff != pgoff || rpglen != pglen) &&
!(pgoff >= pregion->vm_pgoff && pgend <= rpgend)) {
/* new mapping is not a subset of the region */
if (!(capabilities & NOMMU_MAP_DIRECT))
goto sharing_violation;
continue;
}
/* we've found a region we can share */
pregion->vm_usage++;
vma->vm_region = pregion; /* 设置vm_area_struct */
start = pregion->vm_start;
start += (pgoff - pregion->vm_pgoff) << PAGE_SHIFT;
vma->vm_start = start;
vma->vm_end = start + len;
if (pregion->vm_flags & VM_MAPPED_COPY)
vma->vm_flags |= VM_MAPPED_COPY;
else {
ret = do_mmap_shared_file(vma); /* 在文件上设置共享映射(驱动程序或文件系统提供并固定存储) */
if (ret < 0) {
vma->vm_region = NULL;
vma->vm_start = 0;
vma->vm_end = 0;
pregion->vm_usage--;
pregion = NULL;
goto error_just_free;
}
}
fput(region->vm_file);
kmem_cache_free(vm_region_jar, region);
region = pregion;
result = start;
goto share;
}
/* obtain the address at which to make a shared mapping
* - this is the hook for quasi-memory character devices to
* tell us the location of a shared mapping
*/
if (capabilities & NOMMU_MAP_DIRECT) {
addr = file->f_op->get_unmapped_area(file, addr, len,
pgoff, flags);
/* get_unmapped_area调用的是"current->mm->get_unmapped_area",在不同体系结构上对应不同的函数,但这些函数的基本原理都是类似的 */
if (IS_ERR_VALUE(addr)) {
ret = addr;
if (ret != -ENOSYS)
goto error_just_free;
/* the driver refused to tell us where to site
* the mapping so we'll have to attempt to copy
* it */
ret = -ENODEV;
if (!(capabilities & NOMMU_MAP_COPY))
goto error_just_free;
capabilities &= ~NOMMU_MAP_DIRECT;
} else {
vma->vm_start = region->vm_start = addr;
vma->vm_end = region->vm_end = addr + len;
}
}
}
vma->vm_region = region;
/* set up the mapping
* - the region is filled in if NOMMU_MAP_DIRECT is still set
*/
if (file && vma->vm_flags & VM_SHARED) /* VM_SHARED:可以被多个进程共享 */
ret = do_mmap_shared_file(vma); /* 在文件上设置共享映射(驱动程序或文件系统提供并固定存储) */
else
ret = do_mmap_private(vma, region, len, capabilities); /* 设置私有映射或匿名共享映射 */
if (ret < 0)
goto error_just_free;
add_nommu_region(region);
/* clear anonymous mappings that don't ask for uninitialized data */
if (!vma->vm_file && !(flags & MAP_UNINITIALIZED)) /* 清除不要求未初始化数据的匿名映射 */
memset((void *)region->vm_start, 0,
region->vm_end - region->vm_start);
/* okay... we have a mapping; now we have to register it */
result = vma->vm_start;
current->mm->total_vm += len >> PAGE_SHIFT;
share:
add_vma_to_mm(current->mm, vma); /* 在list和tree的适当位置将VMA添加到进程的mm_struct中,如果不是匿名页面,也添加到地址空间的页面树中 */
/* we flush the region from the icache only when the first executable
* mapping of it is made */
if (vma->vm_flags & VM_EXEC && !region->vm_icache_flushed) {
flush_icache_range(region->vm_start, region->vm_end);
region->vm_icache_flushed = true;
}
up_write(&nommu_region_sem);
return result;
error_just_free:
up_write(&nommu_region_sem);
error:
if (region->vm_file)
fput(region->vm_file);
kmem_cache_free(vm_region_jar, region);
if (vma->vm_file)
fput(vma->vm_file);
vm_area_free(vma);
return ret;
sharing_violation:
up_write(&nommu_region_sem);
pr_warn("Attempt to share mismatched mappings\n");
ret = -EINVAL;
goto error;
error_getting_vma:
kmem_cache_free(vm_region_jar, region);
pr_warn("Allocation of vma for %lu byte allocation from process %d failed\n",
len, current->pid);
show_free_areas(0, NULL);
return -ENOMEM;
error_getting_region:
pr_warn("Allocation of vm region for %lu byte allocation from process %d failed\n",
len, current->pid);
show_free_areas(0, NULL);
return -ENOMEM;
}
vm_area_alloc
(底层还是调用 kmem_cache_alloc
,然后调用 vma_init
把该 vma
插入红黑树)vm_area_struct
用于管理进程使用的虚拟地址(虚存管理的最基本的管理单元):struct vm_area_struct {
/* The first cache line has the info for VMA tree walking. */
unsigned long vm_start; /* Our start address within vm_mm. */
unsigned long vm_end; /* The first byte after our end address
within vm_mm. */
/* linked list of VM areas per task, sorted by address */
struct vm_area_struct *vm_next, *vm_prev;
struct rb_node vm_rb;
/*
* Largest free memory gap in bytes to the left of this VMA.
* Either between this VMA and vma->vm_prev, or between one of the
* VMAs below us in the VMA rbtree and its ->vm_prev. This helps
* get_unmapped_area find a free area of the right size.
*/
unsigned long rb_subtree_gap;
/* Second cache line starts here. */
struct mm_struct *vm_mm; /* The address space we belong to. */
pgprot_t vm_page_prot; /* Access permissions of this VMA. */
unsigned long vm_flags; /* Flags, see mm.h. */
/*
* For areas with an address space and backing store,
* linkage into the address_space->i_mmap interval tree.
*/
struct {
struct rb_node rb;
unsigned long rb_subtree_last;
} shared;
/*
* A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma
* list, after a COW of one of the file pages. A MAP_SHARED vma
* can only be in the i_mmap tree. An anonymous MAP_PRIVATE, stack
* or brk vma (with NULL file) can only be in an anon_vma list.
*/
struct list_head anon_vma_chain; /* Serialized by mmap_sem &
* page_table_lock */
struct anon_vma *anon_vma; /* Serialized by page_table_lock */
/* Function pointers to deal with this struct. */
const struct vm_operations_struct *vm_ops;
/* Information about our backing store: */
unsigned long vm_pgoff; /* Offset (within vm_file) in PAGE_SIZE
units */
struct file * vm_file; /* File we map to (can be NULL). */
void * vm_private_data; /* was vm_pte (shared mem) */
atomic_long_t swap_readahead_info;
#ifndef CONFIG_MMU
struct vm_region *vm_region; /* NOMMU mapping region */
#endif
#ifdef CONFIG_NUMA
struct mempolicy *vm_policy; /* NUMA policy for the VMA */
#endif
struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
} __randomize_layout;
struct core_thread {
struct task_struct *task;
struct core_thread *next;
};
struct core_state {
atomic_t nr_threads;
struct core_thread dumper;
struct completion startup;
};
get_unmapped_area
调用的是 current->mm->get_unmapped_area
,在 Linux 中,实际上调用的是 arch_get_unmapped_area
(进程中能够找到查找空闲虚拟内存的方法)enum mmap_allocation_direction {UP, DOWN}; /* UP == '0', DOWN == '1' */
unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr0,
unsigned long len, unsigned long pgoff, unsigned long flags)
{
return arch_get_unmapped_area_common(filp,
addr0, len, pgoff, flags, UP); /* addr0 == '0' */
}
static unsigned long arch_get_unmapped_area_common(struct file *filp,
unsigned long addr0, unsigned long len, unsigned long pgoff,
unsigned long flags, enum mmap_allocation_direction dir) /* dir == UP */
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
unsigned long addr = addr0;
int do_color_align;
struct vm_unmapped_area_info info; /* 用于管理分配内存请求 */
if (unlikely(len > TASK_SIZE))
return -ENOMEM;
if (flags & MAP_FIXED) {
/* Even MAP_FIXED mappings must reside within TASK_SIZE */
if (TASK_SIZE - len < addr)
return -EINVAL;
/*
* We do not accept a shared mapping if it would violate
* cache aliasing constraints.
*/
if ((flags & MAP_SHARED) &&
((addr - (pgoff << PAGE_SHIFT)) & shm_align_mask))
return -EINVAL;
return addr;
}
do_color_align = 0;
if (filp || (flags & MAP_SHARED))
do_color_align = 1;
/* requesting a specific address */
if (addr) {
if (do_color_align)
addr = COLOUR_ALIGN(addr, pgoff);
else
addr = PAGE_ALIGN(addr);
vma = find_vma(mm, addr); /* 找到对应的vma */
if (TASK_SIZE - len >= addr &&
(!vma || addr + len <= vm_start_gap(vma)))
return addr;
}
info.length = len;
info.align_mask = do_color_align ? (PAGE_MASK & shm_align_mask) : 0;
info.align_offset = pgoff << PAGE_SHIFT;
if (dir == DOWN) { /* 自上而下进行映射(在本调用链中恒不成立) */
info.flags = VM_UNMAPPED_AREA_TOPDOWN;
info.low_limit = PAGE_SIZE;
info.high_limit = mm->mmap_base;
addr = vm_unmapped_area(&info); /* 根据vm_unmapped_area_info扫描mmap映射区域来查找满足请求的内存 */
if (!(addr & ~PAGE_MASK)) /* "addr&~PAGE_MASK"可判定addr是否是4096倍数,如果结果为"0",则是,否则不是 */
return addr; /* addr是否是4096倍数则返回 */
/*
* A failed mmap() very likely causes application failure,
* so fall back to the bottom-up function here. This scenario
* can happen with large stack limits and large mmap()
* allocations.
*/
}
info.flags = 0;
info.low_limit = mm->mmap_base;
info.high_limit = TASK_SIZE;
return vm_unmapped_area(&info); /* 根据vm_unmapped_area_info扫描mmap映射区域来查找满足请求的内存 */
}
vm_unmapped_area
用于在 mmap 映射区域中查找满足请求的内存(以 vm_area_struct
为单位),这是内存分配中最底层的内容/*
* 搜索未映射的地址范围,条件如下:
* - 不与任何VMA相交
* - 区间范围属于 [low_limit,high_limit)
* - 地址大小至少是 length
* - 满足 (begin_addr & align_mask) == (align_offset & align_mask)
*/
static inline unsigned long
vm_unmapped_area(struct vm_unmapped_area_info *info)
{
if (info->flags & VM_UNMAPPED_AREA_TOPDOWN) /* VM_UNMAPPED_AREA_TOPDOWN:将虚拟机未映射区域自上而下进行映射(在本调用链中恒不成立) */
return unmapped_area_topdown(info); /* 反向 */
else
return unmapped_area(info); /* 正向 */
}
unmapped_area
:unsigned long unmapped_area(struct vm_unmapped_area_info *info)
{
/*
* 我们通过寻找紧跟合适间隙的rbtree节点来实现搜索
* - gap_start = vma->vm_prev->vm_end <= info->high_limit - length;
* - gap_end = vma->vm_start >= info->low_limit + length;
* - gap_end - gap_start >= length
*/
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
unsigned long length, low_limit, high_limit, gap_start, gap_end;
/* Adjust search length to account for worst case alignment overhead */
length = info->length + info->align_mask;
if (length < info->length)
return -ENOMEM;
/* Adjust search limits by the desired length */
if (info->high_limit < length)
return -ENOMEM;
high_limit = info->high_limit - length;
if (info->low_limit > high_limit)
return -ENOMEM;
low_limit = info->low_limit + length;
/* Check if rbtree root looks promising */
if (RB_EMPTY_ROOT(&mm->mm_rb))
goto check_highest;
vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb);
if (vma->rb_subtree_gap < length)
goto check_highest;
while (true) {
/* Visit left subtree if it looks promising */
gap_end = vm_start_gap(vma);
if (gap_end >= low_limit && vma->vm_rb.rb_left) {
struct vm_area_struct *left =
rb_entry(vma->vm_rb.rb_left,
struct vm_area_struct, vm_rb);
if (left->rb_subtree_gap >= length) {
vma = left;
continue;
}
}
gap_start = vma->vm_prev ? vm_end_gap(vma->vm_prev) : 0;
check_current:
/* Check if current node has a suitable gap */
if (gap_start > high_limit)
return -ENOMEM;
if (gap_end >= low_limit &&
gap_end > gap_start && gap_end - gap_start >= length)
goto found;
/* Visit right subtree if it looks promising */
if (vma->vm_rb.rb_right) {
struct vm_area_struct *right =
rb_entry(vma->vm_rb.rb_right,
struct vm_area_struct, vm_rb);
if (right->rb_subtree_gap >= length) {
vma = right;
continue;
}
}
/* Go back up the rbtree to find next candidate node */
while (true) {
struct rb_node *prev = &vma->vm_rb;
if (!rb_parent(prev))
goto check_highest;
vma = rb_entry(rb_parent(prev),
struct vm_area_struct, vm_rb);
if (prev == vma->vm_rb.rb_left) {
gap_start = vm_end_gap(vma->vm_prev);
gap_end = vm_start_gap(vma);
goto check_current;
}
}
}
check_highest:
/* Check highest gap, which does not precede any rbtree node */
gap_start = mm->highest_vm_end;
gap_end = ULONG_MAX; /* Only for VM_BUG_ON below */
if (gap_start > high_limit)
return -ENOMEM;
found:
/* We found a suitable gap. Clip it with the original low_limit. */
if (gap_start < info->low_limit)
gap_start = info->low_limit;
/* Adjust gap address to the desired alignment */
gap_start += (info->align_offset - gap_start) & info->align_mask;
VM_BUG_ON(gap_start + info->length > info->high_limit);
VM_BUG_ON(gap_start + info->length > gap_end);
return gap_start; /* 最后返回找到的addr */
}
do_mmap_shared_file
或者 do_mmap_private
,他们底层都会调用 call_mmap
完成最后的设置static inline int call_mmap(struct file *file, struct vm_area_struct *vma)
{
return file->f_op->mmap(file, vma);
}
file->f_op->mmap
指向 ext4_file_mmap
(Linux 默认的文件系统为 Ext2 Ext3 Ext4)static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
{
struct inode *inode = file->f_mapping->host;
if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
return -EIO;
/*
* We don't support synchronous mappings for non-DAX files. At least
* until someone comes with a sensible use case.
*/
if (!IS_DAX(file_inode(file)) && (vma->vm_flags & VM_SYNC))
return -EOPNOTSUPP;
file_accessed(file);
if (IS_DAX(file_inode(file))) {
vma->vm_ops = &ext4_dax_vm_ops; /* 初始化vma->vm_ops(在page fault handler中被使用到) */
vma->vm_flags |= VM_HUGEPAGE;
} else {
vma->vm_ops = &ext4_file_vm_ops;
}
return 0;
}
新人初稿,若有问题请大佬指出