我看的是linux-4.2.3的源码。参考了《边干边学——Linux内核指导》(鬼畜的书名)第16章内容,他们用的是2.6.15的内核源码。
现在linux中可以使用共享内存的方式有两种
POSIX的shm_open()
在/dev/shm/
下打开一个文件,用mmap()
映射到进程自己的内存地址
System V的shmget()
得到一个共享内存对象的id,用shmat()
映射到进程自己的内存地址
POSIX的实现是基于tmpfs的,函数都写在libc里,没什么好说的,主要还是看System V的实现方式。在System V中共享内存属于IPC子系统。所谓ipc,就是InterProcess Communication即进程间通信的意思,System V比前面的Unix增加了3中进程间通信的方式,共享内存、消息队列、信号量,统称IPC。主要代码在以下文件中
ipc/shm.c
include/linux/shm.h
ipc/util.c
ipc/util.h
include/linux/ipc.h
同一块共享内存在内核中至少有3个标识符
IPC对象id(IPC对象是保存IPC信息的数据结构)
进程虚拟内存中文件的inode,即每个进程中的共享内存也是以文件的方式存在的,但并不是显式的。可以通过某个vm_area_struct->vm_file->f_dentry->d_inode->i_ino
表示
IPC对象的key。如果在shmget()
中传入同一个key可以获取到同一块共享内存。但由于key是用户指定的,可能重复,而且也很少程序写之前会约定一个key,所以这种方法不是很常用。通常System V这种共享内存的方式是用于有父子关系的进程的。或者用ftok()
函数用路径名来生成一个key。
首先看一下在内核中表示一块共享内存的数据结构,在include/linux/shm.h
中
/* */
是内核源码的注释,//
是我的注释
struct shmid_kernel /* private to the kernel */
{
struct kern_ipc_perm shm_perm; // 权限,这个结构体中还有一些重要的内容,后面会提到
struct file *shm_file; // 表示这块共享内存的内核文件,文件内容即共享内存的内容
unsigned long shm_nattch; // 连接到这块共享内存的进程数
unsigned long shm_segsz; // 大小,字节为单位
time_t shm_atim; // 最后一次连接时间
time_t shm_dtim; // 最后一次断开时间
time_t shm_ctim; // 最后一次更改信息的时间
pid_t shm_cprid; // 创建者进程id
pid_t shm_lprid; // 最后操作者进程id
struct user_struct *mlock_user;
/* The task created the shm object. NULL if the task is dead. */
struct task_struct *shm_creator;
struct list_head shm_clist; /* list by creator */
};
再看一下struct shmid_kernel
中存储权限信息的shm_perm
,在include/linux/ipc.h
中
/* used by in-kernel data structures */
struct kern_ipc_perm
{
spinlock_t lock;
bool deleted;
int id; // IPC对象id
key_t key; // IPC对象键值,即创建共享内存时用户指定的
kuid_t uid; // IPC对象拥有者id
kgid_t gid; // 组id
kuid_t cuid; // 创建者id
kgid_t cgid;
umode_t mode;
unsigned long seq;
void *security;
};
为啥有这样一个struct呢?因为这些权限、id、key是IPC对象都有的属性,所以比如表示semaphore的结构struct semid_kernel
中也有一个这样的struct kern_ipc_perm
。然后在传递IPC对象的时候,传的也是struct kern_ipc_perm
的指针,再用container_of
这样的宏获得外面的struct,这样就能用同一个函数操作3种IPC对象,达到较好的代码重用。
接下来我们看一下共享内存相关函数。首先它们都是系统调用,对应的用户API在libc里面,参数是相同的,只是libc中的API做了一些调用系统调用需要的日常工作(保护现场、恢复现场之类的),所以就直接看这个系统调用了。
声明在include/linux/syscalls.h
中
asmlinkage long sys_shmat(int shmid, char __user *shmaddr, int shmflg);
asmlinkage long sys_shmget(key_t key, size_t size, int flag);
asmlinkage long sys_shmdt(char __user *shmaddr);
asmlinkage long sys_shmctl(int shmid, int cmd, struct shmid_ds __user *buf);
定义在ipc/shm.c
中
SYSCALL_DEFINE3(shmget, key_t, key, size_t, size, int, shmflg)
{
struct ipc_namespace *ns;
static const struct ipc_ops shm_ops = {
.getnew = newseg,
.associate = shm_security,
.more_checks = shm_more_checks,
};
struct ipc_params shm_params;
ns = current->nsproxy->ipc_ns;
shm_params.key = key;
shm_params.flg = shmflg;
shm_params.u.size = size;
return ipcget(ns, &shm_ids(ns), &shm_ops, &shm_params);
}
首先看到这个函数定义可能会很奇怪,不过这个SYSCALL_DEFINE3
的宏展开来最后形式肯定和.h文件中声明的一样,即还是long sys_shmget(key_t key, size_t size, int flag)
这个宏是为了修一个bug,纯粹黑科技,这里不提它。
然后这里实际调用的函数是ipcget()
。为了统一一个ipc的接口也是煞费苦心,共享内存、信号量、消息队列三种对象创建的时候都会调用这个函数,但其实创建的逻辑并不在这里。而在shm_ops
中的三个函数里。
顺便提一下其中的current->nsproxy->ipc_ns
。这个的类型是struct ipc_namespace
。它是啥呢?我们知道,共享内存这些进程间通信的数据结构是全局的,但有时候需要把他们隔离开,即某一组进程并不知道另外的进程的共享内存,它们只希望在组内共用这些东西,这样就不会与其他进程冲突。于是就煞费苦心在内核中加了一个namespace。只要在clone()
函数中加入CLONE_NEWIPC
标志就能创建一个新的IPC namespace。
那么这个IPC namespace和我们的共享内存的数据结构有什么关系呢,可以看一下结构体
struct ipc_ids {
int in_use;
unsigned short seq;
struct rw_semaphore rwsem;
struct idr ipcs_idr;
int next_id;
};
struct ipc_namespace {
atomic_t count;
struct ipc_ids ids[3];
...
};
比较重要的是其中的ids
,它存的是所用IPC对象的id,其中共享内存都存在ids[2]
中。而在ids[2]
中真正负责管理数据的是ipcs_idr
,它也是内核中一个煞费苦心弄出来的id管理机制,一个id可以对应任意唯一确定的对象。把它理解成一个数组就好。它们之间的关系大概如下图所示。
[0] struct kern_ipc_perm <==> struct shmid_kernel
struct ipc_namespace => struct ipc_ids => struct idr => [1] struct kern_ipc_perm <==> struct shmid_kernel
[2] struct kern_ipc_perm <==> struct shmid_kernel
好的,我们回头来看看shmget()
究竟干了啥,首先看一下ipcget()
int ipcget(struct ipc_namespace *ns, struct ipc_ids *ids,
const struct ipc_ops *ops, struct ipc_params *params)
{
if (params->key == IPC_PRIVATE)
return ipcget_new(ns, ids, ops, params);
else
return ipcget_public(ns, ids, ops, params);
}
如果传进来的参数是IPC_PRIVATE
(这个宏的值是0)的话,无论是什么mode,都会创建一块新的共享内存。如果非0,则会去已有的共享内存中找有没有这个key的,有就返回,没有就新建。
首先看一下新建的函数newseg()
static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
{
key_t key = params->key;
int shmflg = params->flg;
size_t size = params->u.size;
int error;
struct shmid_kernel *shp;
size_t numpages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
struct file *file;
char name[13];
int id;
vm_flags_t acctflag = 0;
if (size < SHMMIN || size > ns->shm_ctlmax)
return -EINVAL;
if (numpages << PAGE_SHIFT < size)
return -ENOSPC;
if (ns->shm_tot + numpages < ns->shm_tot ||
ns->shm_tot + numpages > ns->shm_ctlall)
return -ENOSPC;
shp = ipc_rcu_alloc(sizeof(*shp));
if (!shp)
return -ENOMEM;
shp->shm_perm.key = key;
shp->shm_perm.mode = (shmflg & S_IRWXUGO);
shp->mlock_user = NULL;
shp->shm_perm.security = NULL;
error = security_shm_alloc(shp);
if (error) {
ipc_rcu_putref(shp, ipc_rcu_free);
return error;
}
sprintf(name, "SYSV%08x", key);
if (shmflg & SHM_HUGETLB) {
struct hstate *hs;
size_t hugesize;
hs = hstate_sizelog((shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK);
if (!hs) {
error = -EINVAL;
goto no_file;
}
hugesize = ALIGN(size, huge_page_size(hs));
/* hugetlb_file_setup applies strict accounting */
if (shmflg & SHM_NORESERVE)
acctflag = VM_NORESERVE;
file = hugetlb_file_setup(name, hugesize, acctflag,
&shp->mlock_user, HUGETLB_SHMFS_INODE,
(shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK);
} else {
/*
* Do not allow no accounting for OVERCOMMIT_NEVER, even
* if it's asked for.
*/
if ((shmflg & SHM_NORESERVE) &&
sysctl_overcommit_memory != OVERCOMMIT_NEVER)
acctflag = VM_NORESERVE;
file = shmem_kernel_file_setup(name, size, acctflag);
}
error = PTR_ERR(file);
if (IS_ERR(file))
goto no_file;
id = ipc_addid(&shm_ids(ns), &shp->shm_perm, ns->shm_ctlmni);
if (id < 0) {
error = id;
goto no_id;
}
shp->shm_cprid = task_tgid_vnr(current);
shp->shm_lprid = 0;
shp->shm_atim = shp->shm_dtim = 0;
shp->shm_ctim = get_seconds();
shp->shm_segsz = size;
shp->shm_nattch = 0;
shp->shm_file = file;
shp->shm_creator = current;
list_add(&shp->shm_clist, ¤t->sysvshm.shm_clist);
/*
* shmid gets reported as "inode#" in /proc/pid/maps.
* proc-ps tools use this. Changing this will break them.
*/
file_inode(file)->i_ino = shp->shm_perm.id;
ns->shm_tot += numpages;
error = shp->shm_perm.id;
ipc_unlock_object(&shp->shm_perm);
rcu_read_unlock();
return error;
no_id:
if (is_file_hugepages(file) && shp->mlock_user)
user_shm_unlock(size, shp->mlock_user);
fput(file);
no_file:
ipc_rcu_putref(shp, shm_rcu_free);
return error;
}
这个函数首先几个if检查size是不是合法的参数,并且检查有没有足够的pages。然后调用ipc_rcu_alloc()
函数给共享内存数据结构shp分配空间。然后把一些参数写到shp的shm_perm成员中。然后sprintf下面那个大的if-else是为表示共享内存内容的file分配空间。再然后ipc_addid()
是一个比较重要的函数,它把刚才新建的这个共享内存的数据结构的指针加入到namespace的ids里,即可以想象成加入到数组里,并获得一个可以找到它的id。这里的id并不完全是数组的下标,因为要避免重复,所以这里有一个简单的机制来保证生成的id几乎是unique的,即ids里面有个seq变量,每次新加入共享内存对象时都会加1,而真正的id是这样生成的SEQ_MULTIPLIER * seq + id
。然后初始化一些成员,再把这个数据结构的指针加到当前进程的一个list里。这个函数的工作就基本完成了。
接下来我们再看一下如果创建时传入一个已有的key,即ipcget_public()
的逻辑
static int ipcget_public(struct ipc_namespace *ns, struct ipc_ids *ids,
const struct ipc_ops *ops, struct ipc_params *params)
{
struct kern_ipc_perm *ipcp;
int flg = params->flg;
int err;
/*
* Take the lock as a writer since we are potentially going to add
* a new entry + read locks are not "upgradable"
*/
down_write(&ids->rwsem);
ipcp = ipc_findkey(ids, params->key);
if (ipcp == NULL) {
/* key not used */
if (!(flg & IPC_CREAT))
err = -ENOENT;
else
err = ops->getnew(ns, params);
} else {
/* ipc object has been locked by ipc_findkey() */
if (flg & IPC_CREAT && flg & IPC_EXCL)
err = -EEXIST;
else {
err = 0;
if (ops->more_checks)
err = ops->more_checks(ipcp, params);
if (!err)
/*
* ipc_check_perms returns the IPC id on
* success
*/
err = ipc_check_perms(ns, ipcp, ops, params);
}
ipc_unlock(ipcp);
}
up_write(&ids->rwsem);
return err;
}
逻辑非常简单,先去找有没有这个key。没有的话还是创建一个新的,注意ops->getnew()
对应的就是刚才的newseg()
函数。如果找到了就判断一下权限有没有问题,没有问题就直接返回IPC id。
可以再看下ipc_findkey()
这个函数
static struct kern_ipc_perm *ipc_findkey(struct ipc_ids *ids, key_t key)
{
struct kern_ipc_perm *ipc;
int next_id;
int total;
for (total = 0, next_id = 0; total < ids->in_use; next_id++) {
ipc = idr_find(&ids->ipcs_idr, next_id);
if (ipc == NULL)
continue;
if (ipc->key != key) {
total++;
continue;
}
rcu_read_lock();
ipc_lock_object(ipc);
return ipc;
}
return NULL;
}
逻辑也很简单,注意到ids->ipcs_idr
就是之前提到的Interger ID Managenent机制,里面存的就是shmid和对象一一对应的关系。然后这里可以看到ids->in_use
表示的是共享内存的个数,由于中间的有些可能删掉了,所以total在找到一个不为空的共享内存的时候才++。然后我们也可以看到,这里对重复的key并没有做任何处理。所以我们在编程的时候也应该避免直接约定用某一个数字当key。
接下来我们看一下shmat()
,它的逻辑全在do_shmat()
中,所以我们直接看这个函数。
long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr,
unsigned long shmlba)
{
struct shmid_kernel *shp;
unsigned long addr;
unsigned long size;
struct file *file;
int err;
unsigned long flags;
unsigned long prot;
int acc_mode;
struct ipc_namespace *ns;
struct shm_file_data *sfd;
struct path path;
fmode_t f_mode;
unsigned long populate = 0;
err = -EINVAL;
if (shmid < 0)
goto out;
else if ((addr = (ulong)shmaddr)) {
if (addr & (shmlba - 1)) {
if (shmflg & SHM_RND)
addr &= ~(shmlba - 1); /* round down */
else
#ifndef __ARCH_FORCE_SHMLBA
if (addr & ~PAGE_MASK)
#endif
goto out;
}
flags = MAP_SHARED | MAP_FIXED;
} else {
if ((shmflg & SHM_REMAP))
goto out;
flags = MAP_SHARED;
}
if (shmflg & SHM_RDONLY) {
prot = PROT_READ;
acc_mode = S_IRUGO;
f_mode = FMODE_READ;
} else {
prot = PROT_READ | PROT_WRITE;
acc_mode = S_IRUGO | S_IWUGO;
f_mode = FMODE_READ | FMODE_WRITE;
}
if (shmflg & SHM_EXEC) {
prot |= PROT_EXEC;
acc_mode |= S_IXUGO;
}
/*
* We cannot rely on the fs check since SYSV IPC does have an
* additional creator id...
*/
ns = current->nsproxy->ipc_ns;
rcu_read_lock();
shp = shm_obtain_object_check(ns, shmid);
if (IS_ERR(shp)) {
err = PTR_ERR(shp);
goto out_unlock;
}
err = -EACCES;
if (ipcperms(ns, &shp->shm_perm, acc_mode))
goto out_unlock;
err = security_shm_shmat(shp, shmaddr, shmflg);
if (err)
goto out_unlock;
ipc_lock_object(&shp->shm_perm);
/* check if shm_destroy() is tearing down shp */
if (!ipc_valid_object(&shp->shm_perm)) {
ipc_unlock_object(&shp->shm_perm);
err = -EIDRM;
goto out_unlock;
}
path = shp->shm_file->f_path;
path_get(&path);
shp->shm_nattch++;
size = i_size_read(d_inode(path.dentry));
ipc_unlock_object(&shp->shm_perm);
rcu_read_unlock();
err = -ENOMEM;
sfd = kzalloc(sizeof(*sfd), GFP_KERNEL);
if (!sfd) {
path_put(&path);
goto out_nattch;
}
file = alloc_file(&path, f_mode,
is_file_hugepages(shp->shm_file) ?
&shm_file_operations_huge :
&shm_file_operations);
err = PTR_ERR(file);
if (IS_ERR(file)) {
kfree(sfd);
path_put(&path);
goto out_nattch;
}
file->private_data = sfd;
file->f_mapping = shp->shm_file->f_mapping;
sfd->id = shp->shm_perm.id;
sfd->ns = get_ipc_ns(ns);
sfd->file = shp->shm_file;
sfd->vm_ops = NULL;
err = security_mmap_file(file, prot, flags);
if (err)
goto out_fput;
down_write(¤t->mm->mmap_sem);
if (addr && !(shmflg & SHM_REMAP)) {
err = -EINVAL;
if (addr + size < addr)
goto invalid;
if (find_vma_intersection(current->mm, addr, addr + size))
goto invalid;
}
addr = do_mmap_pgoff(file, addr, size, prot, flags, 0, &populate);
*raddr = addr;
err = 0;
if (IS_ERR_VALUE(addr))
err = (long)addr;
invalid:
up_write(¤t->mm->mmap_sem);
if (populate)
mm_populate(addr, populate);
out_fput:
fput(file);
out_nattch:
down_write(&shm_ids(ns).rwsem);
shp = shm_lock(ns, shmid);
shp->shm_nattch--;
if (shm_may_destroy(ns, shp))
shm_destroy(ns, shp);
else
shm_unlock(shp);
up_write(&shm_ids(ns).rwsem);
return err;
out_unlock:
rcu_read_unlock();
out:
return err;
}
首先检查shmaddr的合法性并进行对齐,即调整为shmlba的整数倍。如果传入addr是0,前面检查部分只会加上一个MAP_SHARED标志,因为后面的mmap会自动为其分配地址。然后从那一段两行的注释开始,函数通过shmid尝试获取共享内存对象,并进行权限检查。然后修改shp中的一些数据,比如连接进程数加一。然后是通过alloc_file()
创建真正的要做mmap的file。在mmap之前还要对地址空间进行检查,检查是否和别的地址重叠,是否够用。实际的映射工作就在do_mmap_pgoff()
函数中做了。
SYSCALL_DEFINE1(shmdt, char __user *, shmaddr)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
unsigned long addr = (unsigned long)shmaddr;
int retval = -EINVAL;
#ifdef CONFIG_MMU
loff_t size = 0;
struct file *file;
struct vm_area_struct *next;
#endif
if (addr & ~PAGE_MASK)
return retval;
down_write(&mm->mmap_sem);
/*
* This function tries to be smart and unmap shm segments that
* were modified by partial mlock or munmap calls:
* - It first determines the size of the shm segment that should be
* unmapped: It searches for a vma that is backed by shm and that
* started at address shmaddr. It records it's size and then unmaps
* it.
* - Then it unmaps all shm vmas that started at shmaddr and that
* are within the initially determined size and that are from the
* same shm segment from which we determined the size.
* Errors from do_munmap are ignored: the function only fails if
* it's called with invalid parameters or if it's called to unmap
* a part of a vma. Both calls in this function are for full vmas,
* the parameters are directly copied from the vma itself and always
* valid - therefore do_munmap cannot fail. (famous last words?)
*/
/*
* If it had been mremap()'d, the starting address would not
* match the usual checks anyway. So assume all vma's are
* above the starting address given.
*/
vma = find_vma(mm, addr);
#ifdef CONFIG_MMU
while (vma) {
next = vma->vm_next;
/*
* Check if the starting address would match, i.e. it's
* a fragment created by mprotect() and/or munmap(), or it
* otherwise it starts at this address with no hassles.
*/
if ((vma->vm_ops == &shm_vm_ops) &&
(vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) {
/*
* Record the file of the shm segment being
* unmapped. With mremap(), someone could place
* page from another segment but with equal offsets
* in the range we are unmapping.
*/
file = vma->vm_file;
size = i_size_read(file_inode(vma->vm_file));
do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start);
/*
* We discovered the size of the shm segment, so
* break out of here and fall through to the next
* loop that uses the size information to stop
* searching for matching vma's.
*/
retval = 0;
vma = next;
break;
}
vma = next;
}
/*
* We need look no further than the maximum address a fragment
* could possibly have landed at. Also cast things to loff_t to
* prevent overflows and make comparisons vs. equal-width types.
*/
size = PAGE_ALIGN(size);
while (vma && (loff_t)(vma->vm_end - addr) <= size) {
next = vma->vm_next;
/* finding a matching vma now does not alter retval */
if ((vma->vm_ops == &shm_vm_ops) &&
((vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) &&
(vma->vm_file == file))
do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start);
vma = next;
}
#else /* CONFIG_MMU */
/* under NOMMU conditions, the exact address to be destroyed must be
* given */
if (vma && vma->vm_start == addr && vma->vm_ops == &shm_vm_ops) {
do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start);
retval = 0;
}
#endif
up_write(&mm->mmap_sem);
return retval;
}
接下来是shmdt()
,这个函数非常简单,找到传入的shmaddr对应的虚拟内存数据结构vma,检查它的地址是不是正确的,然后调用do_munmap()
函数断开对共享内存的连接。注意此操作并不会销毁共享内存,即使没有进程连接到它也不会,只有手动调用shmctl(id, IPC_RMID, NULL)
才能销毁。
shmctl()
总体就是一个switch语句,大多数做的是读取信息的或者设置标志位的工作,这里不赘述。