一、库函数shmget()--共享内存区的创建与寻找
asmlinkage long sys_shmget (key_t key, size_t size, int shmflg) { struct shmid_kernel *shp; int err, id = 0; down(&shm_ids.sem); if (key == IPC_PRIVATE) { err = newseg(key, shmflg, size);//分配一个共享内存区供本进程专用,最后返回的是一体化的标示号 } else if ((id = ipc_findkey(&shm_ids, key)) == -1) {//在shm_ids寻找shmid_kernel结构(共享内存区),如果没有找到,id为-1。如果找到了id为标示号。 if (!(shmflg & IPC_CREAT))//没有找到也不允许创建,那么就出错返回 err = -ENOENT; else err = newseg(key, shmflg, size);//否则创建一个共享内存区 } else if ((shmflg & IPC_CREAT) && (shmflg & IPC_EXCL)) {//如果找到了,但是要求的是创建,那么也返回出错 err = -EEXIST; } else {//如果找到了,也不要求创建,就是正常情况下了 shp = shm_lock(id);//通过标示号id,获取共享内存区 if(shp==NULL) BUG(); if (shp->shm_segsz < size) err = -EINVAL; else if (ipcperms(&shp->shm_perm, shmflg)) err = -EACCES; else err = shm_buildid(id, shp->shm_perm.seq);//最后返回的还是一体化参数 shm_unlock(id); } up(&shm_ids.sem); return err;//无论是创建还是查找,最后都返回的是一体化的标示号 }键值IPC_PRIVATE,即0,是特殊的,它表示要分配一个共享内存区供本进程专用。其他键值则表示要创建或寻找的是"共享"内存区。而标志位IPC_CREAT则表示目的在于创建。
1、当键值是IPC_PRIVATE时,会调用newseg,分配一个共享内存区供本进程专用,代码如下:
static int newseg (key_t key, int shmflg, size_t size) { int error; struct shmid_kernel *shp; int numpages = (size + PAGE_SIZE -1) >> PAGE_SHIFT; struct file * file; char name[13]; int id; if (size < SHMMIN || size > shm_ctlmax) return -EINVAL; if (shm_tot + numpages >= shm_ctlall) return -ENOSPC; shp = (struct shmid_kernel *) kmalloc (sizeof (*shp), GFP_USER);//分配shmid_kernel结构 if (!shp) return -ENOMEM; sprintf (name, "SYSV%08x", key); file = shmem_file_setup(name, size);//在特殊文件系统"shm"中建立映射文件 error = PTR_ERR(file); if (IS_ERR(file)) goto no_file; error = -ENOSPC; id = shm_addid(shp);//将shmid_kernel结构链入shm_ids if(id == -1) goto no_id; shp->shm_perm.key = key; shp->shm_flags = (shmflg & S_IRWXUGO); shp->shm_cprid = current->pid; shp->shm_lprid = 0; shp->shm_atim = shp->shm_dtim = 0; shp->shm_ctim = CURRENT_TIME; shp->shm_segsz = size; shp->shm_nattch = 0; shp->id = shm_buildid(id,shp->shm_perm.seq);//将这个标识号转换成一个一体化的标示号 shp->shm_file = file;//指向新建立的file file->f_dentry->d_inode->i_ino = shp->id; file->f_op = &shm_file_operations;//最后又重新设置了一遍f_op,这里是shm_file_operations,而不是shmem_file_operations shm_tot += numpages; shm_unlock (id); return shp->id;//返回的是一体化的标示号 no_id: fput(file); no_file: kfree(shp); return error; }shmid_kernel结构如下:
struct shmid_kernel /* private to the kernel */ { struct kern_ipc_perm shm_perm; struct file * shm_file; int id; unsigned long shm_nattch; unsigned long shm_segsz; time_t shm_atim; time_t shm_dtim; time_t shm_ctim; pid_t shm_cprid; pid_t shm_lprid; };shmem_file_setup,在特殊文件系统"shm"中建立映射文件,代码如下:
struct file *shmem_file_setup(char * name, loff_t size) { int error; struct file *file; struct inode * inode; struct dentry *dentry, *root; struct qstr this; int vm_enough_memory(long pages); error = -ENOMEM; if (!vm_enough_memory((size) >> PAGE_SHIFT)) goto out; this.name = name; this.len = strlen(name); this.hash = 0; /* will go */ root = shmem_fs_type.kern_mnt->mnt_root;//shm特殊文件系统的根节点的dentry结构 dentry = d_alloc(root, &this);//分配shm节点的dentry结构 if (!dentry) goto out; error = -ENFILE; file = get_empty_filp(); if (!file) goto put_dentry; error = -ENOSPC; inode = shmem_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0);//分配shm节点的inode结构 if (!inode) goto close_file; d_instantiate(dentry, inode);//shm节点的dentry结构和shm节点的inode结构相关联 dentry->d_inode->i_size = size; file->f_vfsmnt = mntget(shmem_fs_type.kern_mnt); file->f_dentry = dentry;//指向刚刚的dentry file->f_op = &shmem_file_operations;//设置如下 file->f_mode = FMODE_WRITE | FMODE_READ; inode->i_nlink = 0; /* It is unlinked */ return(file); close_file: put_filp(file); put_dentry: dput (dentry); out: return ERR_PTR(error); }其中shmem_fs_type.kern_mnt->mnt_root是在init_shmem_fs中建立的。
static DECLARE_FSTYPE(shmem_fs_type, "shm", shmem_read_super, FS_LITTER); static int __init init_shmem_fs(void) { int error; struct vfsmount * res; if ((error = register_filesystem(&shmem_fs_type))) { printk (KERN_ERR "Could not register shmem fs\n"); return error; } res = kern_mount(&shmem_fs_type); if (IS_ERR (res)) { printk (KERN_ERR "could not kern_mount shmem fs\n"); unregister_filesystem(&shmem_fs_type); return PTR_ERR(res); } devfs_mk_dir (NULL, "shm", NULL); return 0; }shmem_get_inode,分配shm节点的inode结构,代码如下:
struct inode *shmem_get_inode(struct super_block *sb, int mode, int dev) { struct inode * inode; spin_lock (&sb->u.shmem_sb.stat_lock); if (!sb->u.shmem_sb.free_inodes) { spin_unlock (&sb->u.shmem_sb.stat_lock); return NULL; } sb->u.shmem_sb.free_inodes--; spin_unlock (&sb->u.shmem_sb.stat_lock); inode = new_inode(sb); if (inode) { inode->i_mode = mode; inode->i_uid = current->fsuid; inode->i_gid = current->fsgid; inode->i_blksize = PAGE_CACHE_SIZE; inode->i_blocks = 0; inode->i_rdev = to_kdev_t(dev); inode->i_mapping->a_ops = &shmem_aops;//shmem_aops设置如下 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; spin_lock_init (&inode->u.shmem_i.lock); switch (mode & S_IFMT) { default: init_special_inode(inode, mode, dev); break; case S_IFREG://i_op和i_fop设置如下 inode->i_op = &shmem_inode_operations; inode->i_fop = &shmem_file_operations; break; case S_IFDIR: inode->i_op = &shmem_dir_inode_operations; inode->i_fop = &shmem_dir_operations; break; case S_IFLNK: inode->i_op = &page_symlink_inode_operations; break; } spin_lock (&shmem_ilock); list_add (&inode->u.shmem_i.list, &shmem_inodes); spin_unlock (&shmem_ilock); } return inode; }inode->i_op = &shmem_inode_operations,代码如下:
static struct inode_operations shmem_inode_operations = { truncate: shmem_truncate, };inode->i_fop = &shmem_file_operations,代码如下:
static struct file_operations shmem_file_operations = { mmap: shmem_mmap };
inode->i_mapping->a_ops = &shmem_aops,代码如下:
static struct address_space_operations shmem_aops = { writepage: shmem_writepage };返回到 shmem_file_setup,file->f_op = &shmem_file_operations,如下:
static struct file_operations shmem_file_operations = { mmap: shmem_mmap };
返回到newseg,shm_addid,将shmid_kernel结构链入shm_ids,代码如下:
static inline int shm_addid(struct shmid_kernel *shp) { return ipc_addid(&shm_ids, &shp->shm_perm, shm_ctlmni+1);//shp->shm_perm是kern_ipc_perm }
int ipc_addid(struct ipc_ids* ids, struct kern_ipc_perm* new, int size) { int id; size = grow_ary(ids,size); for (id = 0; id < size; id++) { if(ids->entries[id].p == NULL) goto found; } return -1; found: ids->in_use++; if (id > ids->max_id) ids->max_id = id; new->cuid = new->uid = current->euid; new->gid = new->cgid = current->egid; new->seq = ids->seq++; if(ids->seq > ids->seq_max) ids->seq = 0; spin_lock(&ids->ary); ids->entries[id].p = new;//把shp->shem_perm链入到全局的shm_ids中 return id;//返回标识号 }其中shm_ids如下:
struct ipc_ids { int size; int in_use; int max_id; unsigned short seq; unsigned short seq_max; struct semaphore sem; spinlock_t ary; struct ipc_id* entries; }; static struct ipc_ids shm_ids; struct ipc_id { struct kern_ipc_perm* p; };继续执行,shm_buildid,将这个标识号转换成一个一体化的标示号,代码如下:
#define shm_buildid(id, seq) \ ipc_buildid(&shm_ids, id, seq) extern inline int ipc_buildid(struct ipc_ids* ids, int id, int seq) { return SEQ_MULTIPLIER*seq + id; }
int ipc_findkey(struct ipc_ids* ids, key_t key) { int id; struct kern_ipc_perm* p; for (id = 0; id <= ids->max_id; id++) { p = ids->entries[id].p; if(p==NULL) continue; if (key == p->key) return id;//返回标示号,而不是一体化标示号 } return -1; }如果找到了,也不要求创建,就是正常情况下了,执行shm_lock,通过标识号id获取共享内存区,如下:
#define shm_lock(id) ((struct shmid_kernel*)ipc_lock(&shm_ids,id)) extern inline struct kern_ipc_perm* ipc_lock(struct ipc_ids* ids, int id) { struct kern_ipc_perm* out; int lid = id % SEQ_MULTIPLIER;//无论是标示号还是一体化标示号都通吃 if(lid > ids->size) return NULL; spin_lock(&ids->ary); out = ids->entries[lid].p; if(out==NULL) spin_unlock(&ids->ary); return out; }
二、库函数shmat()--建立共享内存区的映射
通过shmget()以给定键值创建了一个共享内存区,或者取得了已创建共享内存区的一体化的标示号以后,还要通过shmat()将这个内存区映射到本进程的虚拟空间,sys_shmat代码如下:
asmlinkage long sys_shmat (int shmid, char *shmaddr, int shmflg, ulong *raddr)//shmaddr为当前进程所要求映射的目标地址,也就是映射后该共享内存区在这个进程的用户空间中的起始地址 { struct shmid_kernel *shp; unsigned long addr; struct file * file; int err; unsigned long flags; unsigned long prot; unsigned long o_flags; int acc_mode; void *user_addr; if (shmid < 0) return -EINVAL; if ((addr = (ulong)shmaddr)) { if (addr & (SHMLBA-1)) { if (shmflg & SHM_RND) addr &= ~(SHMLBA-1); /* round down */ else return -EINVAL; } flags = MAP_SHARED | MAP_FIXED; } else flags = MAP_SHARED; if (shmflg & SHM_RDONLY) { prot = PROT_READ; o_flags = O_RDONLY; acc_mode = S_IRUGO; } else { prot = PROT_READ | PROT_WRITE; o_flags = O_RDWR; acc_mode = S_IRUGO | S_IWUGO; } /* * We cannot rely on the fs check since SYSV IPC does have an * aditional creator id... */ shp = shm_lock(shmid);//通过一体化标示号找到共享内存区 if(shp == NULL) return -EINVAL; if (ipcperms(&shp->shm_perm, acc_mode)) { shm_unlock(shmid); return -EACCES; } file = shp->shm_file;//找到file结构 shp->shm_nattch++; shm_unlock(shmid); down(¤t->mm->mmap_sem); user_addr = (void *) do_mmap (file, addr, file->f_dentry->d_inode->i_size, prot, flags, 0);//建立起文件与虚拟空间的映射 up(¤t->mm->mmap_sem); down (&shm_ids.sem); if(!(shp = shm_lock(shmid))) BUG(); shp->shm_nattch--; if(shp->shm_nattch == 0 && shp->shm_flags & SHM_DEST) shm_destroy (shp); shm_unlock(shmid); up (&shm_ids.sem); *raddr = (unsigned long) user_addr; err = 0; if (IS_ERR(user_addr)) err = PTR_ERR(user_addr); return err; }do_mmap,建立起文件与虚拟空间的映射。代码如下:
static inline unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flag, unsigned long offset) { unsigned long ret = -EINVAL; if ((offset + PAGE_ALIGN(len)) < offset) goto out; if (!(offset & ~PAGE_MASK)) ret = do_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT); out: return ret; }
unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long pgoff) { struct mm_struct * mm = current->mm; struct vm_area_struct * vma; int correct_wcount = 0; int error; ...... if (flags & MAP_FIXED) { if (addr & ~PAGE_MASK) return -EINVAL; } else { addr = get_unmapped_area(addr, len);//如果addr为0,那么就自行分配一个虚拟空间 if (!addr) return -ENOMEM; } /* Determine the object being mapped and call the appropriate * specific mapper. the address has already been validated, but * not unmapped, but the maps are removed from the list. */ vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);//分配了vm_area_struct结构 if (!vma) return -ENOMEM; vma->vm_mm = mm; vma->vm_start = addr; vma->vm_end = addr + len; vma->vm_flags = vm_flags(prot,flags) | mm->def_flags; if (file) { VM_ClearReadHint(vma); vma->vm_raend = 0; if (file->f_mode & FMODE_READ) vma->vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; if (flags & MAP_SHARED) { vma->vm_flags |= VM_SHARED | VM_MAYSHARE; /* This looks strange, but when we don't have the file open * for writing, we can demote the shared mapping to a simpler * private mapping. That also takes care of a security hole * with ptrace() writing to a shared mapping without write * permissions. * * We leave the VM_MAYSHARE bit on, just to get correct output * from /proc/xxx/maps.. */ if (!(file->f_mode & FMODE_WRITE)) vma->vm_flags &= ~(VM_MAYWRITE | VM_SHARED); } } else { vma->vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; if (flags & MAP_SHARED) vma->vm_flags |= VM_SHARED | VM_MAYSHARE; } vma->vm_page_prot = protection_map[vma->vm_flags & 0x0f]; vma->vm_ops = NULL; vma->vm_pgoff = pgoff; vma->vm_file = NULL; vma->vm_private_data = NULL; /* Clear old maps */ error = -ENOMEM; if (do_munmap(mm, addr, len)) goto free_vma; /* Check against address space limit. */ if ((mm->total_vm << PAGE_SHIFT) + len > current->rlim[RLIMIT_AS].rlim_cur) goto free_vma; /* Private writable mapping? Check memory availability.. */ if ((vma->vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE && !(flags & MAP_NORESERVE) && !vm_enough_memory(len >> PAGE_SHIFT)) goto free_vma; if (file) { if (vma->vm_flags & VM_DENYWRITE) { error = deny_write_access(file); if (error) goto free_vma; correct_wcount = 1; } vma->vm_file = file;//这里是重点 get_file(file); error = file->f_op->mmap(file, vma);//最后设置成shmem_mmap if (error) goto unmap_and_free_vma; } else if (flags & MAP_SHARED) { error = shmem_zero_setup(vma); if (error) goto free_vma; } /* Can addr have changed?? * * Answer: Yes, several device drivers can do it in their * f_op->mmap method. -DaveM */ flags = vma->vm_flags; addr = vma->vm_start; insert_vm_struct(mm, vma); if (correct_wcount) atomic_inc(&file->f_dentry->d_inode->i_writecount); mm->total_vm += len >> PAGE_SHIFT; if (flags & VM_LOCKED) { mm->locked_vm += len >> PAGE_SHIFT; make_pages_present(addr, addr + len); } return addr;//起始虚拟地址 unmap_and_free_vma: if (correct_wcount) atomic_inc(&file->f_dentry->d_inode->i_writecount); vma->vm_file = NULL; fput(file); /* Undo any partial mapping done by a device driver. */ flush_cache_range(mm, vma->vm_start, vma->vm_end); zap_page_range(mm, vma->vm_start, vma->vm_end - vma->vm_start); flush_tlb_range(mm, vma->vm_start, vma->vm_end); free_vma: kmem_cache_free(vm_area_cachep, vma); return error; }file->f_op->mmap(file, vma),最后设置成shmem_mmap,代码如下:
static int shm_mmap(struct file * file, struct vm_area_struct * vma) { UPDATE_ATIME(file->f_dentry->d_inode); vma->vm_ops = &shm_vm_ops; shm_inc(file->f_dentry->d_inode->i_ino); return 0; }
static struct vm_operations_struct shm_vm_ops = { open: shm_open, /* callback for a new vm-area open */ close: shm_close, /* callback for when the vm-area is released */ nopage: shmem_nopage, };
在sys_shmat()中实际上并没有建立页面的映射,而是把它推迟到了实际需要的时候。
三、所以,在将一块共享内存区纳入一个进程的存储空间以后,当其中的任何一个页面首次受到访问时就会因为“缺页”而产生一次页面异常。从do_page_fault()开始,顺着handle_mm_fault()、handle_pte_fault(),一直到do_no_page。在do_no_page()中,如果产生异常的地址所属区间的指针vm_ops指向一个vm_operations_struct数据结构,并且该结构中的函数指针nopage非零,就会调用这个函数来建立所在页面的映射表项。
static int do_no_page(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int write_access, pte_t *page_table) { struct page * new_page; pte_t entry; if (!vma->vm_ops || !vma->vm_ops->nopage) return do_anonymous_page(mm, vma, page_table, write_access, address); /* * The third argument is "no_share", which tells the low-level code * to copy, not share the page even if sharing is possible. It's * essentially an early COW detection. */ new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, (vma->vm_flags & VM_SHARED)?0:write_access);//对于共享内存来说,指向了shmem_page if (new_page == NULL) /* no page was available -- SIGBUS */ return 0; if (new_page == NOPAGE_OOM) return -1; ++mm->rss; /* * This silly early PAGE_DIRTY setting removes a race * due to the bad i386 page protection. But it's valid * for other architectures too. * * Note that if write_access is true, we either now have * an exclusive copy of the page, or this is a shared mapping, * so we can make it writable and dirty to avoid having to * handle that later. */ flush_page_to_ram(new_page); flush_icache_page(vma, new_page); entry = mk_pte(new_page, vma->vm_page_prot); if (write_access) { entry = pte_mkwrite(pte_mkdirty(entry)); } else if (page_count(new_page) > 1 && !(vma->vm_flags & VM_SHARED)) entry = pte_wrprotect(entry); set_pte(page_table, entry);//把页表项指向新申请的page,这样就建立了映射 /* no need to invalidate: a not-present page shouldn't be cached */ update_mmu_cache(vma, address, entry); return 2; /* Major fault */ }vma->vm_ops->nopage,对于共享内存来说,指向了shmem_page,代码如下:
struct page * shmem_nopage(struct vm_area_struct * vma, unsigned long address, int no_share) { unsigned long size; struct page * page; unsigned int idx; swp_entry_t *entry; struct inode * inode = vma->vm_file->f_dentry->d_inode; struct address_space * mapping = inode->i_mapping; struct shmem_inode_info *info; idx = (address - vma->vm_start) >> PAGE_SHIFT; idx += vma->vm_pgoff; down (&inode->i_sem); size = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;//页面数 page = NOPAGE_SIGBUS; if ((idx >= size) && (vma->vm_mm == current->mm)) goto out; /* retry, we may have slept */ page = __find_lock_page(mapping, idx, page_hash (mapping, idx)); if (page) goto cached_page; info = &inode->u.shmem_i; entry = shmem_swp_entry (info, idx); if (!entry) goto oom; if (entry->val) {//目前为0 unsigned long flags; /* Look it up and read it in.. */ page = lookup_swap_cache(*entry); if (!page) { lock_kernel(); swapin_readahead(*entry); page = read_swap_cache(*entry); unlock_kernel(); if (!page) goto oom; } /* We have to this with page locked to prevent races */ spin_lock (&info->lock); swap_free(*entry); lock_page(page); delete_from_swap_cache_nolock(page); *entry = (swp_entry_t) {0}; flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_referenced) | (1 << PG_arch_1)); page->flags = flags | (1 << PG_dirty); add_to_page_cache_locked(page, mapping, idx); info->swapped--; spin_unlock (&info->lock); } else {//执行这里 spin_lock (&inode->i_sb->u.shmem_sb.stat_lock); if (inode->i_sb->u.shmem_sb.free_blocks == 0) goto no_space; inode->i_sb->u.shmem_sb.free_blocks--; spin_unlock (&inode->i_sb->u.shmem_sb.stat_lock); /* Ok, get a new page */ page = page_cache_alloc();//分配一个页面 if (!page) goto oom; clear_user_highpage(page, address); inode->i_blocks++; add_to_page_cache (page, mapping, idx);//这个函数是重点,此时的mapping是inode->mapping,而不是交换分区&swapper_space } /* We have the page */ SetPageUptodate (page); cached_page: UnlockPage (page); up(&inode->i_sem); if (no_share) { struct page *new_page = page_cache_alloc(); if (new_page) { copy_user_highpage(new_page, page, address); flush_page_to_ram(new_page); } else new_page = NOPAGE_OOM; page_cache_release(page); return new_page; } flush_page_to_ram (page); return(page); no_space: spin_unlock (&inode->i_sb->u.shmem_sb.stat_lock); oom: page = NOPAGE_OOM; out: up(&inode->i_sem); return page; }add_to_page_cache,将page加入到相关队里中去。相关代码请参考 Linux内核源代码情景分析-内存管理之用户页面的换入, 只不是此时的mapping是inode->mapping,而不是交换分区&swapper_space。
page->list链入mapping->clean_pages;
page->next_hash和page->pprev_hash链入全局的Hash表;
page->lru链入了全局的active_list;
page->mapping来自于inode->mapping。也是在这里赋值的。
返回到do_no_page,把页表项指向新申请的page。这样就建立了映射。
假设两个进程一个是申请sys_shmget的共享内存区,一个是查找sys_shmget刚刚申请的共享内存区;都通过sys_shmat,将这个内存区映射到本进程的虚拟空间。
第一个进程如上面的步骤,建立了映射后,往共享内存区写数据。第二个进程会调用page = __find_lock_page(mapping, idx, page_hash (mapping, idx));找到刚刚分配的内存,并建立映射。这样第二个进程就能读取刚刚写入的数据。
四、当内存紧张时,共享内存区也会被换入到交换分区,参考Linux内核源代码情景分析-内存管理之用户页面的定期换出。
kswapd内核线程:
1、refill_inactive_scan和swap_out,把活跃的页面变成不活跃脏的页面。挑选的原则是最近没有被访问,且age小于0。
2、page_launder,把不活跃脏的页面变成不活跃干净的页面。
我们这里主要分析page_launder,算法如下:
if (PageDirty(page)) { int (*writepage)(struct page *) = page->mapping->a_ops->writepage;//还记得我们设置过shmem_writepage int result; if (!writepage) goto page_active; /* First time through? Move it to the back of the list */ if (!launder_loop) { list_del(page_lru); list_add(page_lru, &inactive_dirty_list); UnlockPage(page); continue; } /* OK, do a physical asynchronous write to swap. */ ClearPageDirty(page); page_cache_get(page); spin_unlock(&pagemap_lru_lock); result = writepage(page);//shmem_writepage page_cache_release(page); /* And re-start the thing.. */ spin_lock(&pagemap_lru_lock); if (result != 1) continue; /* writepage refused to do anything */ set_page_dirty(page); goto page_active; }
inode->i_mapping->a_ops = &shmem_aops,代码如下:
static struct address_space_operations shmem_aops = { writepage: shmem_writepage };writepage(page),也就是shmem_writepage(page),代码如下:
static int shmem_writepage(struct page * page) { int error; struct shmem_inode_info *info; swp_entry_t *entry, swap; info = &page->mapping->host->u.shmem_i; if (info->locked) return 1; swap = __get_swap_page(2);//从交换设备上分配一个页面 if (!swap.val) return 1; spin_lock(&info->lock); entry = shmem_swp_entry (info, page->index);//根据物理页面号,通过这个函数在文件的swp_entry_t表中找到相应的表项,此表项表示一个页面在交换设备上的页面号,目前什么内容没有 if (!entry) /* this had been allocted on page allocation */ BUG(); error = -EAGAIN; if (entry->val) { __swap_free(swap, 2); goto out; } *entry = swap;//页面在交换设备上的页面号 error = 0; /* Remove the from the page cache */ lru_cache_del(page); remove_inode_page(page); /* Add it to the swap cache */ add_to_swap_cache(page, swap); page_cache_release(page); set_page_dirty(page); info->swapped++; out: spin_unlock(&info->lock); UnlockPage(page); return error; }
shmem_swp_entry,根据物理页面号,通过这个函数在文件的swp_entry_t表中找到相应的表项,此表项表示一个页面在交换设备上的页面号。
static swp_entry_t * shmem_swp_entry (struct shmem_inode_info *info, unsigned long index) { if (index < SHMEM_NR_DIRECT) return info->i_direct+index; index -= SHMEM_NR_DIRECT; if (index >= ENTRIES_PER_PAGE*ENTRIES_PER_PAGE) return NULL; if (!info->i_indirect) { info->i_indirect = (swp_entry_t **) get_zeroed_page(GFP_USER); if (!info->i_indirect) return NULL; } if(!(info->i_indirect[index/ENTRIES_PER_PAGE])) { info->i_indirect[index/ENTRIES_PER_PAGE] = (swp_entry_t *) get_zeroed_page(GFP_USER); if (!info->i_indirect[index/ENTRIES_PER_PAGE]) return NULL; } return info->i_indirect[index/ENTRIES_PER_PAGE]+index%ENTRIES_PER_PAGE; }
struct shmem_inode_info { spinlock_t lock; swp_entry_t i_direct[SHMEM_NR_DIRECT]; /* for the first blocks */ swp_entry_t **i_indirect; /* doubly indirect blocks */ unsigned long swapped; int locked; /* into memory */ struct list_head list; };返回到shmem_writepage,执行如下:
/* Remove the from the page cache */ lru_cache_del(page); remove_inode_page(page);
page->list为空;
page->next_hash和page->pprev_hash位空;
page->lru为空;
继续执行,代码如下:
/* Add it to the swap cache */ add_to_swap_cache(page, swap); page_cache_release(page);
void add_to_swap_cache(struct page *page, swp_entry_t entry) { unsigned long flags; #ifdef SWAP_CACHE_INFO swap_cache_add_total++; #endif if (!PageLocked(page)) BUG(); if (PageTestandSetSwapCache(page)) BUG(); if (page->mapping) BUG(); flags = page->flags & ~((1 << PG_error) | (1 << PG_arch_1)); page->flags = flags | (1 << PG_uptodate); add_to_page_cache_locked(page, &swapper_space, entry.val); }参考 Linux内核源代码情景分析-内存管理之用户页面的换入,执行后的结果如下:
page->list链入mapping->clean_pages;
page->next_hash和page->pprev_hash链入全局的Hash表;
page->lru链入了全局的active_list;
只是此时mapping是交换分区&swapper_space。而不是的是inode->mapping,所以page->mapping->a_ops->writepage就指向了swap_writepage了。
当page_launcher再次扫描到这个页面时,它的page->mapping->a_ops->writepage已经指向了swap_writepage了。流程就和Linux内核源代码情景分析-内存管理之用户页面的定期换出完全一样了。
static int swap_writepage(struct page *page) { rw_swap_page(WRITE, page, 0); return 0; }把页面写入了交换分区。最后:
page->list链入mapping->dirty_pages或者clean_pages(保持原样);
page->next_hash和page->pprev_hash链入全局的Hash表;
page->lru链入了page->zone->inactive_clean_list;
五、恢复映射
1、如果refill_inactive_scan和swap_out,把活跃的页面变成不活跃脏的页面。挑选的原则是最近没有被访问,且age小于0。
或者,page_launder,把不活跃脏的页面变成不活跃干净的页面。
不活跃脏的页面,有如下特点:
使用计数为1;
page->list链入mapping->dirty_pages/clean_pages;
page->next_hash和page->pprev_hash链入全局的Hash表;
page->lru链入了全局的inactive_dirty_list;
page->flags对应为设置为PG_dirty。
不活跃干净的页面,有如下特点:
使用计数为1;
page->list链入mapping->dirty_pages/clean_pages(保持原样);
page->next_hash和page->pprev_hash链入全局的Hash表;
page->lru链入了page->zone->inactive_clean_list;
如果发生缺页中断,do_no_page中调用shmem_nopage,再次访问到这个页面,那么会调用lookup_swap_cache,会在全局的Hash表找到对应的页面,并且引用计数加1,变成2,但还没有移到活跃队列中。什么时候转移到活跃队列中呢?
答案在,page_launder和reclaim_page中。
page_launder:
if (PageTestandClearReferenced(page) || page->age > 0 || //此时引用计数大于1 (!page->buffers && page_count(page) > 1) || page_ramdisk(page)) { del_page_from_inactive_dirty_list(page); add_page_to_active_list(page); continue; }reclaim_page:
if (PageTestandClearReferenced(page) || page->age > 0 || (!page->buffers && page_count(page) > 1)) {//此时引用计数大于1 del_page_from_inactive_clean_list(page); add_page_to_active_list(page); continue; }
如果发生缺页中断,do_no_page调用shmem_nopage,再次访问到这个页面,调用look_swap_cache为NULL,所以继续执行,代码位于shmem_nopage:
if (entry->val) {//目前不为0了,应为刚刚换出时设置了 unsigned long flags; /* Look it up and read it in.. */ page = lookup_swap_cache(*entry); if (!page) { lock_kernel(); swapin_readahead(*entry);//从交换区预读 page = read_swap_cache(*entry);//从交换区真读 unlock_kernel(); if (!page) goto oom; } /* We have to this with page locked to prevent races */ spin_lock (&info->lock); swap_free(*entry); lock_page(page); delete_from_swap_cache_nolock(page);//从交换区队列中移除 *entry = (swp_entry_t) {0};//swap_entry_t项清零 flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_referenced) | (1 << PG_arch_1)); page->flags = flags | (1 << PG_dirty); add_to_page_cache_locked(page, mapping, idx); info->swapped--; spin_unlock (&info->lock); } else {add_to_page_cache_locked,最后的结构就是:
page->list链入mapping->clean_pages;
page->next_hash和page->pprev_hash链入全局的Hash表;
page->lru链入了全局的active_list。