备注:
1. Kernel版本:5.4
2. 使用工具:Source Insight 4.0
3. 参考博客:
(1)arm-linux 系统调用流程
(2)ARM Linux上的系统调用代码分析
sys_open是经过宏替换定义的,源码在fs/open.c中。
SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode)
{
if (force_o_largefile())
flags |= O_LARGEFILE;
return do_sys_open(AT_FDCWD, filename, flags, mode);
}
//include/linux/syscalls.h
#define SYSCALL_DEFINE1(name, ...) SYSCALL_DEFINEx(1, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE2(name, ...) SYSCALL_DEFINEx(2, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE4(name, ...) SYSCALL_DEFINEx(4, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE5(name, ...) SYSCALL_DEFINEx(5, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE6(name, ...) SYSCALL_DEFINEx(6, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE_MAXARGS 6
#define SYSCALL_DEFINEx(x, sname, ...) \
SYSCALL_METADATA(sname, x, __VA_ARGS__) \
__SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
注意:这里name已经变成了”_name”,加上了下划线了,所以”open”变成了“_open”了。
如果未定义了CONFIG_FTRACE_SYSCALLS,则SYSCALL_METADATA
为空,否则如下:
#ifdef CONFIG_FTRACE_SYSCALLS
#define __SC_STR_ADECL(t, a) #a
#define __SC_STR_TDECL(t, a) #t
extern struct trace_event_class event_class_syscall_enter;
extern struct trace_event_class event_class_syscall_exit;
extern struct trace_event_functions enter_syscall_print_funcs;
extern struct trace_event_functions exit_syscall_print_funcs;
#define SYSCALL_TRACE_ENTER_EVENT(sname) \
static struct syscall_metadata __syscall_meta_##sname; \
static struct trace_event_call __used \
event_enter_##sname = { \
.class = &event_class_syscall_enter, \
{ \
.name = "sys_enter"#sname, \
}, \
.event.funcs = &enter_syscall_print_funcs, \
.data = (void *)&__syscall_meta_##sname,\
.flags = TRACE_EVENT_FL_CAP_ANY, \
}; \
static struct trace_event_call __used \
__attribute__((section("_ftrace_events"))) \
*__event_enter_##sname = &event_enter_##sname;
#define SYSCALL_TRACE_EXIT_EVENT(sname) \
static struct syscall_metadata __syscall_meta_##sname; \
static struct trace_event_call __used \
event_exit_##sname = { \
.class = &event_class_syscall_exit, \
{ \
.name = "sys_exit"#sname, \
}, \
.event.funcs = &exit_syscall_print_funcs, \
.data = (void *)&__syscall_meta_##sname,\
.flags = TRACE_EVENT_FL_CAP_ANY, \
}; \
static struct trace_event_call __used \
__attribute__((section("_ftrace_events"))) \
*__event_exit_##sname = &event_exit_##sname;
#define SYSCALL_METADATA(sname, nb, ...) \
static const char *types_##sname[] = { \
__MAP(nb,__SC_STR_TDECL,__VA_ARGS__) \
}; \
static const char *args_##sname[] = { \
__MAP(nb,__SC_STR_ADECL,__VA_ARGS__) \
}; \
SYSCALL_TRACE_ENTER_EVENT(sname); \
SYSCALL_TRACE_EXIT_EVENT(sname); \
static struct syscall_metadata __used \
__syscall_meta_##sname = { \
.name = "sys"#sname, \
.syscall_nr = -1, /* Filled in at boot */ \
.nb_args = nb, \
.types = nb ? types_##sname : NULL, \
.args = nb ? args_##sname : NULL, \
.enter_event = &event_enter_##sname, \
.exit_event = &event_exit_##sname, \
.enter_fields = LIST_HEAD_INIT(__syscall_meta_##sname.enter_fields), \
}; \
static struct syscall_metadata __used \
__attribute__((section("__syscalls_metadata"))) \
*__p_syscall_meta_##sname = &__syscall_meta_##sname;
static inline int is_syscall_trace_event(struct trace_event_call *tp_event)
{
return tp_event->class == &event_class_syscall_enter ||
tp_event->class == &event_class_syscall_exit;
}
#else
#define SYSCALL_METADATA(sname, nb, ...)
static inline int is_syscall_trace_event(struct trace_event_call *tp_event)
{
return 0;
}
#endif
#define __SYSCALL_DEFINEx(x, name, ...) \
__diag_push(); \
__diag_ignore(GCC, 8, "-Wattribute-alias", \
"Type aliasing is used to sanitize syscall arguments");\
asmlinkage long sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) \
__attribute__((alias(__stringify(__se_sys##name)))); \
ALLOW_ERROR_INJECTION(sys##name, ERRNO); \
static inline long __do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__));\
asmlinkage long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \
asmlinkage long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \
{ \
long ret = __do_sys##name(__MAP(x,__SC_CAST,__VA_ARGS__));\
__MAP(x,__SC_TEST,__VA_ARGS__); \
__PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__)); \
return ret; \
} \
__diag_pop(); \
static inline long __do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))
asmlinkage long sys##name(__MAP(x,__SC_DECL,VA_ARGS)),此时已变成了“sys_open”。
经过宏最终的展开,最后的形式如下:
asmlinkage long sys_open(const char __user* filename,int flags,umode_t mode)
{
if (force_o_largefile())
flags |= O_LARGEFILE;
return do_sys_open(AT_FDCWD, filename, flags, mode);
}
sys_open的实现
SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode)
{
//判断系统是否支持大文件,即判断long的位数,如果64则表示支持大文件;
if (force_o_largefile())
flags |= O_LARGEFILE;
//完成主要的open工作,AT_FDCWD表示从当前目录开始查找
return do_sys_open(AT_FDCWD, filename, flags, mode);
}
其中先调用force_o_largefile()来判断是否需要设置大文件标识,然后调用do_sys_open来完成具体的工作。其中force_o_largefile()函数如下:
#ifndef force_o_largefile
#define force_o_largefile() (!IS_ENABLED(CONFIG_ARCH_32BIT_OFF_T))
#endif
只有在32位的OS上此处才为false,所以64位的系统上才有可能使能该寄存器。
因此文件最大大小受索引节点中表示文件大小的32位的i_size的影响,只能访问2的32次方字节,即4GB(实际高位一般不用,所以通常只有2G)。加上O_LAGEFILE之后启用索引节点的i_dir_acl字段也可以一起表示文件的大小了,这样位数就变成了64位,2的64位就4GB*4GB,单个文件这么大已经很大了16T了。
该函数主要分为如下几个步骤来完成打开文件的操作:
源码:fs/open.c
long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode)
{
struct open_flags op;
int fd = build_open_flags(flags, mode, &op);
struct filename *tmp;
if (fd)
return fd;
//将欲打开的文件名拷贝到内核中
tmp = getname(filename);
if (IS_ERR(tmp))
return PTR_ERR(tmp);
//从进程的文件表中找到一个空闲的文件表指针,如果出错,则返回
fd = get_unused_fd_flags(flags);
if (fd >= 0) {
struct file *f = do_filp_open(dfd, tmp, &op);
if (IS_ERR(f)) {
put_unused_fd(fd);
fd = PTR_ERR(f);
} else {
fsnotify_open(f);
//把file和fd关联起来,添加打开的文件表f到当前进程的文件表数组中
fd_install(fd, f);
}
}
putname(tmp);
return fd;
}
getname函数:
源码:fs/namei.c
struct filename *
getname(const char __user * filename)
{
return getname_flags(filename, 0, NULL);
}
struct filename *
getname_flags(const char __user *filename, int flags, int *empty)
{
struct filename *result;
char *kname;
int len;
result = audit_reusename(filename);
if (result)
return result;
result = __getname(); //从内核缓存中分配空间;
if (unlikely(!result))
return ERR_PTR(-ENOMEM);
/*
* First, try to embed the struct filename inside the names_cache
* allocation
*/
kname = (char *)result->iname;
result->name = kname;
//将文件名从用户态拷贝至内核态;
len = strncpy_from_user(kname, filename, EMBEDDED_NAME_MAX);
if (unlikely(len < 0)) {
__putname(result); //如果拷贝失败,则调用__putname()释放__getname()中申请的空间;
return ERR_PTR(len);
}
/*
* Uh-oh. We have a name that's approaching PATH_MAX. Allocate a
* separate struct filename so we can dedicate the entire
* names_cache allocation for the pathname, and re-do the copy from
* userland.
*/
if (unlikely(len == EMBEDDED_NAME_MAX)) {
const size_t size = offsetof(struct filename, iname[1]);
kname = (char *)result;
/*
* size is chosen that way we to guarantee that
* result->iname[0] is within the same object and that
* kname can't be equal to result->iname, no matter what.
*/
result = kzalloc(size, GFP_KERNEL);
if (unlikely(!result)) {
__putname(kname);
return ERR_PTR(-ENOMEM);
}
result->name = kname;
len = strncpy_from_user(kname, filename, PATH_MAX);
if (unlikely(len < 0)) {
__putname(kname);
kfree(result);
return ERR_PTR(len);
}
if (unlikely(len == PATH_MAX)) {
__putname(kname);
kfree(result);
return ERR_PTR(-ENAMETOOLONG);
}
}
result->refcnt = 1;
/* The empty path is special. */
if (unlikely(!len)) {
if (empty)
*empty = 1;
if (!(flags & LOOKUP_EMPTY)) {
putname(result);
return ERR_PTR(-ENOENT);
}
}
result->uptr = filename;
result->aname = NULL;
audit_getname(result);
return result;
}
get_unused_fd_flags函数:
源码:fs/file.c
int get_unused_fd_flags(unsigned flags)
{
//获得当前进程的files_struct 结构——current->files
return __alloc_fd(current->files, 0, rlimit(RLIMIT_NOFILE), flags);
}
EXPORT_SYMBOL(get_unused_fd_flags);
/*
* allocate a file descriptor, mark it busy.
*/
int __alloc_fd(struct files_struct *files,
unsigned start, unsigned end, unsigned flags)
{
unsigned int fd;
int error;
struct fdtable *fdt;
spin_lock(&files->file_lock);
repeat:
fdt = files_fdtable(files);
fd = start; // 从上一次打开的fd的下一个fd开始搜索空闲的fd
// 寻找空闲的fd,返回值为空闲的fd
if (fd < files->next_fd)
fd = files->next_fd;
if (fd < fdt->max_fds)
fd = find_next_fd(fdt, fd);
/*
* N.B. For clone tasks sharing a files structure, this test
* will limit the total number of files that can be opened.
*/
error = -EMFILE;
if (fd >= end)
goto out;
//如果有必要,即打开的fd超过max_fds,则需要expand当前进程的fd表;
//返回值error<0表示出错,error=0表示无需expand,error=1表示进行了expand;
error = expand_files(files, fd);
if (error < 0)
goto out;
/*
* If we needed to expand the fs array we
* might have blocked - try again.
*/
//error=1表示进行了expand,那么此时需要重新去查找空闲的fd;
if (error)
goto repeat;
//设置下一次查找的起始fd,即本次找到的空闲的fd的下一个fd,记录在files->next_fd中
if (start <= files->next_fd)
files->next_fd = fd + 1;
__set_open_fd(fd, fdt);
if (flags & O_CLOEXEC)
__set_close_on_exec(fd, fdt);
else
__clear_close_on_exec(fd, fdt);
error = fd;
#if 1
/* Sanity check */
if (rcu_access_pointer(fdt->fd[fd]) != NULL) {
printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd);
rcu_assign_pointer(fdt->fd[fd], NULL);
}
#endif
out:
spin_unlock(&files->file_lock);
return error;
}
do_filp_open函数:
do_filep_open函数的主要功能,就是通过路径名来分配并填充这个文件对应的文件对象。do_filep_open(dfd, pathname, flags, mode, acc_mode)函数主要执行如下操作:
1,设置一堆访问模式标志
2,调用get_empty_filep()函数从名为filp_cachep的slab缓存中分配一个struct file*的文件对象。
3,如果flags中有O_CREATE标志,跳到,否则到4
4,调用do_path_lookup(dfd, pathname, flags, &nd)做目录查找,将查找结果填充到struct nameidata nd中。还记得目录查找么?见这里
5,调用finish_open(nd, flags, mode)做一些合法性验验证并从nd->intent.open.file中获取到struct file filep
6,调用release_open_intent(nd)做一些清理工作。主要是减少nd->intent.open.file中的一些引用计数。
7,返回filep
8,到这一步说明flags中有O_CREATE标志,需要在目录查找过程中逐级创建对应的目录和文件,这一步依次调用path_init_rcu(), path_walk_rcu()和path_finish_rcu()完成创建文件的目录查找工作,最终依然是将查找结果填充到struct nameidata *nd中。(在标准的目录查找do_path_lookup()的实现中,主干流程也是依次调用着三个函数做查找工作)
9,调用do_last(&nd, &path, flags, acc_mode, mode, pathname)函数获取最终的struct file filep结构。在这个函数中,内核会根据nd->last_type做不同的处理,对于普通文件,会调用finish_open(nd, flags, mode)做一些合法性验验证并从nd->intent.open.file中获取到struct file filep
10,调用release_open_intent(nd)做一些清理工作。主要是减少nd->intent.open.file中的一些引用计数。
11,返回filep
源码:fs/namei.c
struct file *do_filp_open(int dfd, struct filename *pathname,
const struct open_flags *op)
{
struct nameidata nd;
int flags = op->lookup_flags;
struct file *filp;
set_nameidata(&nd, dfd, pathname);
filp = path_openat(&nd, op, flags | LOOKUP_RCU);
if (unlikely(filp == ERR_PTR(-ECHILD)))
filp = path_openat(&nd, op, flags);
if (unlikely(filp == ERR_PTR(-ESTALE)))
filp = path_openat(&nd, op, flags | LOOKUP_REVAL);
restore_nameidata();
return filp;
}
//源码:fs/namei.c
static struct file *path_openat(struct nameidata *nd,
const struct open_flags *op, unsigned flags)
{
struct file *file;
int error;
// 拿到一个初始化好的file结构体
file = alloc_empty_file(op->open_flag, current_cred());
if (IS_ERR(file))
return file;
if (unlikely(file->f_flags & __O_TMPFILE)) {
error = do_tmpfile(nd, flags, op, file);
} else if (unlikely(file->f_flags & O_PATH)) {
error = do_o_path(nd, flags, file);
} else {
//找到需要打开文件的父目录的dentry结构,这个过程我们已经在sys_mount过程中分析;
//为什么要查找父目录的struct dentry,我想是因为open的文件不一定存在,所以先找到父目录的dentry;
const char *s = path_init(nd, flags);
while (!(error = link_path_walk(s, nd)) &&
(error = do_last(nd, file, op)) > 0) {
nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
s = trailing_symlink(nd);
}
terminate_walk(nd);
}
if (likely(!error)) {
if (likely(file->f_mode & FMODE_OPENED))
return file;
WARN_ON(1);
error = -EINVAL;
}
fput(file);
if (error == -EOPENSTALE) {
if (flags & LOOKUP_RCU)
error = -ECHILD;
else
error = -ESTALE;
}
return ERR_PTR(error);
}
//源码:fs/namei.c
/*
* Handle the last step of open()
*/
static int do_last(struct nameidata *nd,
struct file *file, const struct open_flags *op)
{
struct dentry *dir = nd->path.dentry;
kuid_t dir_uid = nd->inode->i_uid;
umode_t dir_mode = nd->inode->i_mode;
int open_flag = op->open_flag;
bool will_truncate = (open_flag & O_TRUNC) != 0;
bool got_write = false;
int acc_mode = op->acc_mode;
unsigned seq;
struct inode *inode;
struct path path;
int error;
nd->flags &= ~LOOKUP_PARENT;
nd->flags |= op->intent;
//nd->last_type:记录路径名的最后一个分量的类型;
if (nd->last_type != LAST_NORM) {
error = handle_dots(nd, nd->last_type);
if (unlikely(error))
return error;
goto finish_open;
}
if (!(open_flag & O_CREAT)) {
if (nd->last.name[nd->last.len])
nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
/* we _can_ be in RCU mode here */
error = lookup_fast(nd, &path, &inode, &seq);
if (likely(error > 0))
goto finish_lookup;
if (error < 0)
return error;
BUG_ON(nd->inode != dir->d_inode);
BUG_ON(nd->flags & LOOKUP_RCU);
} else {
/* create side of things */
/*
* This will *only* deal with leaving RCU mode - LOOKUP_JUMPED
* has been cleared when we got to the last component we are
* about to look up
*/
error = complete_walk(nd);
if (error)
return error;
audit_inode(nd->name, dir, AUDIT_INODE_PARENT);
/* trailing slashes? */
if (unlikely(nd->last.name[nd->last.len]))
return -EISDIR;
}
// 以下处理的是没有设置O_CREAT标志位的情况,即简单的查找,调用函数do_lookup();
// 如果没有找到struct dentry,那么返回说目录项不存在;
// 如果找到nd->last对应的dentry,那么跳转到ok;
if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) {
error = mnt_want_write(nd->path.mnt);
if (!error)
got_write = true;
/*
* do _not_ fail yet - we might not need that or fail with
* a different error; let lookup_open() decide; we'll be
* dropping this one anyway.
*/
}
// 以下进入O_CREAT标志位被置位的处理情况
if (open_flag & O_CREAT)
inode_lock(dir->d_inode);
else
inode_lock_shared(dir->d_inode);
error = lookup_open(nd, &path, file, op, got_write);
if (open_flag & O_CREAT)
inode_unlock(dir->d_inode);
else
inode_unlock_shared(dir->d_inode);
if (error)
goto out;
if (file->f_mode & FMODE_OPENED) {
if ((file->f_mode & FMODE_CREATED) ||
!S_ISREG(file_inode(file)->i_mode))
will_truncate = false;
audit_inode(nd->name, file->f_path.dentry, 0);
goto opened;
}
if (file->f_mode & FMODE_CREATED) {
/* Don't check for write permission, don't truncate */
open_flag &= ~O_TRUNC;
will_truncate = false;
acc_mode = 0;
path_to_nameidata(&path, nd);
goto finish_open_created;
}
/*
* If atomic_open() acquired write access it is dropped now due to
* possible mount and symlink following (this might be optimized away if
* necessary...)
*/
if (got_write) {
mnt_drop_write(nd->path.mnt);
got_write = false;
}
error = follow_managed(&path, nd);
if (unlikely(error < 0))
return error;
if (unlikely(d_is_negative(path.dentry))) {
path_to_nameidata(&path, nd);
return -ENOENT;
}
/*
* create/update audit record if it already exists.
*/
audit_inode(nd->name, path.dentry, 0);
if (unlikely((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT))) {
path_to_nameidata(&path, nd);
return -EEXIST;
}
seq = 0; /* out of RCU mode, so the value doesn't matter */
inode = d_backing_inode(path.dentry);
finish_lookup:
error = step_into(nd, &path, 0, inode, seq);
if (unlikely(error))
return error;
finish_open:
/* Why this, you ask? _Now_ we might have grown LOOKUP_JUMPED... */
error = complete_walk(nd);
if (error)
return error;
audit_inode(nd->name, nd->path.dentry, 0);
if (open_flag & O_CREAT) {
error = -EISDIR;
if (d_is_dir(nd->path.dentry))
goto out;
error = may_create_in_sticky(dir_mode, dir_uid,
d_backing_inode(nd->path.dentry));
if (unlikely(error))
goto out;
}
error = -ENOTDIR;
if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry))
goto out;
if (!d_is_reg(nd->path.dentry))
will_truncate = false;
if (will_truncate) {
error = mnt_want_write(nd->path.mnt);
if (error)
goto out;
got_write = true;
}
finish_open_created:
error = may_open(&nd->path, acc_mode, open_flag);
if (error)
goto out;
BUG_ON(file->f_mode & FMODE_OPENED); /* once it's opened, it's opened */
error = vfs_open(&nd->path, file);
if (error)
goto out;
opened:
error = ima_file_check(file, op->acc_mode);
if (!error && will_truncate)
error = handle_truncate(file);
out:
if (unlikely(error > 0)) {
WARN_ON(1);
error = -EINVAL;
}
if (got_write)
mnt_drop_write(nd->path.mnt);
return error;
}
fd_install函数:
fd_install()将分配的fd和代表该文件的struct file结构建立关联。
源码:fs/file.c
void fd_install(unsigned int fd, struct file *file)
{
__fd_install(current->files, fd, file);
}
EXPORT_SYMBOL(fd_install);
void __fd_install(struct files_struct *files, unsigned int fd,
struct file *file)
{
struct fdtable *fdt;
rcu_read_lock_sched();
if (unlikely(files->resize_in_progress)) {
rcu_read_unlock_sched();
spin_lock(&files->file_lock);
fdt = files_fdtable(files);
BUG_ON(fdt->fd[fd] != NULL);
rcu_assign_pointer(fdt->fd[fd], file);
spin_unlock(&files->file_lock);
return;
}
/* coupled with smp_wmb() in expand_fdtable() */
smp_rmb();
fdt = rcu_dereference_sched(files->fdt);
BUG_ON(fdt->fd[fd] != NULL);
rcu_assign_pointer(fdt->fd[fd], file);
rcu_read_unlock_sched();
}