http://www.cppblog.com/momoxiao/archive/2010/04/04/111594.html
先通过strace来看下ls命令的执行都做了哪些系统调用:
strace
-
o ls.txt ls
运行结果,这儿只摘取了ls.txt中我们感兴趣的部分:
open(
"
.
"
, O_RDONLY
|
O_NONBLOCK
|
O_LARGEFILE
|
O_DIRECTORY
|
O_CLOEXEC)
=
3
///
打开当前目录这个文件(目录是一种特殊的文件),并返回文件句柄3
fstat64(
3
, {st_mode
=
S_IFDIR
|
0755
, st_size
=
4096
, })
=
0
///
取得当前目录文件的属性,比如这里大小为4096
fcntl64(
3
, F_GETFD)
=
0x1
(flags FD_CLOEXEC)
getdents64(
3
,
/*
33 entries
*/
,
4096
)
=
1104
///
读取当前目录下的文件
getdents64(
3
,
/*
0 entries
*/
,
4096
)
=
0
close(
3
)
=
0
///
关闭当前目录文件的句柄
这里核心是getdents64系统调用,它读取目录文件中的一个个目录项(directory entry)并返回,所以我们运行ls后才看到文件。
下面我们就看下getdents64是怎么用的,想办法干扰它的执行,从而隐藏掉我们不想让用户发现的文件。
fs/readdir.c
asmlinkage
long
sys_getdents64(unsigned
int
fd,
struct
linux_dirent64 __user
*
dirent, unsigned
int
count)
{
struct
file
*
file;
struct
linux_dirent64 __user
*
lastdirent;
struct
getdents_callback64 buf;
int
error;
error
=
-
EFAULT;
if
(
!
access_ok(VERIFY_WRITE, dirent, count))
goto
out
;
error
=
-
EBADF;
file
=
fget(fd);
if
(
!
file)
goto
out
;
buf.current_dir
=
dirent;
buf.previous
=
NULL;
buf.count
=
count;
buf.error
=
0
;
error
=
vfs_readdir(file, filldir64,
&
buf);
///
读取目录函数
if
(error
<
0
)
goto
out_putf;
error
=
buf.error;
lastdirent
=
buf.previous;
if
(lastdirent) {
typeof
(lastdirent
->
d_off) d_off
=
file
->
f_pos;
error
=
-
EFAULT;
if
(__put_user(d_off,
&
lastdirent
->
d_off))
goto
out_putf;
error
=
count
-
buf.count;
}
out_putf:
fput(file);
out
:
return
error;
}
首先,在sys_getdents64中通过调用vfs_readdir()读取目录函数。
那么什么是vfs呢?vfs全名Virtual File Switch,就是虚拟文件系统。我们可以把Linux的文件系统看成三层,最上层是上层用户使用的系统调用,中间一层就是vfs,最下面一层是挂载到VFS中的各种实际文件系统,比如ext2,jffs等。Switch这个词在这儿用的很形象,上层同一个系统调用,在vfs这层会根据文件系统的类型,调用对应的内核函数。vfs这层,本身就是起一个switch的作用。
看下vfs_readdir()吧。
fs/readdir.c
int
vfs_readdir(
struct
file
*
file, filldir_t filler,
void
*
buf)
{
struct
inode
*
inode
=
file
->
f_path.dentry
->
d_inode;
int
res
=
-
ENOTDIR;
if
(
!
file
->
f_op
||
!
file
->
f_op
->
readdir)
goto
out
;
res
=
security_file_permission(file, MAY_READ);
if
(res)
goto
out
;
res
=
mutex_lock_killable(
&
inode
->
i_mutex);
if
(res)
goto
out
;
res
=
-
ENOENT;
if
(
!
IS_DEADDIR(inode)) {
res
=
file
->
f_op
->
readdir(file, buf, filler);
///
调用实际文件系统的读取目录项(就是文件系统三层结构中最下面一层)
file_accessed(file);
}
mutex_unlock(
&
inode
->
i_mutex);
out
:
return
res;
}
里面file->f_op->readdir()读取底层实际文件系统的目录项。
大致的关系是这样的:
file结构里有个文件操作的函数集const struct file_operations *f_op。
struct file_operations 中实际上是一些函数的指针,readdir就是其中的一个指针。
在调用vir_readdir之前,内核会根据实际文件系统类型给struct file_operations赋对应值。
下面我们通过看代码,获得一个比较直观的认识。
struct file 和 struct file_operations都在/include/linux/fs.h中定义。
file结构:
struct
file {
/*
* fu_list becomes invalid after file_free is called and queued via
* fu_rcuhead for RCU freeing
*/
union {
struct
list_head fu_list;
struct
rcu_head fu_rcuhead;
} f_u;
struct
path f_path;
#define
f_dentry f_path.dentry
#define
f_vfsmnt f_path.mnt
const
struct
file_operations
*
f_op;
///
对应每一种实际的文件系统,会有自己的file_operations函数集。可以理解成file这个类的纯虚函数集
atomic_long_t f_count;
unsigned
int
f_flags;
mode_t f_mode;
loff_t f_pos;
struct
fown_struct f_owner;
unsigned
int
f_uid, f_gid;
struct
file_ra_state f_ra;
u64 f_version;
#ifdef CONFIG_SECURITY
void
*
f_security;
#endif
/*
needed for tty driver, and maybe others
*/
void
*
private_data;
#ifdef CONFIG_EPOLL
/*
Used by fs/eventpoll.c to link all the hooks to this file
*/
struct
list_head f_ep_links;
spinlock_t f_ep_lock;
#endif
/* #ifdef CONFIG_EPOLL */
struct
address_space
*
f_mapping;
#ifdef CONFIG_DEBUG_WRITECOUNT
unsigned
long
f_mnt_write_state;
#endif
};
file_operations结构,里面是一些函数指针。我们在这儿关心的是int (*readdir) (struct file *, void *, filldir_t);
readdir()用来读取实际文件系统目录项。
struct
file_operations {
struct
module
*
owner;
loff_t (
*
llseek) (
struct
file
*
, loff_t,
int
);
ssize_t (
*
read) (
struct
file
*
,
char
__user
*
, size_t, loff_t
*
);
ssize_t (
*
write) (
struct
file
*
,
const
char
__user
*
, size_t, loff_t
*
);
ssize_t (
*
aio_read) (
struct
kiocb
*
,
const
struct
iovec
*
, unsigned
long
, loff_t);
ssize_t (
*
aio_write) (
struct
kiocb
*
,
const
struct
iovec
*
, unsigned
long
, loff_t);
int
(
*
readdir) (
struct
file
*
,
void
*
, filldir_t);
///
我们在这儿关心的函数指针,实际文件系统的读取目录项函数。
///
每次打开文件,内核都会根据文件位于的文件系统类型,对文件相应的file_operations赋相应值。
unsigned
int
(
*
poll) (
struct
file
*
,
struct
poll_table_struct
*
);
int
(
*
ioctl) (
struct
inode
*
,
struct
file
*
, unsigned
int
, unsigned
long
);
long
(
*
unlocked_ioctl) (
struct
file
*
, unsigned
int
, unsigned
long
);
long
(
*
compat_ioctl) (
struct
file
*
, unsigned
int
, unsigned
long
);
int
(
*
mmap) (
struct
file
*
,
struct
vm_area_struct
*
);
int
(
*
open) (
struct
inode
*
,
struct
file
*
);
int
(
*
flush) (
struct
file
*
, fl_owner_t id);
int
(
*
release) (
struct
inode
*
,
struct
file
*
);
int
(
*
fsync) (
struct
file
*
,
struct
dentry
*
,
int
datasync);
int
(
*
aio_fsync) (
struct
kiocb
*
,
int
datasync);
int
(
*
fasync) (
int
,
struct
file
*
,
int
);
int
(
*
lock
) (
struct
file
*
,
int
,
struct
file_lock
*
);
ssize_t (
*
sendpage) (
struct
file
*
,
struct
page
*
,
int
, size_t, loff_t
*
,
int
);
unsigned
long
(
*
get_unmapped_area)(
struct
file
*
, unsigned
long
, unsigned
long
, unsigned
long
, unsigned
long
);
int
(
*
check_flags)(
int
);
int
(
*
dir_notify)(
struct
file
*
filp, unsigned
long
arg);
int
(
*
flock) (
struct
file
*
,
int
,
struct
file_lock
*
);
ssize_t (
*
splice_write)(
struct
pipe_inode_info
*
,
struct
file
*
, loff_t
*
, size_t, unsigned
int
);
ssize_t (
*
splice_read)(
struct
file
*
, loff_t
*
,
struct
pipe_inode_info
*
, size_t, unsigned
int
);
int
(
*
setlease)(
struct
file
*
,
long
,
struct
file_lock
**
);
};
下面来看下在ls用到file结构中的file_operations之前,内核是怎样它赋值的
struct
inode
*
ext2_iget (
struct
super_block
*
sb, unsigned
long
ino)
{
struct
ext2_inode_info
*
ei;
struct
buffer_head
*
bh;
struct
ext2_inode
*
raw_inode;
struct
inode
*
inode;
long
ret
=
-
EIO;
int
n;
inode
=
iget_locked(sb, ino);
if
(
!
inode)
return
ERR_PTR(
-
ENOMEM);
if
(
!
(inode
->
i_state
&
I_NEW))
return
inode;
ei
=
EXT2_I(inode);
#ifdef CONFIG_EXT2_FS_POSIX_ACL
ei
->
i_acl
=
EXT2_ACL_NOT_CACHED;
ei
->
i_default_acl
=
EXT2_ACL_NOT_CACHED;
#endif
ei
->
i_block_alloc_info
=
NULL;
raw_inode
=
ext2_get_inode(inode
->
i_sb, ino,
&
bh);
if
(IS_ERR(raw_inode)) {
ret
=
PTR_ERR(raw_inode);
goto
bad_inode;
}
inode
->
i_mode
=
le16_to_cpu(raw_inode
->
i_mode);
inode
->
i_uid
=
(uid_t)le16_to_cpu(raw_inode
->
i_uid_low);
inode
->
i_gid
=
(gid_t)le16_to_cpu(raw_inode
->
i_gid_low);
if
(
!
(test_opt (inode
->
i_sb, NO_UID32))) {
inode
->
i_uid
|=
le16_to_cpu(raw_inode
->
i_uid_high)
<<
16
;
inode
->
i_gid
|=
le16_to_cpu(raw_inode
->
i_gid_high)
<<
16
;
}
inode
->
i_nlink
=
le16_to_cpu(raw_inode
->
i_links_count);
inode
->
i_size
=
le32_to_cpu(raw_inode
->
i_size);
inode
->
i_atime.tv_sec
=
(signed)le32_to_cpu(raw_inode
->
i_atime);
inode
->
i_ctime.tv_sec
=
(signed)le32_to_cpu(raw_inode
->
i_ctime);
inode
->
i_mtime.tv_sec
=
(signed)le32_to_cpu(raw_inode
->
i_mtime);
inode
->
i_atime.tv_nsec
=
inode
->
i_mtime.tv_nsec
=
inode
->
i_ctime.tv_nsec
=
0
;
ei
->
i_dtime
=
le32_to_cpu(raw_inode
->
i_dtime);
/*
We now have enough fields to check if the inode was active or not.
* This is needed because nfsd might try to access dead inodes
* the test is that same one that e2fsck uses
* NeilBrown 1999oct15
*/
if
(inode
->
i_nlink
==
0
&&
(inode
->
i_mode
==
0
||
ei
->
i_dtime)) {
/*
this inode is deleted
*/
brelse (bh);
ret
=
-
ESTALE;
goto
bad_inode;
}
inode
->
i_blocks
=
le32_to_cpu(raw_inode
->
i_blocks);
ei
->
i_flags
=
le32_to_cpu(raw_inode
->
i_flags);
ei
->
i_faddr
=
le32_to_cpu(raw_inode
->
i_faddr);
ei
->
i_frag_no
=
raw_inode
->
i_frag;
ei
->
i_frag_size
=
raw_inode
->
i_fsize;
ei
->
i_file_acl
=
le32_to_cpu(raw_inode
->
i_file_acl);
ei
->
i_dir_acl
=
0
;
if
(S_ISREG(inode
->
i_mode))
inode
->
i_size
|=
((__u64)le32_to_cpu(raw_inode
->
i_size_high))
<<
32
;
else
ei
->
i_dir_acl
=
le32_to_cpu(raw_inode
->
i_dir_acl);
ei
->
i_dtime
=
0
;
inode
->
i_generation
=
le32_to_cpu(raw_inode
->
i_generation);
ei
->
i_state
=
0
;
ei
->
i_block_group
=
(ino
-
1
)
/
EXT2_INODES_PER_GROUP(inode
->
i_sb);
ei
->
i_dir_start_lookup
=
0
;
/*
* NOTE! The in-memory inode i_data array is in little-endian order
* even on big-endian machines: we do NOT byteswap the block numbers!
*/
for
(n
=
0
; n
<
EXT2_N_BLOCKS; n
++
)
ei
->
i_data[n]
=
raw_inode
->
i_block[n];
///
下面是我们关心的。。。。。。。。。。。。。。。。。。。。。。。。
///
这里对inode->fop赋值,就是inode中的file_operations结构。
if
(S_ISREG(inode
->
i_mode)) {
///
普通文件(S_ISREG),inode->i_fop为ext2_file_operations函数集
inode
->
i_op
=
&
ext2_file_inode_operations;
if
(ext2_use_xip(inode
->
i_sb)) {
///
???现在不关心
inode
->
i_mapping
->
a_ops
=
&
ext2_aops_xip;
inode
->
i_fop
=
&
ext2_xip_file_operations;
}
else
if
(test_opt(inode
->
i_sb, NOBH)) {
inode
->
i_mapping
->
a_ops
=
&
ext2_nobh_aops;
inode
->
i_fop
=
&
ext2_file_operations;
}
else
{
inode
->
i_mapping
->
a_ops
=
&
ext2_aops;
inode
->
i_fop
=
&
ext2_file_operations;
}
}
else
if
(S_ISDIR(inode
->
i_mode)) {
///
目录文件(S_ISDIR),inode->i_fop为ext2_dir_operations函数集
inode
->
i_op
=
&
ext2_dir_inode_operations;
inode
->
i_fop
=
&
ext2_dir_operations;
if
(test_opt(inode
->
i_sb, NOBH))
inode
->
i_mapping
->
a_ops
=
&
ext2_nobh_aops;
else
inode
->
i_mapping
->
a_ops
=
&
ext2_aops;
}
else
if
(S_ISLNK(inode
->
i_mode)) {
///
链接文件(S_ISLNK),不需要inode->i_fop函数集
if
(ext2_inode_is_fast_symlink(inode))
inode
->
i_op
=
&
ext2_fast_symlink_inode_operations;
else
{
inode
->
i_op
=
&
ext2_symlink_inode_operations;
if
(test_opt(inode
->
i_sb, NOBH))
inode
->
i_mapping
->
a_ops
=
&
ext2_nobh_aops;
else
inode
->
i_mapping
->
a_ops
=
&
ext2_aops;
}
}
else
{
inode
->
i_op
=
&
ext2_special_inode_operations;
if
(raw_inode
->
i_block[
0
])
init_special_inode(inode, inode
->
i_mode,
old_decode_dev(le32_to_cpu(raw_inode
->
i_block[
0
])));
else
init_special_inode(inode, inode
->
i_mode,
new_decode_dev(le32_to_cpu(raw_inode
->
i_block[
1
])));
}
///
以上。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。
brelse (bh);
ext2_set_inode_flags(inode);
unlock_new_inode(inode);
return
inode;
bad_inode:
iget_failed(inode);
return
ERR_PTR(ret);
}
上面一段代码把inode中的file_operations赋值为ext2_file_operations。
打开文件用sys_open(),在fs/open.c文件中,函数调用流程如下:
sys_open() --> do_sys_open() --> do_filp_open() --> nameidata_to_filp() --> __dentry_open()
static
struct
file
*
__dentry_open(
struct
dentry
*
dentry,
struct
vfsmount
*
mnt,
int
flags,
struct
file
*
f,
int
(
*
open)(
struct
inode
*
,
struct
file
*
))
{
struct
inode
*
inode;
int
error;
f
->
f_flags
=
flags;
f
->
f_mode
=
((flags
+
1
)
&
O_ACCMODE)
|
FMODE_LSEEK
|
FMODE_PREAD
|
FMODE_PWRITE;
inode
=
dentry
->
d_inode;
if
(f
->
f_mode
&
FMODE_WRITE) {
error
=
__get_file_write_access(inode, mnt);
if
(error)
goto
cleanup_file;
if
(
!
special_file(inode
->
i_mode))
file_take_write(f);
}
f
->
f_mapping
=
inode
->
i_mapping;
f
->
f_path.dentry
=
dentry;
f
->
f_path.mnt
=
mnt;
f
->
f_pos
=
0
;
f
->
f_op
=
fops_get(inode
->
i_fop);
///
把inode中file_operations函数集给file中file_operations函数集
file_move(f,
&
inode
->
i_sb
->
s_files);
error
=
security_dentry_open(f);
if
(error)
goto
cleanup_all;
if
(
!
open
&&
f
->
f_op)
open
=
f
->
f_op
->
open;
if
(open) {
error
=
open(inode, f);
if
(error)
goto
cleanup_all;
}
f
->
f_flags
&=
~
(O_CREAT
|
O_EXCL
|
O_NOCTTY
|
O_TRUNC);
file_ra_state_init(
&
f
->
f_ra, f
->
f_mapping
->
host
->
i_mapping);
/*
NB: we're sure to have correct a_ops only after f_op->open
*/
if
(f
->
f_flags
&
O_DIRECT) {
if
(
!
f
->
f_mapping
->
a_ops
||
((
!
f
->
f_mapping
->
a_ops
->
direct_IO)
&&
(
!
f
->
f_mapping
->
a_ops
->
get_xip_mem))) {
fput(f);
f
=
ERR_PTR(
-
EINVAL);
}
}
return
f;
cleanup_all:
fops_put(f
->
f_op);
if
(f
->
f_mode
&
FMODE_WRITE) {
put_write_access(inode);
if
(
!
special_file(inode
->
i_mode)) {
/*
* We don't consider this a real
* mnt_want/drop_write() pair
* because it all happenend right
* here, so just reset the state.
*/
file_reset_write(f);
mnt_drop_write(mnt);
}
}
file_kill(f);
f
->
f_path.dentry
=
NULL;
f
->
f_path.mnt
=
NULL;
cleanup_file:
put_filp(f);
dput(dentry);
mntput(mnt);
return
ERR_PTR(error);
}
在这儿,f
->
f_op
=
fops_get(inode
->
i_fop); 把file结构中的file_operations函数集赋值成inode中的函数集,也就是ext2_file_operations。
下面归纳下ls执行的整个流程:
假设当前目录在ext2文件系统上,ls要查看当前目录下的文件,
1.open打开当前目录的句柄,这个句柄对应内核中一个file结构。
file结构中的file_operations函数集从inode结构中获得,就是ext2_file_operations
2.getdents64调用file->f_op->readdir()实际上是调用了ext2_file_operations中的readdir(),
由ext2文件系统驱动读取当前目录下面的文件项。
我们要隐藏一个文件,要做的就是替换file->f_op->readdir(),也就是替换ext2_file_operations中的readdir()。