Docker 内核实现容器的功能用了linux 内核中的三个特性 Namespace、Cgroup、UnionFs,今天我们来说一下UnionFs。
linux UnionFs 实现的是overlay 文件系统
OverlayFs 文件系统分为三层,
lower 是只读层
Upper 是可读写
Merged 是 lower 和Upper 合并的目录
挂载方式可以使用mount 命令挂载:
mount -t overlay overlay -o lowerdir=lower1:lower2,upperdir=upper,workdir=work merged
当我们使用
mount -t overlay overlay -o lowerdir=lower1:lower2,upperdir=upper,workdir=work merged
linux 内核层,overlay 结构体声明类型
static struct file_system_type ovl_fs_type = {
.owner = THIS_MODULE,
.name = "overlay",
.fs_flags = FS_USERNS_MOUNT,
.mount = ovl_mount,
.kill_sb = kill_anon_super,
};
当我们使用overlay设备的时候,会触发结构体上挂载的mount函数指针,这个函数触发linux内核中的ovl_mount
static struct dentry *ovl_mount(struct file_system_type *fs_type, int flags,
const char *dev_name, void *raw_data)
{
return mount_nodev(fs_type, flags, raw_data, ovl_fill_super);
}
核心是使用ovl_fill_super,填充overlay 文件系统的超级块,申请一个ovl_fs,然后填充到
sb->s_fs_info = ofs;
详细代码:
static int ovl_fill_super(struct super_block *sb, void *data, int silent)
{
struct path upperpath = { };
struct dentry *root_dentry;
struct ovl_entry *oe;
struct ovl_fs *ofs;
struct ovl_layer *layers;
struct cred *cred;
char *splitlower = NULL;
unsigned int numlower;
int err;
// 如果当前用户的namespace不是超级块的ns那么返回错误 -EIO
err = -EIO;
if (WARN_ON(sb->s_user_ns != current_user_ns()))
goto out;
// 目录操作结构体赋值
sb->s_d_op = &ovl_dentry_operations;
err = -ENOMEM;
// 申请ovl_fs,并且对ovl_fs进行填充
ofs = kzalloc(sizeof(struct ovl_fs), GFP_KERNEL);
if (!ofs)
goto out;
err = -ENOMEM;
ofs->creator_cred = cred = prepare_creds();
if (!cred)
goto out_err;
/* Is there a reason anyone would want not to share whiteouts? */
ofs->share_whiteout = true;
ofs->config.index = ovl_index_def;
ofs->config.uuid = true;
ofs->config.nfs_export = ovl_nfs_export_def;
ofs->config.xino = ovl_xino_def();
ofs->config.metacopy = ovl_metacopy_def;
// 装载选项
err = ovl_parse_opt((char *) data, &ofs->config);
if (err)
goto out_err;
err = -EINVAL;
if (!ofs->config.lowerdir) {
if (!silent)
pr_err("missing 'lowerdir'\n");
goto out_err;
}
err = -ENOMEM;
splitlower = kstrdup(ofs->config.lowerdir, GFP_KERNEL);
if (!splitlower)
goto out_err;
err = -EINVAL;
numlower = ovl_split_lowerdirs(splitlower);
if (numlower > OVL_MAX_STACK) {
pr_err("too many lower directories, limit is %d\n",
OVL_MAX_STACK);
goto out_err;
}
err = -ENOMEM;
layers = kcalloc(numlower + 1, sizeof(struct ovl_layer), GFP_KERNEL);
if (!layers)
goto out_err;
ofs->layers = layers;
/* Layer 0 is reserved for upper even if there's no upper */
ofs->numlayer = 1;
sb->s_stack_depth = 0;
sb->s_maxbytes = MAX_LFS_FILESIZE;
atomic_long_set(&ofs->last_ino, 1);
/* Assume underlying fs uses 32bit inodes unless proven otherwise */
if (ofs->config.xino != OVL_XINO_OFF) {
ofs->xino_mode = BITS_PER_LONG - 32;
if (!ofs->xino_mode) {
pr_warn("xino not supported on 32bit kernel, falling back to xino=off.\n");
ofs->config.xino = OVL_XINO_OFF;
}
}
/* alloc/destroy_inode needed for setting up traps in inode cache */
sb->s_op = &ovl_super_operations;
if (ofs->config.upperdir) {
struct super_block *upper_sb;
err = -EINVAL;
if (!ofs->config.workdir) {
pr_err("missing 'workdir'\n");
goto out_err;
}
err = ovl_get_upper(sb, ofs, &layers[0], &upperpath);
if (err)
goto out_err;
upper_sb = ovl_upper_mnt(ofs)->mnt_sb;
if (!ovl_should_sync(ofs)) {
ofs->errseq = errseq_sample(&upper_sb->s_wb_err);
if (errseq_check(&upper_sb->s_wb_err, ofs->errseq)) {
err = -EIO;
pr_err("Cannot mount volatile when upperdir has an unseen error. Sync upperdir fs to clear state.\n");
goto out_err;
}
}
err = ovl_get_workdir(sb, ofs, &upperpath);
if (err)
goto out_err;
if (!ofs->workdir)
sb->s_flags |= SB_RDONLY;
sb->s_stack_depth = upper_sb->s_stack_depth;
sb->s_time_gran = upper_sb->s_time_gran;
}
oe = ovl_get_lowerstack(sb, splitlower, numlower, ofs, layers);
err = PTR_ERR(oe);
if (IS_ERR(oe))
goto out_err;
/* If the upper fs is nonexistent, we mark overlayfs r/o too */
if (!ovl_upper_mnt(ofs))
sb->s_flags |= SB_RDONLY;
if (!ofs->config.uuid && ofs->numfs > 1) {
pr_warn("The uuid=off requires a single fs for lower and upper, falling back to uuid=on.\n");
ofs->config.uuid = true;
}
if (!ovl_force_readonly(ofs) && ofs->config.index) {
err = ovl_get_indexdir(sb, ofs, oe, &upperpath);
if (err)
goto out_free_oe;
/* Force r/o mount with no index dir */
if (!ofs->indexdir)
sb->s_flags |= SB_RDONLY;
}
err = ovl_check_overlapping_layers(sb, ofs);
if (err)
goto out_free_oe;
/* Show index=off in /proc/mounts for forced r/o mount */
if (!ofs->indexdir) {
ofs->config.index = false;
if (ovl_upper_mnt(ofs) && ofs->config.nfs_export) {
pr_warn("NFS export requires an index dir, falling back to nfs_export=off.\n");
ofs->config.nfs_export = false;
}
}
if (ofs->config.metacopy && ofs->config.nfs_export) {
pr_warn("NFS export is not supported with metadata only copy up, falling back to nfs_export=off.\n");
ofs->config.nfs_export = false;
}
if (ofs->config.nfs_export)
sb->s_export_op = &ovl_export_operations;
/* Never override disk quota limits or use reserved space */
cap_lower(cred->cap_effective, CAP_SYS_RESOURCE);
sb->s_magic = OVERLAYFS_SUPER_MAGIC;
sb->s_xattr = ofs->config.userxattr ? ovl_user_xattr_handlers :
ovl_trusted_xattr_handlers;
sb->s_fs_info = ofs;
sb->s_flags |= SB_POSIXACL;
sb->s_iflags |= SB_I_SKIP_SYNC;
// 把 overlay 文件系统的根目录设置到 upperDir里
err = -ENOMEM;
// 创建root的inode并且指向新建的inode对象root_inode
root_dentry = ovl_get_root(sb, upperpath.dentry, oe);
if (!root_dentry)
goto out_free_oe;
mntput(upperpath.mnt);
kfree(splitlower);
sb->s_root = root_dentry;
return 0;
out_free_oe:
ovl_entry_stack_free(oe);
kfree(oe);
out_err:
kfree(splitlower);
path_put(&upperpath);
ovl_free_fs(ofs);
out:
return err;
}
操作overlay 文件系统的目录操作结构体实现:
static const struct dentry_operations ovl_dentry_operations = {
.d_release = ovl_dentry_release,
.d_real = ovl_d_real,
.d_revalidate = ovl_dentry_revalidate,
.d_weak_revalidate = ovl_dentry_weak_revalidate,
};
数据结构图:
参考网址:
Linux源码剖析——OverlayFS 源码分析_linux overlay-CSDN博客
如果你做过kernel module ,读过linux设计实现.就很容易理解了
描述符操作结构体定义:
const struct file_operations ovl_dir_operations = {
.read = generic_read_dir,
.open = ovl_dir_open,
.iterate = ovl_iterate,
.llseek = ovl_dir_llseek,
.fsync = ovl_dir_fsync,
.release = ovl_dir_release,
};
当我们使用linux 系统调用打开overlay 设备文件的时候会触发操作结构体的函数,
open 函数:
static int ovl_dir_open(struct inode *inode, struct file *file)
{
struct path realpath;
struct file *realfile;
struct ovl_dir_file *od;
enum ovl_path_type type;
od = kzalloc(sizeof(struct ovl_dir_file), GFP_KERNEL);
if (!od)
return -ENOMEM;
type = ovl_path_real(file->f_path.dentry, &realpath);
realfile = ovl_dir_open_realfile(file, &realpath);
if (IS_ERR(realfile)) {
kfree(od);
return PTR_ERR(realfile);
}
od->realfile = realfile;
od->is_real = ovl_dir_is_real(file->f_path.dentry);
od->is_upper = OVL_TYPE_UPPER(type);
file->private_data = od;
return 0;
}
struct ovl_dir_file {
bool is_real; // 是否需要合并
bool is_upper; // 是否需要从upper读取
struct ovl_dir_cache *cache; // 缓存目录
struct list_head *cursor; // 遍历游标
struct file *realfile; // 真实文件
struct file *upperfile; // overlay 里 在upper目录所在位置
};
这里主要做的操作是初始化ovl_dir_file,并且把他挂载到万能指针private_data中。
读的操作是通过getdents,我们看迭代器:
static int ovl_iterate(struct file *file, struct dir_context *ctx)
{
struct ovl_dir_file *od = file->private_data;
struct dentry *dentry = file->f_path.dentry;
struct ovl_cache_entry *p;
const struct cred *old_cred;
int err;
old_cred = ovl_override_creds(dentry->d_sb);
if (!ctx->pos)
ovl_dir_reset(file);
//是否需要读取真实路径
if (od->is_real) {
// 不需要合并直接读取真实路径
/*
* If parent is merge, then need to adjust d_ino for '..', if
* dir is impure then need to adjust d_ino for copied up
* entries.
*/
if (ovl_xino_bits(dentry->d_sb) ||
(ovl_same_fs(dentry->d_sb) &&
(ovl_is_impure_dir(file) ||
OVL_TYPE_MERGE(ovl_path_type(dentry->d_parent))))) {
err = ovl_iterate_real(file, ctx);
} else {
err = iterate_dir(od->realfile, ctx);
}
goto out;
}
// 创建目录缓存
if (!od->cache) {
struct ovl_dir_cache *cache;
cache = ovl_cache_get(dentry);
err = PTR_ERR(cache);
if (IS_ERR(cache))
goto out;
od->cache = cache;
ovl_seek_cursor(od, ctx->pos);
}
// 直接把合并后的目录缓存,遍历返回用户层
while (od->cursor != &od->cache->entries) {
p = list_entry(od->cursor, struct ovl_cache_entry, l_node);
if (!p->is_whiteout) {
if (!p->ino) {
err = ovl_cache_update_ino(&file->f_path, p);
if (err)
goto out;
}
}
/* ovl_cache_update_ino() sets is_whiteout on stale entry */
if (!p->is_whiteout) {
if (!dir_emit(ctx, p->name, p->len, p->ino, p->type))
break;
}
od->cursor = p->l_node.next;
ctx->pos++;
}
err = 0;
out:
revert_creds(old_cred);
return err;
}