PMFS支持多粒度的文件组织方式,包括1G, 2M, 4K 三种粒度。
文件默认为4K大小的页,通过调用 pmfs_fallocate()可以设置文件页大小。
pmfs_fallocate:
static long pmfs_fallocate(struct file *file, int mode, loff_t offset,
loff_t len)
{
struct inode *inode = file->f_path.dentry->d_inode;
struct super_block *sb = inode->i_sb;
long ret = 0;
unsigned long blocknr, blockoff;
int num_blocks, blocksize_mask;
struct pmfs_inode *pi;
pmfs_transaction_t *trans;
loff_t new_size;
/* We only support the FALLOC_FL_KEEP_SIZE mode */
if (mode & ~FALLOC_FL_KEEP_SIZE)
return -EOPNOTSUPP;
if (S_ISDIR(inode->i_mode))
return -ENODEV;
mutex_lock(&inode->i_mutex);
new_size = len + offset;
if (!(mode & FALLOC_FL_KEEP_SIZE) && new_size > inode->i_size) {
ret = inode_newsize_ok(inode, new_size);
if (ret)
goto out;
}
pi = pmfs_get_inode(sb, inode->i_ino);
if (!pi) {
ret = -EACCES;
goto out;
}
trans = pmfs_new_transaction(sb, MAX_INODE_LENTRIES +
MAX_METABLOCK_LENTRIES);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
goto out;
}
pmfs_add_logentry(sb, trans, pi, MAX_DATA_PER_LENTRY, LE_DATA);
/* Set the block size hint */
/*************************************************************************/
pmfs_set_blocksize_hint(sb, pi, new_size);//该函数为根据传入size来设置文件块大小
/*************************************************************************/
blocksize_mask = sb->s_blocksize - 1;
blocknr = offset >> sb->s_blocksize_bits;
blockoff = offset & blocksize_mask;
num_blocks = (blockoff + len + blocksize_mask) >> sb->s_blocksize_bits;
ret = pmfs_alloc_blocks(trans, inode, blocknr, num_blocks, true);
inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
pmfs_memunlock_inode(sb, pi);
if (ret || (mode & FALLOC_FL_KEEP_SIZE)) {
pi->i_flags |= cpu_to_le32(PMFS_EOFBLOCKS_FL);
}
if (!(mode & FALLOC_FL_KEEP_SIZE) && new_size > inode->i_size) {
inode->i_size = new_size;
pi->i_size = cpu_to_le64(inode->i_size);
}
pi->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
pi->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
pmfs_memlock_inode(sb, pi);
pmfs_commit_transaction(sb, trans);
out:
mutex_unlock(&inode->i_mutex);
return ret;
}
pmfs_set_blocksize_hint:
int pmfs_set_blocksize_hint(struct super_block *sb, struct pmfs_inode *pi,
loff_t new_size)
{
unsigned short block_type;
if (!pmfs_can_set_blocksize_hint(pi, new_size))
return 0;
if (new_size >= 0x40000000) { /* 1G */
block_type = PMFS_BLOCK_TYPE_1G;
goto hint_set;
}
if (new_size >= 0x200000) { /* 2M */
block_type = PMFS_BLOCK_TYPE_2M;
goto hint_set;
}
/* defaulting to 4K */
block_type = PMFS_BLOCK_TYPE_4K;
hint_set:
pmfs_dbg_verbose(
"Hint: new_size 0x%llx, i_size 0x%llx, root 0x%llx\n",
new_size, pi->i_size, le64_to_cpu(pi->root));
pmfs_dbg_verbose("Setting the hint to 0x%x\n", block_type);
pmfs_memunlock_inode(sb, pi);
pi->i_blk_type = block_type;
pmfs_memlock_inode(sb, pi);
return 0;
}
内存的空间管理,默认块大小为4KB,例如128G大小的空间,共有12810241024 / 4 个4KB的块,文件用多少块就分配多少个块,那么文件系统都是怎么管理这些块的分配与回收的呢?如何知道哪些块正在使用哪些没有被使用呢?
struct pmfs_sb_info {
...................
struct list_head block_inuse_head;
. ...................
};
struct list_head {
struct list_head *next, *prev;
};
简单的说,分配的过程就是从这个链表上按顺序去找链表上没有的块,找到就给插入上去,完成分配物理块的操作。
回收的过程就是,去该链表上去找到对应的块,从链表上去掉,完成回收物理块的操作。
例如128G的内存,就有128 * 1024 * 1024 / 4 个块,都是通过链表一个一个链接起来的吗,那么每次分配回收遍历链表所花的时间将是非常大的。那么这个问题又是怎么解决的呢?
struct pmfs_blocknode {
struct list_head link;
unsigned long block_low;
unsigned long block_high;
};
比如现在分配了1024个块,块号分别是0 ——511,513——1024,中间块号为512的块没有分配。
如果没有pmfs_blocknode 来组织,那么就链表block_inuse_head就会有1024个结点,下次再分配或者释放的时候就会遍历很多次。
现在有了pmfs_blocknode,那么就链表block_inuse_head就只需要两个pmfs_blocknode结构体结点就可以了。
pmfs_blocknode 1(0,511),pmfs_blocknode 2(513,1024)。
这样就大大减少了链表结点的数量,减小了遍历链表的时间。
下面结合代码具体讲块的分配与回收。
因为这里维护的inuse_block的链表,所以分配块就是按顺序从链表中找空隙,找到该空隙就返回对应的blocknr, 并将该块号插入blocknode中。
int pmfs_new_block(struct super_block *sb, unsigned long *blocknr,
unsigned short btype, int zero)
{
struct pmfs_sb_info *sbi = PMFS_SB(sb);
struct list_head *head = &(sbi->block_inuse_head);
struct pmfs_blocknode *i, *next_i;
struct pmfs_blocknode *free_blocknode= NULL;
void *bp;
unsigned long num_blocks = 0;
struct pmfs_blocknode *curr_node;
int errval = 0;
bool found = 0;
unsigned long next_block_low;
unsigned long new_block_low;
unsigned long new_block_high;
num_blocks = pmfs_get_numblocks(btype);//根据文件组织粒度分配块,
//如果是2M的粒度,那么这里就会分配2*1024KB/4KB 个块,这里num_blocks就等于512,1G时类推
mutex_lock(&sbi->s_lock);
//list_for_each_entry相当于一个for循环,i= list_entry(head->next, typeof(*i), link),即取头结点的下一个节点。
list_for_each_entry(i, head, link) {
if (i->link.next == head) {//当该链表上除头结点外只有一个结点时
next_i = NULL;
next_block_low = sbi->block_end;
} else {
next_i = list_entry(i->link.next, typeof(*i), link);
next_block_low = next_i->block_low;
}
//如果粒度为4KB,new_block_low就等于new_block_high,2M时,new_block_high = new_block_low + 512
new_block_low = (i->block_high + num_blocks) & ~(num_blocks - 1);
new_block_high = new_block_low + num_blocks - 1;
//然后根据new_block_low,new_block_high 去链表中比较找到合适的位置插入即可。
if (new_block_high >= next_block_low) {
/* Does not fit - skip to next blocknode */
continue; //如果不满足条件,即再次循环,避免执行下面代码浪费时间
}
if ((new_block_low == (i->block_high + 1)) &&
(new_block_high == (next_block_low - 1)))//刚好填补两个pmfs_blocknode中间的空缺
{
/* Fill the gap completely */
if (next_i) {
i->block_high = next_i->block_high;
list_del(&next_i->link);
free_blocknode = next_i; //两个blocknode合为一个blocknode,那么就释放掉其中一个
sbi->num_blocknode_allocated--;
} else {
i->block_high = new_block_high;
}
found = 1;
break;
}
if ((new_block_low == (i->block_high + 1)) &&
(new_block_high < (next_block_low - 1))) {//右边空缺
/* Aligns to left */
i->block_high = new_block_high;
found = 1;
break;
}
if ((new_block_low > (i->block_high + 1)) &&
(new_block_high == (next_block_low - 1))) {//左边空缺
/* Aligns to right */
if (next_i) {
/* right node exist */
next_i->block_low = new_block_low;
} else {
/* right node does NOT exist */
curr_node = pmfs_alloc_blocknode(sb);
PMFS_ASSERT(curr_node);
if (curr_node == NULL) {
errval = -ENOSPC;
break;
}
curr_node->block_low = new_block_low;
curr_node->block_high = new_block_high;
list_add(&curr_node->link, &i->link);
}
found = 1;
break;
}
if ((new_block_low > (i->block_high + 1)) &&
(new_block_high < (next_block_low - 1))) {//两边空缺
/* Aligns somewhere in the middle */
curr_node = pmfs_alloc_blocknode(sb);
PMFS_ASSERT(curr_node);
if (curr_node == NULL) {
errval = -ENOSPC;
break;
}
curr_node->block_low = new_block_low;
curr_node->block_high = new_block_high;
list_add(&curr_node->link, &i->link);
found = 1;
break;
}
}
if (found == 1) {
sbi->num_free_blocks -= num_blocks;
}
mutex_unlock(&sbi->s_lock);
if (free_blocknode)
__pmfs_free_blocknode(free_blocknode);
if (found == 0) {
return -ENOSPC;
}
if (zero) { //这个几乎不用,我也不清楚有什么用
size_t size;
bp = pmfs_get_block(sb, pmfs_get_block_off(sb, new_block_low, btype));
pmfs_memunlock_block(sb, bp); //TBDTBD: Need to fix this
if (btype == PMFS_BLOCK_TYPE_4K)
size = 0x1 << 12;
else if (btype == PMFS_BLOCK_TYPE_2M)
size = 0x1 << 21;
else
size = 0x1 << 30;
memset_nt(bp, 0, size);
pmfs_memlock_block(sb, bp);
}
*blocknr = new_block_low;
return errval;
}
块回收就是从block_inuse_head链表上将对应的块号给去掉。
void __pmfs_free_block(struct super_block *sb, unsigned long blocknr,
unsigned short btype, struct pmfs_blocknode **start_hint)
{
struct pmfs_sb_info *sbi = PMFS_SB(sb);
struct list_head *head = &(sbi->block_inuse_head);
unsigned long new_block_low;
unsigned long new_block_high;
unsigned long num_blocks = 0;
struct pmfs_blocknode *i;
struct pmfs_blocknode *free_blocknode= NULL;
struct pmfs_blocknode *curr_node;
num_blocks = pmfs_get_numblocks(btype);//同样的根据block_type找到需要free多少个块
new_block_low = blocknr;
new_block_high = blocknr + num_blocks - 1;
BUG_ON(list_empty(head));
if (start_hint && *start_hint &&//这部分好像也没什么用,没看到哪里有用的地方
new_block_low >= (*start_hint)->block_low)
i = *start_hint;
else
i = list_first_entry(head, typeof(*i), link);
list_for_each_entry_from(i, head, link) {//传入i的初始值为head后的第一个blocknode
//遍历链表找到new_block_low的位置
if (new_block_low > i->block_high) {
/* skip to next blocknode */
continue;
}
if ((new_block_low == i->block_low) &&
(new_block_high == i->block_high)) { //该pmfs_blocknode所存的块刚好为要free的块
/* fits entire datablock */
if (start_hint)
*start_hint = pmfs_next_blocknode(i, head);
list_del(&i->link);
free_blocknode = i;
sbi->num_blocknode_allocated--;
sbi->num_free_blocks += num_blocks;
goto block_found;
}
if ((new_block_low == i->block_low) &&
(new_block_high < i->block_high)) {
/* Aligns left */
i->block_low = new_block_high + 1;
sbi->num_free_blocks += num_blocks;
if (start_hint)
*start_hint = i;
goto block_found;
}
if ((new_block_low > i->block_low) &&
(new_block_high == i->block_high)) {
/* Aligns right */
i->block_high = new_block_low - 1;
sbi->num_free_blocks += num_blocks;
if (start_hint)
*start_hint = pmfs_next_blocknode(i, head);
goto block_found;
}
if ((new_block_low > i->block_low) &&
(new_block_high < i->block_high)) {//从该pmfs_blocknode中间free掉一些block,
//将一个pmfs_blocknode分为两个,所以需要new一个pmfs_blocknode用于存放另一半块号
/* Aligns somewhere in the middle */
curr_node = pmfs_alloc_blocknode(sb);
PMFS_ASSERT(curr_node);
if (curr_node == NULL) {
/* returning without freeing the block*/
goto block_found;
}
curr_node->block_low = new_block_high + 1;
curr_node->block_high = i->block_high;
i->block_high = new_block_low - 1;
list_add(&curr_node->link, &i->link);
sbi->num_free_blocks += num_blocks;
if (start_hint)
*start_hint = curr_node;
goto block_found;
}
}
pmfs_error_mng(sb, "Unable to free block %ld\n", blocknr);
block_found:
if (free_blocknode)
__pmfs_free_blocknode(free_blocknode);
}
总结:上面主要说了一下PMFS文件系统支持多粒度分配,以及PMFS是怎么管理这些块的,以及又是如何分配和释放的。