PMFS内存文件系统的空间管理

文章目录

  • PMFS多粒度块
  • 物理块管理方式
  • 块分配方式
  • 块回收方式

PMFS多粒度块

PMFS支持多粒度的文件组织方式,包括1G, 2M, 4K 三种粒度。
文件默认为4K大小的页,通过调用 pmfs_fallocate()可以设置文件页大小。

pmfs_fallocate:

static long pmfs_fallocate(struct file *file, int mode, loff_t offset,
			    loff_t len)
{
	struct inode *inode = file->f_path.dentry->d_inode;
	struct super_block *sb = inode->i_sb;
	long ret = 0;
	unsigned long blocknr, blockoff;
	int num_blocks, blocksize_mask;
	struct pmfs_inode *pi;
	pmfs_transaction_t *trans;
	loff_t new_size;

	/* We only support the FALLOC_FL_KEEP_SIZE mode */
	if (mode & ~FALLOC_FL_KEEP_SIZE)
		return -EOPNOTSUPP;

	if (S_ISDIR(inode->i_mode))
		return -ENODEV;

	mutex_lock(&inode->i_mutex);

	new_size = len + offset;
	if (!(mode & FALLOC_FL_KEEP_SIZE) && new_size > inode->i_size) {
		ret = inode_newsize_ok(inode, new_size);
		if (ret)
			goto out;
	}

	pi = pmfs_get_inode(sb, inode->i_ino);
	if (!pi) {
		ret = -EACCES;
		goto out;
	}
	trans = pmfs_new_transaction(sb, MAX_INODE_LENTRIES +
			MAX_METABLOCK_LENTRIES);
	if (IS_ERR(trans)) {
		ret = PTR_ERR(trans);
		goto out;
	}
	pmfs_add_logentry(sb, trans, pi, MAX_DATA_PER_LENTRY, LE_DATA);

	/* Set the block size hint */
	/*************************************************************************/
	pmfs_set_blocksize_hint(sb, pi, new_size);//该函数为根据传入size来设置文件块大小
	/*************************************************************************/
	
	blocksize_mask = sb->s_blocksize - 1;
	blocknr = offset >> sb->s_blocksize_bits;
	blockoff = offset & blocksize_mask;
	num_blocks = (blockoff + len + blocksize_mask) >> sb->s_blocksize_bits;
	ret = pmfs_alloc_blocks(trans, inode, blocknr, num_blocks, true);

	inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;

	pmfs_memunlock_inode(sb, pi);
	if (ret || (mode & FALLOC_FL_KEEP_SIZE)) {
		pi->i_flags |= cpu_to_le32(PMFS_EOFBLOCKS_FL);
	}

	if (!(mode & FALLOC_FL_KEEP_SIZE) && new_size > inode->i_size) {
		inode->i_size = new_size;
		pi->i_size = cpu_to_le64(inode->i_size);
	}
	pi->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
	pi->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
	pmfs_memlock_inode(sb, pi);

	pmfs_commit_transaction(sb, trans);

out:
	mutex_unlock(&inode->i_mutex);
	return ret;
}

pmfs_set_blocksize_hint:

int pmfs_set_blocksize_hint(struct super_block *sb, struct pmfs_inode *pi,
		loff_t new_size)
{
	unsigned short block_type;

	if (!pmfs_can_set_blocksize_hint(pi, new_size))
		return 0;

	if (new_size >= 0x40000000) {   /* 1G */
		block_type = PMFS_BLOCK_TYPE_1G;
		goto hint_set;
	}

	if (new_size >= 0x200000) {     /* 2M */
		block_type = PMFS_BLOCK_TYPE_2M;
		goto hint_set;
	}

	/* defaulting to 4K */
	block_type = PMFS_BLOCK_TYPE_4K;

hint_set:
	pmfs_dbg_verbose(
		"Hint: new_size 0x%llx, i_size 0x%llx, root 0x%llx\n",
		new_size, pi->i_size, le64_to_cpu(pi->root));
	pmfs_dbg_verbose("Setting the hint to 0x%x\n", block_type);
	pmfs_memunlock_inode(sb, pi);
	pi->i_blk_type = block_type;
	pmfs_memlock_inode(sb, pi);
	return 0;
}

物理块管理方式

内存的空间管理,默认块大小为4KB,例如128G大小的空间,共有12810241024 / 4 个4KB的块,文件用多少块就分配多少个块,那么文件系统都是怎么管理这些块的分配与回收的呢?如何知道哪些块正在使用哪些没有被使用呢?

  1. 通过一个双向循环链表把已经使用的块给组织起来,链表头block_inuse_head。
struct pmfs_sb_info {
       ...................
struct list_head block_inuse_head;
.      ...................
};

struct list_head {
	struct list_head *next, *prev;
};

简单的说,分配的过程就是从这个链表上按顺序去找链表上没有的块,找到就给插入上去,完成分配物理块的操作。
回收的过程就是,去该链表上去找到对应的块,从链表上去掉,完成回收物理块的操作。
例如128G的内存,就有128 * 1024 * 1024 / 4 个块,都是通过链表一个一个链接起来的吗,那么每次分配回收遍历链表所花的时间将是非常大的。那么这个问题又是怎么解决的呢?

  1. 通过一个结构体存储一些连续的块的最小块号和最大块号以及在链表中的位置,pmfs_blocknode
struct pmfs_blocknode {
	struct list_head link;
	unsigned long block_low;
	unsigned long block_high;
};

比如现在分配了1024个块,块号分别是0 ——511,513——1024,中间块号为512的块没有分配。
如果没有pmfs_blocknode 来组织,那么就链表block_inuse_head就会有1024个结点,下次再分配或者释放的时候就会遍历很多次。
现在有了pmfs_blocknode,那么就链表block_inuse_head就只需要两个pmfs_blocknode结构体结点就可以了。
pmfs_blocknode 1(0,511),pmfs_blocknode 2(513,1024)。

这样就大大减少了链表结点的数量,减小了遍历链表的时间。

  1. 大粒度块与小粒度块的关系。
    在PMFS中,以下大粒度块都以2M为例,一个2M的块就是由512个连续的4KB块组成的。文件在申请或者释放块时,都是以申请回收4KB块为基础的。2M block在申请时会传入文件的block_type,根据block_type,可以算出要申请或者释放的块数量,从block_inuse_head链表中找到合适的位置给插入,并返回该512个4KB块的第一个块号。2M block在回收时,根据传入文件的block_type以及该huge_page的第一个page的blocknr,所以在回收一个2M block就相当于回收512个连续的4KB块。

下面结合代码具体讲块的分配与回收。

块分配方式

因为这里维护的inuse_block的链表,所以分配块就是按顺序从链表中找空隙,找到该空隙就返回对应的blocknr, 并将该块号插入blocknode中。

int pmfs_new_block(struct super_block *sb, unsigned long *blocknr,
	unsigned short btype, int zero)
{
	struct pmfs_sb_info *sbi = PMFS_SB(sb);
	struct list_head *head = &(sbi->block_inuse_head);
	struct pmfs_blocknode *i, *next_i;
	struct pmfs_blocknode *free_blocknode= NULL;
	void *bp;
	unsigned long num_blocks = 0;
	struct pmfs_blocknode *curr_node;
	int errval = 0;
	bool found = 0;
	unsigned long next_block_low;
	unsigned long new_block_low;
	unsigned long new_block_high;

	num_blocks = pmfs_get_numblocks(btype);//根据文件组织粒度分配块,
	//如果是2M的粒度,那么这里就会分配2*1024KB/4KB 个块,这里num_blocks就等于512,1G时类推

	mutex_lock(&sbi->s_lock);
	//list_for_each_entry相当于一个for循环,i= list_entry(head->next, typeof(*i), link),即取头结点的下一个节点。
	list_for_each_entry(i, head, link) {
		if (i->link.next == head) {//当该链表上除头结点外只有一个结点时
			next_i = NULL;
			next_block_low = sbi->block_end;
		} else {
			next_i = list_entry(i->link.next, typeof(*i), link);
			next_block_low = next_i->block_low;
		}
		//如果粒度为4KB,new_block_low就等于new_block_high,2M时,new_block_high = new_block_low + 512
		new_block_low = (i->block_high + num_blocks) & ~(num_blocks - 1);
		new_block_high = new_block_low + num_blocks - 1;
		//然后根据new_block_low,new_block_high 去链表中比较找到合适的位置插入即可。
		if (new_block_high >= next_block_low) {
			/* Does not fit - skip to next blocknode */
			continue; //如果不满足条件,即再次循环,避免执行下面代码浪费时间
		}

		if ((new_block_low == (i->block_high + 1)) &&
			(new_block_high == (next_block_low - 1)))//刚好填补两个pmfs_blocknode中间的空缺
		{
			/* Fill the gap completely */
			if (next_i) {
				i->block_high = next_i->block_high;
				list_del(&next_i->link);
				free_blocknode = next_i; //两个blocknode合为一个blocknode,那么就释放掉其中一个
				sbi->num_blocknode_allocated--;
			} else {
				i->block_high = new_block_high;
			}
			found = 1;
			break;
		}

		if ((new_block_low == (i->block_high + 1)) && 
			(new_block_high < (next_block_low - 1))) {//右边空缺
			/* Aligns to left */
			i->block_high = new_block_high;
			found = 1;
			break;
		}

		if ((new_block_low > (i->block_high + 1)) &&
			(new_block_high == (next_block_low - 1))) {//左边空缺
			/* Aligns to right */
			if (next_i) {
				/* right node exist */
				next_i->block_low = new_block_low;
			} else {
				/* right node does NOT exist */
				curr_node = pmfs_alloc_blocknode(sb);
				PMFS_ASSERT(curr_node);
				if (curr_node == NULL) {
					errval = -ENOSPC;
					break;
				}
				curr_node->block_low = new_block_low;
				curr_node->block_high = new_block_high;
				list_add(&curr_node->link, &i->link);
			}
			found = 1;
			break;
		}

		if ((new_block_low > (i->block_high + 1)) &&
			(new_block_high < (next_block_low - 1))) {//两边空缺
			/* Aligns somewhere in the middle */
			curr_node = pmfs_alloc_blocknode(sb);
			PMFS_ASSERT(curr_node);
			if (curr_node == NULL) {
				errval = -ENOSPC;
				break;
			}
			curr_node->block_low = new_block_low;
			curr_node->block_high = new_block_high;
			list_add(&curr_node->link, &i->link);
			found = 1;
			break;
		}
	}
	
	if (found == 1) {
		sbi->num_free_blocks -= num_blocks;
	}	

	mutex_unlock(&sbi->s_lock);

	if (free_blocknode)
		__pmfs_free_blocknode(free_blocknode);

	if (found == 0) {
		return -ENOSPC;
	}

	if (zero) { //这个几乎不用,我也不清楚有什么用
		size_t size;
		bp = pmfs_get_block(sb, pmfs_get_block_off(sb, new_block_low, btype));
		pmfs_memunlock_block(sb, bp); //TBDTBD: Need to fix this
		if (btype == PMFS_BLOCK_TYPE_4K)
			size = 0x1 << 12;
		else if (btype == PMFS_BLOCK_TYPE_2M)
			size = 0x1 << 21;
		else
			size = 0x1 << 30;
		memset_nt(bp, 0, size);
		pmfs_memlock_block(sb, bp);
	}
	*blocknr = new_block_low;

	return errval;
}

块回收方式

块回收就是从block_inuse_head链表上将对应的块号给去掉。

void __pmfs_free_block(struct super_block *sb, unsigned long blocknr,
		      unsigned short btype, struct pmfs_blocknode **start_hint)
{
	struct pmfs_sb_info *sbi = PMFS_SB(sb);
	struct list_head *head = &(sbi->block_inuse_head);
	unsigned long new_block_low;
	unsigned long new_block_high;
	unsigned long num_blocks = 0;
	struct pmfs_blocknode *i;
	struct pmfs_blocknode *free_blocknode= NULL;
	struct pmfs_blocknode *curr_node;

	num_blocks = pmfs_get_numblocks(btype);//同样的根据block_type找到需要free多少个块
	new_block_low = blocknr;
	new_block_high = blocknr + num_blocks - 1;

	BUG_ON(list_empty(head));

	if (start_hint && *start_hint &&//这部分好像也没什么用,没看到哪里有用的地方
	    new_block_low >= (*start_hint)->block_low)
		i = *start_hint;
	else
		i = list_first_entry(head, typeof(*i), link);

	list_for_each_entry_from(i, head, link) {//传入i的初始值为head后的第一个blocknode
	//遍历链表找到new_block_low的位置
		if (new_block_low > i->block_high) {
			/* skip to next blocknode */
			continue;
		}

		if ((new_block_low == i->block_low) &&
			(new_block_high == i->block_high)) { //该pmfs_blocknode所存的块刚好为要free的块
			/* fits entire datablock */
			if (start_hint)
				*start_hint = pmfs_next_blocknode(i, head);
			list_del(&i->link);
			free_blocknode = i;
			sbi->num_blocknode_allocated--;
			sbi->num_free_blocks += num_blocks;
			goto block_found;
		}
		if ((new_block_low == i->block_low) &&
			(new_block_high < i->block_high)) {
			/* Aligns left */
			i->block_low = new_block_high + 1;
			sbi->num_free_blocks += num_blocks;
			if (start_hint)
				*start_hint = i;
			goto block_found;
		}
		if ((new_block_low > i->block_low) && 
			(new_block_high == i->block_high)) {
			/* Aligns right */
			i->block_high = new_block_low - 1;
			sbi->num_free_blocks += num_blocks;
			if (start_hint)
				*start_hint = pmfs_next_blocknode(i, head);
			goto block_found;
		}
		if ((new_block_low > i->block_low) &&
			(new_block_high < i->block_high)) {//从该pmfs_blocknode中间free掉一些block,
			//将一个pmfs_blocknode分为两个,所以需要new一个pmfs_blocknode用于存放另一半块号
			/* Aligns somewhere in the middle */
			curr_node = pmfs_alloc_blocknode(sb);
			PMFS_ASSERT(curr_node);
			if (curr_node == NULL) {
				/* returning without freeing the block*/
				goto block_found;
			}
			curr_node->block_low = new_block_high + 1;
			curr_node->block_high = i->block_high;
			i->block_high = new_block_low - 1;
			list_add(&curr_node->link, &i->link);
			sbi->num_free_blocks += num_blocks;
			if (start_hint)
				*start_hint = curr_node;
			goto block_found;
		}
	}

	pmfs_error_mng(sb, "Unable to free block %ld\n", blocknr);

block_found:

	if (free_blocknode)
		__pmfs_free_blocknode(free_blocknode);
}

总结:上面主要说了一下PMFS文件系统支持多粒度分配,以及PMFS是怎么管理这些块的,以及又是如何分配和释放的。

你可能感兴趣的:(PMFS内存文件系统的空间管理)