bootmem allocator

在系统启动阶段,buddy系统和slab分配器建立之前,系统的每个节点都拥有自己的bootmem allocator来实现内存的分配,当启动阶段结束后,bootmem allocator将被销毁,而相应的空闲内存会提交给buddy系统来管理,因此bootmem allocator所存在的时间是短暂的,它的宗旨是简单,而非高效!bootmem allocator的基本思想是在一个节点中建立一片位图区域,每一位对应该节点的低端内存的一个页框,通过一个bit来标记一个页的状态,实现页面的分配与回收。

首先了解一下bootmem的核心数据结构

typedef struct bootmem_data {
	unsigned long node_min_pfn; 	
	unsigned long node_low_pfn;
	void *node_bootmem_map;    
	unsigned long last_end_off; 	
	unsigned long hint_idx;     	
	struct list_head list;     
} bootmem_data_t;
  • node_min_pfn:节点的最小页框编号
  • node_low_pfn:节点的低端内存最大页框编号
  • node_bootmem_map:节点的位图起始地址
  • last_end_off:上次分配内存的最后一个字节相对于其所属页面末端的偏移,这个变量内存分配的时候用到,用于防止产生碎片
  • hint_idx:用于内存分配时确定分配的起始地址
  • list:用于将该节点的bootmem链入所有节点的bootmem链表

下面结合具体的代码就以下几个主要的方面介绍bootmem allocator的工作过程

1.bootmem allocator的初始化

2.bootmem allocator保留内存和释放内存

3.bootmem allocator分配内存

4.bootmem allocator的销毁

1.bootmem allocator的初始化

在arch_setup(),通过initmem_init()-->setup_bootmem_allocator()-->setup_node_bootmem()-->init_bootmem_node()来建立节点中的bootmem allocator. 还有一个初始化的函数是init_bootmem(),其和init_bootmem_node()一样,都是对init_bootmem_core()的封装,区别是前者只针对单节点系统,而后者指定了一个节点,在后面其他操作中都用到了类似的封装方法。

unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn,
				unsigned long startpfn, unsigned long endpfn)
{
	return init_bootmem_core(pgdat->bdata, freepfn, startpfn, endpfn);
}

unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
{
	max_low_pfn = pages;
	min_low_pfn = start;
	return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages);
}

下面来看看bootmem初始化的核心函数init_bootmem_core()

static unsigned long __init init_bootmem_core(bootmem_data_t *bdata,
	unsigned long mapstart, unsigned long start, unsigned long end)
{
	unsigned long mapsize;

	mminit_validate_memmodel_limits(&start, &end);
	bdata->node_bootmem_map = phys_to_virt(PFN_PHYS(mapstart));/*存储位图起始地址的虚拟地址*/
	bdata->node_min_pfn = start;/*节点中的起始页*/
	bdata->node_low_pfn = end;  /*节点中的终止页*/
	link_bootmem(bdata);/*将该bdata按顺序链入bdata_list中*/

	/*
	 * Initially all pages are reserved - setup_arch() has to
	 * register free RAM areas explicitly.
	 */
	mapsize = bootmap_bytes(end - start);
	memset(bdata->node_bootmem_map, 0xff, mapsize);/*将位图全部置1,保留所有页*/

	bdebug("nid=%td start=%lx map=%lx end=%lx mapsize=%lx\n",
		bdata - bootmem_node_data, start, mapstart, end, mapsize);

	return mapsize;/*返回位图大小*/
}

我们可以看到在init_bootmem_core()中,主要的工作就是初始化bdata中的变量,以及将位图全部置1,这些参数的确定是在前面列举的函数中完成的。

2.bootmem allocator保留内存和释放内存

保留内存和释放内存是两个相对的概念,bootmem allocator分配出去的内存的会被标记为保留状态,也就是对应的位图区域都为1,这些内存在bootmem allocator销毁后是不会被buddy系统接管的,而释放内存很好理解,就是将相应的页面置于空闲状态,这些页面可以被bootmem allocator分配,空闲的页面在bootmem allocator销毁后会被buddy系统接管。

先来看看保留内存的处理,调用reserve_bootmem_node()函数可以将指定节点中的指定范围页面置为保留状态

int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
				 unsigned long size, int flags)
{
	unsigned long start, end;

	start = PFN_DOWN(physaddr);     /*获得起始页框*/
 	end = PFN_UP(physaddr + size);  /*获得终止页框*/

	return mark_bootmem_node(pgdat->bdata, start, end, 1, flags);
}

下面来看核心函数mark_bootmem_node()

static int __init mark_bootmem_node(bootmem_data_t *bdata,
				unsigned long start, unsigned long end,
				int reserve, int flags)
{
	unsigned long sidx, eidx;

	bdebug("nid=%td start=%lx end=%lx reserve=%d flags=%x\n",
		bdata - bootmem_node_data, start, end, reserve, flags);

	/*条件判断*/
	BUG_ON(start < bdata->node_min_pfn);
	BUG_ON(end > bdata->node_low_pfn);

	/*计算出start index,end index,即start和end相对于节点最小页框号的偏移量*/
	sidx = start - bdata->node_min_pfn;
	eidx = end - bdata->node_min_pfn;

	if (reserve) /*如果选择保留页框*/
		return __reserve(bdata, sidx, eidx, flags);
	else	    /*选择释放页框*/
		__free(bdata, sidx, eidx);
	return 0;
}

再看__reserve()

static int __init __reserve(bootmem_data_t *bdata, unsigned long sidx,
			unsigned long eidx, int flags)
{
	unsigned long idx;
	int exclusive = flags & BOOTMEM_EXCLUSIVE;

	bdebug("nid=%td start=%lx end=%lx flags=%x\n",
		bdata - bootmem_node_data,
		sidx + bdata->node_min_pfn,
		eidx + bdata->node_min_pfn,
		flags);

	for (idx = sidx; idx < eidx; idx++)/*遍历sidx-->eidx的页框对应的位图区域*/
		if (test_and_set_bit(idx, bdata->node_bootmem_map)) {/*把位图的相关位置1*/
			if (exclusive) {
				__free(bdata, sidx, idx);
				return -EBUSY;
			}
			bdebug("silent double reserve of PFN %lx\n",
				idx + bdata->node_min_pfn);
		}
	return 0;
}

可以看到,保留页面的关键操作就是调用test_and_set_bit()将位图的相关区域置1.

释放内存和保留内存的过程基本相同,只不过传递给mark_bootmem_node()的reserve参数为0,表示释放相应页面,因此在mark_bootmem_node()中会调用__free()

static void __init __free(bootmem_data_t *bdata,
			unsigned long sidx, unsigned long eidx)
{
	unsigned long idx;

	bdebug("nid=%td start=%lx end=%lx\n", bdata - bootmem_node_data,
		sidx + bdata->node_min_pfn,
		eidx + bdata->node_min_pfn);

	if (bdata->hint_idx > sidx)
		bdata->hint_idx = sidx;/*保证hint_idx指向最低的空闲页*/

	for (idx = sidx; idx < eidx; idx++)/*遍历相关的位图区域*/
		if (!test_and_clear_bit(idx, bdata->node_bootmem_map))/*清零*/
			BUG();
}

__free()相较__reserve()多了一处对bdata->hint_idx的操作,这个地方是为了保证hint_idx指向最低的空闲页,因为在进行分配的时候,boot allocator是保证从最低的空闲页开始分配

3.bootmem allocator分配内存

bootmem allocator分配内存相对于前面的操作来说要复杂一些,这里面主要考虑的一个问题就是内存碎片。设我们的页面大小为4KB,假如我们上一次分配内存的范围是从第4个页面开始到第8个页面的2KB处,而这次要求分配的起始地址处于第九个页面,如果从第九个页面开始分配的话,那么至少会产生2KB的内存碎片,这样无疑会产生大量的浪费。这也是为什么我们之前介绍的bootmem关键数据结构中引入last_end_off这个变量,它记录了上次分配的末端地址离页尾的偏移,在我们这个例子中该值为2KB,那么如果这次我们从第9个页面开始分配,我们就要考虑将这2KB整合到这次分配中去。

分配内存的核心函数是alloc_bootmem_core(),具体代码如下:

static void * __init alloc_bootmem_core(struct bootmem_data *bdata,
					unsigned long size, unsigned long align,
					unsigned long goal, unsigned long limit)
{
	unsigned long fallback = 0;
	unsigned long min, max, start, sidx, midx, step;

	bdebug("nid=%td size=%lx [%lu pages] align=%lx goal=%lx limit=%lx\n",
		bdata - bootmem_node_data, size, PAGE_ALIGN(size) >> PAGE_SHIFT,
		align, goal, limit);

	BUG_ON(!size);                        /*检测size*/
	BUG_ON(align & (align - 1));          /*检测对齐数是否为2的指数幂*/
	BUG_ON(limit && goal + size > limit); /*如果limit不为0则检测goal+size是否超过limit*/

	if (!bdata->node_bootmem_map)
		return NULL;

	/*得到该节点的最小最大低端内存页框号*/
	min = bdata->node_min_pfn;
	max = bdata->node_low_pfn;

	/*将goal和limit从地址转化为页框号*/
	goal >>= PAGE_SHIFT;
	limit >>= PAGE_SHIFT;

	if (limit && max > limit)
		max = limit;
	if (max <= min)
		return NULL;

	/*设定步进,以页面为单位*/
	step = max(align >> PAGE_SHIFT, 1UL);

	/*确定起始页框*/
	if (goal && min < goal && goal < max)
		start = ALIGN(goal, step);
	else
		start = ALIGN(min, step);

	/*确定起始页框和最大页框的偏移量*/
	sidx = start - bdata->node_min_pfn;
	midx = max - bdata->node_min_pfn;

	if (bdata->hint_idx > sidx) { /*sidx小于hint_idx的话则要下调至hint_idx对齐后的结果*/
		/*
		 * Handle the valid case of sidx being zero and still
		 * catch the fallback below.
		 */
		fallback = sidx + 1;
		sidx = align_idx(bdata, bdata->hint_idx, step);
	}

	while (1) {
		int merge;
		void *region;
		unsigned long eidx, i, start_off, end_off;
find_block:
		sidx = find_next_zero_bit(bdata->node_bootmem_map, midx, sidx); /*找到下一个0位作为起始地址*/
		sidx = align_idx(bdata, sidx, step);                            /*按step进行对齐*/
		eidx = sidx + PFN_UP(size);

		if (sidx >= midx || eidx > midx)
			break;

		for (i = sidx; i < eidx; i++)
			if (test_bit(i, bdata->node_bootmem_map)) { /*遇到了保留位,则表明无法找到一块连续的空闲区域*/
				sidx = align_idx(bdata, i, step);  /*调整sidx*/
				if (sidx == i)
					sidx += step;
				goto find_block;                   /*重新开始检索bitmap*/
			}
			
        /*如果 1.上次分配的PAGE还有剩余的空间
		       2.PAGE_SIZE-1>0
		       3.上次分配的PAGE是在这次要求分配的PAGE的相邻并在前面*/
		if (bdata->last_end_off & (PAGE_SIZE - 1) &&
				PFN_DOWN(bdata->last_end_off) + 1 == sidx)
			start_off = align_off(bdata, bdata->last_end_off, align);/*start_off从上次的PAGE剩余处开始,取对齐后的结果,将上次分配的页面剩余的部分整合到这次分配的内存中来*/
		else
			start_off = PFN_PHYS(sidx);/*不满足上述条件,则从要求的起始PAGE开始*/

		merge = PFN_DOWN(start_off) < sidx; /*确定merge的值为0或1*/
		end_off = start_off + size;

		/*重新确定last_end_off和hint_idx*/
		bdata->last_end_off = end_off;
		bdata->hint_idx = PFN_UP(end_off);

		/*
		 * Reserve the area now:
		 */
		if (__reserve(bdata, PFN_DOWN(start_off) + merge, /*保留相关的区域*/
				PFN_UP(end_off), BOOTMEM_EXCLUSIVE))
			BUG();

		region = phys_to_virt(PFN_PHYS(bdata->node_min_pfn) + /*得到起始地址的虚拟地址*/
				start_off);
		memset(region, 0, size);/*将申请到的区域清空*/
		/*
		 * The min_count is set to 0 so that bootmem allocated blocks
		 * are never reported as leaks.
		 */
		kmemleak_alloc(region, size, 0, 0);
		return region;
	}

	if (fallback) {
		sidx = align_idx(bdata, fallback - 1, step);
		fallback = 0;
		goto find_block;
	}

	return NULL;
}



4.bootmem allocator的销毁

bootmem allocator销毁后,其空闲的内存将交由buddy system接管,核心函数为free_all_bootmem_core()

static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
{
	int aligned;
	struct page *page;
	unsigned long start, end, pages, count = 0;

	if (!bdata->node_bootmem_map)/*bitmap不存在,表示该节点已经释放*/
		return 0;

	/*获得低端内存的起始页框和终止页框*/
	start = bdata->node_min_pfn;
	end = bdata->node_low_pfn;

	/*
	 * If the start is aligned to the machines wordsize, we might
	 * be able to free pages in bulks of that order.
	 */
	aligned = !(start & (BITS_PER_LONG - 1));/*得到start是否为2的指数幂*/

	bdebug("nid=%td start=%lx end=%lx aligned=%d\n",
		bdata - bootmem_node_data, start, end, aligned);

	/*************************************
	*         第一步:释放空闲页          *
	*************************************/
	while (start < end) {
		unsigned long *map, idx, vec;

		map = bdata->node_bootmem_map;
		idx = start - bdata->node_min_pfn;
		vec = ~map[idx / BITS_PER_LONG];/*将idx所处的long字段的位图部分进行取反*/

		/*如果:1.起始地址是2的整数幂
		       2.该long字段的位图全为0,即空闲状态
		       3.start+BITS_PER_LONG未超过范围*/
		if (aligned && vec == ~0UL && start + BITS_PER_LONG < end) {
			int order = ilog2(BITS_PER_LONG);/*得到Long的长度为2的多少次幂*/

			__free_pages_bootmem(pfn_to_page(start), order);/*直接将整块内存释放*/
			count += BITS_PER_LONG;
		} else {/*否则只能逐页释放*/
			unsigned long off = 0;

			while (vec && off < BITS_PER_LONG) {/*判断该字段内的空闲页是否已经释放完*/
				if (vec & 1) { /*vec的最低位为1,也就是说start+off对应的page为空闲*/
					page = pfn_to_page(start + off);
					__free_pages_bootmem(page, 0);
					count++;
				}
				vec >>= 1;
				off++;
			}
		}
		start += BITS_PER_LONG;
	}

	/*****************************
	*  第二步:释放保存bitmap的页 *
	******************************/
	page = virt_to_page(bdata->node_bootmem_map);/*得到bitmap起始地址的所属页*/
	pages = bdata->node_low_pfn - bdata->node_min_pfn;
	pages = bootmem_bootmap_pages(pages);/*得到bitmap的大小,以页为单位*/
	count += pages;
	while (pages--)/*逐页释放*/
		__free_pages_bootmem(page++, 0);

	bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count);

	return count;/*返回释放的页框数*/
}




你可能感兴趣的:(Boot)