linux启动内存分配器

linux启动内存分配器是在伙伴系统、slab机制实现之前,为满足内核中内存的分配而建立的。本身的机制比较简单,使用位图来进行标志分配和释放。

一、数据结构介绍

1,保留区间

因为在建立启动内存分配器的时候,会涉及保留内存。也就是说,之前保留给页表、分配器本身(用于映射的位图)、io等得内存在分配器建立后,当用它来分配内存空间时,保留出来的那些部分就不能再分配了。linux中对保留内存空间的部分用下列数据结构表示

/*
 * Early reserved memory areas.
 */
#define MAX_EARLY_RES 20/*保留空间最大块数*/

struct early_res {/*保留空间结构*/
	u64 start, end;
	char name[16];
	char overlap_ok;
};
/*保留内存空间全局变量*/
static struct early_res early_res[MAX_EARLY_RES] __initdata = {
	{ 0, PAGE_SIZE, "BIOS data page" },	/* BIOS data page */
	{}
};
2,bootmem分配器

/*
 * node_bootmem_map is a map pointer - the bits represent all physical 
 * memory pages (including holes) on the node.
 */
 /*用于bootmem分配器的节点数据结构*/
typedef struct bootmem_data {
	unsigned long node_min_pfn;/*存放bootmem位图的第一个页面(即内核映象结束处的第一个页面)。*/
	unsigned long node_low_pfn;/*物理内存的顶点,最高不超过896MB。*/
	void *node_bootmem_map;
	unsigned long last_end_off;/*用来存放在前一次分配中所分配的最后一个字节相对于last_pos的位移量*/
	unsigned long hint_idx;/*存放前一次分配的最后一个页面号*/
	struct list_head list;
} bootmem_data_t;
全局链表

static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list);

二、启动分配器的建立

启动分配器的建立主要的流程为初始化映射位图、活动内存区的映射位置0(表示可用)、保留内存区域处理,其中保留区存放在上面介绍的全局数组中,这里只是将分配器中对应映射位图值1,表示已经分配。

下面我们看内核中具体的初始化流程。

start_kernel()->setup_arch()->initmem_init()

void __init setup_arch(char **cmdline_p)
{
          .......
	/*此函数在开始对bootmem分配制度建立做些准备工作
	然后调用相关函数建立bootmem分配制度*/
	initmem_init(0, max_pfn);
          .......
}           
void __init initmem_init(unsigned long start_pfn,
				  unsigned long end_pfn)
{
#ifdef CONFIG_HIGHMEM
	highstart_pfn = highend_pfn = max_pfn;
	if (max_pfn > max_low_pfn)
		highstart_pfn = max_low_pfn;
	/*将活动内存放到early_node_map中,前面已经分析过了*/
	e820_register_active_regions(0, 0, highend_pfn);
	/*设置上面变量中的内存为当前,在这里没有
	设置相关的宏*/
	sparse_memory_present_with_active_regions(0);
	printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
		pages_to_mb(highend_pfn - highstart_pfn));
	num_physpages = highend_pfn;
	/*高端内存开始地址物理*/
	high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
#else
	e820_register_active_regions(0, 0, max_low_pfn);
	sparse_memory_present_with_active_regions(0);
	num_physpages = max_low_pfn;
	high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
#endif
#ifdef CONFIG_FLATMEM
	max_mapnr = num_physpages;
#endif
	__vmalloc_start_set = true;

	printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
			pages_to_mb(max_low_pfn));
	/*安装bootmem分配器,此分配器在伙伴系统起来之前
	用来进行承担内存的分配等管理*/
	setup_bootmem_allocator();
}

void __init setup_bootmem_allocator(void)
{
	int nodeid;
	unsigned long bootmap_size, bootmap;
	/*
	 * Initialize the boot-time allocator (with low memory only):
	 */
	 /*计算所需要的映射页面大小一个字节一位,
	 所以需要对总的页面大小除以8*/
	bootmap_size = bootmem_bootmap_pages(max_low_pfn)<<PAGE_SHIFT;
	/*直接中e820中找到一个大小合适的内存块,返回基址*/
	bootmap = find_e820_area(0, max_pfn_mapped<<PAGE_SHIFT, bootmap_size,
				 PAGE_SIZE);
	if (bootmap == -1L)
		panic("Cannot find bootmem map of size %ld\n", bootmap_size);
	/*将用于位图映射的页面保留*/
	reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");

	printk(KERN_INFO "  mapped low ram: 0 - %08lx\n",
		 max_pfn_mapped<<PAGE_SHIFT);
	printk(KERN_INFO "  low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT);
	/*对每一个在线的node*/
	for_each_online_node(nodeid) {
		 unsigned long start_pfn, end_pfn;

#ifdef CONFIG_NEED_MULTIPLE_NODES/*not set*/
		start_pfn = node_start_pfn[nodeid];
		end_pfn = node_end_pfn[nodeid];
		if (start_pfn > max_low_pfn)
			continue;
		if (end_pfn > max_low_pfn)
			end_pfn = max_low_pfn;
#else
		start_pfn = 0;
		end_pfn = max_low_pfn;
#endif
		/*对指定节点安装启动分配器*/
		bootmap = setup_node_bootmem(nodeid, start_pfn, end_pfn,
						 bootmap);
	}
	/*bootmem的分配制度到这里就已经建立完成,把after_bootmem
	变量置成1,标识*/
	after_bootmem = 1;
}
static unsigned long __init setup_node_bootmem(int nodeid,
				 unsigned long start_pfn,
				 unsigned long end_pfn,
				 unsigned long bootmap)
{
	unsigned long bootmap_size;

	/* don't touch min_low_pfn */
	/*初始化映射位图,将位图中的所有位置1*/
	bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
					 bootmap >> PAGE_SHIFT,
					 start_pfn, end_pfn);
	printk(KERN_INFO "  node %d low ram: %08lx - %08lx\n",
		nodeid, start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
	printk(KERN_INFO "  node %d bootmap %08lx - %08lx\n",
		 nodeid, bootmap, bootmap + bootmap_size);
	/*将活动内存区对应位图相关位置0,表示可被分配的*/
	free_bootmem_with_active_regions(nodeid, end_pfn);
	/*对置保留位的相关页面对应的位图设置为1,表示已经分配
	或者不可用(不能被分配)*/
	early_res_to_bootmem(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
	/*返回映射页面的最后地址,下次映射即可以从这里开始*/
	return bootmap + bootmap_size;
}
对于初始化映射位图,最终调用init_bootmem_core()

/*
 * Called once to set up the allocator itself.
 */
static unsigned long __init init_bootmem_core(bootmem_data_t *bdata,
	unsigned long mapstart, unsigned long start, unsigned long end)
{
	unsigned long mapsize;

	mminit_validate_memmodel_limits(&start, &end);
	bdata->node_bootmem_map = phys_to_virt(PFN_PHYS(mapstart));
	bdata->node_min_pfn = start;
	bdata->node_low_pfn = end;
	/*添加bdata变量到链表中*/
	link_bootmem(bdata);

	/*
	 * Initially all pages are reserved - setup_arch() has to
	 * register free RAM areas explicitly.
	 */
	 /*计算本bdata的mapsize,也就是内存页面大小的1/8*/
	mapsize = bootmap_bytes(end - start);
	/*将所有map置1*/
	memset(bdata->node_bootmem_map, 0xff, mapsize);

	bdebug("nid=%td start=%lx map=%lx end=%lx mapsize=%lx\n",
		bdata - bootmem_node_data, start, mapstart, end, mapsize);

	return mapsize;
}
/*
 * link bdata in order
 */
 /*添加到链表,由添加的代码可知
 链表中的数据开始位置为递增的*/
static void __init link_bootmem(bootmem_data_t *bdata)
{
	struct list_head *iter;
	/*添加到全局链表bdata_list中*/
	list_for_each(iter, &bdata_list) {
		bootmem_data_t *ent;

		ent = list_entry(iter, bootmem_data_t, list);
		if (bdata->node_min_pfn < ent->node_min_pfn)
			break;
	}
	list_add_tail(&bdata->list, iter);
}
/**
 * free_bootmem_with_active_regions - Call free_bootmem_node for each active range
 * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
 * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node
 *
 * If an architecture guarantees that all ranges registered with
 * add_active_ranges() contain no holes and may be freed, this
 * this function may be used instead of calling free_bootmem() manually.
 */
 /*用active_region来初始化bootmem分配器,基于低端内存区*/
void __init free_bootmem_with_active_regions(int nid,
						unsigned long max_low_pfn)
{
	int i;
	/*对每个节点上得活动内存区*/
	for_each_active_range_index_in_nid(i, nid) {
		unsigned long size_pages = 0;
		unsigned long end_pfn = early_node_map[i].end_pfn;

		if (early_node_map[i].start_pfn >= max_low_pfn)
			continue;

		if (end_pfn > max_low_pfn)
			end_pfn = max_low_pfn;
		/*计算活动区的页面数*/
		size_pages = end_pfn - early_node_map[i].start_pfn;
		/*释放这部分内存,起始就是对应位图值0*/
		free_bootmem_node(NODE_DATA(early_node_map[i].nid),
				PFN_PHYS(early_node_map[i].start_pfn),
				size_pages << PAGE_SHIFT);
	}
}
/**
 * free_bootmem_node - mark a page range as usable
 * @pgdat: node the range resides on
 * @physaddr: starting address of the range
 * @size: size of the range in bytes
 *
 * Partial pages will be considered reserved and left as they are.
 *
 * The range must reside completely on the specified node.
 */
void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
			      unsigned long size)
{
	unsigned long start, end;
	/*相关宏进行控制,调试用*/
	kmemleak_free_part(__va(physaddr), size);

	start = PFN_UP(physaddr);/*取上界*/
	end = PFN_DOWN(physaddr + size);/*取下界*/

	
	/*调用此函数对相关bit位清0,表示没有分配,这里保留位为0*/
	mark_bootmem_node(pgdat->bdata, start, end, 0, 0);
}
static int __init mark_bootmem_node(bootmem_data_t *bdata,
				unsigned long start, unsigned long end,
				int reserve, int flags)
{
	unsigned long sidx, eidx;

	bdebug("nid=%td start=%lx end=%lx reserve=%d flags=%x\n",
		bdata - bootmem_node_data, start, end, reserve, flags);

	BUG_ON(start < bdata->node_min_pfn);
	BUG_ON(end > bdata->node_low_pfn);
	/*此两个变量为到节点最小内存页面的偏移量*/
	sidx = start - bdata->node_min_pfn;
	eidx = end - bdata->node_min_pfn;

	if (reserve)/*如果设置了保留位*/
		return __reserve(bdata, sidx, eidx, flags);
	else/*相关的map位清0*/
		__free(bdata, sidx, eidx);
	return 0;
}
/*bootmem分配器的保留操作*/
static int __init __reserve(bootmem_data_t *bdata, unsigned long sidx,
			unsigned long eidx, int flags)
{
	unsigned long idx;
	int exclusive = flags & BOOTMEM_EXCLUSIVE;

	bdebug("nid=%td start=%lx end=%lx flags=%x\n",
		bdata - bootmem_node_data,
		sidx + bdata->node_min_pfn,
		eidx + bdata->node_min_pfn,
		flags);
	/*对连续的几个页面设置为保留*/
	for (idx = sidx; idx < eidx; idx++)
		if (test_and_set_bit(idx, bdata->node_bootmem_map)) {
			if (exclusive) {
				__free(bdata, sidx, idx);
				return -EBUSY;
			}
			bdebug("silent double reserve of PFN %lx\n",
				idx + bdata->node_min_pfn);
		}
	return 0;
}
/*bootmem分配器中释放内存*/
static void __init __free(bootmem_data_t *bdata,
			unsigned long sidx, unsigned long eidx)
{
	unsigned long idx;

	bdebug("nid=%td start=%lx end=%lx\n", bdata - bootmem_node_data,
		sidx + bdata->node_min_pfn,
		eidx + bdata->node_min_pfn);

	if (bdata->hint_idx > sidx)
		bdata->hint_idx = sidx;/*更新变量hint_idx,用于分配*/

	for (idx = sidx; idx < eidx; idx++)/*对应位清0*/
		if (!test_and_clear_bit(idx, bdata->node_bootmem_map))
			BUG();
}
void __init early_res_to_bootmem(u64 start, u64 end)
{
	int i, count;
	u64 final_start, final_end;

	count  = 0;
	for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++)
		count++;/*计算保留块的个数*/

	printk(KERN_INFO "(%d early reservations) ==> bootmem [%010llx - %010llx]\n",
			 count, start, end);
	for (i = 0; i < count; i++) {
		struct early_res *r = &early_res[i];
		printk(KERN_INFO "  #%d [%010llx - %010llx] %16s", i,
			r->start, r->end, r->name);
		final_start = max(start, r->start);
		final_end = min(end, r->end);
		if (final_start >= final_end) {
			printk(KERN_CONT "\n");
			continue;
		}
		printk(KERN_CONT " ==> [%010llx - %010llx]\n",
			final_start, final_end);
		/*将指定区间置为保留*/
		reserve_bootmem_generic(final_start, final_end - final_start,
				BOOTMEM_DEFAULT);
	}
}
上面的保留指定区间reserve_bootmem_generic()函数最终调用如下函数

/**
 * reserve_bootmem - mark a page range as usable
 * @addr: starting address of the range
 * @size: size of the range in bytes
 * @flags: reservation flags (see linux/bootmem.h)
 *
 * Partial pages will be reserved.
 *
 * The range must be contiguous but may span node boundaries.
 */
int __init reserve_bootmem(unsigned long addr, unsigned long size,
			    int flags)
{
	unsigned long start, end;

	start = PFN_DOWN(addr);/*下界*/
	end = PFN_UP(addr + size);/*上界*/

	return mark_bootmem(start, end, 1, flags);
}
/*保留指定内存区间*/
static int __init mark_bootmem(unsigned long start, unsigned long end,
				int reserve, int flags)
{
	unsigned long pos;
	bootmem_data_t *bdata;

	pos = start;
	/*通过bdata_list链表找到在指定区间的bdata*/
	list_for_each_entry(bdata, &bdata_list, list) {
		int err;
		unsigned long max;

		if (pos < bdata->node_min_pfn ||
		    pos >= bdata->node_low_pfn) {
			BUG_ON(pos != start);
			continue;
		}

		max = min(bdata->node_low_pfn, end);
		/*设置为保留*/
		err = mark_bootmem_node(bdata, pos, max, reserve, flags);
		if (reserve && err) {/*如果出错,递归调用*/
			mark_bootmem(start, pos, 0, 0);
			return err;
		}

		if (max == end)
			return 0;
		pos = bdata->node_low_pfn;
	}
	BUG();
}
三、内存的分配和释放

介绍了上面的初始化流程,对于分配和释放就简单了,分配就是将分配器映射位图中对应的位置1,释放过程相反。

/*分配size大小的空间*/
static void * __init alloc_bootmem_core(struct bootmem_data *bdata,
					unsigned long size, unsigned long align,
					unsigned long goal, unsigned long limit)
{
	unsigned long fallback = 0;
	unsigned long min, max, start, sidx, midx, step;

	bdebug("nid=%td size=%lx [%lu pages] align=%lx goal=%lx limit=%lx\n",
		bdata - bootmem_node_data, size, PAGE_ALIGN(size) >> PAGE_SHIFT,
		align, goal, limit);

	BUG_ON(!size);
	BUG_ON(align & (align - 1));
	BUG_ON(limit && goal + size > limit);
	/*如果没有映射位图返回空,分配失败*/
	if (!bdata->node_bootmem_map)
		return NULL;

	min = bdata->node_min_pfn;
	max = bdata->node_low_pfn;

	goal >>= PAGE_SHIFT;
	limit >>= PAGE_SHIFT;

	if (limit && max > limit)
		max = limit;
	if (max <= min)
		return NULL;
	/*step为需要对齐于页面数*/
	step = max(align >> PAGE_SHIFT, 1UL);
	/*计算起始页面*/
	if (goal && min < goal && goal < max)
		start = ALIGN(goal, step);
	else
		start = ALIGN(min, step);
	/*计算分配页面区间*/
	sidx = start - bdata->node_min_pfn;
	midx = max - bdata->node_min_pfn;
	/*前一次分配的页号比这次开始分配的页面号大
	那么,如果第一次没有分配到,回退到这次的
	开始重新试,因为第一次分配是从上一次分配
	的位置开始的*/
	if (bdata->hint_idx > sidx) {		
		 * Handle the valid case of sidx being zero and still
		 * catch the fallback below.
		 */
		fallback = sidx + 1;
		/*从上一次分配的位置开始,对齐与页面*/
		sidx = align_idx(bdata, bdata->hint_idx, step);
	}

	while (1) {
		int merge;
		void *region;
		unsigned long eidx, i, start_off, end_off;
find_block:
		/*查找第一个为0的位*/
		sidx = find_next_zero_bit(bdata->node_bootmem_map, midx, sidx);
		sidx = align_idx(bdata, sidx, step);
		eidx = sidx + PFN_UP(size);/*结束位置*/

		if (sidx >= midx || eidx > midx)/*找到结束了*/
			break;

		for (i = sidx; i < eidx; i++)/*检查这段区域是否空闲*/
			if (test_bit(i, bdata->node_bootmem_map)) {/*如果不是,将跳过这段继续查找*/
				sidx = align_idx(bdata, i, step);
				if (sidx == i)
					sidx += step;
				goto find_block;
			}

		if (bdata->last_end_off & (PAGE_SIZE - 1) &&/*如果为相邻的页面,也就是说上次分配的页面和这次分配的开始页面为相邻的*/
				PFN_DOWN(bdata->last_end_off) + 1 == sidx)
			start_off = align_off(bdata, bdata->last_end_off, align);
		else
			start_off = PFN_PHYS(sidx);

		/*merge==1表示上次结束和这次开始不在同一个页面上*/
		merge = PFN_DOWN(start_off) < sidx;
		end_off = start_off + size;
		/*更新数据*/
		bdata->last_end_off = end_off;
		bdata->hint_idx = PFN_UP(end_off);

		/*
		 * Reserve the area now:
		 */
		 /*设定新加入的页面为保留,就是将对应的映射位置1*/
		if (__reserve(bdata, PFN_DOWN(start_off) + merge,
				PFN_UP(end_off), BOOTMEM_EXCLUSIVE))
			BUG();
		/*对应开始地址的虚拟地址返回*/
		region = phys_to_virt(PFN_PHYS(bdata->node_min_pfn) +
				start_off);
		memset(region, 0, size);/*分配的大小*/
		/*
		 * The min_count is set to 0 so that bootmem allocated blocks
		 * are never reported as leaks.
		 */
		 /*调试用*/
		kmemleak_alloc(region, size, 0, 0);
		return region;
	}

	if (fallback) {/*回退,重新查看*/
		sidx = align_idx(bdata, fallback - 1, step);
		fallback = 0;
		goto find_block;
	}

	return NULL;
}

你可能感兴趣的:(数据结构,linux,list,struct,活动,each)