linux内核分析 —— zone结构体对齐

内核版本:linux-6.2

在很多linux内核资料上都说为了防止伪cache共享,对zone结构体进行了特殊的设计。

下面是zone结构体的定义:

struct zone {
	unsigned long _watermark[NR_WMARK];
	unsigned long watermark_boost;
	unsigned long nr_reserved_highatomic;
	long lowmem_reserve[MAX_NR_ZONES];
#ifdef CONFIG_NUMA
	int node;
#endif
	struct pglist_data	*zone_pgdat;
	struct per_cpu_pages	__percpu *per_cpu_pageset;
	struct per_cpu_zonestat	__percpu *per_cpu_zonestats;
	int pageset_high;
	int pageset_batch;
#ifndef CONFIG_SPARSEMEM
	unsigned long		*pageblock_flags;
#endif /* CONFIG_SPARSEMEM */
	unsigned long		zone_start_pfn;
	atomic_long_t		managed_pages;
	unsigned long		spanned_pages;
	unsigned long		present_pages;
#if defined(CONFIG_MEMORY_HOTPLUG)
	unsigned long		present_early_pages;
#endif
#ifdef CONFIG_CMA
	unsigned long		cma_pages;
#endif
	const char		*name;
#ifdef CONFIG_MEMORY_ISOLATION
	unsigned long		nr_isolate_pageblock;
#endif
#ifdef CONFIG_MEMORY_HOTPLUG
	seqlock_t		span_seqlock;
#endif
	int initialized;
	
	/* Write-intensive fields used from the page allocator */
	CACHELINE_PADDING(_pad1_);
	struct free_area	free_area[MAX_ORDER];
	unsigned long		flags;
	spinlock_t		lock;

	/* Write-intensive fields used by compaction and vmstats. */
	CACHELINE_PADDING(_pad2_);
	unsigned long percpu_drift_mark;
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
	unsigned long		compact_cached_free_pfn;
	unsigned long		compact_cached_migrate_pfn[ASYNC_AND_SYNC];
	unsigned long		compact_init_migrate_pfn;
	unsigned long		compact_init_free_pfn;
#endif
#ifdef CONFIG_COMPACTION
	unsigned int		compact_considered;
	unsigned int		compact_defer_shift;
	int			compact_order_failed;
#endif

#if defined CONFIG_COMPACTION || defined CONFIG_CMA
	bool			compact_blockskip_flush;
#endif
	bool			contiguous;

	CACHELINE_PADDING(_pad3_);
	atomic_long_t		vm_stat[NR_VM_ZONE_STAT_ITEMS];
	atomic_long_t		vm_numa_event[NR_VM_NUMA_EVENT_ITEMS];
} ____cacheline_internodealigned_in_smp;

在这个结构体内部利用CACHELINE_PADDING对部分成员进行了对齐,同时最后最整个结构体也进行了对齐。

其中____cacheline_internodealigned_in_smpCACHELINE_PADDING的定义是在include/linux/cache.h中的:

#define ____cacheline_internodealigned_in_smp \
	__attribute__((__aligned__(1 << (INTERNODE_CACHE_SHIFT))))

/*
 * The maximum alignment needed for some critical structures
 * These could be inter-node cacheline sizes/L3 cacheline
 * size etc.  Define this in asm/cache.h for your arch
 */
#ifndef INTERNODE_CACHE_SHIFT
#define INTERNODE_CACHE_SHIFT L1_CACHE_SHIFT
#endif


#if defined(CONFIG_SMP)
struct cacheline_padding {
	char x[0];
} ____cacheline_internodealigned_in_smp;
#define CACHELINE_PADDING(name)		struct cacheline_padding name

可以看到,这两个宏默认都是按L1的cache line大小对齐,对结构体整体的修饰表示zone结构体类型的变量的起始地址按L1 cacheline对齐,在结构体内部表示紧跟在后面结构体成员变量的地址按L1 cacheline对齐。

那L1 cacheline的大小具体是多少呢?

以我自己的测试环境为例,是x86_64架构的,L1 cacheline的长度定义在arch\x86\include\asm\cache.h中:

/* L1 cache line size */
#define L1_CACHE_SHIFT	(CONFIG_X86_L1_CACHE_SHIFT)
#define INTERNODE_CACHE_SHIFT CONFIG_X86_INTERNODE_CACHE_SHIFT

在编译生成的autoconf.h中可以看到这两个宏的值:

#define CONFIG_X86_INTERNODE_CACHE_SHIFT 6
#define CONFIG_X86_L1_CACHE_SHIFT 6

即:L1 cacheline的长度是64字节,所以需要按64字节对齐。

下面通过crash工具验证一下。

首先验证zone结构体的起始地址:

crash> kmem -z | grep "ZONE:"
NODE: 0  ZONE: 0  ADDR: ffff88807fffb000  NAME: "DMA"
NODE: 0  ZONE: 1  ADDR: ffff88807fffb600  NAME: "DMA32"
NODE: 0  ZONE: 2  ADDR: ffff88807fffbc00  NAME: "Normal"
NODE: 0  ZONE: 3  ADDR: ffff88807fffc200  NAME: "Movable"
NODE: 1  ZONE: 0  ADDR: ffff88813fff8000  NAME: "DMA"
NODE: 1  ZONE: 1  ADDR: ffff88813fff8600  NAME: "DMA32"
NODE: 1  ZONE: 2  ADDR: ffff88813fff8c00  NAME: "Normal"
NODE: 1  ZONE: 3  ADDR: ffff88813fff9200  NAME: "Movable"

可以看到上面的这些zone的起始地址都是对齐的,而且是512字节对齐。

再看看zone结构体内部成员的偏移量:

crash> struct zone -ox
struct zone {
    [0x0] unsigned long _watermark[4];
   [0x20] unsigned long watermark_boost;
   [0x28] unsigned long nr_reserved_highatomic;
   [0x30] long lowmem_reserve[4];
   [0x50] int node;
   [0x58] struct pglist_data *zone_pgdat;
   [0x60] struct per_cpu_pages *per_cpu_pageset;
   [0x68] struct per_cpu_zonestat *per_cpu_zonestats;
   [0x70] int pageset_high;
   [0x74] int pageset_batch;
   [0x78] unsigned long zone_start_pfn;
   [0x80] atomic_long_t managed_pages;
   [0x88] unsigned long spanned_pages;
   [0x90] unsigned long present_pages;
   [0x98] const char *name;
   [0xa0] unsigned long nr_isolate_pageblock;
   [0xa8] int initialized;
   [0xc0] struct cacheline_padding _pad1_;
   [0xc0] struct free_area free_area[11];        // 偏移量是0xc0,低6位是0,是64字节对齐
  [0x488] unsigned long flags;
  [0x490] spinlock_t lock;
  [0x500] struct cacheline_padding _pad2_;
  [0x500] unsigned long percpu_drift_mark;       // 偏移量是0x500,低8位是0,是256字节对齐
  [0x508] unsigned long compact_cached_free_pfn;
  [0x510] unsigned long compact_cached_migrate_pfn[2];
  [0x520] unsigned long compact_init_migrate_pfn;
  [0x528] unsigned long compact_init_free_pfn;
  [0x530] unsigned int compact_considered;
  [0x534] unsigned int compact_defer_shift;
  [0x538] int compact_order_failed;
  [0x53c] bool compact_blockskip_flush;
  [0x53d] bool contiguous;
  [0x540] struct cacheline_padding _pad3_;
  [0x540] atomic_long_t vm_stat[11];           // 偏移量是0x540,低6位是0,是64字节对齐
  [0x598] atomic_long_t vm_numa_event[6];
}
SIZE: 0x600

完。

你可能感兴趣的:(Linux内核,Linux内存管理,linux,运维,服务器)