CPU Cache理解与性能优化

作者

QQ群:852283276
微信:arm80x86
微信公众号:青儿创客基地
B站:主页 https://space.bilibili.com/208826118

参考

cache结构与工作原理
Cache的组成结构
浅析x86架构中cache的组织结构
x86架构里的cache
7个示例科普CPU CACHE
Gallery of Processor Cache Effects
x64内核内存空间结构
cache组织方式(VIVT、VIPT、PIPT)
CPU缓存实现方式:VIVT、VIPT和PIPT

Cache结构

Cache物理结构有三种:

  1. 直接相连(Direct Mapped),一个内存地址只在Cache的一个位置出现,相当于组(Set)只有一个Way的组相连Cache
  2. 组相连(Set-Associative),Cache被分为N组,一个内存地址可以出现组(Set)内的任意一个位置
  3. 全相连(Fully-Associative),一个内存地址可以在Cache的任意位置出现,相当于只有一个组(Set)的组相连Cache

下面看一下我的虚拟机Cache结构,以L1 D-Cache为例,

zc@ubuntu:~/project/fdk/mwm197$ lscpu
Architecture:          x86_64
CPU op-mode(s):        32-bit, 64-bit
Byte Order:            Little Endian
CPU(s):                2
On-line CPU(s) list:   0,1
Thread(s) per core:    1
Core(s) per socket:    2
Socket(s):             1
NUMA node(s):          1
Vendor ID:             GenuineIntel
CPU family:            6
Model:                 158
Model name:            Intel(R) Core(TM) i3-8100 CPU @ 3.60GHz
Stepping:              11
CPU MHz:               3600.007
BogoMIPS:              7200.01
Hypervisor vendor:     VMware
Virtualization type:   full
L1d cache:             32K
L1i cache:             32K
L2 cache:              256K
L3 cache:              6144K
NUMA node0 CPU(s):     0,1
Flags:                 fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon pebs bts nopl xtopology tsc_reliable nonstop_tsc cpuid aperfmperf pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch epb invpcid_single pti retpoline rsb_ctxsw fsgsbase tsc_adjust bmi1 avx2 smep bmi2 invpcid rdseed adx smap xsaveopt dtherm arat pln pts hwp hwp_notify hwp_act_window hwp_epp              
zc@ubuntu:~/project/fdk/mwm197$ cat /sys/devices/system/cpu/
cpu0/            cpufreq/         hotplug/         isolated         modalias         online           power/           uevent           
cpu1/            cpuidle/         intel_pstate/    kernel_max       offline          possible         present          vulnerabilities/ 
zc@ubuntu:~/project/fdk/mwm197$ cat /sys/devices/system/cpu/cpu0/
cache/            crash_notes_size  firmware_node/    node0/            subsystem/        uevent            
crash_notes       driver/           hotplug/          power/            topology/         
zc@ubuntu:~/project/fdk/mwm197$ cat /sys/devices/system/cpu/cpu0/cache/
index0/ index1/ index2/ index3/ power/  uevent  
zc@ubuntu:~/project/fdk/mwm197$ cat /sys/devices/system/cpu/cpu0/cache/index0/
coherency_line_size      level                    physical_line_partition  shared_cpu_list          size                     uevent                   
id                       number_of_sets           power/                   shared_cpu_map           type                     ways_of_associativity    
zc@ubuntu:~/project/fdk/mwm197$ cat /sys/devices/system/cpu/cpu0/cache/index0/type 
Data
zc@ubuntu:~/project/fdk/mwm197$ cat /sys/devices/system/cpu/cpu0/cache/index0/level 
1
zc@ubuntu:~/project/fdk/mwm197$ cat /sys/devices/system/cpu/cpu0/cache/index0/type 
Data
zc@ubuntu:~/project/fdk/mwm197$ cat /sys/devices/system/cpu/cpu0/cache/index0/size 
32K
zc@ubuntu:~/project/fdk/mwm197$ cat /sys/devices/system/cpu/cpu0/cache/index0/number_of_sets 
64
zc@ubuntu:~/project/fdk/mwm197$ cat /sys/devices/system/cpu/cpu0/cache/index0/ways_of_associativity 
8
zc@ubuntu:~/project/fdk/mwm197$ cat /sys/devices/system/cpu/cpu0/cache/index0/coherency_line_size 
64

L1 D-Cache大小为32K(0x8000),64个组,每组8路,Cache行为64字节,CPU地址空间为64位(不确定)?此时内存地址拆分如下(不确定)?8路这个参数不体现在地址拆分里,因为组内相当于全相连。

|---- 索引 ---|----- 组 -----|---------- 块内偏移 -----------|------ 字节偏移 ------|
|---- tag ----|---- set ----|-----block internal offset-----|-----Byte offset-----|
|----64:12----|----11:6-----|------------5:3----------------|---------2:0---------|
|----- 52 ----|----- 6 -----|------------ 3 ----------------|--------- 3 ---------|

Cache索引方式VIVT VIPT PIPT

如果使用使用虚拟地址做索引,不同进程(用户态/内核态)同时访问一片物理地址的时候,就有可能有缓存别名cache aliasing的问题,也就是多个相同的虚拟地址可能会被映射到同一个物理地址,使用copy_to_user是个比较稳妥的方案,当然也就带来性能的下降。

Linux Cache操作相关接口

armv7 cortex-a9,xilinx petalinux-v2015.2.1 kernel3.19.0

dma_sync_single_for_cpu/dma_sync_single_for_device

//include\asm-generic\dma-mapping-common.h
dma_sync_single_for_cpu/dma_sync_single_for_device
  struct dma_map_ops *ops = get_dma_ops(dev); //arch\arm\include\asm\dma-mapping.h
    __generic_dma_ops
      dev->archdata.dma_ops //自定义设置覆盖默认设置
      arm_dma_ops //默认 arch\arm\mm\dma-mapping.c
  arm_dma_ops->arm_dma_sync_single_for_cpu/arm_dma_sync_single_for_device //arch\arm\mm\dma-mapping.c

arm_dma_ops

//arch\arm\mm\dma-mapping.c line130
struct dma_map_ops arm_dma_ops = {
	.alloc			= arm_dma_alloc,
	.free			= arm_dma_free,
	.mmap			= arm_dma_mmap,
	.get_sgtable		= arm_dma_get_sgtable,
	.map_page		= arm_dma_map_page,
	.unmap_page		= arm_dma_unmap_page,
	.map_sg			= arm_dma_map_sg,
	.unmap_sg		= arm_dma_unmap_sg,
	.sync_single_for_cpu	= arm_dma_sync_single_for_cpu,
	.sync_single_for_device	= arm_dma_sync_single_for_device,
	.sync_sg_for_cpu	= arm_dma_sync_sg_for_cpu,
	.sync_sg_for_device	= arm_dma_sync_sg_for_device,
	.set_dma_mask		= arm_dma_set_mask,
};
EXPORT_SYMBOL(arm_dma_ops);

arm_dma_sync_single_for_cpu/arm_dma_sync_single_for_device

//arch\arm\mm\dma-mapping.c line144
static void arm_dma_sync_single_for_cpu(struct device *dev,
		dma_addr_t handle, size_t size, enum dma_data_direction dir)
{
	unsigned int offset = handle & (PAGE_SIZE - 1);
	struct page *page = pfn_to_page(dma_to_pfn(dev, handle-offset));
	__dma_page_dev_to_cpu(page, offset, size, dir);
}

static void arm_dma_sync_single_for_device(struct device *dev,
		dma_addr_t handle, size_t size, enum dma_data_direction dir)
{
	unsigned int offset = handle & (PAGE_SIZE - 1);
	struct page *page = pfn_to_page(dma_to_pfn(dev, handle-offset));
	__dma_page_cpu_to_dev(page, offset, size, dir);
}

__dma_page_cpu_to_dev/__dma_page_dev_to_cpu

//arch\arm\mm\dma-mapping.c line836
/*
 * Make an area consistent for devices.
 * Note: Drivers should NOT use this function directly, as it will break
 * platforms with CONFIG_DMABOUNCE.
 * Use the driver DMA support - see dma-mapping.h (dma_sync_*)
 */
static void __dma_page_cpu_to_dev(struct page *page, unsigned long off,
	size_t size, enum dma_data_direction dir)
{
	phys_addr_t paddr;

	dma_cache_maint_page(page, off, size, dir, dmac_map_area);

	paddr = page_to_phys(page) + off;
	if (dir == DMA_FROM_DEVICE) {
		outer_inv_range(paddr, paddr + size);
	} else {
		outer_clean_range(paddr, paddr + size);
	}
	/* FIXME: non-speculating: flush on bidirectional mappings? */
}

static void __dma_page_dev_to_cpu(struct page *page, unsigned long off,
	size_t size, enum dma_data_direction dir)
{
	phys_addr_t paddr = page_to_phys(page) + off;

	/* FIXME: non-speculating: not required */
	/* in any case, don't bother invalidating if DMA to device */
	if (dir != DMA_TO_DEVICE) {
		outer_inv_range(paddr, paddr + size);

		dma_cache_maint_page(page, off, size, dir, dmac_unmap_area);
	}

	/*
	 * Mark the D-cache clean for these pages to avoid extra flushing.
	 */
	if (dir != DMA_TO_DEVICE && size >= PAGE_SIZE) {
		unsigned long pfn;
		size_t left = size;

		pfn = page_to_pfn(page) + off / PAGE_SIZE;
		off %= PAGE_SIZE;
		if (off) {
			pfn++;
			left -= PAGE_SIZE - off;
		}
		while (left >= PAGE_SIZE) {
			page = pfn_to_page(pfn++);
			set_bit(PG_dcache_clean, &page->flags);
			left -= PAGE_SIZE;
		}
	}
}

arm_dma_sync_sg_for_cpu/arm_dma_sync_sg_for_dev

arm_dma_sync_sg_for_cpu/arm_dma_sync_sg_for_dev内部调用arm_dma_sync_single_for_cpu/arm_dma_sync_single_for_device来实现,

//arch\arm\mm\dma-mapping.c line932
/**
 * arm_dma_sync_sg_for_cpu
 * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices
 * @sg: list of buffers
 * @nents: number of buffers to map (returned from dma_map_sg)
 * @dir: DMA transfer direction (same as was passed to dma_map_sg)
 */
void arm_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
			int nents, enum dma_data_direction dir)
{
	struct dma_map_ops *ops = get_dma_ops(dev);
	struct scatterlist *s;
	int i;

	for_each_sg(sg, s, nents, i)
		ops->sync_single_for_cpu(dev, sg_dma_address(s), s->length,
					 dir);
}

/**
 * arm_dma_sync_sg_for_device
 * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices
 * @sg: list of buffers
 * @nents: number of buffers to map (returned from dma_map_sg)
 * @dir: DMA transfer direction (same as was passed to dma_map_sg)
 */
void arm_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
			int nents, enum dma_data_direction dir)
{
	struct dma_map_ops *ops = get_dma_ops(dev);
	struct scatterlist *s;
	int i;

	for_each_sg(sg, s, nents, i)
		ops->sync_single_for_device(dev, sg_dma_address(s), s->length,
					    dir);
}

arm_dma_map_sg/arm_dma_unmap_sg

arm_dma_map_sg/arm_dma_unmap_sg在内部调用map_page/unmap_page来实现,

//arch\arm\mm\dma-mapping.c line870
/**
 * arm_dma_map_sg - map a set of SG buffers for streaming mode DMA
 * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices
 * @sg: list of buffers
 * @nents: number of buffers to map
 * @dir: DMA transfer direction
 *
 * Map a set of buffers described by scatterlist in streaming mode for DMA.
 * This is the scatter-gather version of the dma_map_single interface.
 * Here the scatter gather list elements are each tagged with the
 * appropriate dma address and length.  They are obtained via
 * sg_dma_{address,length}.
 *
 * Device ownership issues as mentioned for dma_map_single are the same
 * here.
 */
int arm_dma_map_sg(struct device *dev, struct scatterlist *sg, int nents,
		enum dma_data_direction dir, struct dma_attrs *attrs)
{
	struct dma_map_ops *ops = get_dma_ops(dev);
	struct scatterlist *s;
	int i, j;

	for_each_sg(sg, s, nents, i) {
#ifdef CONFIG_NEED_SG_DMA_LENGTH
		s->dma_length = s->length;
#endif
		s->dma_address = ops->map_page(dev, sg_page(s), s->offset,
						s->length, dir, attrs);
		if (dma_mapping_error(dev, s->dma_address))
			goto bad_mapping;
	}
	return nents;

 bad_mapping:
	for_each_sg(sg, s, i, j)
		ops->unmap_page(dev, sg_dma_address(s), sg_dma_len(s), dir, attrs);
	return 0;
}

/**
 * arm_dma_unmap_sg - unmap a set of SG buffers mapped by dma_map_sg
 * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices
 * @sg: list of buffers
 * @nents: number of buffers to unmap (same as was passed to dma_map_sg)
 * @dir: DMA transfer direction (same as was passed to dma_map_sg)
 *
 * Unmap a set of streaming mode DMA translations.  Again, CPU access
 * rules concerning calls here are the same as for dma_unmap_single().
 */
void arm_dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nents,
		enum dma_data_direction dir, struct dma_attrs *attrs)
{
	struct dma_map_ops *ops = get_dma_ops(dev);
	struct scatterlist *s;

	int i;

	for_each_sg(sg, s, nents, i)
		ops->unmap_page(dev, sg_dma_address(s), sg_dma_len(s), dir, attrs);
}

map_page/unmap_page

//arch\arm\mm\dma-mapping.c line61
/**
 * arm_dma_map_page - map a portion of a page for streaming DMA
 * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices
 * @page: page that buffer resides in
 * @offset: offset into page for start of buffer
 * @size: size of buffer to map
 * @dir: DMA transfer direction
 *
 * Ensure that any data held in the cache is appropriately discarded
 * or written back.
 *
 * The device owns this memory once this call has completed.  The CPU
 * can regain ownership by calling dma_unmap_page().
 */
static dma_addr_t arm_dma_map_page(struct device *dev, struct page *page,
	     unsigned long offset, size_t size, enum dma_data_direction dir,
	     struct dma_attrs *attrs)
{
	if (!dma_get_attr(DMA_ATTR_SKIP_CPU_SYNC, attrs))
		__dma_page_cpu_to_dev(page, offset, size, dir);
	return pfn_to_dma(dev, page_to_pfn(page)) + offset;
}

static dma_addr_t arm_coherent_dma_map_page(struct device *dev, struct page *page,
	     unsigned long offset, size_t size, enum dma_data_direction dir,
	     struct dma_attrs *attrs)
{
	return pfn_to_dma(dev, page_to_pfn(page)) + offset;
}

/**
 * arm_dma_unmap_page - unmap a buffer previously mapped through dma_map_page()
 * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices
 * @handle: DMA address of buffer
 * @size: size of buffer (same as passed to dma_map_page)
 * @dir: DMA transfer direction (same as passed to dma_map_page)
 *
 * Unmap a page streaming mode DMA translation.  The handle and size
 * must match what was provided in the previous dma_map_page() call.
 * All other usages are undefined.
 *
 * After this call, reads by the CPU to the buffer are guaranteed to see
 * whatever the device wrote there.
 */
static void arm_dma_unmap_page(struct device *dev, dma_addr_t handle,
		size_t size, enum dma_data_direction dir,
		struct dma_attrs *attrs)
{
	if (!dma_get_attr(DMA_ATTR_SKIP_CPU_SYNC, attrs))
		__dma_page_dev_to_cpu(pfn_to_page(dma_to_pfn(dev, handle)),
				      handle & ~PAGE_MASK, size, dir);
}

struct cpu_cache_fns cpu_cache

其中cpu_cache在函数setup_processor中初始化,

//arch\arm\kernel\setup.c
static void __init setup_processor(void)
{
	struct proc_info_list *list;

	/*
	 * locate processor in the list of supported processor
	 * types.  The linker builds this table for us from the
	 * entries in arch/arm/mm/proc-*.S
	 */
	list = lookup_processor_type(read_cpuid_id());
	if (!list) {
		pr_err("CPU configuration botched (ID %08x), unable to continue.\n",
		       read_cpuid_id());
		while (1);
	}

	cpu_name = list->cpu_name;
	__cpu_architecture = __get_cpu_architecture();

#ifdef MULTI_CPU
	processor = *list->proc;
#endif
#ifdef MULTI_TLB
	cpu_tlb = *list->tlb;
#endif
#ifdef MULTI_USER
	cpu_user = *list->user;
#endif
#ifdef MULTI_CACHE
	cpu_cache = *list->cache;
#endif

...

}

cpu_cache是一个结构体,包含一组函数库,

struct cpu_cache_fns {
	void (*flush_icache_all)(void);
	void (*flush_kern_all)(void);
	void (*flush_kern_louis)(void);
	void (*flush_user_all)(void);
	void (*flush_user_range)(unsigned long, unsigned long, unsigned int);

	void (*coherent_kern_range)(unsigned long, unsigned long);
	int  (*coherent_user_range)(unsigned long, unsigned long);
	void (*flush_kern_dcache_area)(void *, size_t);

	void (*dma_map_area)(const void *, size_t, int);
	void (*dma_unmap_area)(const void *, size_t, int);

	void (*dma_flush_range)(const void *, const void *);
};

在xilinx zynq平台上,cpu_cache_fns定义在arch\arm\mm\proc-macros.S,实现来自arch\arm\mm\cache-v7.S汇编,例如,

dma_unmap_area -> v7_dma_unmap_area
dma_map_area -> v7_dma_map_area

outercache

outercache在函数__l2c_init初始化,outer_cache = fns = data->outer_cache

//arch\arm\mm\cache-l2x0.c
static void __init __l2c_init(const struct l2c_init_data *data,
	u32 aux_val, u32 aux_mask, u32 cache_id)
{
	struct outer_cache_fns fns;
	unsigned way_size_bits, ways;
	u32 aux, old_aux;

	/*
	 * Sanity check the aux values.  aux_mask is the bits we preserve
	 * from reading the hardware register, and aux_val is the bits we
	 * set.
	 */
	if (aux_val & aux_mask)
		pr_alert("L2C: platform provided aux values permit register corruption.\n");

	old_aux = aux = readl_relaxed(l2x0_base + L2X0_AUX_CTRL);
	aux &= aux_mask;
	aux |= aux_val;

	if (old_aux != aux)
		pr_warn("L2C: DT/platform modifies aux control register: 0x%08x -> 0x%08x\n",
		        old_aux, aux);

	/* Determine the number of ways */
	switch (cache_id & L2X0_CACHE_ID_PART_MASK) {
	case L2X0_CACHE_ID_PART_L310:
		if ((aux_val | ~aux_mask) & (L2C_AUX_CTRL_WAY_SIZE_MASK | L310_AUX_CTRL_ASSOCIATIVITY_16))
			pr_warn("L2C: DT/platform tries to modify or specify cache size\n");
		if (aux & (1 << 16))
			ways = 16;
		else
			ways = 8;
		break;

	case L2X0_CACHE_ID_PART_L210:
	case L2X0_CACHE_ID_PART_L220:
		ways = (aux >> 13) & 0xf;
		break;

	case AURORA_CACHE_ID:
		ways = (aux >> 13) & 0xf;
		ways = 2 << ((ways + 1) >> 2);
		break;

	default:
		/* Assume unknown chips have 8 ways */
		ways = 8;
		break;
	}

	l2x0_way_mask = (1 << ways) - 1;

	/*
	 * way_size_0 is the size that a way_size value of zero would be
	 * given the calculation: way_size = way_size_0 << way_size_bits.
	 * So, if way_size_bits=0 is reserved, but way_size_bits=1 is 16k,
	 * then way_size_0 would be 8k.
	 *
	 * L2 cache size = number of ways * way size.
	 */
	way_size_bits = (aux & L2C_AUX_CTRL_WAY_SIZE_MASK) >>
			L2C_AUX_CTRL_WAY_SIZE_SHIFT;
	l2x0_size = ways * (data->way_size_0 << way_size_bits);

	fns = data->outer_cache;
	fns.write_sec = outer_cache.write_sec;
	if (data->fixup)
		data->fixup(l2x0_base, cache_id, &fns);

	/*
	 * Check if l2x0 controller is already enabled.  If we are booting
	 * in non-secure mode accessing the below registers will fault.
	 */
	if (!(readl_relaxed(l2x0_base + L2X0_CTRL) & L2X0_CTRL_EN))
		data->enable(l2x0_base, aux, data->num_lock);

	outer_cache = fns;

	/*
	 * It is strange to save the register state before initialisation,
	 * but hey, this is what the DT implementations decided to do.
	 */
	if (data->save)
		data->save(l2x0_base);

	/* Re-read it in case some bits are reserved. */
	aux = readl_relaxed(l2x0_base + L2X0_AUX_CTRL);

	pr_info("%s cache controller enabled, %d ways, %d kB\n",
		data->type, ways, l2x0_size >> 10);
	pr_info("%s: CACHE_ID 0x%08x, AUX_CTRL 0x%08x\n",
		data->type, cache_id, aux);
}

在xilinx zynq平台上,使用的l2 cache是L2C-310,开机打印如下,

L2C: platform modifies aux control register: 0x72360000 -> 0x72760000
L2C: DT/platform modifies aux control register: 0x72360000 -> 0x72760000
L2C-310 erratum 769419 enabled
L2C-310 enabling early BRESP for Cortex-A9
L2C-310 full line of zeros enabled for Cortex-A9
L2C-310 ID prefetch enabled, offset 1 lines
L2C-310 dynamic clock gating enabled, standby mode enabled
L2C-310 cache controller enabled, 8 ways, 512 kB
L2C-310: CACHE_ID 0x410000c8, AUX_CTRL 0x76760001

arm64

armv8,xilinx petalinux-v2018.2,kernel4.14.0,

__dma_inv_area
__swiotlb_map_sg_attrs

你可能感兴趣的:(linux内核与驱动开发,cache)