Linux ioremap 的实现
linux, memory, ioremap
在 linux kernel 的代码中,经常看到 ioremap 函数。
其功能是将给定的物理地址映射为虚拟地址。
注意,此处的物理地址并不是真正内存的物理地址,而是cpu上的io memory。
可以参考芯片《Reference Manual》中断 memory map 章节。
本文主要学习 ioremap 是如何实现的。
ioremap 的定义:
#define ioremap(cookie,size) __arch_ioremap((cookie), (size), MT_DEVICE) #define MT_DEVICE 0 #define __arch_ioremap __arm_ioremap void __iomem * __arm_ioremap(unsigned long phys_addr, size_t size, unsigned int mtype) { return __arm_ioremap_caller(phys_addr, size, mtype, __builtin_return_address(0)); } void __iomem *__arm_ioremap_caller(unsigned long phys_addr, size_t size, unsigned int mtype, void *caller) { unsigned long last_addr; unsigned long offset = phys_addr & ~PAGE_MASK; unsigned long pfn = __phys_to_pfn(phys_addr); /* * Don't allow wraparound or zero size */ last_addr = phys_addr + size - 1; if (!size || last_addr < phys_addr) return NULL; return __arm_ioremap_pfn_caller(pfn, offset, size, mtype, caller); } void __iomem * __arm_ioremap_pfn_caller(unsigned long pfn, unsigned long offset, size_t size, unsigned int mtype, void *caller) { const struct mem_type *type; int err; unsigned long addr; struct vm_struct * area; /* * High mappings must be supersection aligned */ // 高端内存需要对齐到 supersection if (pfn >= 0x100000 && (__pfn_to_phys(pfn) & ~SUPERSECTION_MASK)) return NULL; /* * Don't allow RAM to be mapped - this causes problems with ARMv6+ */ // map 的不能是 RAM ,只能是 soc 的 io memory /* int pfn_valid(unsigned long pfn) { return memblock_is_memory(pfn << PAGE_SHIFT); } */ if (WARN_ON(pfn_valid(pfn))) return NULL; // get_mem_type 的实现见后文 // 从前文的定义可知, mtype 为 MT_DEVICE type = get_mem_type(mtype); if (!type) return NULL; /* * Page align the mapping size, taking account of any offset. */ size = PAGE_ALIGN(offset + size); // get_vm_area_caller 函数的实现见后面 area = get_vm_area_caller(size, VM_IOREMAP, caller); if (!area) return NULL; addr = (unsigned long)area->addr; #ifndef CONFIG_SMP if (DOMAIN_IO == 0 && (((cpu_architecture() >= CPU_ARCH_ARMv6) && (get_cr() & CR_XP)) || cpu_is_xsc3()) && pfn >= 0x100000 && !((__pfn_to_phys(pfn) | size | addr) & ~SUPERSECTION_MASK)) { area->flags |= VM_ARM_SECTION_MAPPING; err = remap_area_supersections(addr, pfn, size, type); } else if (!((__pfn_to_phys(pfn) | size | addr) & ~PMD_MASK)) { area->flags |= VM_ARM_SECTION_MAPPING; err = remap_area_sections(addr, pfn, size, type); } else #endif // ioremap_page_range 函数的实现见后文 err = ioremap_page_range(addr, addr + size, __pfn_to_phys(pfn), __pgprot(type->prot_pte)); if (err) { vunmap((void *)addr); return NULL; } flush_cache_vmap(addr, addr + size); return (void __iomem *) (offset + addr); }
get_mem_type 函数的实现:
const struct mem_type *get_mem_type(unsigned int type) { return type < ARRAY_SIZE(mem_types) ? &mem_types[type] : NULL; }
mem_types 的定义:
static struct mem_type mem_types[] = { [MT_DEVICE] = { /* Strongly ordered / ARMv6 shared device */ .prot_pte = PROT_PTE_DEVICE | L_PTE_MT_DEV_SHARED | L_PTE_SHARED, .prot_l1 = PMD_TYPE_TABLE, .prot_sect = PROT_SECT_DEVICE | PMD_SECT_S, .domain = DOMAIN_IO, }, [MT_DEVICE_NONSHARED] = { /* ARMv6 non-shared device */ .prot_pte = PROT_PTE_DEVICE | L_PTE_MT_DEV_NONSHARED, .prot_l1 = PMD_TYPE_TABLE, .prot_sect = PROT_SECT_DEVICE, .domain = DOMAIN_IO, }, [MT_DEVICE_CACHED] = { /* ioremap_cached */ .prot_pte = PROT_PTE_DEVICE | L_PTE_MT_DEV_CACHED, .prot_l1 = PMD_TYPE_TABLE, .prot_sect = PROT_SECT_DEVICE | PMD_SECT_WB, .domain = DOMAIN_IO, }, [MT_DEVICE_WC] = { /* ioremap_wc */ .prot_pte = PROT_PTE_DEVICE | L_PTE_MT_DEV_WC, .prot_l1 = PMD_TYPE_TABLE, .prot_sect = PROT_SECT_DEVICE, .domain = DOMAIN_IO, }, [MT_UNCACHED] = { .prot_pte = PROT_PTE_DEVICE, .prot_l1 = PMD_TYPE_TABLE, .prot_sect = PMD_TYPE_SECT | PMD_SECT_XN, .domain = DOMAIN_IO, }, [MT_CACHECLEAN] = { .prot_sect = PMD_TYPE_SECT | PMD_SECT_XN, .domain = DOMAIN_KERNEL, }, [MT_MINICLEAN] = { .prot_sect = PMD_TYPE_SECT | PMD_SECT_XN | PMD_SECT_MINICACHE, .domain = DOMAIN_KERNEL, }, [MT_LOW_VECTORS] = { .prot_pte = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY | L_PTE_RDONLY, .prot_l1 = PMD_TYPE_TABLE, .domain = DOMAIN_USER, }, [MT_HIGH_VECTORS] = { .prot_pte = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY | L_PTE_USER | L_PTE_RDONLY, .prot_l1 = PMD_TYPE_TABLE, .domain = DOMAIN_USER, }, [MT_MEMORY] = { .prot_pte = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY, .prot_l1 = PMD_TYPE_TABLE, .prot_sect = PMD_TYPE_SECT | PMD_SECT_AP_WRITE, .domain = DOMAIN_KERNEL, }, [MT_ROM] = { .prot_sect = PMD_TYPE_SECT, .domain = DOMAIN_KERNEL, }, [MT_MEMORY_NONCACHED] = { .prot_pte = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY | L_PTE_MT_BUFFERABLE, .prot_l1 = PMD_TYPE_TABLE, .prot_sect = PMD_TYPE_SECT | PMD_SECT_AP_WRITE, .domain = DOMAIN_KERNEL, }, [MT_MEMORY_DTCM] = { .prot_pte = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY | L_PTE_XN, .prot_l1 = PMD_TYPE_TABLE, .prot_sect = PMD_TYPE_SECT | PMD_SECT_XN, .domain = DOMAIN_KERNEL, }, [MT_MEMORY_ITCM] = { .prot_pte = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY, .prot_l1 = PMD_TYPE_TABLE, .domain = DOMAIN_KERNEL, }, };
get_vm_area_caller 函数的实现:
struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, void *caller) { /* * Just any arbitrary offset to the start of the vmalloc VM area: the * current 8MB value just means that there will be a 8MB "hole" after the * physical memory until the kernel virtual memory starts. That means that * any out-of-bounds memory accesses will hopefully be caught. * The vmalloc() routines leaves a hole of 4kB between each vmalloced * area for the same reason. ;) * * Note that platforms may override VMALLOC_START, but they must provide * VMALLOC_END. VMALLOC_END defines the (exclusive) limit of this space, * which may not overlap IO space. */ /* #ifndef VMALLOC_START #define VMALLOC_OFFSET (8*1024*1024) // high_memory 在 arch/arm/mm/init.c 文件中的 bootmem_init 函数中赋值,该函数的实现见后文 #define VMALLOC_START (((unsigned long)high_memory + VMALLOC_OFFSET) & ~(VMALLOC_OFFSET-1)) #endif */ /* vmalloc ending address */ #define VMALLOC_END 0xf2000000UL return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, -1, GFP_KERNEL, caller); } static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long align, unsigned long flags, unsigned long start, unsigned long end, int node, gfp_t gfp_mask, void *caller) { static struct vmap_area *va; struct vm_struct *area; BUG_ON(in_interrupt()); /* bits in flags of vmalloc's vm_struct below */ // #define VM_IOREMAP 0x00000001 /* ioremap() and friends */ if (flags & VM_IOREMAP) { int bit = fls(size); if (bit > IOREMAP_MAX_ORDER) bit = IOREMAP_MAX_ORDER; else if (bit < PAGE_SHIFT) bit = PAGE_SHIFT; align = 1ul << bit; } size = PAGE_ALIGN(size); if (unlikely(!size)) return NULL; /** * kzalloc_node - allocate zeroed memory from a particular memory node. * @size: how many bytes of memory are required. * @flags: the type of memory to allocate (see kmalloc). * @node: memory node from which to allocate */ // 分配一个 vm_struct 结构体 area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); if (unlikely(!area)) return NULL; /* * We always allocate a guard page. */ size += PAGE_SIZE; // start 和 end 分别为 VMALLOC_START 和 VMALLOC_END // align 为 1 // alloc_vmap_area 函数的注释: /* * Allocate a region of KVA of the specified size and alignment, within the * vstart and vend. */ // 已经使用的 vm 的信息分别存在各个 vmap_area 结构体中 // 所有的 vmap_area 结构体都在红黑树 vmap_area_root 中 // alloc_vmap_area 函数的主要功能是,查找红黑树 vmap_area_root ,找到 start 和 end 之间满足 size 大小的未使用空间, // 创建一个 vmap_area 结构体,并用找到的未使用空间信息初始化该结构体,然后将该结构体插入到红黑树 vmap_area_root 中 va = alloc_vmap_area(size, align, start, end, node, gfp_mask); if (IS_ERR(va)) { kfree(area); return NULL; } /* * When this function is called from __vmalloc_node_range, * we do not add vm_struct to vmlist here to avoid * accessing uninitialized members of vm_struct such as * pages and nr_pages fields. They will be set later. * To distinguish it from others, we use a VM_UNLIST flag. */ if (flags & VM_UNLIST) setup_vmalloc_vm(area, va, flags, caller); else // 看前面注释可知,上面的 if 分支只是个特殊情况,我们只分析 else 分支 // insert_vmalloc_vm 函数的实现见后文 insert_vmalloc_vm(area, va, flags, caller); return area; }
文件中的 bootmem_init 函数中赋值,该函数的实现:
void __init bootmem_init(void) { unsigned long min, max_low, max_high; max_low = max_high = 0; // 找到内存的起始地址, 低端内存的最高地址, 高端内存的最高地址 // find_limits 函数实现见后文 find_limits(&min, &max_low, &max_high); arm_bootmem_init(min, max_low); /* * Sparsemem tries to allocate bootmem in memory_present(), * so must be done after the fixed reservations */ arm_memory_present(); /* * sparse_init() needs the bootmem allocator up and running. */ sparse_init(); /* * Now free the memory - free_area_init_node needs * the sparse mem_map arrays initialized by sparse_init() * for memmap_init_zone(), otherwise all PFNs are invalid. */ arm_bootmem_free(min, max_low, max_high); // high_memory 为高端内存的起始虚拟地址 high_memory = __va(((phys_addr_t)max_low << PAGE_SHIFT) - 1) + 1; /* * This doesn't seem to be used by the Linux memory manager any * more, but is used by ll_rw_block. If we can get rid of it, we * also get rid of some of the stuff above as well. * * Note: max_low_pfn and max_pfn reflect the number of _pages_ in * the system, not the maximum PFN. */ max_low_pfn = max_low - PHYS_PFN_OFFSET; max_pfn = max_high - PHYS_PFN_OFFSET; }
find_limits 函数实现:
static void __init find_limits(unsigned long *min, unsigned long *max_low, unsigned long *max_high) { struct meminfo *mi = &meminfo; int i; *min = -1UL; *max_low = *max_high = 0; for_each_bank (i, mi) { struct membank *bank = &mi->bank[i]; unsigned long start, end; start = bank_pfn_start(bank); end = bank_pfn_end(bank); if (*min > start) *min = start; if (*max_high < end) *max_high = end; // 如果是高端内存,就不用更新 max_low 了 // 参考后面的 sanity_check_meminfo 函数 if (bank->highmem) continue; if (*max_low < end) *max_low = end; } }
sanity_check_meminfo 函数的实现:
void __init sanity_check_meminfo(void) { int i, j, highmem = 0; for (i = 0, j = 0; i < meminfo.nr_banks; i++) { struct membank *bank = &meminfo.bank[j]; *bank = meminfo.bank[i]; #ifdef CONFIG_HIGHMEM // static void * __initdata vmalloc_min = (void *)(VMALLOC_END - SZ_128M); if (__va(bank->start) >= vmalloc_min || __va(bank->start) < (void *)PAGE_OFFSET) highmem = 1; bank->highmem = highmem; ... #else bank->highmem = highmem; ... } ... }
回到函数 get_vm_area_caller 的实现。
insert_vmalloc_vm 函数的实现:
static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, unsigned long flags, void *caller) { // vm(vm_struct) 结构体在函数 __get_vm_area_node 中分配 // va(vmap_area) 结构体,在函数 __get_vm_area_node 中通过调用 alloc_vmap_area 分配 setup_vmalloc_vm(vm, va, flags, caller); insert_vmalloc_vmlist(vm); }
setup_vmalloc_vm 函数的实现:
static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, unsigned long flags, void *caller) { vm->flags = flags; vm->addr = (void *)va->va_start; vm->size = va->va_end - va->va_start; vm->caller = caller; va->vm = vm; va->flags |= VM_VM_AREA; }
insert_vmalloc_vmlist 函数的实现:
static void insert_vmalloc_vmlist(struct vm_struct *vm) { struct vm_struct *tmp, **p; vm->flags &= ~VM_UNLIST; write_lock(&vmlist_lock); // 将 vm_struct 结构体插入的 vmlist 中 for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) { if (tmp->addr >= vm->addr) break; } vm->next = *p; *p = vm; write_unlock(&vmlist_lock); }
ioremap_page_range 函数的实现:
int ioremap_page_range(unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot) { pgd_t *pgd; unsigned long start; unsigned long next; int err; BUG_ON(addr >= end); start = addr; phys_addr -= addr; pgd = pgd_offset_k(addr); do { next = pgd_addr_end(addr, end); err = ioremap_pud_range(pgd, addr, next, phys_addr+addr, prot); if (err) break; } while (pgd++, addr = next, addr != end); flush_cache_vmap(start, end); return err; } static inline int ioremap_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot) { pud_t *pud; unsigned long next; phys_addr -= addr; pud = pud_alloc(&init_mm, pgd, addr); if (!pud) return -ENOMEM; do { next = pud_addr_end(addr, end); if (ioremap_pmd_range(pud, addr, next, phys_addr + addr, prot)) return -ENOMEM; } while (pud++, addr = next, addr != end); return 0; } static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot) { pmd_t *pmd; unsigned long next; phys_addr -= addr; pmd = pmd_alloc(&init_mm, pud, addr); if (!pmd) return -ENOMEM; do { next = pmd_addr_end(addr, end); if (ioremap_pte_range(pmd, addr, next, phys_addr + addr, prot)) return -ENOMEM; } while (pmd++, addr = next, addr != end); return 0; } static int ioremap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot) { pte_t *pte; u64 pfn; pfn = phys_addr >> PAGE_SHIFT; pte = pte_alloc_kernel(pmd, addr); if (!pte) return -ENOMEM; do { BUG_ON(!pte_none(*pte)); set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot)); pfn++; } while (pte++, addr += PAGE_SIZE, addr != end); return 0; }
上面几个函数的功能,是建立 linux 4级页表。
linux 4级页表参考:
http://larmbr.me/2014/01/19/the-evolution-of-4-level-page-talbe-in-linux/
总价一下。
ioremap中首先做了一些检查,其中一项检查是要处理的物理地址是不是 RAM ,因为 ioremap 只处理 soc 的 io memory ,不处理 RAM 。
分配一个 vm_struct 结构体。
之后分配一个 vmap_area 结构体,并查找红黑树 vmap_area_root 找到合适的 hole 。
然后初始化 vm_struct 结构体和 vmap_area 结构体的一些成员。
最后建立 linux 的4级内存页表。
4级即: PGD -> PUD -> PMD -> PTE