最近开发异构驱动时,遇到许多问题和疑惑,故想了解下Linux的内存申请机制。
内核:Linux4.9
一般 dma之类的内存申请都会尝试走 dev->ops.xxxx
的路径,所以这里梳理下 dev.ops.xxx
的来源
以arm为例:
of_platform_bus_create
|--- of_platform_device_create_pdata //根据device node创建device
|--- of_dma_configure // 默认 dma_base=0,size=4G
|--- of_iommu_configure //根据iommus属性获取 iommu_ops
|--- arch_setup_dma_ops
|--- arm_setup_iommu_dma_ops //iommu 返回 true,
|--- arm_iommu_create_mapping //创建一个mapping
|--- iommu_domain_alloc
__arm_iommu_attach_device
|--- iommu_attach_device
|--- dev->archdata.mapping //这里初始化
true:arm_get_iommu_dma_map_ops //有iommu所以走这个 获取到 iommu_ops(dma-mapping.c)
false:arm_get_dma_map_ops //无iommu,返回 arm_dma_ops(dma-mapping.c)
set_dma_ops //设置 dev->archdata.dma_ops = ops
根据上述初始化流程可以得出:
mapping:映射信息,iova的分配使用。
domain:类似于mmu页表,管理着整个地址的映射,目前全志的iommu只支持一个domain,对应只有一个mapping
可以看到,初始化做的最主要的事情就是设置每个 deivce
的 archdata
。
代码位置:drivers/of/of_reserved_mem.c
最大预留块数:
#define MAX_RESERVED_REGIONS 16
static struct reserved_mem reserved_mem[MAX_RESERVED_REGIONS];
static int reserved_mem_count;
也就是说 reserved-memory这个节点最多能预留16块,如果要更多,需要修改这个宏
函数:fdt_init_reserved_mem
ARM32:
start_kernel
setup_arch
arm_memblock_init/arm64_memblock_init
early_init_fdt_scan_reserved_mem
early_init_dt_reserve_memory_arch(扫描initial_boot_params,调用这个保留内存)
of_scan_flat_dt (扫描设备树)
__fdt_scan_reserved_mem
if 'node != reserved-memory' 直接返回
__reserved_mem_reserve_reg
early_init_dt_reserve_memory_arch(初始化每个节点)
fdt_init_reserved_mem
位置:drivers/of/fdt.c
伪代码:
static int __init __reserved_mem_reserve_reg(unsigned long node,
const char *uname)
{
prop = of_get_flat_dt_prop(node, "reg", &len);
if (!prop)
return -ENOENT;
if (len && len % t_len != 0) {
return -EINVAL;
}
nomap = of_get_flat_dt_prop(node, "no-map", NULL) != NULL;
while (len >= t_len) {
/* 从 reg 属性获取到 base和size */
base = dt_mem_next_cell(dt_root_addr_cells, &prop);
size = dt_mem_next_cell(dt_root_size_cells, &prop);
/* 调用 early_init_dt_reserve_memory_arch 进行预留 */
if (size &&
early_init_dt_reserve_memory_arch(base, size, nomap) == 0)
pr_debug("Reserved memory: reserved region for node '%s': base %pa, size %ld MiB\n",
uname, &base, (unsigned long)size / SZ_1M);
else
pr_info("Reserved memory: failed to reserve memory for node '%s': base %pa, size %ld MiB\n",
uname, &base, (unsigned long)size / SZ_1M);
len -= t_len;
if (first) {
/* 记录第一个node, 用于再次初始化 */
fdt_reserved_mem_save_node(node, uname, base, size);
first = 0;
}
}
return 0;
}
int __init __weak early_init_dt_reserve_memory_arch(phys_addr_t base,
phys_addr_t size, bool nomap)
{
/* nomap属性会让内存在Linux系统中不可见 */
if (nomap)
return memblock_remove(base, size);
return memblock_reserve(base, size);
}
void __init fdt_reserved_mem_save_node(unsigned long node, const char *uname,
phys_addr_t base, phys_addr_t size)
{
struct reserved_mem *rmem = &reserved_mem[reserved_mem_count];
rmem->fdt_node = node;
rmem->name = uname;
rmem->base = base;
rmem->size = size;
reserved_mem_count++;
}
伪代码:
void __init fdt_init_reserved_mem(void)
{
int i;
/* check for overlapping reserved regions */
__rmem_check_for_overlap();
for (i = 0; i < reserved_mem_count; i++) {
struct reserved_mem *rmem = &reserved_mem[i];
unsigned long node = rmem->fdt_node;
int len;
const __be32 *prop;
int err = 0;
prop = of_get_flat_dt_prop(node, "phandle", &len);
if (!prop)
prop = of_get_flat_dt_prop(node, "linux,phandle", &len);
if (prop)
rmem->phandle = of_read_number(prop, len/4);
if (rmem->size == 0)
err = __reserved_mem_alloc_size(node, rmem->name,
&rmem->base, &rmem->size);
/* 初始化这个mem */
if (err == 0)
__reserved_mem_init_node(rmem);
}
}
/*
* 调用对应 name 的 initfn(前提是该节点设置了 compatible = "xxxxxx")
* 使用: RESERVEDMEM_OF_DECLARE 声明
* 在4.9下有以下声明:
* ion_of.c:249:RESERVEDMEM_OF_DECLARE(ion, "ion-region", rmem_ion_setup);
* bman_ccsr.c:133:RESERVEDMEM_OF_DECLARE(bman_fbpr, "fsl,bman-fbpr", bman_fbpr);
* qman_ccsr.c:419:RESERVEDMEM_OF_DECLARE(qman_fqd, "fsl,qman-fqd", qman_fqd);
* qman_ccsr.c:430:RESERVEDMEM_OF_DECLARE(qman_pfdr, "fsl,qman-pfdr", qman_pfdr);
* dma-contiguous.c:279:RESERVEDMEM_OF_DECLARE(cma, "shared-dma-pool", rmem_cma_setup);
* dma-coherent.c:337:RESERVEDMEM_OF_DECLARE(dma, "shared-dma-pool", rmem_dma_setup);
*
* dma-coherent.c的就是我们最后调用到的
*/
static int __init __reserved_mem_init_node(struct reserved_mem *rmem)
{
extern const struct of_device_id __reservedmem_of_table[];
const struct of_device_id *i;
/* 通过链接脚本让 __rmem_of_table_sentinel 在 __reservedmem_of_table 后*/
for (i = __reservedmem_of_table; i < &__rmem_of_table_sentinel; i++) {
reservedmem_of_init_fn initfn = i->data;
const char *compat = i->compatible;
if (!of_flat_dt_is_compatible(rmem->fdt_node, compat))
continue;
if (initfn(rmem) == 0) {
pr_info("initialized node %s, compatible id %s\n",
rmem->name, compat);
return 0;
}
}
return -ENOENT;
}
// __reserved_mem_init_node -> dma-coherent.c:rmem_dma_setup(rmm)
static int __init rmem_dma_setup(struct reserved_mem *rmem)
{
unsigned long node = rmem->fdt_node;
if (of_get_flat_dt_prop(node, "reusable", NULL))
return -EINVAL;
#ifdef CONFIG_ARM
if (!of_get_flat_dt_prop(node, "no-map", NULL)) {
pr_err("Reserved memory: regions without no-map are not yet supported\n");
return -EINVAL;
}
#endif
rmem->ops = &rmem_dma_ops;
pr_info("Reserved memory: created DMA memory pool at %pa, size %ld MiB\n",
&rmem->base, (unsigned long)rmem->size / SZ_1M);
return 0;
}
将设备树中 memory-region
的第idx个节点预留给设备
of_reserved_mem_device_init_by_idx(struct device *dev,struct device_node *np, int idx)
rmem = __find_rmem(target);
rmem->ops->device_init(rmem, dev);
rmem_dma_device_init
dma_init_coherent_memory(进行地址映射,虚拟机地址分配等)
dma_mem->virt_base = memremap(phys_addr, size, MEMREMAP_WC);
|- ioremap(phys_addr, size);
dma_mem->device_base = phys_addr
dma_mem->pfn_base = PFN_DOWN(phys_addr);
dma_assign_coherent_memory
dev->dma_mem = mem;
memremap(phys_addr, size, MEMREMAP_WC);
|--- ioremap_wc(offset, size);
|---- __arm_ioremap_caller(res_cookie, size, MT_DEVICE_WC, __builtin_return_address(0))
|---- __arm_ioremap_pfn_caller(pfn, offset, size, MT_DEVICE_WC, NULL)
__arm_ioremap_pfn_caller函数实现具体映射,伪代码如下:
static void __iomem * __arm_ioremap_pfn_caller(unsigned long pfn,
unsigned long offset, size_t size, unsigned int mtype, void *caller)
{
const struct mem_type *type;
int err;
unsigned long addr;
struct vm_struct *area;
phys_addr_t paddr = __pfn_to_phys(pfn);
type = get_mem_type(mtype);
if (!type)
return NULL;
/*
* Page align the mapping size, taking account of any offset.
*/
size = PAGE_ALIGN(offset + size);
/*
* 尽可能重用其中一个静态映射. 略
*/
if (size && !(sizeof(phys_addr_t) == 4 && pfn >= 0x100000)) {
struct static_vm *svm;
svm = find_static_vm_paddr(paddr, size, mtype);
if (svm) {
addr = (unsigned long)svm->vm.addr;
addr += paddr - svm->vm.phys_addr;
return (void __iomem *) (offset + addr);
}
}
/*
* 不允许使用不匹配的属性映射 RAM
* pfn_valid检查 addr所属的 memblock.memory.regions的flags属性是否配置了 MEMBLOCK_NOMAP
*/
if (WARN_ON(pfn_valid(pfn) && mtype != MT_MEMORY_RW))
return NULL;
/*
* 从内核虚拟地址(vmalloc区域)中寻找一块空闲的虚拟地址
*/
area = get_vm_area_caller(size, VM_IOREMAP, caller);
if (!area)
return NULL;
addr = (unsigned long)area->addr;
area->phys_addr = paddr;
#if !defined(CONFIG_SMP) && !defined(CONFIG_ARM_LPAE)
if (DOMAIN_IO == 0 &&
(((cpu_architecture() >= CPU_ARCH_ARMv6) && (get_cr() & CR_XP)) ||
cpu_is_xsc3()) && pfn >= 0x100000 &&
!((paddr | size | addr) & ~SUPERSECTION_MASK)) {
area->flags |= VM_ARM_SECTION_MAPPING;
err = remap_area_supersections(addr, pfn, size, type);
} else if (!((paddr | size | addr) & ~PMD_MASK)) {
area->flags |= VM_ARM_SECTION_MAPPING;
err = remap_area_sections(addr, pfn, size, type);
} else
#endif
/*
* 映射对应的虚拟地址和物理地址
*/
err = ioremap_page_range(addr, addr + size, paddr,
__pgprot(type->prot_pte));
if (err) {
vunmap((void *)addr);
return NULL;
}
flush_cache_vmap(addr, addr + size);
return (void __iomem *) (offset + addr);
}
个人对dma_alloc_coherent
的调用路径比较好奇,这里单独进行分析。
调用链:
dma_alloc_coherent(dev, size, dma_handle, flag)
|--- dma_alloc_attrs(dev, size, dma_handle, flag, 0)
|--- dma_alloc_from_coherent(dev, size, dma_handle, &cpu_addr) -> 路线1 走设备路线, 成功则直接返回
|--- ops = get_dma_ops(dev) // 获取 dev->archdata.dma_ops, 创建device时候会根据有无IOMMU进行赋值
|--- if (dev && dev->archdata.dma_ops) return dev->archdata.dma_ops; // 有 IOMMMU
else return &arm_dma_ops; -> 没有IOMMU, 则等于 arm_dma_ops // 无 IOMMU
|--- cpu_addr = ops->alloc(dev, size, dma_handle, flag, attrs);
|--- arm_dma_alloc(dev, size, dma_handle, flag, attrs);
|--- __dma_alloc(dev, size, handle, gfp, prot, false, attrs, NULL);
函数:dma_alloc_from_coherent
伪代码如下:
int dma_alloc_from_coherent(struct device *dev, ssize_t size,
dma_addr_t *dma_handle, void **ret)
{
struct dma_coherent_mem *mem;
int order = get_order(size);
unsigned long flags;
int pageno;
int dma_memory_map;
if (!dev)
return 0;
mem = dev->dma_mem; // 获取设备树预留的物理内存基址
if (!mem)
return 0;
*ret = NULL;
spin_lock_irqsave(&mem->spinlock, flags);
if (unlikely(size > (mem->size << PAGE_SHIFT)))
goto err;
// 从 bitmap 或者空闲的内存
pageno = bitmap_find_free_region(mem->bitmap, mem->size, order);
if (unlikely(pageno < 0))
goto err;
/*
* 获取物理地址和虚拟地址,虚拟地址在前面 __arm_ioremap_pfn_caller 已经用
* memremap从 vmalloc 映射好了
*/
*dma_handle = mem->device_base + (pageno << PAGE_SHIFT);
*ret = mem->virt_base + (pageno << PAGE_SHIFT);
dma_memory_map = (mem->flags & DMA_MEMORY_MAP);
spin_unlock_irqrestore(&mem->spinlock, flags);
if (dma_memory_map)
memset(*ret, 0, size);
else
memset_io(*ret, 0, size);
return 1;
err:
spin_unlock_irqrestore(&mem->spinlock, flags);
/*
* In the case where the allocation can not be satisfied from the
* per-device area, try to fall back to generic memory if the
* constraints allow it.
*/
return mem->flags & DMA_MEMORY_EXCLUSIVE;
}
不需要手动刷cache,硬件来保证cache一致性。通常是将改buffer在页表中标志为 no cache
,由CPU直接读写DRAM。通常适合少写多读,否则经常更新DRAM的数据,效率比较低。
dma_alloc_coherent(dev, size, dma, flag)
建议申请大于一个PAGE_SIZE 以上使用,小内存推荐 dma_pool
接口
dma_free_coherent(dev, size, addr, dma)
dma_pool_create(name, dev, size , align, boundary)
dma_pool_alloc(pool, flags, dma)
dma_pool_free(pool, vaddr, dma)
参数同上,vaddr是前面的alloc得到的地址
dma_pool_destroy(pool)
通常需要手动刷cache。
DMA流映射函数可以在中断上下文使用。
一般做法:传输前进行映射,传输完后取消映射。在映射和解映射的相关代码中会进行cache一致性操作,一般调用map/unmap接口即可,无需自己手动调用刷cache接口。
流式DMA有2种映射,一种是映射单个内存区域,一种是映射一个scatterlist。
映射单个内存区域的接口有:
上面的函数需要用 dma_mapping_error()
来判断返回值。
映射scatterlist的接口有:
同步接口:
DMA
从内存读到设备,需要执行clean
,将cache
上的数据刷回去)DMA
从设备读到内存,需要执行invalid
,需要将cache
上的数据无效)dma_map_single(dev, addr, size, dir)
dma_map_single(dev, dma, size, direction)
dma_map_page(dev, page, offset, size, direction)
参数基本同 dma_map_single
,只是基于page映射
dma_unmap_page(dev, dma, size, direction);
dma_map_sg(dev, sglist, nents, direction);
dma_unmap_sg(dev, sglist, nents, direction);
获取对应的物理地址,可以使用sg_dma_address和sg_dma_len来获取sg的物理地址和长度:
int i, count = dma_map_sg(dev, sglist, nents, direction);
struct scatterlist *sg;
for_each_sg(sglist, sg, count, i) {
hw_address[i] = sg_dma_address(sg);
hw_len[i] = sg_dma_len(sg);
}
使用scatterlists,将若干个区域合并成一个区域用于映射
dma-buf
的出现就是为了解决各个驱动之间 buffer
共享的问题,因此它本质上是 buffer 与 file 的结合
主要是参考https://blog.csdn.net/hexiaolong2009/article/details/102596772
的文章
通常:
exporter
:提供并分配buffer
inporter
:内核空间的使用者user
:用户空间的使用者dma_buf_ops
:map_dma_buf/unmap_dma_buf, map, mmap, release
等DEFINE_DMA_BUF_EXPORT_INFO
定义一个info
:DEFINE_DMA_BUF_EXPORT_INFO(info)
info.ops = &ops;
info.size = xxx;
info.flags = O_CLOEXEC;
info.priv = xxx;
dma_buf_export
初始化info
对象:struct dma_buf *dmabuf = dma_buf_export(info);
主要是用于在内核映射并访问内存:
dma_buf_kmap/dma_buf_kunmap
:一次映射一个page
,可能会休眠dma_buf_kamp_atomic/dma_buf_kunmap
:一次映射一个page
,不会休眠dma_buf_vmap/dma_buf_vunmap
:一次可以映射多个page
提供给DMA
来访问物理内存
绑定API
:
dma_buf_attach
:
dma-buf
与 device
的连接关系,这个连接关系被存放在新创建的 dma_buf_attachment
对象中dma_buf_ops
中的 attach
回调接口dma_buf_map_attachment
:主要是生成sg_table
和同步cache
必须先调用attach
再调用attachment
。
对应的反向操作:
dma_buf_unmap_attachment
dma_buf_dettach
Driver Demo
:
#include
#include
#include
struct dma_buf *dmabuf_exported;
EXPORT_SYMBOL(dmabuf_exported);
static int exporter_attach(struct dma_buf *dmabuf, struct device *dev,
struct dma_buf_attachment *attachment)
{
pr_info("dmabuf attach device: %s\n", dev_name(dev));
return 0;
}
static void exporter_detach(struct dma_buf *dmabuf, struct dma_buf_attachment *attachment)
{
pr_info("dmabuf detach device: %s\n", dev_name(attachment->dev));
}
static struct sg_table *exporter_map_dma_buf(struct dma_buf_attachment *attachment,
enum dma_data_direction dir)
{
void *vaddr = attachment->dmabuf->priv;
struct sg_table *table;
table = kmalloc(sizeof(*table), GFP_KERNEL);
sg_alloc_table(table, 1, GFP_KERNEL);
sg_dma_len(table->sgl) = PAGE_SIZE;
sg_dma_address(table->sgl) = dma_map_single(NULL, vaddr, PAGE_SIZE, dir);
return table;
}
static void exporter_unmap_dma_buf(struct dma_buf_attachment *attachment,
struct sg_table *table,
enum dma_data_direction dir)
{
dma_unmap_single(NULL, sg_dma_address(table->sgl), PAGE_SIZE, dir);
sg_free_table(table);
kfree(table);
}
...
static const struct dma_buf_ops exp_dmabuf_ops = {
.attach = exporter_attach,
.detach = exporter_detach,
.map_dma_buf = exporter_map_dma_buf,
.unmap_dma_buf = exporter_unmap_dma_buf,
...
};
static struct dma_buf *exporter_alloc_page(void)
{
DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
struct dma_buf *dmabuf;
void *vaddr;
vaddr = kzalloc(PAGE_SIZE, GFP_KERNEL);
exp_info.ops = &exp_dmabuf_ops;
exp_info.size = PAGE_SIZE;
exp_info.flags = O_CLOEXEC;
exp_info.priv = vaddr;
dmabuf = dma_buf_export(&exp_info);
sprintf(vaddr, "hello world!");
return dmabuf;
}
static int __init exporter_init(void)
{
dmabuf_exported = exporter_alloc_page();
return 0;
}
module_init(exporter_init);
Demo
:
struct dma_buf_attachment *attachment;
struct sg_table *table;
struct device *dev;
unsigned int reg_addr, reg_size;
dev = kzalloc(sizeof(*dev), GFP_KERNEL);
dev_set_name(dev, "importer");
attachment = dma_buf_attach(dmabuf, dev);
table = dma_buf_map_attachment(attachment, DMA_BIDIRECTIONAL);
reg_addr = sg_dma_address(table->sgl);
reg_size = sg_dma_len(table->sgl);
pr_info("reg_addr = 0x%08x, reg_size = 0x%08x\n", reg_addr, reg_size);
dma_buf_unmap_attachment(attachment, table, DMA_BIDIRECTIONAL);
dma_buf_detach(dmabuf, attachment);
用户空间的mmap
会调用到对应exp_dmabuf_ops->mmap
。
内核空间可以使用dma_buf_mmap()
来直接引用 dma-buf
的 mmap
实现,以此来间接的实现设备驱动的 mmap
文件操作接口
驱动demo:
#include
#include
#include
#include
#include
struct dma_buf *dmabuf_exported;
EXPORT_SYMBOL(dmabuf_exported);
static int exporter_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma)
{
void *vaddr = dmabuf->priv;
return remap_pfn_range(vma, vma->vm_start, virt_to_pfn(vaddr),
PAGE_SIZE, vma->vm_page_prot);
}
...
static const struct dma_buf_ops exp_dmabuf_ops = {
...
.mmap = exporter_mmap,
};
...
static long exporter_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
int fd = dma_buf_fd(dmabuf_exported, O_CLOEXEC);
copy_to_user((int __user *)arg, &fd, sizeof(fd));
return 0;
}
static struct file_operations exporter_fops = {
.owner = THIS_MODULE,
.unlocked_ioctl = exporter_ioctl,
};
static struct miscdevice mdev = {
.minor = MISC_DYNAMIC_MINOR,
.name = "exporter",
.fops = &exporter_fops,
};
static int __init exporter_init(void)
{
...
misc_register(&mdev);
...
}
static void __exit exporter_exit(void)
{
...
misc_deregister(&mdev);
...
}
module_init(exporter_init);
module_exit(exporter_exit);
用户空间程序:
int main(int argc, char *argv[])
{
int fd;
int dmabuf_fd = 0;
fd = open("/dev/exporter", O_RDONLY);
ioctl(fd, 0, &dmabuf_fd);
close(fd);
char *str = mmap(NULL, 4096, PROT_READ, MAP_SHARED, dmabuf_fd, 0);
printf("read from dmabuf mmap: %s\n", str);
return 0;
}
下面是 驱动层 使用 dma_buf_mmap()
内核 API
,以此来简化设备驱动的 mmap
文件操作接口的实现。
比驱动demo新增file
的mmap
,去掉 dma_buf_ops
的ioctl
:
...
static int exporter_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma)
{
void *vaddr = dmabuf->priv;
return remap_pfn_range(vma, vma->vm_start, virt_to_pfn(vaddr),
PAGE_SIZE, vma->vm_page_prot);
}
...
static const struct dma_buf_ops exp_dmabuf_ops = {
...
.mmap = exporter_mmap, // 去掉
};
...
static int exporter_misc_mmap(struct file *file, struct vm_area_struct *vma)
{
return dma_buf_mmap(dmabuf_exported, vma, 0);
}
static struct file_operations exporter_fops = {
.owner = THIS_MODULE,
.mmap = exporter_misc_mmap, // 新增
};
...
用户程序demo:
int main(int argc, char *argv[])
{
int fd;
fd = open("/dev/exporter", O_RDONLY);
char *str = mmap(NULL, 4096, PROT_READ, MAP_SHARED, fd, 0);
printf("read from /dev/exporter mmap: %s\n", str);
close(fd);
return 0;
}
dma-buf
本质上是 buffer 与 file 的结合,该 file 还是个被 open 过的 file。
从我们调用 dma_buf_export()
开始,这个 file 就已经被 open 了。
而且该 file 还是个匿名文件,因此应用程序无法通过 fd = open(“name”)
的方式来获取它所对应的 fd
,只能依托于 exporter 驱动的 ioctl 接口,通过 dma_buf_fd()
来获取。
内核 API
实现了 dma-buf
与 fd
之间的相互转换:
fd = dma_buf_fd(dmabuf);
dmabuf = dma_buf_get(fd);
get/put
:
只要是文件,内部都会有一个引用计数(f_count
)。当使用 dma_buf_export()
函数创建 dma-buf
时,该引用计数被初始化为1;当这个引用计数为0时,则会自动触发 dma_buf_ops
的 release
回调接口,并释放 dma-buf
对象
在 linux
内核中操作 file
引用计数的常用函数为 fget()
和 fput()
,而 dma-buf
又在此基础上进行了封装,如下:
函数 | 区别 |
---|---|
get_dma_buf |
引用计数加1 |
dma_buf_get |
引用计数加1,并将 fd 转换成 dma_buf 指针 |
dma_buf_put |
引用计数减1 |
dma_buf_fd |
引用计数不变,仅创建 fd |
为什么需要 fd
:
user space
访问该 buffer
(通过 mmap
);buffer
在各个驱动模块之间流转,而无需拷贝;