在前面对DPDK中的内存进行了各个模块的分析,这次开始整体流程的分析说明。重点是分析一下内存从开始准备到最终应用的过程,从而把各个分别讲的模板贯穿起来,从而能够更好的了解和认识DPDK中内存的使用。
DPDK中,启动时对内存的处理如下:
1、大页内存的处理
这个在前面分析过,两种情况,一种是直接在Linux内核配置文件中处理;另外一种是使用用命令配置,临时就用mount挂载,一直第一次就用nodev命令找到挂载点设置。当然这有一个前提,就是在内核中已经预留了大页内存。
2、使用mmap进行多进程内存共享。也即DPDK的主进程通过mmap进行初始化并mmap巨页内存,然后再通过配置文件共享给其它进程。
看一下相关的配置文件:
/**
* The global RTE configuration structure.
*/
struct rte_config {
uint32_t master_lcore; /**< Id of the master lcore */
uint32_t lcore_count; /**< Number of available logical cores. */
uint32_t numa_node_count; /**< Number of detected NUMA nodes. */
uint32_t numa_nodes[RTE_MAX_NUMA_NODES]; /**< List of detected NUMA nodes. */
uint32_t service_lcore_count;/**< Number of available service cores. */
enum rte_lcore_role_t lcore_role[RTE_MAX_LCORE]; /**< State of cores. */
/** Primary or secondary configuration */
enum rte_proc_type_t process_type;
/** PA or VA mapping mode */
enum rte_iova_mode iova_mode;
/**
* Pointer to memory configuration, which may be shared across multiple
* DPDK instances
*/
struct rte_mem_config *mem_config;
} __attribute__((__packed__));
/*
* internal configuration structure for the number, size and
* mount points of hugepages
*/
struct hugepage_info {
uint64_t hugepage_sz; /**< size of a huge page */
char hugedir[PATH_MAX]; /**< dir where hugetlbfs is mounted */
uint32_t num_pages[RTE_MAX_NUMA_NODES];
/**< number of hugepages of that size on each socket */
int lock_descriptor; /**< file descriptor for hugepage dir */
};
/**
* Structure used to store information about hugepages that we mapped
* through the files in hugetlbfs.
*/
struct hugepage_file {
void *orig_va; /**< virtual addr of first mmap() */
void *final_va; /**< virtual addr of 2nd mmap() */
uint64_t physaddr; /**< physical addr */
size_t size; /**< the page size */
int socket_id; /**< NUMA socket ID */
int file_id; /**< the '%d' in HUGEFILE_FMT */
char filepath[MAX_HUGEPAGE_PATH]; /**< path to backing file on filesystem */
};
/**
* Memory configuration shared across multiple processes.
*/
struct rte_mem_config {
volatile uint32_t magic; /**< Magic number - sanity check. */
uint32_t version;
/**< Prevent secondary processes using different DPDK versions. */
/* memory topology */
uint32_t nchannel; /**< Number of channels (0 if unknown). */
uint32_t nrank; /**< Number of ranks (0 if unknown). */
/**
* current lock nest order
* - qlock->mlock (ring/hash/lpm)
* - mplock->qlock->mlock (mempool)
* Notice:
* *ALWAYS* obtain qlock first if having to obtain both qlock and mlock
*/
rte_rwlock_t mlock; /**< used by memzones for thread safety. */
rte_rwlock_t qlock; /**< used by tailqs for thread safety. */
rte_rwlock_t mplock; /**< used by mempool library for thread safety. */
rte_spinlock_t tlock; /**< used by timer library for thread safety. */
rte_rwlock_t memory_hotplug_lock;
/**< Indicates whether memory hotplug request is in progress. */
/* memory segments and zones */
struct rte_fbarray memzones; /**< Memzone descriptors. */
struct rte_memseg_list memsegs[RTE_MAX_MEMSEG_LISTS];
/**< List of dynamic arrays holding memsegs */
struct rte_tailq_head tailq_head[RTE_MAX_TAILQ];
/**< Tailqs for objects */
struct malloc_heap malloc_heaps[RTE_MAX_HEAPS];
/**< DPDK malloc heaps */
int next_socket_id; /**< Next socket ID for external malloc heap */
/* rte_mem_config has to be mapped at the exact same address in all
* processes, so we need to store it.
*/
uint64_t mem_cfg_addr; /**< Address of this structure in memory. */
/* Primary and secondary processes cannot run with different legacy or
* single file segments options, so to avoid having to specify these
* options to all processes, store them in shared config and update the
* internal config at init time.
*/
uint32_t legacy_mem; /**< stored legacy mem parameter. */
uint32_t single_file_segments;
/**< stored single file segments parameter. */
uint64_t tsc_hz;
/**< TSC rate */
uint8_t dma_maskbits; /**< Keeps the more restricted dma mask. */
};
上面的四个数据结构,其实就是全局配置文件rte_config到rte_mem_config,然后再有就是巨页内存的配置和相关信息的数据结构。其中的英文注释已经详细的不得了了。
再看一下DPDK对世面的管理数据结构:
/**< Prevent this segment from being freed back to the OS. */
struct rte_memseg {
RTE_STD_C11
union {
phys_addr_t phys_addr; /**< deprecated - Start physical address. */
rte_iova_t iova; /**< Start IO address. */
};
RTE_STD_C11
union {
void *addr; /**< Start virtual address. */
uint64_t addr_64; /**< Makes sure addr is always 64 bits */
};
size_t len; /**< Length of the segment. */
uint64_t hugepage_sz; /**< The pagesize of underlying memory */
int32_t socket_id; /**< NUMA socket ID. */
uint32_t nchannel; /**< Number of channels. */
uint32_t nrank; /**< Number of ranks. */
uint32_t flags; /**< Memseg-specific flags */
} __rte_packed;
DPDK将同一SOCKET的大小相同并且地址连续的巨页存储在此结构中,方便管理和优化。
从启动后就正式开始了初始化和相关分配的流程:
1、环境初始化函数:
/* Launch threads, called at application init(). */
int
rte_eal_init(int argc, char **argv)
{
......
if (rte_config_init() < 0) {
rte_eal_init_alert("Cannot init config");
return -1;
......
if (internal_config.no_hugetlbfs == 0) {
/* rte_config isn't initialized yet */
ret = internal_config.process_type == RTE_PROC_PRIMARY ?
eal_hugepage_info_init() :
eal_hugepage_info_read();
if (ret < 0) {
rte_eal_init_alert("Cannot get hugepage information.");
rte_errno = EACCES;
rte_atomic32_clear(&run_once);
return -1;
}
}
.........
/* in secondary processes, memory init may allocate additional fbarrays
* not present in primary processes, so to avoid any potential issues,
* initialize memzones first.
*/
if (rte_eal_memzone_init() < 0) {
rte_eal_init_alert("Cannot init memzone");
rte_errno = ENODEV;
return -1;
}
.........
if (rte_eal_memory_init() < 0) {
rte_eal_init_alert("Cannot init memory");
rte_errno = ENOMEM;
return -1;
}
}
}
此函数里主要包含了四个主要的初始化函数(即上面列出的)。当然这个函数里还包括了不少的参数分析、日志以及其它初始化动作,但分析内存重点还是看内存相关的。
2、全局配置初始化
看相关代码:
/* Sets up rte_config structure with the pointer to shared memory config.*/
static int
rte_config_init(void)
{
rte_config.process_type = internal_config.process_type;
switch (rte_config.process_type){
case RTE_PROC_PRIMARY:
if (rte_eal_config_create() < 0)
return -1;
eal_mcfg_update_from_internal();
break;
case RTE_PROC_SECONDARY:
if (rte_eal_config_attach() < 0)
return -1;
eal_mcfg_wait_complete();
if (eal_mcfg_check_version() < 0) {
RTE_LOG(ERR, EAL, "Primary and secondary process DPDK version mismatch\n");
return -1;
}
if (rte_eal_config_reattach() < 0)
return -1;
eal_mcfg_update_internal();
break;
case RTE_PROC_AUTO:
case RTE_PROC_INVALID:
RTE_LOG(ERR, EAL, "Invalid process type %d\n",
rte_config.process_type);
return -1;
}
return 0;
}
rte_eal_config_reattach由从进程执行,而rte_eal_config_create由主进程执行,此处要进行前面提到的mmap映射过程即从config文件中的配置映射到所在进程的ret_config.mem_config数据结构中。
/* create memory configuration in shared/mmap memory. Take out
* a write lock on the memsegs, so we can auto-detect primary/secondary.
* This means we never close the file while running (auto-close on exit).
* We also don't lock the whole file, so that in future we can use read-locks
* on other parts, e.g. memzones, to detect if there are running secondary
* processes. */
static int
rte_eal_config_create(void)
{
size_t page_sz = sysconf(_SC_PAGE_SIZE);
size_t cfg_len = sizeof(*rte_config.mem_config);
size_t cfg_len_aligned = RTE_ALIGN(cfg_len, page_sz);
void *rte_mem_cfg_addr, *mapped_mem_cfg_addr;
int retval;
const char *pathname = eal_runtime_config_path();
if (internal_config.no_shconf)
return 0;
/* map the config before hugepage address so that we don't waste a page */
if (internal_config.base_virtaddr != 0)
rte_mem_cfg_addr = (void *)
RTE_ALIGN_FLOOR(internal_config.base_virtaddr -
sizeof(struct rte_mem_config), page_sz);
else
rte_mem_cfg_addr = NULL;
if (mem_cfg_fd < 0){
mem_cfg_fd = open(pathname, O_RDWR | O_CREAT, 0600);
if (mem_cfg_fd < 0) {
RTE_LOG(ERR, EAL, "Cannot open '%s' for rte_mem_config\n",
pathname);
return -1;
}
}
retval = ftruncate(mem_cfg_fd, cfg_len);
if (retval < 0){
close(mem_cfg_fd);
mem_cfg_fd = -1;
RTE_LOG(ERR, EAL, "Cannot resize '%s' for rte_mem_config\n",
pathname);
return -1;
}
retval = fcntl(mem_cfg_fd, F_SETLK, &wr_lock);
if (retval < 0){
close(mem_cfg_fd);
mem_cfg_fd = -1;
RTE_LOG(ERR, EAL, "Cannot create lock on '%s'. Is another primary "
"process running?\n", pathname);
return -1;
}
/* reserve space for config */
rte_mem_cfg_addr = eal_get_virtual_area(rte_mem_cfg_addr,
&cfg_len_aligned, page_sz, 0, 0);
if (rte_mem_cfg_addr == NULL) {
RTE_LOG(ERR, EAL, "Cannot mmap memory for rte_config\n");
close(mem_cfg_fd);
mem_cfg_fd = -1;
return -1;
}
/* remap the actual file into the space we've just reserved */
mapped_mem_cfg_addr = mmap(rte_mem_cfg_addr,
cfg_len_aligned, PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_FIXED, mem_cfg_fd, 0);
if (mapped_mem_cfg_addr == MAP_FAILED) {
munmap(rte_mem_cfg_addr, cfg_len);
close(mem_cfg_fd);
mem_cfg_fd = -1;
RTE_LOG(ERR, EAL, "Cannot remap memory for rte_config\n");
return -1;
}
memcpy(rte_mem_cfg_addr, &early_mem_config, sizeof(early_mem_config));
rte_config.mem_config = rte_mem_cfg_addr;
/* store address of the config in the config itself so that secondary
* processes could later map the config into this exact location */
rte_config.mem_config->mem_cfg_addr = (uintptr_t) rte_mem_cfg_addr;
rte_config.mem_config->dma_maskbits = 0;
return 0;
}
/* attach to an existing shared memory config */
static int
rte_eal_config_attach(void)
{
struct rte_mem_config *mem_config;
const char *pathname = eal_runtime_config_path();
if (internal_config.no_shconf)
return 0;
if (mem_cfg_fd < 0){
mem_cfg_fd = open(pathname, O_RDWR);
if (mem_cfg_fd < 0) {
RTE_LOG(ERR, EAL, "Cannot open '%s' for rte_mem_config\n",
pathname);
return -1;
}
}
/* map it as read-only first */
mem_config = (struct rte_mem_config *) mmap(NULL, sizeof(*mem_config),
PROT_READ, MAP_SHARED, mem_cfg_fd, 0);
if (mem_config == MAP_FAILED) {
close(mem_cfg_fd);
mem_cfg_fd = -1;
RTE_LOG(ERR, EAL, "Cannot mmap memory for rte_config! error %i (%s)\n",
errno, strerror(errno));
return -1;
}
rte_config.mem_config = mem_config;
return 0;
}
看明白这个,需要有一些mmap的相关知识,其实一般共享内存会有两个入口,一个是创建并使用,一个是使用。如果对此不是很清楚的可以看看相关的知识,不难,不过需要指出的Linux中有两种内存映射,一种是SystemV,而此处使用的Posix。
3、巨页内存信息配置初始化
其初始化代码:
int
eal_hugepage_info_init(void)
{
struct hugepage_info *hpi, *tmp_hpi;
unsigned int i;
if (hugepage_info_init() < 0)
return -1;
/* for no shared files mode, we're done */
if (internal_config.no_shconf)
return 0;
hpi = &internal_config.hugepage_info[0];
tmp_hpi = create_shared_memory(eal_hugepage_info_path(),
sizeof(internal_config.hugepage_info));
if (tmp_hpi == NULL) {
RTE_LOG(ERR, EAL, "Failed to create shared memory!\n");
return -1;
}
memcpy(tmp_hpi, hpi, sizeof(internal_config.hugepage_info));
/* we've copied file descriptors along with everything else, but they
* will be invalid in secondary process, so overwrite them
*/
for (i = 0; i < RTE_DIM(internal_config.hugepage_info); i++) {
struct hugepage_info *tmp = &tmp_hpi[i];
tmp->lock_descriptor = -1;
}
if (munmap(tmp_hpi, sizeof(internal_config.hugepage_info)) < 0) {
RTE_LOG(ERR, EAL, "Failed to unmap shared memory!\n");
return -1;
}
return 0;
}
static int
hugepage_info_init(void)
{ const char dirent_start_text[] = "hugepages-";
const size_t dirent_start_len = sizeof(dirent_start_text) - 1;
unsigned int i, num_sizes = 0;
DIR *dir;
struct dirent *dirent;
dir = opendir(sys_dir_path);
if (dir == NULL) {
RTE_LOG(ERR, EAL,
"Cannot open directory %s to read system hugepage info\n",
sys_dir_path);
return -1;
}
for (dirent = readdir(dir); dirent != NULL; dirent = readdir(dir)) {
struct hugepage_info *hpi;
if (strncmp(dirent->d_name, dirent_start_text,
dirent_start_len) != 0)
continue;
if (num_sizes >= MAX_HUGEPAGE_SIZES)
break;
hpi = &internal_config.hugepage_info[num_sizes];
hpi->hugepage_sz =
rte_str_to_size(&dirent->d_name[dirent_start_len]);
/* first, check if we have a mountpoint */
if (get_hugepage_dir(hpi->hugepage_sz,
hpi->hugedir, sizeof(hpi->hugedir)) < 0) {
uint32_t num_pages;
num_pages = get_num_hugepages(dirent->d_name);
if (num_pages > 0)
RTE_LOG(NOTICE, EAL,
"%" PRIu32 " hugepages of size "
"%" PRIu64 " reserved, but no mounted "
"hugetlbfs found for that size\n",
num_pages, hpi->hugepage_sz);
/* if we have kernel support for reserving hugepages
* through mmap, and we're in in-memory mode, treat this
* page size as valid. we cannot be in legacy mode at
* this point because we've checked this earlier in the
* init process.
*/
#ifdef MAP_HUGE_SHIFT
if (internal_config.in_memory) {
RTE_LOG(DEBUG, EAL, "In-memory mode enabled, "
"hugepages of size %" PRIu64 " bytes "
"will be allocated anonymously\n",
hpi->hugepage_sz);
calc_num_pages(hpi, dirent);
num_sizes++;
}
#endif
continue;
}
/* try to obtain a writelock */
hpi->lock_descriptor = open(hpi->hugedir, O_RDONLY);
/* if blocking lock failed */
if (flock(hpi->lock_descriptor, LOCK_EX) == -1) {
RTE_LOG(CRIT, EAL,
"Failed to lock hugepage directory!\n");
break;
}
/* clear out the hugepages dir from unused pages */
if (clear_hugedir(hpi->hugedir) == -1)
break;
calc_num_pages(hpi, dirent);
num_sizes++;
}
closedir(dir);
/* something went wrong, and we broke from the for loop above */
if (dirent != NULL)
return -1;
internal_config.num_hugepage_sizes = num_sizes;
/* sort the page directory entries by size, largest to smallest */
qsort(&internal_config.hugepage_info[0], num_sizes,
sizeof(internal_config.hugepage_info[0]), compare_hpi);
/* now we have all info, check we have at least one valid size */
for (i = 0; i < num_sizes; i++) {
/* pages may no longer all be on socket 0, so check all */
unsigned int j, num_pages = 0;
struct hugepage_info *hpi = &internal_config.hugepage_info[i];
for (j = 0; j < RTE_MAX_NUMA_NODES; j++)
num_pages += hpi->num_pages[j];
if (num_pages > 0)
return 0;
}
/* no valid hugepage mounts available, return error */
return -1;
}
其实大概上面的代码分为几个过程,首先准备巨页相关的数据结构然后进行巨页的init,在init中通过读取配置文件中的信息对整个巨页的信息进行配置并加载。其次,判断是否有
共享文件Mode,否则退出。再次根据前面得到的相关信息开始创建共享内存并拷贝hpi文件描述符到共享内存,这没啥可讲的。最后处理从进程中无用的部分即可。
4、内存memzone初始化
同样,在完成上述的内存处理动作后就开始了memzone的处理(同样在rte_eal_init函数内):
/* in secondary processes, memory init may allocate additional fbarrays
* not present in primary processes, so to avoid any potential issues,
* initialize memzones first.
*/
if (rte_eal_memzone_init() < 0) {
rte_eal_init_alert("Cannot init memzone");
rte_errno = ENODEV;
return -1;
}
if (rte_eal_memory_init() < 0) {
rte_eal_init_alert("Cannot init memory");
rte_errno = ENOMEM;
return -1;
}
这里把下面的那个初始化也搞进来,这个在后头就不再拷贝了:
/*
* Init the memzone subsystem
*/
int
rte_eal_memzone_init(void)
{
struct rte_mem_config *mcfg;
int ret = 0;
/* get pointer to global configuration */
mcfg = rte_eal_get_configuration()->mem_config;
rte_rwlock_write_lock(&mcfg->mlock);
if (rte_eal_process_type() == RTE_PROC_PRIMARY &&
rte_fbarray_init(&mcfg->memzones, "memzone",
RTE_MAX_MEMZONE, sizeof(struct rte_memzone))) {
RTE_LOG(ERR, EAL, "Cannot allocate memzone list\n");
ret = -1;
} else if (rte_eal_process_type() == RTE_PROC_SECONDARY &&
rte_fbarray_attach(&mcfg->memzones)) {
RTE_LOG(ERR, EAL, "Cannot attach to memzone list\n");
ret = -1;
}
rte_rwlock_write_unlock(&mcfg->mlock);
return ret;
}
int
rte_fbarray_init(struct rte_fbarray *arr, const char *name, unsigned int len,
unsigned int elt_sz)
{
size_t page_sz, mmap_len;
char path[PATH_MAX];
struct used_mask *msk;
struct mem_area *ma = NULL;
void *data = NULL;
int fd = -1;
if (arr == NULL) {
rte_errno = EINVAL;
return -1;
}
if (fully_validate(name, elt_sz, len))
return -1;
/* allocate mem area before doing anything */
ma = malloc(sizeof(*ma));
if (ma == NULL) {
rte_errno = ENOMEM;
return -1;
}
page_sz = sysconf(_SC_PAGESIZE);
if (page_sz == (size_t)-1) {
free(ma);
return -1;
}
/* calculate our memory limits */
mmap_len = calc_data_size(page_sz, elt_sz, len);
data = eal_get_virtual_area(NULL, &mmap_len, page_sz, 0, 0);
if (data == NULL) {
free(ma);
return -1;
}
rte_spinlock_lock(&mem_area_lock);
fd = -1;
if (internal_config.no_shconf) {
/* remap virtual area as writable */
void *new_data = mmap(data, mmap_len, PROT_READ | PROT_WRITE,
MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS, fd, 0);
if (new_data == MAP_FAILED) {
RTE_LOG(DEBUG, EAL, "%s(): couldn't remap anonymous memory: %s\n",
__func__, strerror(errno));
goto fail;
}
} else {
eal_get_fbarray_path(path, sizeof(path), name);
/*
* Each fbarray is unique to process namespace, i.e. the
* filename depends on process prefix. Try to take out a lock
* and see if we succeed. If we don't, someone else is using it
* already.
*/
fd = open(path, O_CREAT | O_RDWR, 0600);
if (fd < 0) {
RTE_LOG(DEBUG, EAL, "%s(): couldn't open %s: %s\n",
__func__, path, strerror(errno));
rte_errno = errno;
goto fail;
} else if (flock(fd, LOCK_EX | LOCK_NB)) {
RTE_LOG(DEBUG, EAL, "%s(): couldn't lock %s: %s\n",
__func__, path, strerror(errno));
rte_errno = EBUSY;
goto fail;
}
/* take out a non-exclusive lock, so that other processes could
* still attach to it, but no other process could reinitialize
* it.
*/
if (flock(fd, LOCK_SH | LOCK_NB)) {
rte_errno = errno;
goto fail;
}
if (resize_and_map(fd, path, data, mmap_len))
goto fail;
}
ma->addr = data;
ma->len = mmap_len;
ma->fd = fd;
/* do not close fd - keep it until detach/destroy */
TAILQ_INSERT_TAIL(&mem_area_tailq, ma, next);
/* initialize the data */
memset(data, 0, mmap_len);
/* populate data structure */
strlcpy(arr->name, name, sizeof(arr->name));
arr->data = data;
arr->len = len;
arr->elt_sz = elt_sz;
arr->count = 0;
msk = get_used_mask(data, elt_sz, len);
msk->n_masks = MASK_LEN_TO_IDX(RTE_ALIGN_CEIL(len, MASK_ALIGN));
rte_rwlock_init(&arr->rwlock);
rte_spinlock_unlock(&mem_area_lock);
return 0;
fail:
if (data)
munmap(data, mmap_len);
if (fd >= 0)
close(fd);
free(ma);
rte_spinlock_unlock(&mem_area_lock);
return -1;
}
int
rte_fbarray_attach(struct rte_fbarray *arr)
{
struct mem_area *ma = NULL, *tmp = NULL;
size_t page_sz, mmap_len;
char path[PATH_MAX];
void *data = NULL;
int fd = -1;
if (arr == NULL) {
rte_errno = EINVAL;
return -1;
}
/*
* we don't need to synchronize attach as two values we need (element
* size and array length) are constant for the duration of life of
* the array, so the parts we care about will not race.
*/
if (fully_validate(arr->name, arr->elt_sz, arr->len))
return -1;
ma = malloc(sizeof(*ma));
if (ma == NULL) {
rte_errno = ENOMEM;
return -1;
}
page_sz = sysconf(_SC_PAGESIZE);
if (page_sz == (size_t)-1) {
free(ma);
return -1;
}
mmap_len = calc_data_size(page_sz, arr->elt_sz, arr->len);
/* check the tailq - maybe user has already mapped this address space */
rte_spinlock_lock(&mem_area_lock);
TAILQ_FOREACH(tmp, &mem_area_tailq, next) {
if (overlap(tmp, arr->data, mmap_len)) {
rte_errno = EEXIST;
goto fail;
}
}
/* we know this memory area is unique, so proceed */
data = eal_get_virtual_area(arr->data, &mmap_len, page_sz, 0, 0);
if (data == NULL)
goto fail;
eal_get_fbarray_path(path, sizeof(path), arr->name);
fd = open(path, O_RDWR);
if (fd < 0) {
rte_errno = errno;
goto fail;
}
/* lock the file, to let others know we're using it */
if (flock(fd, LOCK_SH | LOCK_NB)) {
rte_errno = errno;
goto fail;
}
if (resize_and_map(fd, path, data, mmap_len))
goto fail;
/* store our new memory area */
ma->addr = data;
ma->fd = fd; /* keep fd until detach/destroy */
ma->len = mmap_len;
TAILQ_INSERT_TAIL(&mem_area_tailq, ma, next);
/* we're done */
rte_spinlock_unlock(&mem_area_lock);
return 0;
fail:
if (data)
munmap(data, mmap_len);
if (fd >= 0)
close(fd);
free(ma);
rte_spinlock_unlock(&mem_area_lock);
return -1;
}
这个函数调用rte_fabarry_init这个函数来分配struct mem_area,这个数据结构用来处理创建和附加的内存区域,确保API调用的安全。
5、内存初始化
在上面把内存初始化的代码调用已经说明了,现在看一下其代码实现及其调用的两个主要函数rte_eal_memseg_init和eal_memalloc_init:
/* init memory subsystem */
int
rte_eal_memory_init(void)
{
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
int retval;
RTE_LOG(DEBUG, EAL, "Setting up physically contiguous memory...\n");
if (!mcfg)
return -1;
/* lock mem hotplug here, to prevent races while we init */
rte_mcfg_mem_read_lock();
if (rte_eal_memseg_init() < 0)
goto fail;
if (eal_memalloc_init() < 0)
goto fail;
retval = rte_eal_process_type() == RTE_PROC_PRIMARY ?
rte_eal_hugepage_init() :
rte_eal_hugepage_attach();
if (retval < 0)
goto fail;
if (internal_config.no_shconf == 0 && rte_eal_memdevice_init() < 0)
goto fail;
return 0;
fail:
rte_mcfg_mem_read_unlock();
return -1;
}
int
rte_eal_memseg_init(void)
{
/* increase rlimit to maximum */
struct rlimit lim;
if (getrlimit(RLIMIT_NOFILE, &lim) == 0) {
/* set limit to maximum */
lim.rlim_cur = lim.rlim_max;
if (setrlimit(RLIMIT_NOFILE, &lim) < 0) {
RTE_LOG(DEBUG, EAL, "Setting maximum number of open files failed: %s\n",
strerror(errno));
} else {
RTE_LOG(DEBUG, EAL, "Setting maximum number of open files to %"
PRIu64 "\n",
(uint64_t)lim.rlim_cur);
}
} else {
RTE_LOG(ERR, EAL, "Cannot get current resource limits\n");
}
#ifndef RTE_EAL_NUMA_AWARE_HUGEPAGES
if (!internal_config.legacy_mem && rte_socket_count() > 1) {
RTE_LOG(WARNING, EAL, "DPDK is running on a NUMA system, but is compiled without NUMA support.\n");
RTE_LOG(WARNING, EAL, "This will have adverse consequences for performance and usability.\n");
RTE_LOG(WARNING, EAL, "Please use --"OPT_LEGACY_MEM" option, or recompile with NUMA support.\n");
}
#endif
return rte_eal_process_type() == RTE_PROC_PRIMARY ?
#ifndef RTE_ARCH_64
memseg_primary_init_32() :
#else
memseg_primary_init() :
#endif
memseg_secondary_init();
}
int
eal_memalloc_init(void)
{
if (rte_eal_process_type() == RTE_PROC_SECONDARY)
if (rte_memseg_list_walk(secondary_msl_create_walk, NULL) < 0)
return -1;
if (rte_eal_process_type() == RTE_PROC_PRIMARY &&
internal_config.in_memory) {
int mfd_res = test_memfd_create();
if (mfd_res < 0) {
RTE_LOG(ERR, EAL, "Unable to check if memfd is supported\n");
return -1;
}
if (mfd_res == 1)
RTE_LOG(DEBUG, EAL, "Using memfd for anonymous memory\n");
else
RTE_LOG(INFO, EAL, "Using memfd is not supported, falling back to anonymous hugepages\n");
/* we only support single-file segments mode with in-memory mode
* if we support hugetlbfs with memfd_create. this code will
* test if we do.
*/
if (internal_config.single_file_segments &&
mfd_res != 1) {
RTE_LOG(ERR, EAL, "Single-file segments mode cannot be used without memfd support\n");
return -1;
}
/* this cannot ever happen but better safe than sorry */
if (!anonymous_hugepages_supported) {
RTE_LOG(ERR, EAL, "Using anonymous memory is not supported\n");
return -1;
}
}
/* initialize all of the fd lists */
if (rte_memseg_list_walk(fd_list_create_walk, NULL))
return -1;
return 0;
}
static int
memseg_secondary_init(void)
{
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
int msl_idx = 0;
struct rte_memseg_list *msl;
for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {
msl = &mcfg->memsegs[msl_idx];
/* skip empty and external memseg lists */
if (msl->memseg_arr.len == 0 || msl->external)
continue;
if (rte_fbarray_attach(&msl->memseg_arr)) {
RTE_LOG(ERR, EAL, "Cannot attach to primary process memseg lists\n");
return -1;
}
/* preallocate VA space */
if (alloc_va_space(msl)) {
RTE_LOG(ERR, EAL, "Cannot preallocate VA space for hugepage memory\n");
return -1;
}
}
return 0;
}
在内存初始化的函数中调用rte_eal_memseg_init实现主从进程中的内存段相关初始化。主进程使用memseg_primary_init函数,子进程使用memseg_secondary_init函数来实现。这其中通过rte_memseg_list结构体来描述其mem segment数量。从而进一步其分配虚拟的内存空间的大小。
从进程使用memseg_secondary_init来操作主进程分配的相关memseglist并attach过来。二者共享memseg空间。同时要做一些虚拟地址空间的大页保留动作。
而rte_eal_memalloc函数就比较简单了,同样分为主从进程,前者对所有的memset的文件描述符进行初始化,而后者初始化自己本地内存的映射并保存到本地:
static int
secondary_msl_create_walk(const struct rte_memseg_list *msl,
void *arg __rte_unused)
{
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
struct rte_memseg_list *primary_msl, *local_msl;
char name[PATH_MAX];
int msl_idx, ret;
if (msl->external)
return 0;
msl_idx = msl - mcfg->memsegs;
primary_msl = &mcfg->memsegs[msl_idx];
local_msl = &local_memsegs[msl_idx];
/* create distinct fbarrays for each secondary */
snprintf(name, RTE_FBARRAY_NAME_LEN, "%s_%i",
primary_msl->memseg_arr.name, getpid());
ret = rte_fbarray_init(&local_msl->memseg_arr, name,
primary_msl->memseg_arr.len,
primary_msl->memseg_arr.elt_sz);
if (ret < 0) {
RTE_LOG(ERR, EAL, "Cannot initialize local memory map\n");
return -1;
}
local_msl->base_va = primary_msl->base_va;
local_msl->len = primary_msl->len;
return 0;
}
6、大页内存初始化:
//rte_eal_memory_init函数内
retval = rte_eal_process_type() == RTE_PROC_PRIMARY ?
rte_eal_hugepage_init() :
rte_eal_hugepage_attach();
在前面的eal_hugepage_info_init函数中主要是处理大页内存外部配置的初始化动作,而在此处的rte_eal_hugepage_init则根据情况来实际完成初始化的动作。DPDK中分为两类内存模式:
legacy mode:静态模式
即在程序初始化时分配内存并一直使用,它能保证内存空间的连续性(虚拟和物理)。但内存不足时,则不可再分配
dynamic mode:动态模式
即程序在初始化过程中逐步根据情况分配内存并一直持续到结束,但是在内存不足时,可以向OS申请并使用而且在使用完成后Free返回给OS。但是这样内存地址就无法保证连续。
int
rte_eal_hugepage_init(void)
{
return internal_config.legacy_mem ?
eal_legacy_hugepage_init() :
eal_hugepage_init();
}
int
rte_eal_hugepage_attach(void)
{
return internal_config.legacy_mem ?
eal_legacy_hugepage_attach() :
eal_hugepage_attach();
}
/*
* Prepare physical memory mapping: fill configuration structure with
* these infos, return 0 on success.
* 1. map N huge pages in separate files in hugetlbfs
* 2. find associated physical addr
* 3. find associated NUMA socket ID
* 4. sort all huge pages by physical address
* 5. remap these N huge pages in the correct order
* 6. unmap the first mapping
* 7. fill memsegs in configuration with contiguous zones
*/
static int
eal_legacy_hugepage_init(void)
{
struct rte_mem_config *mcfg;
struct hugepage_file *hugepage = NULL, *tmp_hp = NULL;
struct hugepage_info used_hp[MAX_HUGEPAGE_SIZES];
struct rte_fbarray *arr;
struct rte_memseg *ms;
uint64_t memory[RTE_MAX_NUMA_NODES];
unsigned hp_offset;
int i, j;
int nr_hugefiles, nr_hugepages = 0;
void *addr;
memset(used_hp, 0, sizeof(used_hp));
/* get pointer to global configuration */
mcfg = rte_eal_get_configuration()->mem_config;
/* hugetlbfs can be disabled */
if (internal_config.no_hugetlbfs) {
void *prealloc_addr;
size_t mem_sz;
struct rte_memseg_list *msl;
int n_segs, cur_seg, fd, flags;
#ifdef MEMFD_SUPPORTED
int memfd;
#endif
uint64_t page_sz;
/* nohuge mode is legacy mode */
internal_config.legacy_mem = 1;
/* nohuge mode is single-file segments mode */
internal_config.single_file_segments = 1;
/* create a memseg list */
msl = &mcfg->memsegs[0];
page_sz = RTE_PGSIZE_4K;
n_segs = internal_config.memory / page_sz;
if (rte_fbarray_init(&msl->memseg_arr, "nohugemem", n_segs,
sizeof(struct rte_memseg))) {
RTE_LOG(ERR, EAL, "Cannot allocate memseg list\n");
return -1;
}
/* set up parameters for anonymous mmap */
fd = -1;
flags = MAP_PRIVATE | MAP_ANONYMOUS;
#ifdef MEMFD_SUPPORTED
/* create a memfd and store it in the segment fd table */
memfd = memfd_create("nohuge", 0);
if (memfd < 0) {
RTE_LOG(DEBUG, EAL, "Cannot create memfd: %s\n",
strerror(errno));
RTE_LOG(DEBUG, EAL, "Falling back to anonymous map\n");
} else {
/* we got an fd - now resize it */
if (ftruncate(memfd, internal_config.memory) < 0) {
RTE_LOG(ERR, EAL, "Cannot resize memfd: %s\n",
strerror(errno));
RTE_LOG(ERR, EAL, "Falling back to anonymous map\n");
close(memfd);
} else {
/* creating memfd-backed file was successful.
* we want changes to memfd to be visible to
* other processes (such as vhost backend), so
* map it as shared memory.
*/
RTE_LOG(DEBUG, EAL, "Using memfd for anonymous memory\n");
fd = memfd;
flags = MAP_SHARED;
}
}
#endif
/* preallocate address space for the memory, so that it can be
* fit into the DMA mask.
*/
mem_sz = internal_config.memory;
prealloc_addr = eal_get_virtual_area(
NULL, &mem_sz, page_sz, 0, 0);
if (prealloc_addr == NULL) {
RTE_LOG(ERR, EAL,
"%s: reserving memory area failed: "
"%s\n",
__func__, strerror(errno));
return -1;
}
addr = mmap(prealloc_addr, mem_sz, PROT_READ | PROT_WRITE,
flags | MAP_FIXED, fd, 0);
if (addr == MAP_FAILED || addr != prealloc_addr) {
RTE_LOG(ERR, EAL, "%s: mmap() failed: %s\n", __func__,
strerror(errno));
munmap(prealloc_addr, mem_sz);
return -1;
}
msl->base_va = addr;
msl->page_sz = page_sz;
msl->socket_id = 0;
msl->len = mem_sz;
msl->heap = 1;
/* we're in single-file segments mode, so only the segment list
* fd needs to be set up.
*/
if (fd != -1) {
if (eal_memalloc_set_seg_list_fd(0, fd) < 0) {
RTE_LOG(ERR, EAL, "Cannot set up segment list fd\n");
/* not a serious error, proceed */
}
}
/* populate memsegs. each memseg is one page long */
for (cur_seg = 0; cur_seg < n_segs; cur_seg++) {
arr = &msl->memseg_arr;
ms = rte_fbarray_get(arr, cur_seg);
if (rte_eal_iova_mode() == RTE_IOVA_VA)
ms->iova = (uintptr_t)addr;
else
ms->iova = RTE_BAD_IOVA;
ms->addr = addr;
ms->hugepage_sz = page_sz;
ms->socket_id = 0;
ms->len = page_sz;
rte_fbarray_set_used(arr, cur_seg);
addr = RTE_PTR_ADD(addr, (size_t)page_sz);
}
if (mcfg->dma_maskbits &&
rte_mem_check_dma_mask_thread_unsafe(mcfg->dma_maskbits)) {
RTE_LOG(ERR, EAL,
"%s(): couldn't allocate memory due to IOVA exceeding limits of current DMA mask.\n",
__func__);
if (rte_eal_iova_mode() == RTE_IOVA_VA &&
rte_eal_using_phys_addrs())
RTE_LOG(ERR, EAL,
"%s(): Please try initializing EAL with --iova-mode=pa parameter.\n",
__func__);
goto fail;
}
return 0;
}
/* calculate total number of hugepages available. at this point we haven't
* yet started sorting them so they all are on socket 0 */
for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) {
/* meanwhile, also initialize used_hp hugepage sizes in used_hp */
used_hp[i].hugepage_sz = internal_config.hugepage_info[i].hugepage_sz;
nr_hugepages += internal_config.hugepage_info[i].num_pages[0];
}
/*
* allocate a memory area for hugepage table.
* this isn't shared memory yet. due to the fact that we need some
* processing done on these pages, shared memory will be created
* at a later stage.
*/
tmp_hp = malloc(nr_hugepages * sizeof(struct hugepage_file));
if (tmp_hp == NULL)
goto fail;
memset(tmp_hp, 0, nr_hugepages * sizeof(struct hugepage_file));
hp_offset = 0; /* where we start the current page size entries */
huge_register_sigbus();
/* make a copy of socket_mem, needed for balanced allocation. */
for (i = 0; i < RTE_MAX_NUMA_NODES; i++)
memory[i] = internal_config.socket_mem[i];
/* map all hugepages and sort them */
for (i = 0; i < (int)internal_config.num_hugepage_sizes; i ++){
unsigned pages_old, pages_new;
struct hugepage_info *hpi;
/*
* we don't yet mark hugepages as used at this stage, so
* we just map all hugepages available to the system
* all hugepages are still located on socket 0
*/
hpi = &internal_config.hugepage_info[i];
if (hpi->num_pages[0] == 0)
continue;
/* map all hugepages available */
pages_old = hpi->num_pages[0];
pages_new = map_all_hugepages(&tmp_hp[hp_offset], hpi, memory);
if (pages_new < pages_old) {
RTE_LOG(DEBUG, EAL,
"%d not %d hugepages of size %u MB allocated\n",
pages_new, pages_old,
(unsigned)(hpi->hugepage_sz / 0x100000));
int pages = pages_old - pages_new;
nr_hugepages -= pages;
hpi->num_pages[0] = pages_new;
if (pages_new == 0)
continue;
}
if (rte_eal_using_phys_addrs() &&
rte_eal_iova_mode() != RTE_IOVA_VA) {
/* find physical addresses for each hugepage */
if (find_physaddrs(&tmp_hp[hp_offset], hpi) < 0) {
RTE_LOG(DEBUG, EAL, "Failed to find phys addr "
"for %u MB pages\n",
(unsigned int)(hpi->hugepage_sz / 0x100000));
goto fail;
}
} else {
/* set physical addresses for each hugepage */
if (set_physaddrs(&tmp_hp[hp_offset], hpi) < 0) {
RTE_LOG(DEBUG, EAL, "Failed to set phys addr "
"for %u MB pages\n",
(unsigned int)(hpi->hugepage_sz / 0x100000));
goto fail;
}
}
if (find_numasocket(&tmp_hp[hp_offset], hpi) < 0){
RTE_LOG(DEBUG, EAL, "Failed to find NUMA socket for %u MB pages\n",
(unsigned)(hpi->hugepage_sz / 0x100000));
goto fail;
}
qsort(&tmp_hp[hp_offset], hpi->num_pages[0],
sizeof(struct hugepage_file), cmp_physaddr);
/* we have processed a num of hugepages of this size, so inc offset */
hp_offset += hpi->num_pages[0];
}
huge_recover_sigbus();
if (internal_config.memory == 0 && internal_config.force_sockets == 0)
internal_config.memory = eal_get_hugepage_mem_size();
nr_hugefiles = nr_hugepages;
/* clean out the numbers of pages */
for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++)
for (j = 0; j < RTE_MAX_NUMA_NODES; j++)
internal_config.hugepage_info[i].num_pages[j] = 0;
/* get hugepages for each socket */
for (i = 0; i < nr_hugefiles; i++) {
int socket = tmp_hp[i].socket_id;
/* find a hugepage info with right size and increment num_pages */
const int nb_hpsizes = RTE_MIN(MAX_HUGEPAGE_SIZES,
(int)internal_config.num_hugepage_sizes);
for (j = 0; j < nb_hpsizes; j++) {
if (tmp_hp[i].size ==
internal_config.hugepage_info[j].hugepage_sz) {
internal_config.hugepage_info[j].num_pages[socket]++;
}
}
}
/* make a copy of socket_mem, needed for number of pages calculation */
for (i = 0; i < RTE_MAX_NUMA_NODES; i++)
memory[i] = internal_config.socket_mem[i];
/* calculate final number of pages */
nr_hugepages = calc_num_pages_per_socket(memory,
internal_config.hugepage_info, used_hp,
internal_config.num_hugepage_sizes);
/* error if not enough memory available */
if (nr_hugepages < 0)
goto fail;
/* reporting in! */
for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) {
for (j = 0; j < RTE_MAX_NUMA_NODES; j++) {
if (used_hp[i].num_pages[j] > 0) {
RTE_LOG(DEBUG, EAL,
"Requesting %u pages of size %uMB"
" from socket %i\n",
used_hp[i].num_pages[j],
(unsigned)
(used_hp[i].hugepage_sz / 0x100000),
j);
}
}
}
/* create shared memory */
hugepage = create_shared_memory(eal_hugepage_data_path(),
nr_hugefiles * sizeof(struct hugepage_file));
if (hugepage == NULL) {
RTE_LOG(ERR, EAL, "Failed to create shared memory!\n");
goto fail;
}
memset(hugepage, 0, nr_hugefiles * sizeof(struct hugepage_file));
/*
* unmap pages that we won't need (looks at used_hp).
* also, sets final_va to NULL on pages that were unmapped.
*/
if (unmap_unneeded_hugepages(tmp_hp, used_hp,
internal_config.num_hugepage_sizes) < 0) {
RTE_LOG(ERR, EAL, "Unmapping and locking hugepages failed!\n");
goto fail;
}
/*
* copy stuff from malloc'd hugepage* to the actual shared memory.
* this procedure only copies those hugepages that have orig_va
* not NULL. has overflow protection.
*/
if (copy_hugepages_to_shared_mem(hugepage, nr_hugefiles,
tmp_hp, nr_hugefiles) < 0) {
RTE_LOG(ERR, EAL, "Copying tables to shared memory failed!\n");
goto fail;
}
#ifndef RTE_ARCH_64
/* for legacy 32-bit mode, we did not preallocate VA space, so do it */
if (internal_config.legacy_mem &&
prealloc_segments(hugepage, nr_hugefiles)) {
RTE_LOG(ERR, EAL, "Could not preallocate VA space for hugepages\n");
goto fail;
}
#endif
/* remap all pages we do need into memseg list VA space, so that those
* pages become first-class citizens in DPDK memory subsystem
*/
if (remap_needed_hugepages(hugepage, nr_hugefiles)) {
RTE_LOG(ERR, EAL, "Couldn't remap hugepage files into memseg lists\n");
goto fail;
}
/* free the hugepage backing files */
if (internal_config.hugepage_unlink &&
unlink_hugepage_files(tmp_hp, internal_config.num_hugepage_sizes) < 0) {
RTE_LOG(ERR, EAL, "Unlinking hugepage files failed!\n");
goto fail;
}
/* free the temporary hugepage table */
free(tmp_hp);
tmp_hp = NULL;
munmap(hugepage, nr_hugefiles * sizeof(struct hugepage_file));
hugepage = NULL;
/* we're not going to allocate more pages, so release VA space for
* unused memseg lists
*/
for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
struct rte_memseg_list *msl = &mcfg->memsegs[i];
size_t mem_sz;
/* skip inactive lists */
if (msl->base_va == NULL)
continue;
/* skip lists where there is at least one page allocated */
if (msl->memseg_arr.count > 0)
continue;
/* this is an unused list, deallocate it */
mem_sz = msl->len;
munmap(msl->base_va, mem_sz);
msl->base_va = NULL;
msl->heap = 0;
/* destroy backing fbarray */
rte_fbarray_destroy(&msl->memseg_arr);
}
if (mcfg->dma_maskbits &&
rte_mem_check_dma_mask_thread_unsafe(mcfg->dma_maskbits)) {
RTE_LOG(ERR, EAL,
"%s(): couldn't allocate memory due to IOVA exceeding limits of current DMA mask.\n",
__func__);
goto fail;
}
return 0;
fail:
huge_recover_sigbus();
free(tmp_hp);
if (hugepage != NULL)
munmap(hugepage, nr_hugefiles * sizeof(struct hugepage_file));
return -1;
}
在静态模式下,会调用eal_legacy_hugepage_init()函数,相反则调用eal_hugepage_init()。前者根据配置internal_config->hugepage_info来将相关SOCKET_ID,PageSZ对rte_memseg_list中的rte_memseg进行映射并排序。然后通过巨页内存文件的重映射来保证虚拟和物理地址二者同时的连续性。保存相关信息及描述符到巨页数据文件。并采用read-ahead做为技术保证。这样就可以保障相关数据操作的效率。
而后者则通过实际的需求来计算在不同的Socket上的分配,通过算法对内存Segment进行分配,它采用preallocate来保证整体的性能需求。
总之,各有各的好处,各有各的香。
DPDK框架中对内存的管理基本上已经针对自己的应用场景做到了尽量有上佳的表现。但是,不断DPDK在演进,它依赖的内核和相关库也都在演进,甚至硬件也在演进,故而其对内存管理的演进是必然的。这也从新老版本中体现出来了。
一根杆子捅到底,接下来,将会继续分析DPDK对相关内存管理(分配使用和回收等)相关的细节和流程。