hugepage原理参考http://blog.chinaunix.net/uid-28541347-id-5783934.html
DPDK版本:17.11.2
hugepage的作用:
1. 就是减少页的切换,页表项减少,产生缺页中断的次数也减少
2. 降低TLB的miss次数
mount
-
t hugetlbfs hugetlbfs /dev/hugepages (挂载默认的hugeage大小)
mount
-
t hugetlbfs none /dev/hugepages_2mb
-
o pagesize
=
2MB(挂载2M的)
1G大页和2M大页必须挂载了才能使用。挂载其中一个,DPDK也能正常运行。
本测试时只设置了1G大页,具体信息如下:
挂载目录:cat /proc/mounts
DPDK初始化函数rte_eal_init调用eal_hugepage_info_init初始化hugepage信息,
此函数主要收集可用hugepage信息(有多少页,挂载目录)。
mount
-
t hugetlbfs none /dev/hugepages_2mb
-
o pagesize
=2MB,只挂载了mount
-t hugetlbfs hugetlbfs /dev/hugepages,DPDK只会使用1G hugepage。
DPDK程序执行时打印“EAL: 2048 hugepages of size 2097152 reserved, but no mounted hugetlbfs found for that size”表明2M的没有挂载。
struct hugepage_info {
uint64_t hugepage_sz; /**< size of a huge page */
const char *hugedir; /**< dir where hugetlbfs is mounted */
uint32_t num_pages[RTE_MAX_NUMA_NODES];
/**< number of hugepages of that size on each socket */
int lock_descriptor; /**< file descriptor for hugepage dir */
};
本实验最后大页信息是:
hugepage_sz=1048576(1048576*1024)
hugedir="/dev/hugepages"
num_pages[0]=4
int
eal_hugepage_info_init(void)
{
const char dirent_start_text[] = "hugepages-";
const size_t dirent_start_len = sizeof(dirent_start_text) - 1;
unsigned i, num_sizes = 0;
DIR *dir;
struct dirent *dirent;
dir = opendir(sys_dir_path); //sys_dir_path[] = "/sys/kernel/mm/hugepages"
if (dir == NULL) {
RTE_LOG(ERR, EAL,
"Cannot open directory %s to read system hugepage info\n",
sys_dir_path);
return -1;
}
/*遍历/sys/kernel/mm/hugepages目录下以“hugepages-”开头的目录*/
for (dirent = readdir(dir); dirent != NULL; dirent = readdir(dir)) {
struct hugepage_info *hpi;
if (strncmp(dirent->d_name, dirent_start_text,
dirent_start_len) != 0)
continue;
if (num_sizes >= MAX_HUGEPAGE_SIZES)
break;
/*internal_config为DPDK全局变量*/
hpi = &internal_config.hugepage_info[num_sizes];
/*保存hugepage的大小,最多保存三种大小,一般也只用到了1G,2M*/
hpi->hugepage_sz =
rte_str_to_size(&dirent->d_name[dirent_start_len]);
/*get_hugepage_dir函数会到/proc/mounts里去寻找对应大小hugepage页挂载的目录 */
hpi->hugedir = get_hugepage_dir(hpi->hugepage_sz);
/* first, check if we have a mountpoint */
if (hpi->hugedir == NULL) {
uint32_t num_pages;
num_pages = get_num_hugepages(dirent->d_name);
if (num_pages > 0)
RTE_LOG(NOTICE, EAL,
"%" PRIu32 " hugepages of size "
"%" PRIu64 " reserved, but no mounted "
"hugetlbfs found for that size\n",
num_pages, hpi->hugepage_sz);
continue;
}
......
......
}
上面只是获取了hugepage信息,后面rte_eal_memory_init函数->rte_eal_hugepage_init->map_all_hugepages初始化每页具体虚拟地址,物理地址,大小等信息。
/* get pointer to global configuration */
mcfg = rte_eal_get_configuration()->mem_config;
/* calculate total number of hugepages available. at this point we haven't
* yet started sorting them so they all are on socket 0 */
for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) {
/* meanwhile, also initialize used_hp hugepage sizes in used_hp */
used_hp[i].hugepage_sz = internal_config.hugepage_info[i].hugepage_sz;
nr_hugepages += internal_config.hugepage_info[i].num_pages[0];
}
/*
* allocate a memory area for hugepage table.
* this isn't shared memory yet. due to the fact that we need some
* processing done on these pages, shared memory will be created
* at a later stage.
*/
tmp_hp = malloc(nr_hugepages * sizeof(struct hugepage_file));
if (tmp_hp == NULL)
goto fail;
/*
* Mmap all hugepages of hugepage table: it first open a file in
* hugetlbfs, then mmap() hugepage_sz data in it. If orig is set, the
* virtual address is stored in hugepg_tbl[i].orig_va, else it is stored
* in hugepg_tbl[i].final_va. The second mapping (when orig is 0) tries to
* map contiguous physical blocks in contiguous virtual blocks.
*/
static unsigned
map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi,
uint64_t *essential_memory __rte_unused, int orig)
/* try to create hugepage file */
fd = open(hugepg_tbl[i].filepath, O_CREAT | O_RDWR, 0600);
if (fd < 0) {
RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n", __func__,
strerror(errno));
goto out;
}
/* map the segment, and populate page tables,
* the kernel fills this segment with zeros */
virtaddr = mmap(vma_addr, hugepage_sz, PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_POPULATE, fd, 0);
/*
* For each hugepage in hugepg_tbl, fill the physaddr value. We find
* it by browsing the /proc/self/pagemap special file.
*/
static int
find_physaddrs(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi)
{
unsigned int i;
phys_addr_t addr;
for (i = 0; i < hpi->num_pages[0]; i++) {
addr = rte_mem_virt2phy(hugepg_tbl[i].orig_va);
if (addr == RTE_BAD_PHYS_ADDR)
return -1;
hugepg_tbl[i].physaddr = addr;
}
return 0;
}
if (find_numasocket(&tmp_hp[hp_offset], hpi) < 0){
RTE_LOG(DEBUG, EAL, "Failed to find NUMA socket for %u MB pages\n",
(unsigned)(hpi->hugepage_sz / 0x100000));
goto fail;
}
qsort(&tmp_hp[hp_offset], hpi->num_pages[0],
sizeof(struct hugepage_file), cmp_physaddr);
static int
cmp_physaddr(const void *a, const void *b)
{
#ifndef RTE_ARCH_PPC_64
const struct hugepage_file *p1 = a;
const struct hugepage_file *p2 = b;
#else
/* PowerPC needs memory sorted in reverse order from x86 */
const struct hugepage_file *p1 = b;
const struct hugepage_file *p2 = a;
#endif
if (p1->physaddr < p2->physaddr)
return -1;
else if (p1->physaddr > p2->physaddr)
return 1;
else
return 0;
}
else if (vma_len == 0) {
unsigned j, num_pages;
/* reserve a virtual area for next contiguous
* physical block: count the number of
* contiguous physical pages. */
for (j = i+1; j < hpi->num_pages[0] ; j++) {
#ifdef RTE_ARCH_PPC_64
/* The physical addresses are sorted in
* descending order on PPC64 */
if (hugepg_tbl[j].physaddr !=
hugepg_tbl[j-1].physaddr - hugepage_sz)
break;
#else
if (hugepg_tbl[j].physaddr !=
hugepg_tbl[j-1].physaddr + hugepage_sz)
break;
#endif
}
num_pages = j - i;
vma_len = num_pages * hugepage_sz;
/* get the biggest virtual memory area up to
* vma_len. If it fails, vma_addr is NULL, so
* let the kernel provide the address. */
vma_addr = get_virtual_area(&vma_len, hpi->hugepage_sz);
if (vma_addr == NULL)
vma_len = hugepage_sz;
}
/* unmap original mappings */
if (unmap_all_hugepages_orig(&tmp_hp[hp_offset], hpi) < 0)
goto fail;
if (new_memseg) {
j += 1;
if (j == RTE_MAX_MEMSEG)
break;
mcfg->memseg[j].iova = hugepage[i].physaddr;
mcfg->memseg[j].addr = hugepage[i].final_va;
mcfg->memseg[j].len = hugepage[i].size;
mcfg->memseg[j].socket_id = hugepage[i].socket_id;
mcfg->memseg[j].hugepage_sz = hugepage[i].size;
}
rte_eal_hugepage_init只会被RTE_PROC_PRIMARY的进程调用(多进程情况下)。rte_eal_hugepage_init完成后只是将可用的大页内存物理地址,虚拟地址,socket id,大小信息保存到了全局变量中,怎么使用这些内存还需要进一步管理。