jemalloc横向分析(一) 核心结构体

第一个结构体arena_s
struct arena_s {
    /* This arena's index within the arenas array. */
    unsigned        ind;该结构在arenas数组中的索引

    /*
     * Number of threads currently assigned to this arena.  This field is
     * protected by arenas_lock.
     */
    unsigned        nthreads;几个线程使用这个area

    /*
     * There are three classes of arena operations from a locking
     * perspective: 3类area操作需要用到lock
     * 1) Thread assignment (modifies nthreads) is protected by arenas_lock.
     * 2) Bin-related operations are protected by bin locks.
     * 3) Chunk- and run-related operations are protected by this mutex.
     */
    malloc_mutex_t        lock;

    arena_stats_t        stats;统计信息
    /*
     * List of tcaches for extant threads associated with this arena.
     * Stats from these are merged incrementally, and at exit if
     * opt_stats_print is enabled.
     */
    ql_head(tcache_t)    tcache_ql; 和这个area关联的不同线程组成的tcache列表

    uint64_t        prof_accumbytes;

    /*
     * PRNG state for cache index randomization of large allocation base
     * pointers.
     */
    uint64_t        offset_state;

    dss_prec_t        dss_prec;

    /*
     * In order to avoid rapid chunk allocation/deallocation when an arena
     * oscillates right on the cusp of needing a new chunk, cache the most
     * recently freed chunk.  The spare is left in the arena's chunk trees
     * until it is deleted.
     *
     * There is one spare chunk per arena, rather than one spare total, in
     * order to avoid interactions between multiple threads that could make
     * a single spare inadequate.每个area一个空闲的chunk,避免快速分配,也避免多个线程竞争
     */
    arena_chunk_t        *spare;

    /* Minimum ratio (log base 2) of nactive:ndirty. */
    ssize_t            lg_dirty_mult; nactive:ndirty对2取对数,nactive指活跃的arena_run

    /* True if a thread is currently executing arena_purge(). */
    bool            purging;某个线程正在对该area执行arena_purge

    /* Number of pages in active runs and huge regions. */
    size_t            nactive;活跃的arena_run和huge regions

    /*
     * Current count of pages within unused runs that are potentially
     * dirty, and for which madvise(... MADV_DONTNEED) has not been called.
     * By tracking this, we can institute a limit on how much dirty unused
     * memory is mapped for each arena.
     */
    size_t            ndirty;

    /*
     * Size/address-ordered tree of this arena's available runs.  The tree
     * is used for first-best-fit run allocation.
     */根据大小地址排序的arena_run组成的树,使用最先最佳适合的方法分配arena_run
    arena_avail_tree_t    runs_avail;

    /*area的脏内存管理
     * Unused dirty memory this arena manages.  Dirty memory is conceptually
     * tracked as an arbitrarily interleaved LRU of dirty runs and cached
     * chunks, but the list linkage is actually semi-duplicated in order to
     * avoid extra arena_chunk_map_misc_t space overhead.
     *概念上讲,脏内存跟踪可以是runs和chunks,实际为了避免arena_chunk_map_misc_t空间使用过头,是半复制
     *   LRU-----------------------------------------------------------MRU
     *
     *        /-- arena ---\
     *        |            |
     *        |            |
     *        |------------|                             /- chunk -\
     *   ...->|chunks_cache|<--------------------------->|  /----\ |<--...
     *        |------------|                             |  |node| |
     *        |            |                             |  |    | |
     *        |            |    /- run -\    /- run -\   |  |    | |
     *        |            |    |       |    |       |   |  |    | |
     *        |            |    |       |    |       |   |  |    | |
     *        |------------|    |-------|    |-------|   |  |----| |
     *   ...->|runs_dirty  |<-->|rd     |<-->|rd     |<---->|rd  |<----...
     *        |------------|    |-------|    |-------|   |  |----| |
     *        |            |    |       |    |       |   |  |    | |
     *        |            |    |       |    |       |   |  \----/ |
     *        |            |    \-------/    \-------/   |         |
     *        |            |                             |         |
     *        |            |                             |         |
     *        \------------/                             \---------/
     */
    arena_runs_dirty_link_t    runs_dirty;双向链表
    extent_node_t        chunks_cache;1个extent_node代表1块区域

    /* Extant huge allocations. */
    ql_head(extent_node_t)    huge; 大块内存链表
    /* Synchronizes all huge allocation/update/deallocation. */
    malloc_mutex_t        huge_mtx;大块内存锁

    /*
     * Trees of chunks that were previously allocated (trees differ only in
     * node ordering).  These are used when allocating chunks, in an attempt
     * to re-use address space.  Depending on function, different tree
     * orderings are needed, which is why there are two trees with the same
     * contents.
     */排序的extent_node_t组成的树,2个一组,不同的是排序方式,前者根据size大小,后者根据地址
    extent_tree_t        chunks_szad_cached;
        rb_gen(, extent_tree_szad_, extent_tree_t, extent_node_t, szad_link, extent_szad_comp)
    extent_tree_t        chunks_ad_cached;
        rb_gen(, extent_tree_ad_, extent_tree_t, extent_node_t, ad_link, extent_ad_comp)
    extent_tree_t        chunks_szad_retained;
    extent_tree_t        chunks_ad_retained;

    malloc_mutex_t        chunks_mtx;
    /* Cache of nodes that were allocated via base_alloc(). */
    ql_head(extent_node_t)    node_cache;通过base_alloc分配的extent_node列表
    malloc_mutex_t        node_cache_mtx;

    /* User-configurable chunk hook functions. */
    chunk_hooks_t        chunk_hooks;回调函数,分配,释放

    /* bins is used to store trees of free regions. */
    arena_bin_t        bins[NBINS]; bins数组,大小是39
};

第二个类型
typedef rb_tree(arena_chunk_map_misc_t) arena_avail_tree_t;
由arena_chunk_map_misc_t构成的红黑树,它作为area的一个字段
/*
 * Each arena_chunk_map_misc_t corresponds to one page within the chunk, just
 * like arena_chunk_map_bits_t.  Two separate arrays are stored within each
 * chunk header in order to improve cache locality.
 */
每一个arena_chunk_map_misc_t和arena_chunk_map_bits_t一样,对应chunk中的1页
为了改进cache本地化,2个独立的数组存储在chunk的头部
struct arena_chunk_map_misc_s {
    /*
     * Linkage for run trees.  There are two disjoint uses:
     * 2种不相交的用法
     * 1) arena_t's runs_avail tree.用于runs_avail树
     * 2) arena_run_t conceptually uses this linkage for in-use non-full
     *    runs, rather than directly embedding linkage.
     */概念上连接正在使用的没有满的runs,而不是直接嵌入式连接
    rb_node(arena_chunk_map_misc_t)        rb_link;

    union {
        /* Linkage for list of dirty runs. */
        arena_runs_dirty_link_t        rd;

        /* Profile counters, used for large object runs. */
        union {
            void                *prof_tctx_pun;
            prof_tctx_t            *prof_tctx;
        };

        /* Small region run metadata. */
        arena_run_t            run;
    };
};

/* Each element of the chunk map corresponds to one page within the chunk. */
struct arena_chunk_map_bits_s {每一个map_bits对应1页
    /*
     * Run address (or size) and various flags are stored together.  The bit
     * layout looks like (assuming 32-bit system):
     * 运行地址或大小,各种标志,存储在一起
     *   ???????? ???????? ???nnnnn nnndumla
     *
     * ? : Unallocated: Run address for first/last pages, unset for internal
     *                  pages. 未分配的,第一页或者最后一页的运行地址,对于内部页不设置
     *     Small: Run page offset.
     *     Large: Run page count for first page, unset for trailing pages.
     * n : binind for small size class, BININD_INVALID for large size class. bin索引
     * d : dirty? 是否是dirty
     * u : unzeroed? 是否是没有置0
     * m : decommitted? 不提交
     * l : large? 是否是large
     * a : allocated? 已分配
     *
     * Following are example bit patterns for the three types of runs.
     * 下面是3种类型的位模式举例
     * p : run page offset
     * s : run size
     * n : binind for size class; large objects set these to BININD_INVALID
     * x : don't care
     * - : 0
     * + : 1
     * [DUMLA] : bit set
     * [dumla] : bit unset
     *
     *   Unallocated (clean):
     *     ssssssss ssssssss sss+++++ +++dum-a 13位以上表示run size 12-5是1,第1位是0,表示非large
     *     xxxxxxxx xxxxxxxx xxxxxxxx xxx-Uxxx 第4位是0,非dirty,第三位是U,没置0,不关心其他位
     *     ssssssss ssssssss sss+++++ +++dUm-a 13位以上表示run size 12-5是1,第三位是U,没置0,第1位是0,表示非large
     *
     *   Unallocated (dirty):
     *     ssssssss ssssssss sss+++++ +++D-m-a 13位以上表示run size 12-5是1,第4位是1,表示Dirty,第3位0,表示清0,第1位是0,表示非large
     *     xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx
     *     ssssssss ssssssss sss+++++ +++D-m-a
     *
     *   Small:
     *     pppppppp pppppppp pppnnnnn nnnd---A 13位以上表示page offset,12-5表示bin index,非dirty,已分配
     *     pppppppp pppppppp pppnnnnn nnn----A
     *     pppppppp pppppppp pppnnnnn nnnd---A
     *
     *   Large:
     *     ssssssss ssssssss sss+++++ +++D--LA 13位以上表示run size 12-5是1,Dirty,Large,Allocated
     *     xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx
     *     -------- -------- ---+++++ +++D--LA
     *
     *   Large (sampled, size <= LARGE_MINCLASS):
     *     ssssssss ssssssss sssnnnnn nnnD--LA
     *     xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx
     *     -------- -------- ---+++++ +++D--LA
     *
     *   Large (not sampled, size == LARGE_MINCLASS):
     *     ssssssss ssssssss sss+++++ +++D--LA
     *     xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx
     *     -------- -------- ---+++++ +++D--LA
     */
    size_t                bits;
#define    CHUNK_MAP_ALLOCATED    ((size_t)0x01U)
#define    CHUNK_MAP_LARGE        ((size_t)0x02U)
#define    CHUNK_MAP_STATE_MASK    ((size_t)0x3U)

#define    CHUNK_MAP_DECOMMITTED    ((size_t)0x04U)
#define    CHUNK_MAP_UNZEROED    ((size_t)0x08U)
#define    CHUNK_MAP_DIRTY        ((size_t)0x10U)
#define    CHUNK_MAP_FLAGS_MASK    ((size_t)0x1cU)

#define    CHUNK_MAP_BININD_SHIFT    5
#define    BININD_INVALID        ((size_t)0xffU)
#define    CHUNK_MAP_BININD_MASK    (BININD_INVALID << CHUNK_MAP_BININD_SHIFT)
#define    CHUNK_MAP_BININD_INVALID CHUNK_MAP_BININD_MASK

#define    CHUNK_MAP_RUNIND_SHIFT    (CHUNK_MAP_BININD_SHIFT + 8)
#define    CHUNK_MAP_SIZE_SHIFT    (CHUNK_MAP_RUNIND_SHIFT - LG_PAGE)
#define    CHUNK_MAP_SIZE_MASK                        \
    (~(CHUNK_MAP_BININD_MASK | CHUNK_MAP_FLAGS_MASK | CHUNK_MAP_STATE_MASK))
};

第三个类型,arena_run_t
struct arena_run_s {
    /* Index of bin this run is associated with. */
    szind_t        binind; bin index 索引大小信息

    /* Number of free regions in run. */
    unsigned    nfree; 空闲region数量

    /* Per region allocated/deallocated bitmap. */
    bitmap_t    bitmap[BITMAP_GROUPS_MAX]; 大小是9,在malloc节中有详细介绍
};

第四个类型arena_chunk_t
/* Arena chunk header. */
struct arena_chunk_s {
    /*
     * A pointer to the arena that owns the chunk is stored within the node.
     * This field as a whole is used by chunks_rtree to support both
     * ivsalloc() and core-based debugging.
     */指向拥有该chunk的area的指针,存储在node中,这个字段被chunks_rtree使用,来支持
    extent_node_t        node;

    /*
     * Map of pages within chunk that keeps track of free/large/small.  The
     * first map_bias entries are omitted, since the chunk header does not
     * need to be tracked in the map.  This omission saves a header page
     * for common chunk sizes (e.g. 4 MiB).
     */ 1个chunk2M,很多页,一个map_bits对应1页
    arena_chunk_map_bits_t    map_bits[1]; /* Dynamically sized. */
};

第五个类型extent_node_t,表示1个范围节点,一片内存
/* Tree of extents.  Use accessor functions for en_* fields. */
struct extent_node_s {
    /* Arena from which this extent came, if any. */
    arena_t            *en_arena;节点所属的area

    /* Pointer to the extent that this tree node is responsible for. */
    void            *en_addr;内存起始地址

    /* Total region size. */
    size_t            en_size;内存大小

    /*
     * The zeroed flag is used by chunk recycling code to track whether
     * memory is zero-filled.
     */
    bool            en_zeroed;是否置0了

    /*
     * True if physical memory is committed to the extent, whether
     * explicitly or implicitly as on a system that overcommits and
     * satisfies physical memory needs on demand via soft page faults.
     */
    bool            en_committed;

    /*
     * The achunk flag is used to validate that huge allocation lookups
     * don't return arena chunks.
     */巨块分配查找
    bool            en_achunk;

    /* Profile counters, used for huge objects. */
    prof_tctx_t        *en_prof_tctx;

    /* Linkage for arena's runs_dirty and chunks_cache rings. */
    arena_runs_dirty_link_t    rd; 和前面的area的这个字段相连
    qr(extent_node_t)    cc_link; 和其他node相连

    union {
        /* Linkage for the size/address-ordered tree. */
        rb_node(extent_node_t)    szad_link;按大小排序的树

        /* Linkage for arena's huge and node_cache lists. */
        ql_elm(extent_node_t)    ql_link;
    };

    /* Linkage for the address-ordered tree. */按地址排序
    rb_node(extent_node_t)    ad_link; 组成node树,在area结构体重有4个extent_tree_t
};
typedef rb_tree(extent_node_t) extent_tree_t;

tcache_t是存在arena->tcache_ql列表中
struct tcache_s {
    ql_elm(tcache_t) link;        /* Used for aggregating stats. */
    uint64_t    prof_accumbytes;/* Cleared after arena_prof_accum(). */
    unsigned    ev_cnt;        /* Event count since incremental GC. */
    szind_t        next_gc_bin;    /* Next bin to GC. */
    tcache_bin_t    tbins[1];    /* Dynamically sized. */
    /*
     * The pointer stacks associated with tbins follow as a contiguous
     * array.  During tcache initialization, the avail pointer in each
     * element of tbins is initialized to point to the proper offset within
     * this array.
     */
};
struct tcache_bin_s {
    tcache_bin_stats_t tstats;
    int        low_water;    /* Min # cached since last GC. */
    unsigned    lg_fill_div;    /* Fill (ncached_max >> lg_fill_div). */
    unsigned    ncached;    /* # of cached objects. */
    void        **avail;    /* Stack of available objects. */
};

要分配内存,则从tcache中分配,tcache一开始不存在,则创建tcache
分配tcache的过程:
size = offsetof(tcache_t, tbins) + (sizeof(tcache_bin_t) * nhbins);
含义是分配1个tcache_t结构,这个tcache的tbins数组大小是44
size = 32 + 32 * 44 = 1440
stack_offset = size;
size += stack_nelms * sizeof(void *); 3696 * 8 29568 31008,调整大小后是32768
tcache_bin_t的avail字段,是个void*数组,各个tcache_bin_t的avail数组长度不一
bin的index越大,nregs越小,这是大趋势,也不是完全符合,在bin_info_init时会初始化nregs
for (i = 0; i < NBINS; i++) {
    arena_bin_info[i].nregs << 1
    <= TCACHE_NSLOTS_SMALL_MIN = 20 (19,23,25,27-38)
        tcache_bin_info[i].ncached_max = 20
    <= TCACHE_NSLOTS_SMALL_MAX = 200(7,11,13,15-18,20-22,24,26)
        tcache_bin_info[i].ncached_max = arena_bin_info[i].nregs << 1
    其他
        tcache_bin_info[i].ncached_max = 200(0-6,8-10,12,14)
    
    stack_nelms += tcache_bin_info[i].ncached_max;

没有遍历完的,nhbins是44,NBINS是39
for (; i < nhbins; i++) {
    tcache_bin_info[i].ncached_max = TCACHE_NSLOTS_LARGE;
    stack_nelms += tcache_bin_info[i].ncached_max;
}

创建完tcache后会初始化每个tcache_bin_info的avail数组,
for (i = 0; i < nhbins; i++) {
    tcache->tbins[i].lg_fill_div = 1;
    tcache->tbins[i].avail = (void **)((uintptr_t)tcache +
        (uintptr_t)stack_offset);
    stack_offset += tcache_bin_info[i].ncached_max * sizeof(void *);
}
内存布局是
tcache_t
...,tcache_bin_info[0],tcache_bin_info[1],...tcache_bin_info[43],avail0,avail1,...,avail43

arena_malloc_large
    arena_run_t *run = arena_run_alloc_large(arena, usize + large_pad, zero);
    arena_chunk_map_misc_t *miscelm = arena_run_to_miscelm(run);
    tcache_t *tcache = (void *)((uintptr_t)arena_miscelm_to_rpages(miscelm) + random_offset);
    
    /*
     * Compute a uniformly distributed offset within the first page
     * that is a multiple of the cacheline size, e.g. [0 .. 63) * 64
     * for 4 KiB pages and 64-byte cachelines.
     */
    prng64(r, LG_PAGE - LG_CACHELINE, arena->offset_state,
        UINT64_C(6364136223846793009),
        UINT64_C(1442695040888963409)); 20位的数
        在第一页内,计算一个统一的,64位对齐的,分布式偏移
        r = arena->offset_state * 6364136223846793009 + 1442695040888963409
        arena->offset_state = r
        r >>= (64 - (lg_range));
            64 - (12 - 6)
    random_offset = ((uintptr_t)r) << LG_CACHELINE;
    
    run作为arena_chunk_map_misc_t的一个字段,可以得到arena_chunk_map_misc_t的地址,根据结构体的定义,一个page对应一个misc
    arena_miscelm_to_rpages(arena_chunk_map_misc_t *miscelm)函数
        1. 消掉miscelm的零头得到chunk的地址(2M对齐)
        2. arena_miscelm_to_pageind(miscelm)
            arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(miscelm);
            size_t pageind =
                ((uintptr_t)miscelm - ((uintptr_t)chunk + map_misc_offset))
                / sizeof(arena_chunk_map_misc_t)
                + map_bias;
                
                map_misc_offset = 4096,chunk之后的第1页(从0开始)是miscelm数组,miscelm数组的第i个元素对应(i + map_bias)页
        3. 最终的地址是((void *)((uintptr_t)chunk + (pageind << LG_PAGE))),这是某一页的起始地址
        
        map_bias为什么是13,和arena_chunk_map_misc_t的结构体大小有关,它的大小是96,1个chunk有512页,要保证每页都有一个arena_chunk_map_misc_t
        499 * 96 = 47904 47904 /4096= 11(页),如果要管理500页,那么需要占用12页的大小,第0页要放arena_chunk_s,以及他的动态字段map_bits
        在arena_boot函数中有精确计算
        /*
         * Compute the header size such that it is large enough to contain the
         * page map.  The page map is biased to omit entries for the header
         * itself, so some iteration is necessary to compute the map bias.
         *计算头部大小,这个大小足够包含page map,page map相对于省略的实体有间距,因此需要迭代计算map_bias
         * 1) Compute safe header_size and map_bias values that include enough
         *    space for an unbiased page map. 如果要管理chunk_npages这么多页,需要多少的页来存储管理信息(map_bits信息和misc信息)
         * 2) Refine map_bias based on (1) to omit the header pages in the page
         *    map.  The resulting map_bias may be one too small.总共才chunk_npages,所以管理不了那么多,尝试值管理(chunk_npages-map_bias)页需要多少管理空间
         * 3) Refine map_bias based on (2).  The result will be >= the result
         *    from (2), and will always be correct.实际上第二次计算已经能满足空间了,这次再计算校正header_size,再得到新的map_bias
         */
        map_bias = 0;
        for (i = 0; i < 3; i++) {
            size_t header_size = offsetof(arena_chunk_t, map_bits) +
                ((sizeof(arena_chunk_map_bits_t) + sizeof(arena_chunk_map_misc_t)) * (chunk_npages-map_bias));
            map_bias = (header_size + PAGE_MASK) >> LG_PAGE;
        }
        第一次,header_size=53352,map_bias = 14
        第二次,header_size=51896,je_map_bias=13
        第三次,header_size=52000,je_map_bias=13
        map_misc_offset = offsetof(arena_chunk_t, map_bits) + sizeof(arena_chunk_map_bits_t) * (chunk_npages-map_bias);
            ((size_t)&(((arena_chunk_t *)0)->map_bits)) = 104
            sizeof(arena_chunk_map_bits_t) * (je_chunk_npages-je_map_bias) = 3992
            4096,一点对齐的操作都没有啊
        
    tcache的地址 = 这页的起始地址 + random_offset
    
现在看一个miscelm对应一个run,这个大小可能不止1页

chunk = arena_chunk_alloc(arena);
if (chunk != NULL) {
    run = &arena_miscelm_get(chunk, map_bias)->run;
    if (arena_run_split_large(arena, run, size, zero))
        run = NULL;
    return (run);
}
    arena_chunk_alloc(arena)返回arena_chunk_t指针,它的地址就是2M对齐的
    刚创建的chunk第一个可用的页是(0 + map_bias),所以取得该页的miscelm,给其run赋值,返回这个run,就得到了上面arena_run_t

chunk = arena_chunk_init_hard(arena);
    chunk = arena_chunk_alloc_internal(arena, &zero, &commit);
        chunk = chunk_alloc_cache(arena, &chunk_hooks, NULL, chunksize, chunksize, zero, true);
            chunk_recycle(...&arena->chunks_szad_cached,&arena->chunks_ad_cached...)
                先尝试回收,失败则真正创建,回收时commit=true,其他情况zero和commit都为false
        chunk = arena_chunk_alloc_internal_hard(arena, &chunk_hooks, zero, commit);
            chunk = (arena_chunk_t *)chunk_alloc_wrapper(arena, chunk_hooks, NULL, chunksize, chunksize, zero, commit);
                chunk_hooks->alloc(new_addr, size, alignment, zero, commit, arena->ind);
                    使用chunk_hooks_default的回调函数chunk_alloc_default
                    arena = chunk_arena_get(arena_ind);
                    ret = chunk_alloc_core(arena, new_addr, size, alignment, zero, commit, arena->dss_prec);
                        ret = chunk_recycle(arena, &chunk_hooks, &arena->chunks_szad_retained, &arena->chunks_ad_retained...
                            先尝试回收,失败则真正创建
                        ret = chunk_alloc_mmap(size, alignment, zero, commit)
                            调用mmap创建,先直接创建,看看是不是2M对齐的,不是则释放,重新映射4M-4k大小的,这个大小的肯定可以找到
                            2M对齐的连续空间
            arena_chunk_register(arena, chunk, *zero)
                extent_node_init(&chunk->node, arena, chunk, chunksize, zero, true);
                    给chunk的node的各个字段赋值,arena地址(开天辟地时创建的内存),节点地址(就是chunk地址),大小(2M)
                    zero,false,commit,true
                extent_node_achunk_set(&chunk->node, true); achunk,true
                chunk_register(chunk, &chunk->node);
                    rtree_set(&chunks_rtree, (uintptr_t)chunk, node)
            
    arena_mapbits_unallocated_set(chunk, map_bias, arena_maxrun, flag_unzeroed | flag_decommitted);
        设置第13页(从0开始)的大小是arena_maxrun,即0x1f3000 1996k = 1f3页 499页 2M的chunk是0x200页,512页,也就是让了13页出来
    arena_mapbits_unallocated_set(chunk, chunk_npages-1, arena_maxrun, flag_unzeroed);
        设置第511页的大小也是arena_maxrun
arena_avail_insert(arena, chunk, map_bias, chunk_npages-map_bias);插入1个run,这个run在chunk中,起始页是map_bias,总页数是chunk_npages-map_bias
    arena_avail_tree_insert(&arena->runs_avail, arena_miscelm_get(chunk, pageind));这里插入的实际是miscelm

关于rtree和arena_run_split_large(arena, run, size, zero)待续

你可能感兴趣的:(jemalloc横向分析(一) 核心结构体)