前言:
写这个文章的初衷是因为今天手写了一个字典树,然后写字典树以后忽然想到了之前看的技术文章,linux kernel 之前的pid 申请方式已经从 bitmap 变成了 基数树,所以打算写文章再回顾一下这种数据结构算法
一、内核中pid的申请
旧的内核中pid的申请就不多说了,是使用的是bitmap去实现的,新版的换为了基数树。申请pid的内核代码在:
struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
size_t set_tid_size)
{
.......
}
继续看代码,申请函数为:
if (tid) {
nr = idr_alloc(&tmp->idr, NULL, tid,
tid + 1, GFP_ATOMIC);
/*
* If ENOSPC is returned it means that the PID is
* alreay in use. Return EEXIST in that case.
*/
if (nr == -ENOSPC)
nr = -EEXIST;
} else {
int pid_min = 1;
/*
* init really needs pid 1, but after reaching the
* maximum wrap back to RESERVED_PIDS
*/
if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS)
pid_min = RESERVED_PIDS;
/*
* Store a null pointer so find_pid_ns does not find
* a partially initialized PID (see below).
*/
nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
pid_max, GFP_ATOMIC);
}
进程最大的pid号为:
int pid_max = PID_MAX_DEFAULT;
/*
* This controls the default maximum pid allocated to a process
*/
#define PID_MAX_DEFAULT (CONFIG_BASE_SMALL ? 0x1000 : 0x8000)
这里我发现了一个问题,linux 最大的pid 是 0x8000 而不是 0xFFFF,这是为了和老的内核兼容,我们可以通过
cat /proc/sys/kernel/pid_max
查询当前内核的最大pid。
申请pid 函数我们聚焦在
idr_alloc 和 idr_alloc_cyclic
再进去看 idr_alloc_cyclic 的实现
int idr_alloc_cyclic(struct idr *idr, void *ptr, int start, int end, gfp_t gfp)
{
u32 id = idr->idr_next;
int err, max = end > 0 ? end - 1 : INT_MAX;
if ((int)id < start)
id = start;
err = idr_alloc_u32(idr, ptr, &id, max, gfp);
if ((err == -ENOSPC) && (id > start)) {
id = start;
err = idr_alloc_u32(idr, ptr, &id, max, gfp);
}
if (err)
return err;
idr->idr_next = id + 1;
return id;
}
具体实现pid 申请的函数在:
int idr_alloc_u32(struct idr *idr, void *ptr, u32 *nextid,
unsigned long max, gfp_t gfp)
{
struct radix_tree_iter iter;
void __rcu **slot;
unsigned int base = idr->idr_base;
unsigned int id = *nextid;
if (WARN_ON_ONCE(!(idr->idr_rt.xa_flags & ROOT_IS_IDR)))
idr->idr_rt.xa_flags |= IDR_RT_MARKER;
id = (id < base) ? 0 : id - base;
radix_tree_iter_init(&iter, id);
slot = idr_get_free(&idr->idr_rt, &iter, gfp, max - base);
if (IS_ERR(slot))
return PTR_ERR(slot);
*nextid = iter.index + base;
/* there is a memory barrier inside radix_tree_iter_replace() */
radix_tree_iter_replace(&idr->idr_rt, &iter, slot, ptr);
radix_tree_iter_tag_clear(&idr->idr_rt, &iter, IDR_FREE);
return 0;
}
我们需要重点分析这个函数,重要的数据结构radix_tree_iter,我们看一下这个数据结构:
这是一个基数树的迭代器
/**
* struct radix_tree_iter - radix tree iterator state
*
* @index: index of current slot
* @next_index: one beyond the last index for this chunk
* @tags: bit-mask for tag-iterating
* @node: node that contains current slot
*
* This radix tree iterator works in terms of "chunks" of slots. A chunk is a
* subinterval of slots contained within one radix tree leaf node. It is
* described by a pointer to its first slot and a struct radix_tree_iter
* which holds the chunk's position in the tree and its size. For tagged
* iteration radix_tree_iter also holds the slots' bit-mask for one chosen
* radix tree tag.
*/
struct radix_tree_iter {
unsigned long index;
unsigned long next_index;
unsigned long tags;
struct radix_tree_node *node;
};
迭代器中有重要的数据结构
#define radix_tree_node xa_node
也就是替换后 变为了
struct xa_node*
我们再继续看 xa_node* 这个数据结构,这是基数树的节点
/*
* @count is the count of every non-NULL element in the ->slots array
* whether that is a value entry, a retry entry, a user pointer,
* a sibling entry or a pointer to the next level of the tree.
* @nr_values is the count of every element in ->slots which is
* either a value entry or a sibling of a value entry.
*/
struct xa_node {
unsigned char shift; /* Bits remaining in each slot */
unsigned char offset; /* Slot offset in parent */
unsigned char count; /* Total entry count */
unsigned char nr_values; /* Value entry count */
struct xa_node __rcu *parent; /* NULL at top of tree */
struct xarray *array; /* The array we belong to */
union {
struct list_head private_list; /* For tree user */
struct rcu_head rcu_head; /* Used when freeing node */
};
void __rcu *slots[XA_CHUNK_SIZE];
union {
unsigned long tags[XA_MAX_MARKS][XA_MARK_LONGS];
unsigned long marks[XA_MAX_MARKS][XA_MARK_LONGS];
};
};
这些宏:
#ifndef XA_CHUNK_SHIFT
#define XA_CHUNK_SHIFT (CONFIG_BASE_SMALL ? 4 : 6)
#endif
#define XA_CHUNK_SIZE (1UL << XA_CHUNK_SHIFT)
XA_CHUNK_SIZE 是 64 或者16.
#define XA_MAX_MARKS 3
对应的CHUNK是6个位或者 4个位
然后我们看到pid最大是0x8000, 4 *4 为 16 个 槽,这意味着基数树高最大为3((16 / 6 ) + 1)
#if defined __x86_64__ && !defined __ILP32__
# define __WORDSIZE 64
#else
# define __WORDSIZE 32
#define __WORDSIZE32_SIZE_ULONG 0
#define __WORDSIZE32_PTRDIFF_LONG 0
#endif
#ifndef BITS_PER_LONG
# define BITS_PER_LONG __WORDSIZE
#endif
BITS_PER_LONG 为64 或者32
#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
/*
* The IDR API does not expose the tagging functionality of the radix tree
* to users. Use tag 0 to track whether a node has free space below it.
*/
#define IDR_FREE 0
带入计算就是:
(64 + 64 -1) / 64 = 1
对应的宏
#define XA_MAX_MARKS 3
#define XA_MARK_LONGS DIV_ROUND_UP(XA_CHUNK_SIZE, BITS_PER_LONG)
初始化基数树的迭代器:
/**
* radix_tree_iter_init - initialize radix tree iterator
*
* @iter: pointer to iterator state
* @start: iteration starting index
* Returns: NULL
*/
static __always_inline void __rcu **
radix_tree_iter_init(struct radix_tree_iter *iter, unsigned long start)
{
/*
* Leave iter->tags uninitialized. radix_tree_next_chunk() will fill it
* in the case of a successful tagged chunk lookup. If the lookup was
* unsuccessful or non-tagged then nobody cares about ->tags.
*
* Set index to zero to bypass next_index overflow protection.
* See the comment in radix_tree_next_chunk() for details.
*/
iter->index = 0;
iter->next_index = start;
return NULL;
}
主要看idr_get_free,这个函数要看懂
shift -= RADIX_TREE_MAP_SHIFT;
shift 最大是16, 每次我们移动是6位,对应的是CHUNKSIZE
if (!tag_get(node, IDR_FREE, offset)) {
offset = radix_tree_find_next_bit(node, IDR_FREE,
offset + 1);
start = next_index(start, node, offset);
if (start > max || start == 0)
return ERR_PTR(-ENOSPC);
while (offset == RADIX_TREE_MAP_SIZE) {
offset = node->offset + 1;
node = node->parent;
if (!node)
goto grow;
shift = node->shift;
}
child = rcu_dereference_raw(node->slots[offset]);
}
每次循环如果循环到tag不是IDR_FREE,就会继续找下一个tag,直到找到是IDR_FREE的solt。
在这里退出循环
else if (!radix_tree_is_internal_node(child))
break;
最后返回空闲槽
iter->index = start;
if (node)
iter->next_index = 1 + min(max, (start | node_maxindex(node)));
else
iter->next_index = 1;
iter->node = node;
set_iter_tags(iter, node, offset, IDR_FREE);
return slot;
上面的start 就应该是要申请的pid,我们最后看next_index
/*
* The maximum index which can be stored in a radix tree
*/
static inline unsigned long shift_maxindex(unsigned int shift)
{
return (RADIX_TREE_MAP_SIZE << shift) - 1;
}
static inline unsigned long node_maxindex(const struct radix_tree_node *node)
{
return shift_maxindex(node->shift);
}
static unsigned long next_index(unsigned long index,
const struct radix_tree_node *node,
unsigned long offset)
{
return (index & ~node_maxindex(node)) + (offset << node->shift);
}
所以start 就是要申请的pid。