Python中对于字典的实现是根据key进行hash生成散列表,算法为“开放定址法”
1.PyDictEntry(K, V对)
字典中每一个kv对,实际上就是一个entry对象
entry的状态存在3种状态 Active, Unused, Dummy
其含义显而易见,值得注意的是Dummy状态,该状态实际上是因为散列表的“开放定址法”缘故,某entry恰巧在碰撞链中时,它不可删除,以免找不到真正的key而直接返回查询失败。
[dictobject.h]
typedef struct {
Py_ssize_t me_hash; /* cached hash code of me_key */
PyObject *me_key;
PyObject *me_value;
} PyDictEntry;
2. PyDictObject对象结构(关联容器dict)
由源码可知,该对象是一个定长对象,初始时会分配一个8个entry的数组ma_smalltable
#define PyDict_MINSIZE 8
typedef struct _dictobject PyDictObject;
struct _dictobject {
PyObject_HEAD
Py_ssize_t ma_fill; //entry个数: Active + Dummy
Py_ssize_t ma_used; //entry个数: Active
Py_ssize_t ma_mask; //ma_table能容纳元素数,搜索时用以对hash值做与操作
PyDictEntry *ma_table; //entry超过8个时会分配较大数组,指针指向该数组
PyDictEntry *(*ma_lookup)(PyDictObject *mp, PyObject *key, long hash);//查询函数
PyDictEntry ma_smalltable[PyDict_MINSIZE];//默认存在的小entry数组避免频繁分配内存
};
3.PyDictObject对象的创建
typedef PyDictEntry dictentry;
typedef PyDictObject dictobject;
/* 将ma_table指向ma_smalltable 并初始化ma_mask */
#define INIT_NONZERO_DICT_SLOTS(mp) do { \
(mp)->ma_table = (mp)->ma_smalltable; \
(mp)->ma_mask = PyDict_MINSIZE - 1; \
} while(0)
/* 将ma_smalltable清零,重置ma_used和ma_fill并调用INIT_NONZERO_DICT_SLOTS */
#define EMPTY_TO_MINSIZE(mp) do { \
memset((mp)->ma_smalltable, 0, sizeof((mp)->ma_smalltable)); \
(mp)->ma_used = (mp)->ma_fill = 0; \
INIT_NONZERO_DICT_SLOTS(mp); \
} while(0)
PyObject *
PyDict_New(void)
{
register PyDictObject *mp;
if (dummy == NULL) { /* Auto-initialize dummy */
dummy = PyString_FromString("");
if (dummy == NULL)
return NULL;
#ifdef SHOW_CONVERSION_COUNTS
Py_AtExit(show_counts);
#endif
#ifdef SHOW_ALLOC_COUNT
Py_AtExit(show_alloc);
#endif
#ifdef SHOW_TRACK_COUNT
Py_AtExit(show_track);
#endif
}
if (numfree) { /* 判断dict缓冲池是否可用 */
mp = free_list[--numfree];
assert (mp != NULL);
assert (Py_TYPE(mp) == &PyDict_Type);
_Py_NewReference((PyObject *)mp);
if (mp->ma_fill) { /* 检查ma_fill判断是否需要EMPTY_TO_MINSIZE */
EMPTY_TO_MINSIZE(mp);
} else {
/* 否则至少需要进行INIT_NONZERO_DICT_SLOTS操作 */
INIT_NONZERO_DICT_SLOTS(mp);
}
assert (mp->ma_used == 0);
assert (mp->ma_table == mp->ma_smalltable);
assert (mp->ma_mask == PyDict_MINSIZE - 1);
#ifdef SHOW_ALLOC_COUNT
count_reuse++;
#endif
/* 缓冲池不可用时,进行内存分配操作 */
} else {
mp = PyObject_GC_New(PyDictObject, &PyDict_Type);
if (mp == NULL)
return NULL;
EMPTY_TO_MINSIZE(mp);
#ifdef SHOW_ALLOC_COUNT
count_alloc++;
#endif
}
mp->ma_lookup = lookdict_string;
#ifdef SHOW_TRACK_COUNT
count_untracked++;
#endif
#ifdef SHOW_CONVERSION_COUNTS
++created;
#endif
return (PyObject *)mp;
}
4.entry的搜索
大致流程如下
首先寻找第一个entry:
- 通过hash & mask获取索引i,在ma_table[i]处取出该entry对象
- 根据该entry的key产生2种可能:
- ep->key==NULL 搜索失败返回该entry
- ep->key == key 搜索成功返回该entry
- 如果me_key == dummy,令freeslot = ep
- 检查active态的entry,判断hash是否相同,若相同则继续比较key值是否相同
- 失败的话继续寻找下一个散列位置,这样迭代下去
static PyDictEntry *
lookdict(PyDictObject *mp, PyObject *key, register long hash)
{
register size_t i;
register size_t perturb;
register PyDictEntry *freeslot;
register size_t mask = (size_t)mp->ma_mask;
PyDictEntry *ep0 = mp->ma_table;
register PyDictEntry *ep;
register int cmp;
PyObject *startkey;
i = (size_t)hash & mask; //与运算防止溢出
ep = &ep0[i];
if (ep->me_key == NULL || ep->me_key == key)//搜索成功(引用相同)或搜索失败(unused)
return ep;
if (ep->me_key == dummy) //key为dummy,置位freeslot,便于服用内存
freeslot = ep;
else {
/* active态的查询 */
if (ep->me_hash == hash) {//先判断hash
startkey = ep->me_key;
Py_INCREF(startkey);
cmp = PyObject_RichCompareBool(startkey, key, Py_EQ);//再判断key值是否相同
Py_DECREF(startkey);
if (cmp < 0)
return NULL;
if (ep0 == mp->ma_table && ep->me_key == startkey) {
if (cmp > 0)
return ep;
}
else {
/* The compare did major nasty stuff to the
* dict: start over.
* XXX A clever adversary could prevent this
* XXX from terminating.
*/
return lookdict(mp, key, hash);
}
}
freeslot = NULL;
}
/* In the loop, me_key == dummy is by far (factor of 100s) the
least likely outcome, so test for that last. */
for (perturb = hash; ; perturb >>= PERTURB_SHIFT) {
i = (i << 2) + i + perturb + 1;//重定位下一位置
ep = &ep0[i & mask];
if (ep->me_key == NULL)
return freeslot == NULL ? ep : freeslot;
if (ep->me_key == key)
return ep;
if (ep->me_hash == hash && ep->me_key != dummy) {
startkey = ep->me_key;
Py_INCREF(startkey);
cmp = PyObject_RichCompareBool(startkey, key, Py_EQ);
Py_DECREF(startkey);
if (cmp < 0)
return NULL;
if (ep0 == mp->ma_table && ep->me_key == startkey) {
if (cmp > 0)
return ep;
}
else {
/* The compare did major nasty stuff to the
* dict: start over.
* XXX A clever adversary could prevent this
* XXX from terminating.
*/
return lookdict(mp, key, hash);
}
}
else if (ep->me_key == dummy && freeslot == NULL)
freeslot = ep;
}
assert(0); /* NOT REACHED */
return 0;
}
lookdict_string默认搜索方法
[dictobject.c]
static dictentry* lookdict_string(dictobject *mp, PyObject *key, register long hash)
{
register int i;
register unsigned int perturb;
register dictentry *freeslot;
register unsigned int mask = mp->ma_mask;
dictentry *ep0 = mp->ma_table;
register dictentry *ep;
if (!PyString_CheckExact(key)) { //判断key是否为string类型,若非则返回传统搜索方式
mp->ma_lookup = lookdict;
return lookdict(mp, key, hash);
}
i = hash & mask;
ep = &ep0[i];
if (ep->me_key == NULL || ep->me_key == key)
return ep;
if (ep->me_key == dummy)
freeslot = ep;
else
{
//string默认策略主要不同在此处,判断函数较为轻量
if (ep->me_hash == hash && _PyString_Eq(ep->me_key, key))
{
return ep;
}
freeslot = NULL;
}
//搜索第二阶段:遍历冲突链,检查每一个entry
for (perturb = hash; ; perturb >>= PERTURB_SHIFT)
{
i = (i << 2) + i + perturb + 1;
ep = &ep0[i & mask];
if (ep->me_key == NULL)
return freeslot == NULL ? ep : freeslot;
if (ep->me_key == key
|| (ep->me_hash == hash && ep->me_key != dummy &&
_PyString_Eq(ep->me_key, key)))
return ep;
if (ep->me_key == dummy && freeslot == NULL)
freeslot = ep;
}
}
5.元素插入
insertdict函数:
该函数关心ma_lookup返回的对象类型,决定插入的策略
[dictobject.c]
static void
insertdict(register dictobject *mp, PyObject *key, long hash, PyObject *value)
{
PyObject *old_value;
register dictentry *ep;
ep = mp->ma_lookup(mp, key, hash);
/*搜索成功*/
if (ep->me_value != NULL) {
old_value = ep->me_value;
ep->me_value = value;
Py_DECREF(old_value);
Py_DECREF(key);
}
/* 搜索失败,返回的值可能是unused或dummy*/
else {
if (ep->me_key == NULL) //为unused时ma_fill++
mp->ma_fill++;
else //否则为dummy
Py_DECREF(ep->me_key);
ep->me_key = key;
ep->me_hash = hash;
ep->me_value = value;
mp->ma_used++;
}
}
PyDict_SetItem函数:
insertdict函数被该函数调用,该函数主要关心取得hash值
[dictobject.c]
int PyDict_SetItem(register PyObject *op, PyObject *key, PyObject *value)
{
register dictobject *mp;
register long hash;
register Py_ssize_t n_used;
mp = (dictobject *)op;
//[1]:计算hash值
if (PyString_CheckExact(key)) {
hash = ((PyStringObject *)key)->ob_shash;
if (hash == -1)
hash = PyObject_Hash(key);
}
else {
hash = PyObject_Hash(key);
if (hash == -1)
return -1;
}
//[2]:插入(key, value)元素对
n_used = mp->ma_used;
insertdict(mp, key, hash, value);
//[3]:必要时调整dict的内存空间,实际上为判断装填率是否大于2/3
if (!(mp->ma_used > n_used && mp->ma_fill*3 >= (mp->ma_mask+1)*2))
return 0;
return dictresize(mp, mp->ma_used * (mp->ma_used>50000 ? 2 : 4));
}
dictresize函数
该函数关心字典列表的调整,根据需求使用ma_smalltable或重新分配新的空间,旧空间的Acttive entry依次插入新空间中,dummy趁机释放掉
[dictobject.c]
static int dictresize(dictobject *mp, int minused)
{
Py_ssize_t newsize;
dictentry *oldtable, *newtable, *ep;
Py_ssize_t i;
int is_oldtable_malloced;
dictentry small_copy[PyDict_MINSIZE];
//[1]:确定新的table的大小
for(newsize = PyDict_MINSIZE; newsize <= minused && newsize > 0; newsize <<= 1)
;
oldtable = mp->ma_table;
is_oldtable_malloced = (oldtable != mp->ma_smalltable);
//[2]: 新的table可以使用mp->ma_smalltable
if (newsize == PyDict_MINSIZE) {
newtable = mp->ma_smalltable;
if (newtable == oldtable) {
if (mp->ma_fill == mp->ma_used) {
//没有任何Dummy态entry,直接返回
return 0;
}
//将旧table拷贝,进行备份
memcpy(small_copy, oldtable, sizeof(small_copy));
oldtable = small_copy;
}
}
//[3]: 新的table不能使用mp->ma_smalltable,需要在系统堆上申请
else {
newtable = PyMem_NEW(dictentry, newsize);
}
//[4]:设置新table
mp->ma_table = newtable;
mp->ma_mask = newsize - 1;
memset(newtable, 0, sizeof(dictentry) * newsize);
mp->ma_used = 0;
i = mp->ma_fill;
mp->ma_fill = 0;
//[5]:处理旧table中的entry:
// 1、Active态entry,搬移到新table中
// 2、Dummy态entry,调整key的引用计数,丢弃该entry
for (ep = oldtable; i > 0; ep++) {
if (ep->me_value != NULL) { /* active entry */
--i;
insertdict(mp, ep->me_key, ep->me_hash, ep->me_value);
}
else if (ep->me_key != NULL) { /* dummy entry */
--i;
assert(ep->me_key == dummy);
Py_DECREF(ep->me_key);
}
}
//[6]:必要时释放旧table所维护的内存空间
if (is_oldtable_malloced)
PyMem_DEL(oldtable);
return 0;
}
6.删除元素
[dictobject.c]
int PyDict_DelItem(PyObject *op, PyObject *key)
{
register dictobject *mp;
register long hash;
register dictentry *ep;
PyObject *old_value, *old_key;
//[1]:获得hash值
if (!PyString_CheckExact(key) ||
(hash = ((PyStringObject *) key)->ob_shash) == -1) {
hash = PyObject_Hash(key);
if (hash == -1)
return -1;
}
//[2]:搜索entry
mp = (dictobject *)op;
ep = (mp->ma_lookup)(mp, key, hash);
if (ep->me_value == NULL) { //搜索失败,entry不存在
return -1;
}
//[3]:删除entry所维护的元素,将entry的状态转为dummy态
old_key = ep->me_key;
ep->me_key = dummy;
old_value = ep->me_value;
ep->me_value = NULL;
mp->ma_used--;
Py_DECREF(old_value);
Py_DECREF(old_key);
return 0;
}
7.字典缓冲池
与列表对象相似,也是待删除该字典对象时尝试加入到缓冲区。加入前进行一系列的清理动作。
[dictobject.c]
static void dict_dealloc(register dictobject *mp)
{
register dictentry *ep;
Py_ssize_t fill = mp->ma_fill;
//[1]:调整dict中对象的引用计数
for (ep = mp->ma_table; fill > 0; ep++) {
if (ep->me_key) {
--fill;
Py_DECREF(ep->me_key);
Py_XDECREF(ep->me_value);
}
}
//[2] :释放从系统堆中申请的内存空间
if (mp->ma_table != mp->ma_smalltable)
PyMem_DEL(mp->ma_table);
//[3] :将被销毁的PyDictObject对象放入缓冲池
if (num_free_dicts < MAXFREEDICTS && mp->ob_type == &PyDict_Type)
free_dicts[num_free_dicts++] = mp;
else
mp->ob_type->tp_free((PyObject *)mp);
}