Python源码学习笔记 5 字典对象

Python中对于字典的实现是根据key进行hash生成散列表,算法为“开放定址法”

1.PyDictEntry(K, V对)

字典中每一个kv对,实际上就是一个entry对象
entry的状态存在3种状态 Active, Unused, Dummy
其含义显而易见,值得注意的是Dummy状态,该状态实际上是因为散列表的“开放定址法”缘故,某entry恰巧在碰撞链中时,它不可删除,以免找不到真正的key而直接返回查询失败。

[dictobject.h]
typedef struct {
    Py_ssize_t me_hash;      /* cached hash code of me_key */
    PyObject *me_key;
    PyObject *me_value;
} PyDictEntry;
Python源码学习笔记 5 字典对象_第1张图片
entry

2. PyDictObject对象结构(关联容器dict)

由源码可知,该对象是一个定长对象,初始时会分配一个8个entry的数组ma_smalltable

#define PyDict_MINSIZE 8
typedef struct _dictobject PyDictObject;
struct _dictobject {
    PyObject_HEAD
    Py_ssize_t ma_fill;  //entry个数: Active + Dummy
    Py_ssize_t ma_used;  //entry个数: Active 
    Py_ssize_t ma_mask; //ma_table能容纳元素数,搜索时用以对hash值做与操作
    PyDictEntry *ma_table; //entry超过8个时会分配较大数组,指针指向该数组
    PyDictEntry *(*ma_lookup)(PyDictObject *mp, PyObject *key, long hash);//查询函数
    PyDictEntry ma_smalltable[PyDict_MINSIZE];//默认存在的小entry数组避免频繁分配内存
};

3.PyDictObject对象的创建

typedef PyDictEntry dictentry;
typedef PyDictObject dictobject;

/* 将ma_table指向ma_smalltable 并初始化ma_mask */
#define INIT_NONZERO_DICT_SLOTS(mp) do {          \
    (mp)->ma_table = (mp)->ma_smalltable;     \
    (mp)->ma_mask = PyDict_MINSIZE - 1;       \
    } while(0)

/* 将ma_smalltable清零,重置ma_used和ma_fill并调用INIT_NONZERO_DICT_SLOTS */
#define EMPTY_TO_MINSIZE(mp) do {               \
    memset((mp)->ma_smalltable, 0, sizeof((mp)->ma_smalltable));    \
    (mp)->ma_used = (mp)->ma_fill = 0;        \
    INIT_NONZERO_DICT_SLOTS(mp);                \
    } while(0)

PyObject *
PyDict_New(void)
{
    register PyDictObject *mp;
    if (dummy == NULL) { /* Auto-initialize dummy */
        dummy = PyString_FromString("");
        if (dummy == NULL)
            return NULL;
#ifdef SHOW_CONVERSION_COUNTS
        Py_AtExit(show_counts);
#endif
#ifdef SHOW_ALLOC_COUNT
        Py_AtExit(show_alloc);
#endif
#ifdef SHOW_TRACK_COUNT
        Py_AtExit(show_track);
#endif
    }
    if (numfree) { /* 判断dict缓冲池是否可用 */
        mp = free_list[--numfree];
        assert (mp != NULL);
        assert (Py_TYPE(mp) == &PyDict_Type);
        _Py_NewReference((PyObject *)mp);
        if (mp->ma_fill) { /* 检查ma_fill判断是否需要EMPTY_TO_MINSIZE */
            EMPTY_TO_MINSIZE(mp);
        } else {
            /* 否则至少需要进行INIT_NONZERO_DICT_SLOTS操作 */
            INIT_NONZERO_DICT_SLOTS(mp);
        }
        assert (mp->ma_used == 0);
        assert (mp->ma_table == mp->ma_smalltable);
        assert (mp->ma_mask == PyDict_MINSIZE - 1);
#ifdef SHOW_ALLOC_COUNT
        count_reuse++;
#endif
    /* 缓冲池不可用时,进行内存分配操作 */
    } else { 
        mp = PyObject_GC_New(PyDictObject, &PyDict_Type);
        if (mp == NULL)
            return NULL;
        EMPTY_TO_MINSIZE(mp);
#ifdef SHOW_ALLOC_COUNT
        count_alloc++;
#endif
    }
    mp->ma_lookup = lookdict_string;
#ifdef SHOW_TRACK_COUNT
    count_untracked++;
#endif
#ifdef SHOW_CONVERSION_COUNTS
    ++created;
#endif
    return (PyObject *)mp;
}

4.entry的搜索

大致流程如下

首先寻找第一个entry:
  • 通过hash & mask获取索引i,在ma_table[i]处取出该entry对象
  • 根据该entry的key产生2种可能:
  • ep->key==NULL 搜索失败返回该entry
  • ep->key == key 搜索成功返回该entry
  • 如果me_key == dummy,令freeslot = ep
  • 检查active态的entry,判断hash是否相同,若相同则继续比较key值是否相同
  • 失败的话继续寻找下一个散列位置,这样迭代下去
static PyDictEntry *
lookdict(PyDictObject *mp, PyObject *key, register long hash)
{
    register size_t i;
    register size_t perturb;
    register PyDictEntry *freeslot;
    register size_t mask = (size_t)mp->ma_mask;
    PyDictEntry *ep0 = mp->ma_table;
    register PyDictEntry *ep;
    register int cmp;
    PyObject *startkey;

    i = (size_t)hash & mask; //与运算防止溢出
    ep = &ep0[i];
    if (ep->me_key == NULL || ep->me_key == key)//搜索成功(引用相同)或搜索失败(unused)
        return ep;

    if (ep->me_key == dummy) //key为dummy,置位freeslot,便于服用内存
        freeslot = ep;
    else {
        /* active态的查询 */
        if (ep->me_hash == hash) {//先判断hash
            startkey = ep->me_key;
            Py_INCREF(startkey);
            cmp = PyObject_RichCompareBool(startkey, key, Py_EQ);//再判断key值是否相同
            Py_DECREF(startkey);
            if (cmp < 0)
                return NULL;
            if (ep0 == mp->ma_table && ep->me_key == startkey) {
                if (cmp > 0)
                    return ep;
            }
            else {
                /* The compare did major nasty stuff to the
                 * dict:  start over.
                 * XXX A clever adversary could prevent this
                 * XXX from terminating.
                 */
                return lookdict(mp, key, hash);
            }
        }
        freeslot = NULL;
    }

    /* In the loop, me_key == dummy is by far (factor of 100s) the
       least likely outcome, so test for that last. */
    for (perturb = hash; ; perturb >>= PERTURB_SHIFT) {
        i = (i << 2) + i + perturb + 1;//重定位下一位置
        ep = &ep0[i & mask];
        if (ep->me_key == NULL)
            return freeslot == NULL ? ep : freeslot;
        if (ep->me_key == key)
            return ep;
        if (ep->me_hash == hash && ep->me_key != dummy) {
            startkey = ep->me_key;
            Py_INCREF(startkey);
            cmp = PyObject_RichCompareBool(startkey, key, Py_EQ);
            Py_DECREF(startkey);
            if (cmp < 0)
                return NULL;
            if (ep0 == mp->ma_table && ep->me_key == startkey) {
                if (cmp > 0)
                    return ep;
            }
            else {
                /* The compare did major nasty stuff to the
                 * dict:  start over.
                 * XXX A clever adversary could prevent this
                 * XXX from terminating.
                 */
                return lookdict(mp, key, hash);
            }
        }
        else if (ep->me_key == dummy && freeslot == NULL)
            freeslot = ep;
    }
    assert(0);          /* NOT REACHED */
    return 0;
}

lookdict_string默认搜索方法

[dictobject.c]
static dictentry* lookdict_string(dictobject *mp, PyObject *key, register long hash)
{
    register int i;
    register unsigned int perturb;
    register dictentry *freeslot;
    register unsigned int mask = mp->ma_mask;
    dictentry *ep0 = mp->ma_table;
    register dictentry *ep;
  
    if (!PyString_CheckExact(key)) { //判断key是否为string类型,若非则返回传统搜索方式
        mp->ma_lookup = lookdict;
        return lookdict(mp, key, hash);
    }

    i = hash & mask;
    ep = &ep0[i];


    if (ep->me_key == NULL || ep->me_key == key)
        return ep;

    if (ep->me_key == dummy)
        freeslot = ep;
    else 
    {
        //string默认策略主要不同在此处,判断函数较为轻量
        if (ep->me_hash == hash && _PyString_Eq(ep->me_key, key)) 
        {
            return ep;
        }
        freeslot = NULL;
    }

    //搜索第二阶段:遍历冲突链,检查每一个entry
    for (perturb = hash; ; perturb >>= PERTURB_SHIFT) 
    {
        i = (i << 2) + i + perturb + 1;
        ep = &ep0[i & mask];
        if (ep->me_key == NULL)
            return freeslot == NULL ? ep : freeslot;
        if (ep->me_key == key
            || (ep->me_hash == hash && ep->me_key != dummy &&
                  _PyString_Eq(ep->me_key, key)))
            return ep;
        if (ep->me_key == dummy && freeslot == NULL)
            freeslot = ep;
    }
}

5.元素插入

insertdict函数:
该函数关心ma_lookup返回的对象类型,决定插入的策略

[dictobject.c]
static void 
insertdict(register dictobject *mp, PyObject *key, long hash, PyObject *value)
{
    PyObject *old_value;
    register dictentry *ep;
    
    ep = mp->ma_lookup(mp, key, hash);
    /*搜索成功*/
    if (ep->me_value != NULL) {
        old_value = ep->me_value;
        ep->me_value = value;
        Py_DECREF(old_value); 
        Py_DECREF(key);
    }
    /* 搜索失败,返回的值可能是unused或dummy*/
    else {
        if (ep->me_key == NULL) //为unused时ma_fill++
            mp->ma_fill++;
        else  //否则为dummy
            Py_DECREF(ep->me_key);
        ep->me_key = key;
        ep->me_hash = hash;
        ep->me_value = value;
        mp->ma_used++;
    }
}

PyDict_SetItem函数:
insertdict函数被该函数调用,该函数主要关心取得hash值

[dictobject.c]
int PyDict_SetItem(register PyObject *op, PyObject *key, PyObject *value)
{
    register dictobject *mp;
    register long hash;
    register Py_ssize_t n_used;

    mp = (dictobject *)op;
    //[1]:计算hash值
    if (PyString_CheckExact(key)) {
        hash = ((PyStringObject *)key)->ob_shash;
        if (hash == -1)
            hash = PyObject_Hash(key);
    }
    else {
        hash = PyObject_Hash(key);
        if (hash == -1)
            return -1;
    }
    //[2]:插入(key, value)元素对
    n_used = mp->ma_used;
    insertdict(mp, key, hash, value);
    
    //[3]:必要时调整dict的内存空间,实际上为判断装填率是否大于2/3
    if (!(mp->ma_used > n_used && mp->ma_fill*3 >= (mp->ma_mask+1)*2))
        return 0;
    return dictresize(mp, mp->ma_used * (mp->ma_used>50000 ? 2 : 4));
}

dictresize函数
该函数关心字典列表的调整,根据需求使用ma_smalltable或重新分配新的空间,旧空间的Acttive entry依次插入新空间中,dummy趁机释放掉

[dictobject.c]
static int dictresize(dictobject *mp, int minused)
{
    Py_ssize_t newsize;
    dictentry *oldtable, *newtable, *ep;
    Py_ssize_t i;
    int is_oldtable_malloced;
    dictentry small_copy[PyDict_MINSIZE];
    //[1]:确定新的table的大小
    for(newsize = PyDict_MINSIZE; newsize <= minused && newsize > 0; newsize <<= 1)
        ;
    oldtable = mp->ma_table;
    is_oldtable_malloced = (oldtable != mp->ma_smalltable);

    //[2]: 新的table可以使用mp->ma_smalltable
    if (newsize == PyDict_MINSIZE) {
        newtable = mp->ma_smalltable;
        if (newtable == oldtable) {
            if (mp->ma_fill == mp->ma_used) {
                //没有任何Dummy态entry,直接返回
                return 0;
            }
            //将旧table拷贝,进行备份
            memcpy(small_copy, oldtable, sizeof(small_copy));
            oldtable = small_copy;
        }
    }
    //[3]: 新的table不能使用mp->ma_smalltable,需要在系统堆上申请
    else {
        newtable = PyMem_NEW(dictentry, newsize);
    }

    //[4]:设置新table
    mp->ma_table = newtable;
    mp->ma_mask = newsize - 1;
    memset(newtable, 0, sizeof(dictentry) * newsize);
    mp->ma_used = 0;
    i = mp->ma_fill;
    mp->ma_fill = 0;

    //[5]:处理旧table中的entry:
    //    1、Active态entry,搬移到新table中
    //    2、Dummy态entry,调整key的引用计数,丢弃该entry
    for (ep = oldtable; i > 0; ep++) {
        if (ep->me_value != NULL) { /* active entry */
            --i;
            insertdict(mp, ep->me_key, ep->me_hash, ep->me_value);
        }
        else if (ep->me_key != NULL) {  /* dummy entry */
            --i;
            assert(ep->me_key == dummy);
            Py_DECREF(ep->me_key);
        }
    }
    //[6]:必要时释放旧table所维护的内存空间
    if (is_oldtable_malloced)
        PyMem_DEL(oldtable);
    return 0;
}

6.删除元素

[dictobject.c]
int PyDict_DelItem(PyObject *op, PyObject *key)
{
    register dictobject *mp;
    register long hash;
    register dictentry *ep;
    PyObject *old_value, *old_key;
    //[1]:获得hash值
    if (!PyString_CheckExact(key) ||
        (hash = ((PyStringObject *) key)->ob_shash) == -1) {
        hash = PyObject_Hash(key);
        if (hash == -1)
            return -1;
    }
    //[2]:搜索entry
    mp = (dictobject *)op;
    ep = (mp->ma_lookup)(mp, key, hash);
    if (ep->me_value == NULL) { //搜索失败,entry不存在
        return -1;
    }
    //[3]:删除entry所维护的元素,将entry的状态转为dummy态
    old_key = ep->me_key;
    ep->me_key = dummy;
    old_value = ep->me_value;
    ep->me_value = NULL;
    mp->ma_used--;
    Py_DECREF(old_value);
    Py_DECREF(old_key);
    return 0;
}
 

7.字典缓冲池

与列表对象相似,也是待删除该字典对象时尝试加入到缓冲区。加入前进行一系列的清理动作。

[dictobject.c]
static void dict_dealloc(register dictobject *mp)
{
    register dictentry *ep;
    Py_ssize_t fill = mp->ma_fill;
    //[1]:调整dict中对象的引用计数
    for (ep = mp->ma_table; fill > 0; ep++) {
        if (ep->me_key) {
            --fill;
            Py_DECREF(ep->me_key);
            Py_XDECREF(ep->me_value);
        }
    }
    //[2] :释放从系统堆中申请的内存空间
    if (mp->ma_table != mp->ma_smalltable)
        PyMem_DEL(mp->ma_table);
    //[3] :将被销毁的PyDictObject对象放入缓冲池
    if (num_free_dicts < MAXFREEDICTS && mp->ob_type == &PyDict_Type)
        free_dicts[num_free_dicts++] = mp;
    else 
        mp->ob_type->tp_free((PyObject *)mp);
}

你可能感兴趣的:(Python源码学习笔记 5 字典对象)