CPython版本 2.7.14
参考: http://guilload.com/python-string-interning/
最近在读Python源码,Python的string类型中有一个很有趣的东西 interned,它是一个dict对象,记录了string对象,以此来保证内存中只有一个相同的字符串实例,让字符串比较更加高效。
最初认为任何字符串都会被加到interned中,但是发现。。。
以下代码在print str类型时会打印 ob_sstate 和 地址
>>> a = "123"
>>> a
ob_sstate: 1
addr: 0153F458
'123'
>>> b = "+=_"
>>> b
ob_sstate: 0
addr: 0153F470
'+=_'
>>>
>>> a = "123" + "456"
>>> a
ob_sstate: 1
addr: 0153A2C0
'123456'
>>>
>>> a = "123"
>>> b = "456"
>>> c = a + b
>>> c
ob_sstate: 0
addr: 0153A300
'123456'
晕,这到底是什么时候字符串会被加入到interned中啊
// 摘自PyString_FromString的实现
op = (PyStringObject *)PyObject_MALLOC(PyStringObject_SIZE + size);
if (op == NULL)
return PyErr_NoMemory();
(void)PyObject_INIT_VAR(op, &PyString_Type, size);
op->ob_shash = -1;
op->ob_sstate = SSTATE_NOT_INTERNED;
Py_MEMCPY(op->ob_sval, str, size+1);
/* share short strings */
if (size == 0) {
PyObject *t = (PyObject *)op;
PyString_InternInPlace(&t);//这里加入了interned
op = (PyStringObject *)t;
nullstring = op;
Py_INCREF(op);
} else if (size == 1) {
PyObject *t = (PyObject *)op;
PyString_InternInPlace(&t);//这里加入了interned
op = (PyStringObject *)t;
characters[*str & UCHAR_MAX] = op;
Py_INCREF(op);
}
return (PyObject *) op;
可以看到,在创建字符串的过程中,如果长度不为0或1 就直接返回了,并未将字符串加入interned
导入这些模块时,这部分字符串会被加载到interned中
PyCodeObject *
PyCode_New(int argcount, int nlocals, int stacksize, int flags,
PyObject *code, PyObject *consts, PyObject *names,
PyObject *varnames, PyObject *freevars, PyObject *cellvars,
PyObject *filename, PyObject *name, int firstlineno,
PyObject *lnotab)
{
PyCodeObject *co;
/* Check argument types */
if (argcount < 0 || nlocals < 0 ||
code == NULL ||
consts == NULL || !PyTuple_Check(consts) ||
names == NULL || !PyTuple_Check(names) ||
varnames == NULL || !PyTuple_Check(varnames) ||
freevars == NULL || !PyTuple_Check(freevars) ||
cellvars == NULL || !PyTuple_Check(cellvars) ||
name == NULL || !PyString_Check(name) ||
filename == NULL || !PyString_Check(filename) ||
lnotab == NULL || !PyString_Check(lnotab) ||
!PyObject_CheckReadBuffer(code)) {
PyErr_BadInternalCall();
return NULL;
}
/*注意这里*/
intern_strings(names);
intern_strings(varnames);
intern_strings(freevars);
intern_strings(cellvars);
intern_string_constants(consts);
co = PyObject_NEW(PyCodeObject, &PyCode_Type);
if (co != NULL) {
co->co_argcount = argcount;
co->co_nlocals = nlocals;
co->co_stacksize = stacksize;
co->co_flags = flags;
Py_INCREF(code);
co->co_code = code;
Py_INCREF(consts);
co->co_consts = consts;
Py_INCREF(names);
co->co_names = names;
Py_INCREF(varnames);
co->co_varnames = varnames;
Py_INCREF(freevars);
co->co_freevars = freevars;
Py_INCREF(cellvars);
co->co_cellvars = cellvars;
Py_INCREF(filename);
co->co_filename = filename;
Py_INCREF(name);
co->co_name = name;
co->co_firstlineno = firstlineno;
Py_INCREF(lnotab);
co->co_lnotab = lnotab;
co->co_zombieframe = NULL;
co->co_weakreflist = NULL;
}
return co;
}
python会把代码里的字符串加入到interned中
#define NAME_CHARS \
"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz"
/* all_name_chars(s): true iff all chars in s are valid NAME_CHARS */
static int
intern_string_constants(PyObject *tuple)
{
int modified = 0;
Py_ssize_t i;
for (i = PyTuple_GET_SIZE(tuple); --i >= 0; ) {
PyObject *v = PyTuple_GET_ITEM(tuple, i);
if (PyString_CheckExact(v)) {
if (all_name_chars(v)) {/*注意这里*/
PyObject *w = v;
PyString_InternInPlace(&v);
if (w != v) {
PyTuple_SET_ITEM(tuple, i, v);
modified = 1;
}
}
}
else if (PyTuple_CheckExact(v)) {
intern_string_constants(v);
}
else if (PyFrozenSet_CheckExact(v)) {
PyObject *w = v;
PyObject *tmp = PySequence_Tuple(v);
if (tmp == NULL) {
PyErr_Clear();
continue;
}
if (intern_string_constants(tmp)) {
v = PyFrozenSet_New(tmp);
if (v == NULL) {
PyErr_Clear();
}
else {
PyTuple_SET_ITEM(tuple, i, v);
Py_DECREF(w);
modified = 1;
}
}
Py_DECREF(tmp);
}
}
return modified;
}
不幸的是,如果代码中的字符串,含有一个不在NAME_CHARS 的字符时,它不会被加入interned中
这就是为什么”123”被加入interned而“_+=”没有被加入的原因
而 “123”+“456” 被加入 而 a=”123”;b=”456”;a+b 没有被加入interned的原因是编译器优化造成的
>>> import dis
>>> def test():
... return "123" + "456"
...
>>> dis.dis(test)
2 0 LOAD_CONST 3 ('123456')
3 RETURN_VALUE
>>>
可以看到 编译期 “123”+”456”就已经被认为是 “123456”了
>>> def f():
... return "".join(("123","456"))
...
>>> dis.dis(f)
2 0 LOAD_CONST 1 ('')
3 LOAD_ATTR 0 (join)
6 LOAD_CONST 4 (('123', '456'))
9 CALL_FUNCTION 1
12 RETURN_VALUE
>>> "123456" is "".join(("123","456"))
False
>>>
如果换成 join就不会被放入interned了