从Python3字符串统一使用Unicode编码
1.字符串加法
#源码
a="dddddddddd"
b="sssssssssss"
a = a+b
#字节码
0 LOAD_NAME 0 (a)
2 LOAD_NAME 1 (b)
4 BINARY_ADD
6 STORE_NAME 0 (a)
8 LOAD_CONST 0 (None)
10 RETURN_VALUE
【BINARY_ADD】#源码有删减
PyObject *right = POP();#出栈right
PyObject *left = TOP();#指向栈顶left
PyObject *sum;#新对象sum
#针对字符串加法运算的优化
if (PyUnicode_CheckExact(left) &&PyUnicode_CheckExact(right)) {
sum = unicode_concatenate(tstate, left, right, f, next_instr);
}
else {
'''
}
SET_TOP(sum);#重新设置栈顶
//源码有删减
static PyObject *
unicode_concatenate(PyThreadState *tstate, PyObject *v, PyObject *w,
PyFrameObject *f, const _Py_CODEUNIT *next_instr)
{
PyObject *res;
if (Py_REFCNT(v) == 2) {
int opcode, oparg;#字节码指令和指令参数
NEXTOPARG();#获取下一字节码指令
switch (opcode) {#根据下一字节码指令进行优化
case STORE_FAST:
{
''''
}
case STORE_DEREF:
{
''''
}
case STORE_NAME:#这是我们执行的情况
{
PyObject *names = f->f_code->co_names;#获取code对象name元组
PyObject *name = GETITEM(names, oparg);#获取参数名
PyObject *locals = f->f_locals;#获取局部参数字典
if (locals && PyDict_CheckExact(locals)) {
PyObject *w = PyDict_GetItemWithError(locals, name);
#两种情况直接返回
if ((w == v && PyDict_DelItem(locals, name) != 0) ||
(w == NULL && _PyErr_Occurred(tstate)))
{
Py_DECREF(v);
return NULL;
}
}
break;
}
}
}
res = v;
PyUnicode_Append(&res, w);#通常情况调用的函数
return res;
}
//源码有删减
void
PyUnicode_Append(PyObject **p_left, PyObject *right)
{
PyObject *left, *res;
Py_UCS4 maxchar, maxchar2;
Py_ssize_t left_len, right_len, new_len;
'''
'''
/* Shortcuts */
if (left == unicode_empty) { #左为''情况
Py_DECREF(left);
Py_INCREF(right);
*p_left = right;
return;
}
if (right == unicode_empty) #右为''情况
return;
#拼接后长度计算
left_len = PyUnicode_GET_LENGTH(left);
right_len = PyUnicode_GET_LENGTH(right);
if (left_len > PY_SSIZE_T_MAX - right_len) {
#PY_SSIZE_T_MAX 7FFF FFFF也就是(unsigned int)/2 - 1,字符串最大长度
goto error;
}
new_len = left_len + right_len;#拼接后长度
if (unicode_modifiable(left) #左类型检查
&& PyUnicode_CheckExact(right)#右类型检查
#左类型必须是和右类型相同的类型或先后兼容的类型(1:Py_UCS1,2:Py_UCS2,4:Py_UCS4)
&& PyUnicode_KIND(right) <= PyUnicode_KIND(left)
#而且两种类型必须都不是ascii码类型,也就是类型0
&& !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
{
#动态扩展左操作字符串空间大小
if (unicode_resize(p_left, new_len) != 0)
goto error;
#快速内存字符串拷贝
_PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
}
else {
#类型不兼容或有操作串是ascii码的情况
maxchar = PyUnicode_MAX_CHAR_VALUE(left);
maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
maxchar = Py_MAX(maxchar, maxchar2);
#创建一个新对象,
res = PyUnicode_New(new_len, maxchar);
if (res == NULL)
goto error;
#先拷贝左边
_PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
#再拷贝右边
_PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
#左指向新对象了,以前的对象就被垃圾回收了
*p_left = res;
}
return;
error:
Py_CLEAR(*p_left);
}
//源码有删减
static int
unicode_resize(PyObject **p_unicode, Py_ssize_t length)
{
PyObject *unicode;
Py_ssize_t old_length;
unicode = *p_unicode;
#获取传入unicode长度
if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
old_length = PyUnicode_WSTR_LENGTH(unicode);
else
old_length = PyUnicode_GET_LENGTH(unicode);
#不用处理
if (old_length == length)
return 0;
#长度为0,指向empty对象
if (length == 0) {
_Py_INCREF_UNICODE_EMPTY();
if (!unicode_empty)
return -1;
Py_SETREF(*p_unicode, unicode_empty);
return 0;
}
#不能更改的情况
if (!unicode_modifiable(unicode)) {
#创建一个新unicode对象,并拷贝原字符串内容
PyObject *copy = resize_copy(unicode, length);
if (copy == NULL)
return -1;
return 0;
}
#空间足够的情况
if (PyUnicode_IS_COMPACT(unicode)) {
#调用操作系统relloc函数重新分配内存
PyObject *new_unicode = resize_compact(unicode, length);
if (new_unicode == NULL)
return -1;
*p_unicode = new_unicode;
return 0;
}
#调用操作系统relloc函数重新分配内存
return resize_inplace(unicode, length);
}
//有删减
static PyObject*
resize_compact(PyObject *unicode, Py_ssize_t length)
{
Py_ssize_t char_size;
Py_ssize_t struct_size;
Py_ssize_t new_size;
int share_wstr;
PyObject *new_unicode;
char_size = PyUnicode_KIND(unicode);
//ascii类型长度
if (PyUnicode_IS_ASCII(unicode))
struct_size = sizeof(PyASCIIObject);
//unicode类型长度
else
struct_size = sizeof(PyCompactUnicodeObject);
share_wstr = _PyUnicode_SHARE_WSTR(unicode);
//超过最大长度
if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
PyErr_NoMemory();
return NULL;
}
//新的大小
new_size = (struct_size + (length + 1) * char_size);
if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
PyObject_DEL(_PyUnicode_UTF8(unicode));
_PyUnicode_UTF8(unicode) = NULL;
_PyUnicode_UTF8_LENGTH(unicode) = 0;
}
//调用relloc函数分配
new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
if (new_unicode == NULL) {
_Py_NewReference(unicode);
PyErr_NoMemory();
return NULL;
}
unicode = new_unicode;
//设置length长度
_PyUnicode_LENGTH(unicode) = length;
//unicode编码设置长度
if (share_wstr) {
_PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
if (!PyUnicode_IS_ASCII(unicode))
_PyUnicode_WSTR_LENGTH(unicode) = length;
}
//其他情况
else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
PyObject_DEL(_PyUnicode_WSTR(unicode));
_PyUnicode_WSTR(unicode) = NULL;
if (!PyUnicode_IS_ASCII(unicode))
_PyUnicode_WSTR_LENGTH(unicode) = 0;
}
#define PyUnicode_WRITE(kind, data, index, value) \
do { \
switch ((kind)) { \
case PyUnicode_1BYTE_KIND: { \
((Py_UCS1 *)(data))[(index)] = (Py_UCS1)(value); \
break; \
} \
case PyUnicode_2BYTE_KIND: { \
((Py_UCS2 *)(data))[(index)] = (Py_UCS2)(value); \
break; \
} \
default: { \
assert((kind) == PyUnicode_4BYTE_KIND); \
((Py_UCS4 *)(data))[(index)] = (Py_UCS4)(value); \
} \
} \
} while (0)
PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
length, 0);
return unicode;
}
static int
resize_inplace(PyObject *unicode, Py_ssize_t length)
{
wchar_t *wstr;
Py_ssize_t new_size;
//一般情况
if (PyUnicode_IS_READY(unicode)) {
Py_ssize_t char_size;
int share_wstr, share_utf8;
void *data;
data = _PyUnicode_DATA_ANY(unicode);//data
char_size = PyUnicode_KIND(unicode);//size
share_wstr = _PyUnicode_SHARE_WSTR(unicode);//wstr
share_utf8 = _PyUnicode_SHARE_UTF8(unicode);//utf-8
//超过最大长度
if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
PyErr_NoMemory();
return -1;
}
//新的长度
new_size = (length + 1) * char_size;
//不是utf8类型却有其内存
if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
{
PyObject_DEL(_PyUnicode_UTF8(unicode));
_PyUnicode_UTF8(unicode) = NULL;
_PyUnicode_UTF8_LENGTH(unicode) = 0;
}
//调用relloc
data = (PyObject *)PyObject_REALLOC(data, new_size);
_PyUnicode_DATA_ANY(unicode) = data;
//wstr类型
if (share_wstr) {
_PyUnicode_WSTR(unicode) = data;
_PyUnicode_WSTR_LENGTH(unicode) = length;
}
//uft8类型
if (share_utf8) {
_PyUnicode_UTF8(unicode) = data;
_PyUnicode_UTF8_LENGTH(unicode) = length;
}
//设置长度
_PyUnicode_LENGTH(unicode) = length;
PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
assert(_PyUnicode_CheckConsistency(unicode, 0));
return 0;
}
}
//长度超过最大长度
/* check for integer overflow */
if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
PyErr_NoMemory();
return -1;
}
//重新新建对象,当成wstr类型处理
new_size = sizeof(wchar_t) * (length + 1);
wstr = _PyUnicode_WSTR(unicode);
wstr = PyObject_REALLOC(wstr, new_size);
if (!wstr) {
PyErr_NoMemory();
return -1;
}
_PyUnicode_WSTR(unicode) = wstr;
_PyUnicode_WSTR(unicode)[length] = 0;
_PyUnicode_WSTR_LENGTH(unicode) = length;
return 0;
}
static PyObject*
resize_copy(PyObject *unicode, Py_ssize_t length)
{
Py_ssize_t copy_length;
//不是wstr类型
if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
PyObject *copy;
//新建对象
copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
//长度
copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
//快速内存拷贝
_PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
return copy;
}
//当成wstr类型处理
else {
PyObject *w;
w = (PyObject*)_PyUnicode_New(length);
if (w == NULL)
return NULL;
copy_length = _PyUnicode_WSTR_LENGTH(unicode);
copy_length = Py_MIN(copy_length, length);
memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
copy_length * sizeof(wchar_t));
return w;
}
}