python2
py2默认的编码是ascii, ascii只支持英文字符
In [2]: sys.getdefaultencoding()
Out[2]: 'ascii'
In [4]: a = '你好'
In [5]: a
Out[5]: '\xe4\xbd\xa0\xe5\xa5\xbd' # 输出是十六进制的内存地址, 但其实还是bytes类型
In [6]: type(a) # 类型是str, 其实就是bytes
Out[6]: str
In [7]: b = 'hello'
In [8]: b
Out[8]: 'hello'
In [9]: type(b)
Out[9]: str
In [11]: a1 = a.decode('utf-8')
In [12]: a1
Out[12]: u'\u4f60\u597d' # 将bytes二进制按utf-8字符集解码
In [13]: type(a1)
Out[13]: unicode # 变成了py2独有的unicode类型
In [16]: sys.getsizeof(a) # bytes类型的你好
Out[16]: 43 # 字节大小
In [17]: sys.getsizeof(a1) # 解码后的 你好
Out[17]: 54
In [19]: b1 = b.decode('utf-8')
In [20]: b1
Out[20]: u'hello'
In [22]: sys.getsizeof(b) # bytes类型占的字节数少
Out[22]: 42
In [23]: sys.getsizeof(b1) # unicode占字节数多
Out[23]: 60
注意:python2的basestring和str是不同的, basestring是包含了bytes和unicode两种类型, 而str就是bytes类型.
print isinstance(u'aa', basestring) # True
print isinstance('aa', basestring) # True
print isinstance(u'aa', str) # False
print isinstance('aa', str) # True
python3
py3默认的编码是unicode, utf-8字符集
In [1]: import sys
In [2]: sys.getdefaultencoding()
Out[2]: 'utf-8'
In [3]: a = '你好'
In [4]: a
Out[4]: '你好'
In [5]: type(a) # py3的str就是str
Out[5]: str
In [6]: a1 = a.encode('utf-8')
In [7]: a1
Out[7]: b'\xe4\xbd\xa0\xe5\xa5\xbd' # 按默认编码格式编码后才是bytes
In [8]: type(a1)
Out[8]: bytes