目前,python实现的国密算法库主要是python-gmssl库和snowland-smx(pysmx)库,二者都对SM2(仅公钥加解密和数字签名)、SM3、SM4进行了细致而优雅的实现。
PyCryptodome. https://www.pycryptodome.org
最近用python做加密系统开发时发现,上述两个库的SM4加解密效率比国外更成熟的AES库相差1-3个数量级!下图是ECB模式下SM4(gmssl库)与AES(PyCryptodome库)的加解密耗时对比:
上图中,短报文是28字节,长报文是253字节,实验数据取20次执行的平均值。gmssl库的SM4比pysmx库快一些,后者的耗时再乘1.5左右。
当我想进入Crypto.Cipher.AES一探究竟,竟找不到加密的源代码,顺着函数调用一路找,看到:
基本上明白了,人家的加解密核心算法不是python代码,而是已经编译好的链接库。python作为解释型语言,效率跟C这样的编译后执行的机器代码自然没法比。咱自己的python国密SM4加解密短报文也不超过1ms,偶尔加密个短指令问题不大,但交互长数据或加解密大文件的时候,速率会严重不足(按之前的测试结果,算出加解密速率为150KB/s)。虽然其他编程语言实现的SM4与AES效率差别没有那么大,但毕竟目前用python的小伙伴越来越多,这对于国密算法的推广使用可不是什么好事!所以着手实现一份更高效的国密SM4的python代码。如下:
from array import array
from struct import pack, unpack
_SM4_FK = array('L', [0xa3b1bac6, 0x56aa3350, 0x677d9197, 0xb27022dc])
_SM4_CK = array('L', [
0x00070e15, 0x1c232a31, 0x383f464d, 0x545b6269, 0x70777e85, 0x8c939aa1, 0xa8afb6bd, 0xc4cbd2d9,
0xe0e7eef5, 0xfc030a11, 0x181f262d, 0x343b4249, 0x50575e65, 0x6c737a81, 0x888f969d, 0xa4abb2b9,
0xc0c7ced5, 0xdce3eaf1, 0xf8ff060d, 0x141b2229, 0x30373e45, 0x4c535a61, 0x686f767d, 0x848b9299,
0xa0a7aeb5, 0xbcc3cad1, 0xd8dfe6ed, 0xf4fb0209, 0x10171e25, 0x2c333a41, 0x484f565d, 0x646b7279])
_SM4_S_BOX = bytes([
0xd6, 0x90, 0xe9, 0xfe, 0xcc, 0xe1, 0x3d, 0xb7, 0x16, 0xb6, 0x14, 0xc2, 0x28, 0xfb, 0x2c, 0x05,
0x2b, 0x67, 0x9a, 0x76, 0x2a, 0xbe, 0x04, 0xc3, 0xaa, 0x44, 0x13, 0x26, 0x49, 0x86, 0x06, 0x99,
0x9c, 0x42, 0x50, 0xf4, 0x91, 0xef, 0x98, 0x7a, 0x33, 0x54, 0x0b, 0x43, 0xed, 0xcf, 0xac, 0x62,
0xe4, 0xb3, 0x1c, 0xa9, 0xc9, 0x08, 0xe8, 0x95, 0x80, 0xdf, 0x94, 0xfa, 0x75, 0x8f, 0x3f, 0xa6,
0x47, 0x07, 0xa7, 0xfc, 0xf3, 0x73, 0x17, 0xba, 0x83, 0x59, 0x3c, 0x19, 0xe6, 0x85, 0x4f, 0xa8,
0x68, 0x6b, 0x81, 0xb2, 0x71, 0x64, 0xda, 0x8b, 0xf8, 0xeb, 0x0f, 0x4b, 0x70, 0x56, 0x9d, 0x35,
0x1e, 0x24, 0x0e, 0x5e, 0x63, 0x58, 0xd1, 0xa2, 0x25, 0x22, 0x7c, 0x3b, 0x01, 0x21, 0x78, 0x87,
0xd4, 0x00, 0x46, 0x57, 0x9f, 0xd3, 0x27, 0x52, 0x4c, 0x36, 0x02, 0xe7, 0xa0, 0xc4, 0xc8, 0x9e,
0xea, 0xbf, 0x8a, 0xd2, 0x40, 0xc7, 0x38, 0xb5, 0xa3, 0xf7, 0xf2, 0xce, 0xf9, 0x61, 0x15, 0xa1,
0xe0, 0xae, 0x5d, 0xa4, 0x9b, 0x34, 0x1a, 0x55, 0xad, 0x93, 0x32, 0x30, 0xf5, 0x8c, 0xb1, 0xe3,
0x1d, 0xf6, 0xe2, 0x2e, 0x82, 0x66, 0xca, 0x60, 0xc0, 0x29, 0x23, 0xab, 0x0d, 0x53, 0x4e, 0x6f,
0xd5, 0xdb, 0x37, 0x45, 0xde, 0xfd, 0x8e, 0x2f, 0x03, 0xff, 0x6a, 0x72, 0x6d, 0x6c, 0x5b, 0x51,
0x8d, 0x1b, 0xaf, 0x92, 0xbb, 0xdd, 0xbc, 0x7f, 0x11, 0xd9, 0x5c, 0x41, 0x1f, 0x10, 0x5a, 0xd8,
0x0a, 0xc1, 0x31, 0x88, 0xa5, 0xcd, 0x7b, 0xbd, 0x2d, 0x74, 0xd0, 0x12, 0xb8, 0xe5, 0xb4, 0xb0,
0x89, 0x69, 0x97, 0x4a, 0x0c, 0x96, 0x77, 0x7e, 0x65, 0xb9, 0xf1, 0x09, 0xc5, 0x6e, 0xc6, 0x84,
0x18, 0xf0, 0x7d, 0xec, 0x3a, 0xdc, 0x4d, 0x20, 0x79, 0xee, 0x5f, 0x3e, 0xd7, 0xcb, 0x39, 0x48])
_SM4_S_BOX_FAST = array('H')
for byte1 in _SM4_S_BOX: # 构造两字节的S盒变换表
byte1 <<= 8
_SM4_S_BOX_FAST.extend(byte1 | byte2 for byte2 in _SM4_S_BOX)
def _T_key(ka): # 用于生成圈密钥的T'变换
B = _SM4_S_BOX_FAST[ka >> 16] << 16 | _SM4_S_BOX_FAST[ka & 0xffff] # S盒变换
return (B ^ (B << 13 | B >> 19) ^ (B << 23 | B >> 9)) & 0xffffffff # L线性变换
def _T(x): # T变换
B = _SM4_S_BOX_FAST[x >> 16] << 16 | _SM4_S_BOX_FAST[x & 0xffff] # S盒变换
return (B ^ (B << 2 | B >> 30) ^ (B << 10 | B >> 22) ^ (B << 18 | B >> 14)
^ (B << 24 | B >> 8)) & 0xffffffff # L线性变换
def _one_round(rK, X): # T变换
X0, X1, X2, X3 = X
for rk in rK:
X0, X1, X2, X3 = X1, X2, X3, X0 ^ _T(X1 ^ X2 ^ X3 ^ rk)
return [X3, X2, X1, X0]
def _ecb_base(rK, X):
for i in range(0, len(X), 4): # 4个字为一组
X[i:i + 4] = _one_round(rK, X[i:i + 4])
return X
def _cbc_enc(rK, iv, X):
for i in range(0, len(X), 4):
iv = X[i:i + 4] = _one_round(rK, [X[i + j] ^ iv[j] for j in range(4)])
return X
def _cbc_dec(rK, iv, X):
for i in range(0, len(X), 4):
next_iv = X[i:i + 4]
X[i:i + 4], iv = list(map(lambda x, y: x ^ y, _one_round(rK, X[i:i + 4]), iv)), next_iv
return X
# bytes转数组(4个字节为一字)
def byte2array(data):
return list(unpack('>%dI' % (len(data) >> 2), data))
# 数组转bytes
def array2byte(data):
return pack('>%dI' % (len(data)), *data)
class SM4:
def __init__(self, key):
self.set_key(key)
def set_key(self, key):
MK = unpack('>4I', key)
K = array('L', (MK[i] ^ _SM4_FK[i] for i in range(4)))
K.extend(K[i] ^ _T_key(K[i + 1] ^ K[i + 2] ^ K[i + 3] ^ _SM4_CK[i]) for i in range(32))
self.e_rk, self.d_rk = K[4:], array('L', reversed(K[4:]))
def ecb_enc(self, data):
return array2byte(_ecb_base(self.e_rk, byte2array(data)))
def ecb_dec(self, data):
return array2byte(_ecb_base(self.d_rk, byte2array(data)))
def cbc_enc(self, iv, data):
return array2byte(_cbc_enc(self.e_rk, byte2array(iv), byte2array(data)))
def cbc_dec(self, iv, data):
return array2byte(_cbc_dec(self.d_rk, byte2array(iv), byte2array(data)))
from Crypto.Cipher import AES
from pysmx.SM4 import Sm4, ENCRYPT, DECRYPT
from gmssl.sm4 import CryptSM4, SM4_ENCRYPT, SM4_DECRYPT
from Crypto.Hash import MD5
from pysmx.SM3 import digest as SM3_pysmx
from gmssl.sm3 import sm3_hash as SM3_gmssl
import time, os
# 填充成16字节倍数bytes
def pad(s: bytes) -> bytes:
n = 16 - (len(s) & 0xf)
return s + bytes([n] * n)
# 将填充后的bytes还原
def unpad(s: bytes) -> bytes:
return s[0:-s[-1]]
# Crypto - AES-128加密器
class AES_Cipher:
def __init__(self, key):
self._key = MD5.new(key).digest()
self._ecb = AES.new(self._key, AES.MODE_ECB)
# AES加密(ECB模式)
def enc(self, data):
return self._ecb.encrypt(pad(data))
# AES解密(ECB模式)
def dec(self, data):
assert len(data) & 0xf == 0
return unpad(self._ecb.decrypt(data))
# AES加密(CBC模式)
def enc_cbc(self, iv, data):
assert len(iv) == 16
return AES.new(self._key, AES.MODE_CBC, iv).encrypt(pad(data))
# AES解密(CBC模式)
def dec_cbc(self, iv, data):
assert len(iv) == 16
assert len(data) & 0xf == 0
return unpad(AES.new(self._key, AES.MODE_CBC, iv).decrypt(data))
# pysmx - SM4加密器
class SM4_pysmx:
def __init__(self, raw_key):
key = SM3_pysmx(raw_key)
self._enc = Sm4()
self._enc.sm4_setkey(key, ENCRYPT)
self._dec = Sm4()
self._dec.sm4_setkey(key, DECRYPT)
# SM4加密(ECB模式)
def enc(self, data):
return bytes(self._enc.sm4_crypt_ecb(pad(data)))
# SM4解密(ECB模式)
def dec(self, data):
assert len(data) & 0xf == 0
return unpad(bytes(self._dec.sm4_crypt_ecb(data)))
# SM4加密(CBC模式)
def enc_cbc(self, iv, data):
assert len(iv) == 16
return bytes(self._enc.sm4_crypt_cbc(iv, pad(data)))
# SM4解密(CBC模式)
def dec_cbc(self, iv, data):
assert len(iv) == 16
assert len(data) & 0xf == 0
return unpad(bytes(self._dec.sm4_crypt_cbc(iv, data)))
# gmssl - SM4加密器
class SM4_gmssl:
def __init__(self, raw_key):
key = SM3_pysmx(raw_key)
self._enc = CryptSM4()
self._enc.set_key(key, SM4_ENCRYPT)
self._dec = CryptSM4()
self._dec.set_key(key, SM4_DECRYPT)
# SM4加密(ECB模式)
def enc(self, data):
return self._enc.crypt_ecb(data)
# SM4解密(ECB模式)
def dec(self, data):
assert len(data) & 0xf == 0
return self._dec.crypt_ecb(data)
# SM4加密(CBC模式)
def enc_cbc(self, iv, data):
assert len(iv) == 16
return self._enc.crypt_cbc(iv, data)
# SM4解密(CBC模式)
def dec_cbc(self, iv, data):
assert len(iv) == 16
assert len(data) & 0xf == 0
return self._dec.crypt_cbc(iv, data)
# my - SM4加密器
class SM4_my:
def __init__(self, key):
self.sm4 = SM4(SM3_pysmx(key)[:16])
# SM4加密(ECB模式)
def enc(self, data):
return self.sm4.ecb_enc(pad(data))
# SM4解密(ECB模式)
def dec(self, data):
assert len(data) & 0xf == 0
return unpad(self.sm4.ecb_dec(data))
# SM4加密(CBC模式)
def enc_cbc(self, iv, data):
assert len(iv) == 16
return self.sm4.cbc_enc(iv, pad(data))
# SM4解密(CBC模式)
def dec_cbc(self, iv, data):
assert len(iv) == 16
assert len(data) & 0xf == 0
return unpad(self.sm4.cbc_dec(iv, data))
def prof_test():
def prof_func():
enc_data = sm4_my.enc(long_data)
dec_data = sm4_my.dec(enc_data)
assert dec_data == long_data
enc_data = sm4_my.enc_cbc(iv, long_data)
dec_data = sm4_my.dec_cbc(iv, enc_data)
assert dec_data == long_data
from cProfile import Profile
long_data = os.urandom(5000000)
KA = os.urandom(16)
iv = os.urandom(16)
sm4_my = SM4_my(KA)
prof = Profile()
prof.runcall(prof_func)
prof.print_stats(1)
# prof.dump_stats('test.prof')
def compare_test():
# 随机生成128位的密钥
KA = os.urandom(16)
# 创建各加密器
time_1 = time.perf_counter()
aes = AES_Cipher(KA)
print('Crypto - AES-128 初始化耗时:%d us' % ((time.perf_counter() - time_1) * 1000000))
time_1 = time.perf_counter()
sm4_smx = SM4_pysmx(KA)
print('pysmx - SM4 初始化耗时:%d us' % ((time.perf_counter() - time_1) * 1000000))
time_1 = time.perf_counter()
sm4_ssl = SM4_gmssl(KA)
print('gmssl - SM4 初始化耗时:%d us' % ((time.perf_counter() - time_1) * 1000000))
time_1 = time.perf_counter()
sm4_my = SM4_my(KA)
print('my - SM4 初始化耗时:%d us' % ((time.perf_counter() - time_1) * 1000000))
# 以下第一次执行很慢
# time_1 = time.perf_counter()
# aes.enc(os.urandom(128))
# print('Crypto - AES-128 第一次加密耗时:%d us' % ((time.perf_counter() - time_1) * 1000000))
# time_1 = time.perf_counter()
# aes.enc(os.urandom(128))
# print('Crypto - AES-128 第二次加密耗时:%d us' % ((time.perf_counter() - time_1) * 1000000))
# time_1 = time.perf_counter()
# sm4_my.enc(os.urandom(128))
# print('my - SM4 第一次加密耗时:%d us' % ((time.perf_counter() - time_1) * 1000000))
# time_1 = time.perf_counter()
# sm4_my.enc(os.urandom(128))
# print('my - SM4 第二次加密耗时:%d us' % ((time.perf_counter() - time_1) * 1000000))
test_num = 50 # 测试次数
# 随机生成消息
short_data = [os.urandom(28) for i in range(test_num)] # 短消息列表
long_data = [os.urandom(1128) for i in range(test_num)] # 长消息列表
enc_data = [b''] * 50
enc_data1 = [b''] * 50
enc_data2 = [b''] * 50
enc_data3 = [b''] * 50
dec_data1 = [b''] * 50
dec_data2 = [b''] * 50
print('——————————————————————加解密测试——————————————————————')
print('短消息长度:%dB 长消息长度:%dB 测试次数:%d 单位:ms' % (len(short_data[0]), len(long_data[0]), test_num))
print('\nECB模式:\n算法库名\t\t加密短消息\t解密短消息\t加密长消息\t解密长消息')
# Crypto - AES-128
time_1 = time.perf_counter()
for i in range(test_num):
enc_data[i] = aes.enc(short_data[i]) # 加密短消息
time_2 = time.perf_counter()
for i in range(test_num):
dec_data1[i] = aes.dec(enc_data[i]) # 解密短消息
time_3 = time.perf_counter()
for i in range(test_num):
enc_data[i] = aes.enc(long_data[i]) # 加密长消息
time_4 = time.perf_counter()
for i in range(test_num):
dec_data2[i] = aes.dec(enc_data[i]) # 解密长消息
time_5 = time.perf_counter()
assert dec_data1 == short_data
assert dec_data2 == long_data
print('Crypto-AES\t%.2f\t\t%.2f\t\t%.2f\t\t%.2f' %
((time_2 - time_1) * 1000, (time_3 - time_2) * 1000, (time_4 - time_3) * 1000, (time_5 - time_4) * 1000))
# pysmx - SM4
time_1 = time.perf_counter()
for i in range(test_num):
enc_data[i] = sm4_smx.enc(short_data[i]) # 加密短消息
time_2 = time.perf_counter()
for i in range(test_num):
dec_data1[i] = sm4_smx.dec(enc_data[i]) # 解密短消息
time_3 = time.perf_counter()
for i in range(test_num):
enc_data1[i] = sm4_smx.enc(long_data[i]) # 加密长消息
time_4 = time.perf_counter()
for i in range(test_num):
dec_data2[i] = sm4_smx.dec(enc_data1[i]) # 解密长消息
time_5 = time.perf_counter()
assert dec_data1 == short_data
assert dec_data2 == long_data
print('pysmx-SM4\t%.2f\t\t%.2f\t\t%.2f\t\t%.2f' %
((time_2 - time_1) * 1000, (time_3 - time_2) * 1000, (time_4 - time_3) * 1000, (time_5 - time_4) * 1000))
time_aim1 = time_5 - time_1
# gmssl - SM4
time_1 = time.perf_counter()
for i in range(test_num):
enc_data[i] = sm4_ssl.enc(short_data[i]) # 加密短消息
time_2 = time.perf_counter()
for i in range(test_num):
dec_data1[i] = sm4_ssl.dec(enc_data[i]) # 解密短消息
time_3 = time.perf_counter()
for i in range(test_num):
enc_data2[i] = sm4_ssl.enc(long_data[i]) # 加密长消息
time_4 = time.perf_counter()
for i in range(test_num):
dec_data2[i] = sm4_ssl.dec(enc_data2[i]) # 解密长消息
time_5 = time.perf_counter()
assert dec_data1 == short_data
assert dec_data2 == long_data
print('gmssl-SM4\t%.2f\t\t%.2f\t\t%.2f\t\t%.2f' %
((time_2 - time_1) * 1000, (time_3 - time_2) * 1000, (time_4 - time_3) * 1000, (time_5 - time_4) * 1000))
time_aim2 = time_5 - time_1
# my - SM4
time_1 = time.perf_counter()
for i in range(test_num):
enc_data[i] = sm4_my.enc(short_data[i]) # 加密短消息
time_2 = time.perf_counter()
for i in range(test_num):
dec_data1[i] = sm4_my.dec(enc_data[i]) # 解密短消息
time_3 = time.perf_counter()
for i in range(test_num):
enc_data3[i] = sm4_my.enc(long_data[i]) # 加密长消息
time_4 = time.perf_counter()
for i in range(test_num):
dec_data2[i] = sm4_my.dec(enc_data3[i]) # 解密长消息
time_5 = time.perf_counter()
assert dec_data1 == short_data
assert dec_data2 == long_data
print('my-SM4\t\t%.2f\t\t%.2f\t\t%.2f\t\t%.2f' %
((time_2 - time_1) * 1000, (time_3 - time_2) * 1000, (time_4 - time_3) * 1000, (time_5 - time_4) * 1000))
time_my = time_5 - time_1
print('总耗时为gmssl的%.2f%%、pysmx的%.2f%%' % (time_my / time_aim2 * 100, time_my / time_aim1 * 100))
assert enc_data1 == enc_data2 == enc_data3
print('\nCBC模式:\n算法库名\t\t加密短消息\t解密短消息\t加密长消息\t解密长消息')
iv = os.urandom(16)
# Crypto - AES-128
time_1 = time.perf_counter()
for i in range(test_num):
enc_data[i] = aes.enc_cbc(iv, short_data[i]) # 加密短消息
time_2 = time.perf_counter()
for i in range(test_num):
dec_data1[i] = aes.dec_cbc(iv, enc_data[i]) # 解密短消息
time_3 = time.perf_counter()
for i in range(test_num):
enc_data[i] = aes.enc_cbc(iv, long_data[i]) # 加密长消息
time_4 = time.perf_counter()
for i in range(test_num):
dec_data2[i] = aes.dec_cbc(iv, enc_data[i]) # 解密长消息
time_5 = time.perf_counter()
assert dec_data1 == short_data
assert dec_data2 == long_data
print('Crypto-AES\t%.2f\t\t%.2f\t\t%.2f\t\t%.2f' %
((time_2 - time_1) * 1000, (time_3 - time_2) * 1000, (time_4 - time_3) * 1000, (time_5 - time_4) * 1000))
# pysmx - SM4
time_1 = time.perf_counter()
for i in range(test_num):
enc_data[i] = sm4_smx.enc_cbc(iv, short_data[i]) # 加密短消息
time_2 = time.perf_counter()
for i in range(test_num):
dec_data1[i] = sm4_smx.dec_cbc(iv, enc_data[i]) # 解密短消息
time_3 = time.perf_counter()
for i in range(test_num):
enc_data1[i] = sm4_smx.enc_cbc(iv, long_data[i]) # 加密长消息
time_4 = time.perf_counter()
for i in range(test_num):
dec_data2[i] = sm4_smx.dec_cbc(iv, enc_data1[i]) # 解密长消息
time_5 = time.perf_counter()
assert dec_data1 == short_data
assert dec_data2 == long_data
print('pysmx-SM4\t%.2f\t\t%.2f\t\t%.2f\t\t%.2f' %
((time_2 - time_1) * 1000, (time_3 - time_2) * 1000, (time_4 - time_3) * 1000, (time_5 - time_4) * 1000))
time_aim1 = time_5 - time_1
# gmssl - SM4
time_1 = time.perf_counter()
for i in range(test_num):
enc_data[i] = sm4_ssl.enc_cbc(iv, short_data[i]) # 加密短消息
time_2 = time.perf_counter()
for i in range(test_num):
dec_data1[i] = sm4_ssl.dec_cbc(iv, enc_data[i]) # 解密短消息
time_3 = time.perf_counter()
for i in range(test_num):
enc_data2[i] = sm4_ssl.enc_cbc(iv, long_data[i]) # 加密长消息
time_4 = time.perf_counter()
for i in range(test_num):
dec_data2[i] = sm4_ssl.dec_cbc(iv, enc_data2[i]) # 解密长消息
time_5 = time.perf_counter()
assert dec_data1 == short_data
assert dec_data2 == long_data
print('gmssl-SM4\t%.2f\t\t%.2f\t\t%.2f\t\t%.2f' %
((time_2 - time_1) * 1000, (time_3 - time_2) * 1000, (time_4 - time_3) * 1000, (time_5 - time_4) * 1000))
time_aim2 = time_5 - time_1
# my - SM4
time_1 = time.perf_counter()
for i in range(test_num):
enc_data[i] = sm4_my.enc_cbc(iv, short_data[i]) # 加密短消息
time_2 = time.perf_counter()
for i in range(test_num):
dec_data1[i] = sm4_my.dec_cbc(iv, enc_data[i]) # 解密短消息
time_3 = time.perf_counter()
for i in range(test_num):
enc_data3[i] = sm4_my.enc_cbc(iv, long_data[i]) # 加密长消息
time_4 = time.perf_counter()
for i in range(test_num):
dec_data2[i] = sm4_my.dec_cbc(iv, enc_data3[i]) # 解密长消息
time_5 = time.perf_counter()
assert dec_data1 == short_data
assert dec_data2 == long_data
print('my-SM4\t\t%.2f\t\t%.2f\t\t%.2f\t\t%.2f' %
((time_2 - time_1) * 1000, (time_3 - time_2) * 1000, (time_4 - time_3) * 1000, (time_5 - time_4) * 1000))
time_my = time_5 - time_1
print('总耗时为gmssl的%.2f%%、pysmx的%.2f%%' % (time_my / time_aim2 * 100, time_my / time_aim1 * 100))
assert enc_data1 == enc_data2 == enc_data3
if __name__ == "__main__":
# prof_test()
compare_test()
SM4的核心代码是前100行,后面是测试用的。关于本SM4代码的用法,跟着测试代码学就行。上述代码保证可以运行,前提是先安装好依赖的库。
测试代码构造了四个密码算法(Crypto-AES、pysmx-SM4、gmssl-SM4和my-SM4)加密器,并比较四者初始化、ECB和CBC两种模式下加解密的耗时,执行结果详见下图(表中的结果数值不是平均值,是50次执行的总耗时):
不同的计算机配置下结果可能有细微差别,我的配置是:
可见,本SM4代码的加解密效率比另两个国密算法库高了不少(耗时约为其1/3 ~ 1/4),我做了哪些改进呢?
1.构造S盒快表。S盒转换是高频操作,传统做法是用256字节的S盒表逐个字节转换,我构造了一个S盒快表(见代码27-30行),两字节合并转换,使得S盒查表操作少了一半,配套的“移位”和“按位与”运算也减少一半多。S盒快表大小为65536×2=131072B=128KB,构造的速度也很快,这种以空间换时间的做法是很合算的。
2.减少函数调用。代码复用减少了代码量,看起来更清爽,但调用频次过高的代码不宜写成函数,因为函数调用也是有开销的。当函数调用的开销和函数本身的计算耗时相差无几的时候,调用就成了负担,而且合并以后,发现有些计算是冗余的。这主要体现在L线性变换,把循环移位的函数去掉,效率提升不少。
3.减少类型转换。在前两个库的SM4代码中,各函数间传递的数据类型在bytes和列表之间反复转换,其实没有必要。函数的中间值统一用一种数据类型,我尝试统一成bytes或int,经过反复对比测试,int比较快且实现简单一点(我想吐槽一下python的bytes类型不支持位运算!(¬︿̫̿¬☆))。
上述三点针对所有编程语言都是适用的。还有很多针对python的优化,比如用array代替list、用生成器代替列表推导、能改原数组就不要生成新数组等等。Profile库是个不错的调优工具,可得知每个中间操作的运行时间,以找到最耗时的操作并集中精力去攻克它,以免错把精力放在没有多少油水可压榨的地方(我就做了很多这样的无用功/(ㄒoㄒ)/~~)。
这回真切地体会到先把书读厚再读薄的感觉,各种实现方式的测试代码写了又改,增了又删,可能不下2000行,最后浓缩成这100行自认为在目前python下较为接近最优解的代码(*^_^*)!
但是,处理短消息耗时离AES还差一个数量级,长消息差更多!毕竟机器码在python面前是降维打击。所以下一步将使用numpy+numba对上述代码进行改写。numba库用来把最耗时的操作(主要是循环)先编译成机器码再执行(代码受到诸多限制,不能再像Pythonic 那么天马行空,需配套使用numpy),引入多线程并行,以达到甚至部分超越PyCryptodome库的AES!
奋战数日,有些疲惫,下次分享加入numba的SM4代码,效果先看图:
可见,这样的效率硬刚AES已经不吃亏了。
自主创新等不来、买不来、要不来,只能脚踏实地一点一滴创造出来。支持国密,研究国密,使用国密,将网络信息安全牢牢掌握在自己手中!