关于SHA256的SIMD优化,是我在网上看到了一篇关于SHA-1的优化的文章之后,将那篇文章的思想转移到了SHA256上,我将那篇文章放到了网盘里,有需要的同学可以下载:
点我下载文章
提取码:4l7x
关于SHA256的实现不再介绍,网上可以搜到不少,只将实现的代码贴在下面,以便同学参考:
#include
#include
#include
#include
#include
#include
#include
#include
#define NUM 2
#pragma warning(disable:4996);
#define _CRT_SECURE_NO_DEPRECATE;
#define _CRT_SECURE_NO_WARNINGS;
#define SHA256_ROTL(a,b) (_mm_or_si128(_mm_and_si128(_mm_srli_epi32(a,32-b), _mm_set1_epi32((0x7fffffff>>(31-b)))),_mm_slli_epi32(a,b)))
#define SHA256_SR(a,b) (_mm_and_si128((_mm_srli_epi32(a,b)), _mm_set1_epi32((0x7fffffff>>(b-1)))))
#define SHA256_Ch(x,y,z) (_mm_xor_si128(_mm_and_si128(x,y),_mm_and_si128(_mm_xor_si128(x,_mm_set1_epi32(0xffffffff)),z)))
#define SHA256_Maj(x,y,z) (_mm_xor_si128(_mm_xor_si128(_mm_and_si128(x,y),_mm_and_si128(x,z)),_mm_and_si128(y,z)))
#define SHA256_E0(x) _mm_xor_si128(_mm_xor_si128(SHA256_ROTL(x,30),SHA256_ROTL(x,19)),SHA256_ROTL(x,10))
#define SHA256_E1(x) _mm_xor_si128(_mm_xor_si128(SHA256_ROTL(x,26),SHA256_ROTL(x,21)),SHA256_ROTL(x,7))
#define SHA256_O0(x) _mm_xor_si128(_mm_xor_si128(SHA256_ROTL(x,25),SHA256_ROTL(x,14)),SHA256_SR(x,3))
#define SHA256_O1(x) _mm_xor_si128(_mm_xor_si128(SHA256_ROTL(x,15),SHA256_ROTL(x,13)),SHA256_SR(x,10))
using namespace std;
char text[NUM + 1];
int init(char ss[4][4], int length,long M[4][16])
{
char* pp,*ppend;
int l = length + ((length % 64 >= 56) ? (128 - length % 64) : (64 - length % 64));
for (int k = 0; k < 4;k++)
{
int i=0;
if (!(pp = (char*)malloc((unsigned long)l))) return 0;
for (i = 0; i < length; pp[i + 3 - 2 * (i % 4)] = ss[k][i], i++);
for (pp[ i+ 3 - 2 * (i % 4)] = 128, i++; i < l; pp[i + 3 - 2 * (i % 4)] = 0, i++);
*((long*)(pp + l - 4)) = length << 3;
*((long*)(pp + l - 8)) = length >> 29;
for (ppend=pp+l; pp<ppend;pp+=64)
{
for (i = 0; i < 16;M[k][i] = ((long*)pp)[i], i++);
}
free(pp - l);
}
return 0;
}
void StrSHA256(long M[4][16], long long length, char* sha2560,char* sha2561,char* sha2562,char* sha2563){
/*
计算字符串SHA-256
参数说明:
str 字符串指针
length 字符串长度
sha256 用于保存SHA-256的字符串指针
返回值为参数sha256
*/
long l, i;
__m128i A, B, C, D, E, F, G,H,T1, T2;
__m128i H0 = _mm_set1_epi32(0x6a09e667);
__m128i H1 = _mm_set1_epi32(0xbb67ae85);
__m128i H2 = _mm_set1_epi32(0x3c6ef372);
__m128i H3 = _mm_set1_epi32(0xa54ff53a);
__m128i H4 = _mm_set1_epi32(0x510e527f);
__m128i H5 = _mm_set1_epi32(0x9b05688c);
__m128i H6 = _mm_set1_epi32(0x1f83d9ab);
__m128i H7 = _mm_set1_epi32(0x5be0cd19);
long Ki[64] = {
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
};
__m128i K[64];
for (int i = 0; i < 64; K[i] = _mm_set1_epi32(Ki[i]),i++);//对常量进行初始化
__m128i W[64];
for (int c = 0; c < 16; W[c] = _mm_set_epi32(M[0][c], M[1][c], M[2][c], M[3][c]), c++);
for (i = 16; i < 64; i++)
{
W[i] = _mm_add_epi32(_mm_add_epi32(_mm_add_epi32(SHA256_O1(W[i - 2]), W[i - 7]), SHA256_O0(W[i - 15])), W[i - 16]);
}
A = H0, B = H1, C = H2, D = H3, E = H4, F = H5, G = H6, H = H7;
for (i = 0; i < 64; i++) {
//T1 = H + SHA256_E1(E) + SHA256_Ch(E, F, G) + K[i] + W[i];
T1 = _mm_add_epi32(H, _mm_add_epi32(SHA256_E1(E), _mm_add_epi32(SHA256_Ch(E, F, G), _mm_add_epi32(K[i], W[i]))));
T2 =_mm_add_epi32( SHA256_E0(A) , SHA256_Maj(A, B, C));
H = G, G = F, F = E, E = _mm_add_epi32(D , T1), D = C, C = B, B = A, A = _mm_add_epi32(T1 , T2);
}
H0 = _mm_add_epi32(H0,A), H1 = _mm_add_epi32(H1,B), H2 = _mm_add_epi32(H2,C), H3 = _mm_add_epi32(H3,D), H4 = _mm_add_epi32(H4,E), H5 = _mm_add_epi32(H5,F), H6 = _mm_add_epi32(H6,G), H7 = _mm_add_epi32(H7,H);
long add_out[8][4];
_mm_storeu_si128((__m128i*)add_out[0], H0);
_mm_storeu_si128((__m128i*)add_out[1], H1);
_mm_storeu_si128((__m128i*)add_out[2], H2);
_mm_storeu_si128((__m128i*)add_out[3], H3);
_mm_storeu_si128((__m128i*)add_out[4], H4);
_mm_storeu_si128((__m128i*)add_out[5], H5);
_mm_storeu_si128((__m128i*)add_out[6], H6);
_mm_storeu_si128((__m128i*)add_out[7], H7);
sprintf(sha2560, "%08X%08X%08X%08X%08X%08X%08X%08X", add_out[0][0], add_out[1][0], add_out[2][0], add_out[3][0], add_out[4][0], add_out[5][0], add_out[6][0], add_out[7][0]);
sprintf(sha2561, "%08X%08X%08X%08X%08X%08X%08X%08X", add_out[0][1], add_out[1][1], add_out[2][1], add_out[3][1], add_out[4][1], add_out[5][1], add_out[6][1], add_out[7][1]);
sprintf(sha2562, "%08X%08X%08X%08X%08X%08X%08X%08X", add_out[0][2], add_out[1][2], add_out[2][2], add_out[3][2], add_out[4][2], add_out[5][2], add_out[6][2], add_out[7][2]);
sprintf(sha2563, "%08X%08X%08X%08X%08X%08X%08X%08X", add_out[0][3], add_out[1][3], add_out[2][3], add_out[3][3], add_out[4][3], add_out[5][3], add_out[6][3], add_out[7][3]);
}
int main() {
char ss[4][4] = { "abc","ABC","bcd","efg" };
long M[4][16];
init(ss, 3, M);
char sha2560[65],sha2561[65],sha2562[65],sha2563[65];
for (int i = 0; i < 1; i++)
{
LARGE_INTEGER num;
long long start, end, freq;
QueryPerformanceFrequency(&num);
freq = num.QuadPart;
QueryPerformanceCounter(&num);
start = num.QuadPart;
for (int j = 0; j < 1; j++)
{
StrSHA256(M,3,sha2560,sha2561,sha2562,sha2563);
}
QueryPerformanceCounter(&num);
end = num.QuadPart;
printf("time=%d 毫秒\n", (end - start) * 1000 / freq);
puts(sha2563);
puts(sha2562);
puts(sha2561);
puts(sha2560);
}
return 0;
}