download the code
昨天实现了基于int类型的strstr函数,可以获得1~2X左右的加速。今天按 lstrstr的流程实现了基于SSE2的STRSTR函数。可以得到2~4X左右的加速。
昨天实现了基于int类型的strstr函数,可以获得1~2X左右的加速。今天按 lstrstr的流程实现了基于SSE2的STRSTR函数。可以得到2~4X左右的加速。
1
char
*
lstrstrsse(
char
*
text,
char
*
pattern)
2 {
3 __m128i * sseiPtr = (__m128i * ) text;
4 unsigned char * chPtrAligned = (unsigned char * )text;
5 __m128i sseiWord0 ; // = *sseiPtr ;
6 __m128i sseiWord1 ; // = *sseiPtr ;
7 __m128i sseiZero = _mm_set1_epi8( 0 );
8 char chara = pattern[ 0 ];
9 char charb = pattern[ 1 ];
10 register __m128i byte16a;
11 register __m128i byte16b;
12 char * bytePtr = text;
13 if (pattern == NULL) return NULL;
14 if (pattern[ 0 ] == 0 ) return NULL;
15 if (pattern[ 1 ] == 0 ) return lstrchr(text,pattern[ 0 ]);
16 byte16a = _mm_set1_epi8(chara);
17 byte16b = _mm_set1_epi8(charb);
18 // process the unaligned bytes
19
20 // the aligned bytes
21 alignStart:
22 sseiWord0 = * sseiPtr;
23 sseiWord1 = * (sseiPtr + 1 );
24 while ( haszeroByte(sseiWord0,sseiWord1,sseiZero) == 0 )
25 {
26 unsigned int reta ;
27 searcha:
28 reta = hasByteC(sseiWord0,sseiWord1, byte16a);
29 if (reta != 0 ) {
30 unsigned int retb ;
31 findouta:
32 retb = hasByteC(sseiWord0,sseiWord1, byte16b);
33 findoutb:
34 if (((reta << 1 ) & retb)){
35 // have ab
36 int i = 1 ;
37 char * bytePtr0 = ( char * ) ( sseiPtr );
38 int j;
39 // printf("test::%0x,%d\n",reta ,bytePtr0 -text);
40 bytePtr = ( char * ) ( sseiPtr );
41 for (j = 0 ;j < 8 ;j ++ ){
42 if (reta & 0xff ) {
43 if (bytePtr0[ 0 ] == chara){
44 i = 1 ;
45 bytePtr = bytePtr0 ;
46 while ((pattern[i] ) && (bytePtr[i] == pattern[i])) i ++ ;
47 if (pattern[i] == 0 ) return bytePtr;
48 }
49 if (bytePtr0[ 1 ] == chara){
50 i = 1 ;
51 bytePtr = bytePtr0 + 1 ;
52 while ((pattern[i] ) && (bytePtr[i] == pattern[i])) i ++ ;
53 if (pattern[i] == 0 ) return bytePtr;
54 }
55 if (bytePtr0[ 2 ] == chara){
56 i = 1 ;
57 bytePtr = bytePtr0 + 2 ;
58 while ((pattern[i] ) && (bytePtr[i] == pattern[i])) i ++ ;
59 if (pattern[i] == 0 ) return bytePtr;
60 }
61 if (bytePtr0[ 3 ] == chara){
62 i = 1 ;
63 bytePtr = bytePtr0 + 3 ;
64 while ((pattern[i] ) && (bytePtr[i] == pattern[i])) i ++ ;
65 if (pattern[i] == 0 ) return bytePtr;
66 }
67 }
68 reta = reta >> 4 ;
69 bytePtr0 += 4 ;
70 }
71 }
72 // search b
73 sseiPtr += 2 ;
74 sseiWord0 = * sseiPtr;
75 sseiWord1 = * (sseiPtr + 1 );
76
77 while ( haszeroByte(sseiWord0,sseiWord1,sseiZero) == 0 ){
78 retb = hasByteC(sseiWord0,sseiWord1, byte16b);
79 if (retb != 0 ){
80 // findout b
81 if (( * (( char * ) sseiPtr)) == charb){
82 // b000
83 char * bytePtr = (( char * ) ( sseiPtr )) - 1 ;
84 if (bytePtr[ 0 ] == chara){
85 int i = 1 ;
86 while ((pattern[i] ) && (bytePtr[i] == pattern[i])) i ++ ;
87 if (pattern[i] == 0 ) return bytePtr;
88 if (bytePtr[i] == 0 ) return NULL;
89 }
90
91 }
92 reta = hasByteC(sseiWord0,sseiWord1, byte16a);
93 if (reta != 0 )
94 goto findoutb;
95 else {
96 goto nextWord;
97 }
98 }
99 sseiPtr += 2 ;
100 sseiWord0 = * sseiPtr;
101 sseiWord1 = * (sseiPtr + 1 );
102 }
103 // search from (char*)sseiPtr
104 char * bytePtr = (( char * ) ( sseiPtr )) - 1 ;
105 if (bytePtr[ 0 ] == chara){
106 int i = 1 ;
107 while ((pattern[i] ) && (bytePtr[i] == pattern[i])) i ++ ;
108 if (pattern[i] == 0 ) return bytePtr;
109 }
110
111 goto prePareForEnd;
112 }
113 nextWord:
114 sseiPtr += 2 ;
115 sseiWord0 = * sseiPtr;
116 sseiWord1 = * (sseiPtr + 1 );
117 }
118 prePareForEnd:
119 {
120 unsigned int reta;
121 unsigned int retb;
122 reta = hasByteC(sseiWord0,sseiWord1, byte16a);
123 retb = hasByteC(sseiWord0,sseiWord1, byte16b);
124 if (((reta << 1 ) & retb)){
125 bytePtr = ( char * )sseiPtr;
126 while ( * bytePtr){
127 if ( * bytePtr == chara) {
128 int i = 1 ;
129 while ((pattern[i] ) && (bytePtr[i] == pattern[i])) i ++ ;
130 if (pattern[i] == 0 ) return bytePtr;
131 if (bytePtr[i] == 0 ) return NULL;
132
133 }
134 bytePtr ++ ;
135 }
136 }
137 }
138 return NULL;
139 }
140
2 {
3 __m128i * sseiPtr = (__m128i * ) text;
4 unsigned char * chPtrAligned = (unsigned char * )text;
5 __m128i sseiWord0 ; // = *sseiPtr ;
6 __m128i sseiWord1 ; // = *sseiPtr ;
7 __m128i sseiZero = _mm_set1_epi8( 0 );
8 char chara = pattern[ 0 ];
9 char charb = pattern[ 1 ];
10 register __m128i byte16a;
11 register __m128i byte16b;
12 char * bytePtr = text;
13 if (pattern == NULL) return NULL;
14 if (pattern[ 0 ] == 0 ) return NULL;
15 if (pattern[ 1 ] == 0 ) return lstrchr(text,pattern[ 0 ]);
16 byte16a = _mm_set1_epi8(chara);
17 byte16b = _mm_set1_epi8(charb);
18 // process the unaligned bytes
19
20 // the aligned bytes
21 alignStart:
22 sseiWord0 = * sseiPtr;
23 sseiWord1 = * (sseiPtr + 1 );
24 while ( haszeroByte(sseiWord0,sseiWord1,sseiZero) == 0 )
25 {
26 unsigned int reta ;
27 searcha:
28 reta = hasByteC(sseiWord0,sseiWord1, byte16a);
29 if (reta != 0 ) {
30 unsigned int retb ;
31 findouta:
32 retb = hasByteC(sseiWord0,sseiWord1, byte16b);
33 findoutb:
34 if (((reta << 1 ) & retb)){
35 // have ab
36 int i = 1 ;
37 char * bytePtr0 = ( char * ) ( sseiPtr );
38 int j;
39 // printf("test::%0x,%d\n",reta ,bytePtr0 -text);
40 bytePtr = ( char * ) ( sseiPtr );
41 for (j = 0 ;j < 8 ;j ++ ){
42 if (reta & 0xff ) {
43 if (bytePtr0[ 0 ] == chara){
44 i = 1 ;
45 bytePtr = bytePtr0 ;
46 while ((pattern[i] ) && (bytePtr[i] == pattern[i])) i ++ ;
47 if (pattern[i] == 0 ) return bytePtr;
48 }
49 if (bytePtr0[ 1 ] == chara){
50 i = 1 ;
51 bytePtr = bytePtr0 + 1 ;
52 while ((pattern[i] ) && (bytePtr[i] == pattern[i])) i ++ ;
53 if (pattern[i] == 0 ) return bytePtr;
54 }
55 if (bytePtr0[ 2 ] == chara){
56 i = 1 ;
57 bytePtr = bytePtr0 + 2 ;
58 while ((pattern[i] ) && (bytePtr[i] == pattern[i])) i ++ ;
59 if (pattern[i] == 0 ) return bytePtr;
60 }
61 if (bytePtr0[ 3 ] == chara){
62 i = 1 ;
63 bytePtr = bytePtr0 + 3 ;
64 while ((pattern[i] ) && (bytePtr[i] == pattern[i])) i ++ ;
65 if (pattern[i] == 0 ) return bytePtr;
66 }
67 }
68 reta = reta >> 4 ;
69 bytePtr0 += 4 ;
70 }
71 }
72 // search b
73 sseiPtr += 2 ;
74 sseiWord0 = * sseiPtr;
75 sseiWord1 = * (sseiPtr + 1 );
76
77 while ( haszeroByte(sseiWord0,sseiWord1,sseiZero) == 0 ){
78 retb = hasByteC(sseiWord0,sseiWord1, byte16b);
79 if (retb != 0 ){
80 // findout b
81 if (( * (( char * ) sseiPtr)) == charb){
82 // b000
83 char * bytePtr = (( char * ) ( sseiPtr )) - 1 ;
84 if (bytePtr[ 0 ] == chara){
85 int i = 1 ;
86 while ((pattern[i] ) && (bytePtr[i] == pattern[i])) i ++ ;
87 if (pattern[i] == 0 ) return bytePtr;
88 if (bytePtr[i] == 0 ) return NULL;
89 }
90
91 }
92 reta = hasByteC(sseiWord0,sseiWord1, byte16a);
93 if (reta != 0 )
94 goto findoutb;
95 else {
96 goto nextWord;
97 }
98 }
99 sseiPtr += 2 ;
100 sseiWord0 = * sseiPtr;
101 sseiWord1 = * (sseiPtr + 1 );
102 }
103 // search from (char*)sseiPtr
104 char * bytePtr = (( char * ) ( sseiPtr )) - 1 ;
105 if (bytePtr[ 0 ] == chara){
106 int i = 1 ;
107 while ((pattern[i] ) && (bytePtr[i] == pattern[i])) i ++ ;
108 if (pattern[i] == 0 ) return bytePtr;
109 }
110
111 goto prePareForEnd;
112 }
113 nextWord:
114 sseiPtr += 2 ;
115 sseiWord0 = * sseiPtr;
116 sseiWord1 = * (sseiPtr + 1 );
117 }
118 prePareForEnd:
119 {
120 unsigned int reta;
121 unsigned int retb;
122 reta = hasByteC(sseiWord0,sseiWord1, byte16a);
123 retb = hasByteC(sseiWord0,sseiWord1, byte16b);
124 if (((reta << 1 ) & retb)){
125 bytePtr = ( char * )sseiPtr;
126 while ( * bytePtr){
127 if ( * bytePtr == chara) {
128 int i = 1 ;
129 while ((pattern[i] ) && (bytePtr[i] == pattern[i])) i ++ ;
130 if (pattern[i] == 0 ) return bytePtr;
131 if (bytePtr[i] == 0 ) return NULL;
132
133 }
134 bytePtr ++ ;
135 }
136 }
137 }
138 return NULL;
139 }
140