sunday是我所知到目前最快的单模式字符串匹配算法了, 由于请求包体中可能含有二进制, 所以把sunday算法改造成了支持二进制串匹配的.
sunday算法的原理不多说,网上一搜一大把, 下面贴下我的实现:
/* * @desc : Sunday String pattern matching algorithm (also support binary buf pattern match) * @author : nemozhang * */ #ifndef __SUNDAY_H_20111203__ #define __SUNDAY_H_20111203__ #include <stdio.h> #include <vector> #ifndef u_char #define u_char unsigned char #endif class SundayAlgo { public: enum { JUMP_TABLE_LEN = 256 // 跳转表大小 }; enum { MATCH_RULE_STEP_ONE_CHAR = 0, // 匹配上时, 跳过一个字符长度继续匹配 MATCH_RULE_STEP_ONE_PATTEN = 1, // 匹配上时, 跳过一个模式串长度继续匹配 }; public: SundayAlgo(): _jump_table_inited(false), _pat_start(0), _pat_end(0), _match_rule(MATCH_RULE_STEP_ONE_CHAR) {} public: // 源串 [text_start, text_end) // 模式串 [pat_start, pat_end) // @return -1 没找到, else 目标相对于起始串的偏移 int Search(const char *text_start, const char *text_end) { if (text_start >= text_end) { return -1; } if (!_jump_table_inited) { return -1; } int text_len = text_end - text_start; int pat_len = _pat_end - _pat_start; for (int i=0; i<text_len-pat_len+1; ) { bool finded = true; // 从后往前匹配 for (int j=0; j<pat_len; ++j) { // 匹配不上, 跳 if (text_start[i+pat_len-j-1] != _pat_start[pat_len-j-1]) { //printf("i:%d, j:%d\n", i, j); //printf("text:%d [%c], pat:%d [%c] \n", i+pat_len-j-1, text_start[i+pat_len-j-1], pat_len-j-1, _pat_start[pat_len-j-1]); //printf("i:%d [%c], j:%d [%c] \n", i, text_start[i], j, _pat_start[j]); u_char next_c = (u_char)text_start[i + pat_len]; //printf("next c:%d, [%c], jmp:%d\n", i+pat_len, next_c, _jump_table[next_c]); i += _jump_table[next_c]; finded = false; break; } } if (finded) { // 匹配上了 return i; } } return -1; } // 将每一个匹配项的偏移存于pos_vec void Search(const char *text_start, const char *text_end, std::vector<int> &pos_vec) { int pos = 0; const char *text_start_new = (const char*)text_start; int pat_len = _pat_end - _pat_start; while(pos != -1) { pos = Search(text_start_new, text_end); if (pos != -1) { pos_vec.push_back(pos + text_start_new - text_start); if (MATCH_RULE_STEP_ONE_CHAR == _match_rule) { text_start_new += (1 + pos); } else { text_start_new += (pat_len + pos); } } else { break; } } } // 设置模式串 // [pat_start, pat_end) 不含pat_end. void SetPatten(const char* pat_start, const char* pat_end) { _pat_start = pat_start; _pat_end = pat_end; PreCompute(pat_start, pat_end); } // 设置匹配策略 // 假设文本串为 "aaaaaa", 模式串为 "aaa" // 如果rule:MATCH_RULE_STEP_ONE_CHAR, 则会产生 4次匹配 // 如果rule:MATCH_RULE_STEP_ONE_PATTERN, 则会产生 2次匹配 void SetMatchRule(int rule) { _match_rule = rule; } private: // 生成跳转表 void PreCompute(const char* pat_start, const char* pat_end) { if (pat_start >= pat_end) { return; } int pat_len = pat_end - pat_start ; // 初始化 for (int i=0; i<JUMP_TABLE_LEN; ++i) { _jump_table[i] = pat_len + 1; // pat长度+1 } const char* p = pat_start; for (; p!=pat_end; ++p) { _jump_table[(u_char)(*p)] = pat_end - p; } _jump_table_inited = true; } private: u_char _jump_table[JUMP_TABLE_LEN]; bool _jump_table_inited; const char *_pat_start; const char *_pat_end; int _match_rule; }; #endif
// by nemozhang #include <gtest/gtest.h> #include "sunday.h" #include <unistd.h> using namespace std; TEST(autorun_SundayAlgo, test_ascii_str) { const char *text = "sunhello world !\n taday is sunday, i feel good now.\nthis is a text for sunday algo test program.day, sunhow,dslasun.sdslsunday" ; const char *pat = "sunday"; int text_len = strlen(text); int pat_len = strlen(pat); SundayAlgo sunday; const char * pat_start = (const char*)pat; const char * pat_end = pat_start + pat_len; sunday.SetPatten(pat_start, pat_end); vector<int> pos_vec; sunday.Search(text, text + text_len, pos_vec); printf("hit times : %d\n", pos_vec.size()); for (size_t i=0; i<pos_vec.size(); ++i) { printf("the %u time : %d\n", i, pos_vec[i]); for (int j=pos_vec[i]; j<pos_vec[i]+pat_len+5; ++j) { if (j >= text_len) { break; } printf("%c",text[j]); } printf("\n"); } } TEST(autorun_SundayAlgo, test_binary_str) { u_char text[] = {1,2,255,253,0,255,0,253,0,3,4,5,6,7,8,9,0,0,1,2,3,4,0,0,1,2,0,4,5,0,9,6,4,2,0,0,0,0,0,0,3,2,1,1,2,3,4,5,6,7,0,3,4,6,55,4,2,3,4,234,12,111,255,253,0,255,253,0,255,253,0}; //u_char pat[] = {255,253,0}; u_char pat[] = {0,0,0}; int text_len = sizeof(text); int pat_len = sizeof(pat); SundayAlgo sunday; const char * pat_start = (const char*)pat; const char * pat_end = pat_start + pat_len; sunday.SetPatten(pat_start, pat_end); vector<int> pos_vec; sunday.SetMatchRule(SundayAlgo::MATCH_RULE_STEP_ONE_PATTEN); sunday.Search((const char*)text, (const char*)text + text_len, pos_vec); printf("\n"); printf("\n"); printf("hit times : %d\n", pos_vec.size()); for (size_t i=0; i<pos_vec.size(); ++i) { printf("the %u time : %d\n", i, pos_vec[i]); for (int j=pos_vec[i]; j<pos_vec[i]+pat_len+5; ++j) { if (j >= text_len) { break; } printf("%d,",text[j]); } printf("\n"); } }