sunday 字符串匹配算法的实现(支持二进制匹配)

之前在解析multi-part formdata协议的http请求包时, 用字符串匹配的方式寻找包体中的 boundary标记.  这就涉及到了字符串匹配算法,  然后就选择了sunday算法.

sunday是我所知到目前最快的单模式字符串匹配算法了, 由于请求包体中可能含有二进制, 所以把sunday算法改造成了支持二进制串匹配的.

sunday算法的原理不多说,网上一搜一大把, 下面贴下我的实现:

/*
 * @desc    : Sunday String pattern matching algorithm (also support binary buf pattern match)
 * @author  : nemozhang
 *
 */
#ifndef __SUNDAY_H_20111203__
#define __SUNDAY_H_20111203__

#include <stdio.h>
#include <vector>

#ifndef u_char
#define u_char unsigned char
#endif

class SundayAlgo
{
public:
        enum
        {   
                JUMP_TABLE_LEN                          = 256    // 跳转表大小
        };  

        enum
        {
                MATCH_RULE_STEP_ONE_CHAR        = 0,    // 匹配上时, 跳过一个字符长度继续匹配
                MATCH_RULE_STEP_ONE_PATTEN      = 1,    // 匹配上时, 跳过一个模式串长度继续匹配
        };

public:

        SundayAlgo():
                _jump_table_inited(false), 
                _pat_start(0),
                _pat_end(0),
                _match_rule(MATCH_RULE_STEP_ONE_CHAR)
        {}  

public:
        // 源串     [text_start, text_end)
        // 模式串   [pat_start, pat_end)
        // @return -1 没找到, else 目标相对于起始串的偏移
        int Search(const char *text_start, const char *text_end)
        {   
                if (text_start >= text_end)
                {   
                        return -1; 
                }   

                if (!_jump_table_inited)
                {   
                        return -1; 
                }   

                int text_len = text_end - text_start;
                int pat_len = _pat_end - _pat_start;

                for (int i=0; i<text_len-pat_len+1; )
                {   
                        bool finded = true;
                        // 从后往前匹配
                        for (int j=0; j<pat_len; ++j)
                        {   
                                // 匹配不上, 跳
                                if (text_start[i+pat_len-j-1] != _pat_start[pat_len-j-1])
                                {   
                                        //printf("i:%d, j:%d\n", i, j);
                                        //printf("text:%d [%c], pat:%d [%c] \n", i+pat_len-j-1, text_start[i+pat_len-j-1],  pat_len-j-1, _pat_start[pat_len-j-1]);
                                        //printf("i:%d [%c], j:%d [%c] \n", i, text_start[i],  j, _pat_start[j]);
                                        u_char next_c = (u_char)text_start[i + pat_len];
                                        //printf("next c:%d, [%c], jmp:%d\n", i+pat_len, next_c, _jump_table[next_c]);

                                        i += _jump_table[next_c];

                                        finded = false;
                                        break;
                                }   
                        }   

                        if (finded)
                        {   
                                // 匹配上了 
                                return i;
                        }
                }

                return -1;
        }

        // 将每一个匹配项的偏移存于pos_vec
        void Search(const char *text_start, const char *text_end, std::vector<int> &pos_vec)
        {
                int pos = 0;

                const char *text_start_new = (const char*)text_start;

                int pat_len = _pat_end - _pat_start;

                while(pos != -1)
                {
                        pos = Search(text_start_new, text_end);

                        if (pos != -1)
                        {
                                pos_vec.push_back(pos + text_start_new - text_start);

                                if (MATCH_RULE_STEP_ONE_CHAR == _match_rule)
                                {
                                        text_start_new += (1 + pos);
                                }
                                else
                                {
                                        text_start_new += (pat_len + pos);
                                }

                        }
                        else
                        {
                                break;
                        }
                }
        }

        // 设置模式串
        // [pat_start, pat_end) 不含pat_end.
        void SetPatten(const char* pat_start, const char* pat_end)
        {
                _pat_start = pat_start;
                _pat_end = pat_end;
                PreCompute(pat_start, pat_end);
        }

        // 设置匹配策略
        // 假设文本串为 "aaaaaa", 模式串为 "aaa"
        // 如果rule:MATCH_RULE_STEP_ONE_CHAR, 则会产生 4次匹配
        // 如果rule:MATCH_RULE_STEP_ONE_PATTERN, 则会产生 2次匹配
        void SetMatchRule(int rule)
        {
                _match_rule = rule;
        }

private:
        // 生成跳转表
        void PreCompute(const char* pat_start, const char* pat_end)
        {
                if (pat_start >= pat_end)
                {
                        return;
                }

                int pat_len = pat_end - pat_start ;

                // 初始化
                for (int i=0; i<JUMP_TABLE_LEN; ++i)
                {
                        _jump_table[i] = pat_len + 1; // pat长度+1
                }

                const char* p = pat_start;
                for (; p!=pat_end; ++p)
                {
                        _jump_table[(u_char)(*p)] = pat_end - p;
                }

                _jump_table_inited = true;
        }

private:
        u_char  _jump_table[JUMP_TABLE_LEN];
        bool    _jump_table_inited;

        const char    *_pat_start;
        const char    *_pat_end;

        int             _match_rule;
};

#endif


测试用例:
// by nemozhang

#include <gtest/gtest.h>
#include "sunday.h"
#include <unistd.h>

using namespace std;


TEST(autorun_SundayAlgo, test_ascii_str) {

    const char *text = "sunhello world !\n taday is sunday, i feel good now.\nthis is a text for sunday algo test program.day, sunhow,dslasun.sdslsunday" ;
    const char *pat = "sunday";

    int text_len = strlen(text);
    int pat_len = strlen(pat);

    SundayAlgo sunday;
    const char * pat_start = (const char*)pat;
    const char * pat_end = pat_start + pat_len;
    sunday.SetPatten(pat_start, pat_end);

    vector<int> pos_vec;

    sunday.Search(text, text + text_len, pos_vec);

    printf("hit times : %d\n", pos_vec.size());
    for (size_t i=0; i<pos_vec.size(); ++i)
    {
        printf("the %u time : %d\n", i, pos_vec[i]);
        for (int j=pos_vec[i]; j<pos_vec[i]+pat_len+5; ++j)
        {
            if (j >= text_len)
            {
                break;
            }
            printf("%c",text[j]);
        }
        printf("\n");
    }

}

TEST(autorun_SundayAlgo, test_binary_str) {

        u_char text[] = {1,2,255,253,0,255,0,253,0,3,4,5,6,7,8,9,0,0,1,2,3,4,0,0,1,2,0,4,5,0,9,6,4,2,0,0,0,0,0,0,3,2,1,1,2,3,4,5,6,7,0,3,4,6,55,4,2,3,4,234,12,111,255,253,0,255,253,0,255,253,0};
        //u_char pat[] = {255,253,0};
        u_char pat[] = {0,0,0};

        int text_len = sizeof(text);
        int pat_len = sizeof(pat);

        SundayAlgo sunday;
        const char * pat_start = (const char*)pat;
        const char * pat_end = pat_start + pat_len;
        sunday.SetPatten(pat_start, pat_end);

        vector<int> pos_vec;

        sunday.SetMatchRule(SundayAlgo::MATCH_RULE_STEP_ONE_PATTEN);
        sunday.Search((const char*)text, (const char*)text + text_len, pos_vec);

        printf("\n");
        printf("\n");
        printf("hit times : %d\n", pos_vec.size());
        for (size_t i=0; i<pos_vec.size(); ++i)
        {
                printf("the %u time : %d\n", i, pos_vec[i]);
                for (int j=pos_vec[i]; j<pos_vec[i]+pat_len+5; ++j)
                {
                        if (j >= text_len)
                        {
                                break;
                        }
                        printf("%d,",text[j]);
                }
                printf("\n");
        }

}

输出如下:
nemo@vm04_sles10:[unittest]$ ./sunday_unittest 
Running main() from gtest_main.cc
[==========] Running 2 tests from 1 test case.
[----------] Global test environment set-up.
[----------] 2 tests from autorun_SundayAlgo
[ RUN      ] autorun_SundayAlgo.test_ascii_str
hit times : 3
the 0 time : 27
sunday, i f
the 1 time : 71
sunday algo
the 2 time : 120
sunday
[       OK ] autorun_SundayAlgo.test_ascii_str (0 ms)
[ RUN      ] autorun_SundayAlgo.test_binary_str




hit times : 2
the 0 time : 34
0,0,0,0,0,0,3,2,
the 1 time : 37
0,0,0,3,2,1,1,2,
[       OK ] autorun_SundayAlgo.test_binary_str (0 ms)
[----------] 2 tests from autorun_SundayAlgo (0 ms total)


[----------] Global test environment tear-down
[==========] 2 tests from 1 test case ran. (0 ms total)
[  PASSED  ] 2 tests.

你可能感兴趣的:(sunday 字符串匹配算法的实现(支持二进制匹配))