BM 算法 Cpp 实现

介绍

BM 算法在文本模式匹配方面的效率不用多说,在 linux 上文件搜索上的使用也证明了它的实用价值,这有一篇它的原理介绍, BM算法介绍。

Show you the code
#include 
#include 
#include "BM.h"


#define  FS_MATCH_NOCASE
#ifdef FS_MATCH_NOCASE
#define bm_tolower(c) \
    if(c >= BDS_TEXT('A') && c <= BDS_TEXT('Z')) c+= 32;
#else
#define  bm_tolower(c)
#endif

namespace BaiduService
{
namespace FileSearch
{


CMBMMatch::CBMMatch::CBMMatch() :
           m_pattern(NULL),
           m_arraySuffixTable(NULL),
           m_nPatternLen(0)
{
    m_pArrayBadCharTable = (char *)malloc(1 << 16);
    memset(m_pArrayBadCharTable, -1, 1 << 16);
}


CMBMMatch::CBMMatch::~CBMMatch()
{
    if (m_arraySuffixTable)
    {
        free(m_arraySuffixTable);
    }

    if (m_pattern != NULL)
    {
        free(m_pattern);
    }

    if (m_pArrayBadCharTable != NULL)
    {
        free(m_pArrayBadCharTable);
    }
}


bool CMBMMatch::CBMMatch::SetPattern(const BDS_TCHAR *pattern)
{

    // bad char table
    m_nPatternLen  = bdststrlen(pattern);
    if (m_nPatternLen < 1)
    {
        return false;
    }

    if (m_pattern != NULL)
    {
        free(m_pattern);
    }

    m_pattern = (BDS_TCHAR *)malloc((m_nPatternLen + 1) * sizeof(BDS_TCHAR));
    memset(m_pattern, 0, (m_nPatternLen + 1) * sizeof(BDS_TCHAR));
#ifndef FS_MATCH_NOCASE 
    bdsstrcpy(m_pattern, pattern);
#else // 


    for (size_t i = 0; i < m_nPatternLen; i++)
    {
        BDS_TCHAR c = pattern[i];
        bm_tolower( c );
        m_pattern[i] = c;
    }
#endif



    for (size_t i = 0; i < m_nPatternLen; i++)
    {
        m_pArrayBadCharTable[(USHORT)m_pattern[i]] = i;
    }


    // build good suffix table, from right to left
    m_arraySuffixTable = (char *)malloc(m_nPatternLen);

    char cLastGoodSuffix = 1;
    for (size_t i = m_nPatternLen - 1; i > 0; i--)
    {
        // compare the prefix
        size_t j = 0;
        for (; j < m_nPatternLen - i; j++)
        {
            if ( m_pattern[j] != m_pattern[j + i] )
            {
                break;
            }
        }

        if (j == m_nPatternLen - i)
        {
            m_arraySuffixTable[i] = i;
            cLastGoodSuffix = i;
        }
        else
        {
            m_arraySuffixTable[i] = cLastGoodSuffix;
        }

    }

    // ajust the last subffix
    m_arraySuffixTable[0] = cLastGoodSuffix;
    m_arraySuffixTable[m_nPatternLen - 1] = 1;


    // find the good suffix
    for (size_t i = m_nPatternLen - 1; i > 0; i--)
    {

        int nLocation = 0;
        nLocation = FindSubString( &m_pattern[i], &m_pattern[0] );

        if (nLocation != 0)
        {
            m_arraySuffixTable[i-1] = i - nLocation;
        }

    }

    return true;
}


bool CMBMMatch::CBMMatch::Match(const BDS_TCHAR *text)
{
    // do search
    bool bFound = false;
    size_t nTextLen = bdststrlen(text);
    if (nTextLen < m_nPatternLen)
    {
        return false;
    }

    size_t i = 0;
    for ( ; i < nTextLen;)
    {
        int j = m_nPatternLen - 1;
        for ( ; j >= 0; j--)
        {
            if (i + j > nTextLen - 1)
            {
                return false;
            }

            BDS_TCHAR c = text[i + j];
            bm_tolower(c);
            if (m_pattern[j] != c )
            {
                // find the next jump
                if ((j - m_pArrayBadCharTable[(USHORT)c]) > m_arraySuffixTable[j])
                {
                    i += (j - m_pArrayBadCharTable[(USHORT)c]);
                }
                else
                {
                    i += m_arraySuffixTable[j];
                }

                break;
            }
        }

        if (j == -1)
        {
            bFound = true;
            break;
        }
    }

    return bFound;

}


int CMBMMatch::CBMMatch::FindSubString(const BDS_TCHAR *szPattern, const BDS_TCHAR *szText)
{
    int nLocation = 0;
    int nLastLocation = 0;
    size_t nTextLen = bdststrlen( szText );
    size_t nPatternLen = bdststrlen( szPattern );
    for (size_t i = 0; i < nTextLen; i++)
    {
        size_t j = 0;
        for ( ; j < nPatternLen; j++ )
        {
            if ( szPattern[j] != szText[i + j] )
            {
                break;
            }
        }

        if ( j == nPatternLen )
        {
            nLastLocation = nLocation;
            nLocation = i;
        }
    }

    return nLastLocation;
}


CMBMMatch::CMBMMatch()
{

}


CMBMMatch::~CMBMMatch()
{
    Reset();
}


bool CMBMMatch::Reset()
{
    std::for_each(m_vecMatch.begin(), m_vecMatch.end(), [&](CBMMatch *match) {

         delete match;

    });

    m_vecMatch.clear();
    m_vecPatterns.clear();

    return true;
}


bool CMBMMatch::SetPatterns(std::vector vecPatterns)
{
    Reset();

    m_vecPatterns = vecPatterns;

    std::for_each(m_vecPatterns.begin(), m_vecPatterns.end(), [&](bdststring pattern) {

         CBMMatch *match = new CBMMatch();

         if (match->SetPattern(pattern.c_str()))
         {
             m_vecMatch.push_back(match);
         }
         else
         {
             delete match;
         }
    });

    return true;
}


bool CMBMMatch::Match(const BDS_TCHAR *text)
{
    bool bFound = true;

    //bFound = m_vecMatch[0]->Match(text);

    std::all_of(m_vecMatch.begin(), m_vecMatch.end(), [&](CBMMatch *match) -> bool {
        if (!match->Match(text))
        {
            bFound = false;
            return false;
        }

        return true;
    });

    return bFound;
}


}
}

介绍没有经过优化以及详细 review,写好之后就没有再看了,难免有问题,可以自行处理。

你可能感兴趣的:(算法小把戏)