关于聊天过滤词算法,一直困扰着我,了解到很多算法,比如:KMP, 正则循环匹配等,然后在http://www.dewen.org/q/41/%E5%A6%82%E4%BD%95%E8%AE%BE%E8%AE%A1%E9%AB%98%E6%95%88%E7%9A%84%E8%81%8A%E5%A4%A9%E8%BF%87%E6%BB%A4%E8%AF%8D%E7%AE%97%E6%B3%95%EF%BC%9F看到了一篇文章,现摘要几种相对好的答案,以备不时之需。
1trie树算法
我们的解决方法是用构造一个tire树。 每个节点都存储0- 256个字符。
用脏词字典来构造这个树。
具体实现代码如下:
namespace KGame { class WordFilter { public: WordFilter() {} ~WordFilter() { Clean(&m_Filter); } void AddWord(const char* word) { UInt32 len = (UInt32)strlen(word); Filter* filter = &m_Filter; for (UInt32 i = 0; i < len; i++) { unsigned char c = word[i]; if (i == len - 1) { filter->m_NodeArray[c].m_Flag |= FilterNode::NODE_IS_END; break; } else { filter->m_NodeArray[c].m_Flag |= FilterNode::NODE_HAS_NEXT; } if (filter->m_NodeArray[c].m_NextFilter == NULL) { Filter* tmpFilter = XNEW (Filter)(); filter->m_NodeArray[c].m_NextFilter = tmpFilter; } filter = (Filter *)filter->m_NodeArray[c].m_NextFilter; } } void AddWords(const std::set<std::string>& wordList) { for (std::set<std::string>::const_iterator it = wordList.begin(); it != wordList.end(); it++) { AddWord(it->c_str()); } } void AddWords(const std::vector<std::string>& wordList) { for (std::vector<std::string>::const_iterator it = wordList.begin(); it != wordList.end(); it++) { AddWord(it->c_str()); } } void AddWords(const KGame::Set<std::string>& worldList) { for (KGame::Set<std::string>::Iter* iter = worldList.Begin(); iter != worldList.End(); iter = worldList.Next(iter)) { AddWord(iter->m_Value.c_str()); } } Int32 Check(const char* str) { Filter* filter = NULL; for (Int32 i = 0; i < (int)strlen(str) - 1; i++) { filter = &m_Filter; for (UInt32 j = i; j < strlen(str); j++) { unsigned char c = str[j]; if ((c >= 'A' && c <= 'Z')) { c += 32; } if (filter->m_NodeArray[c].m_Flag == FilterNode::NODE_IS_NULL) { break; } else if (filter->m_NodeArray[c].m_Flag & FilterNode::NODE_IS_END) { return i; } else // NODE_HAS_NEXT { filter = (Filter*)filter->m_NodeArray[c].m_NextFilter; } } } return -1; } void CheckAndModify(char* str, const char replace = '*') { Filter* filter = NULL; for (Int32 i = 0; i < (int)strlen(str) - 1; i++) { filter = &m_Filter; for (UInt32 j = i; j < strlen(str); j++) { unsigned char c = str[j]; if ((c >= 'A' && c <= 'Z')) { c += 32; } if (filter->m_NodeArray[c].m_Flag == FilterNode::NODE_IS_NULL) { break; } else if (filter->m_NodeArray[c].m_Flag & FilterNode::NODE_IS_END) { for (UInt32 k = i; k <= j; k++) { str[k] = replace; } if (filter->m_NodeArray[c].m_Flag & FilterNode::NODE_HAS_NEXT) { filter = (Filter*)filter->m_NodeArray[c].m_NextFilter; } else { continue; } } else // NODE_HAS_NEXT { filter = (Filter*)filter->m_NodeArray[c].m_NextFilter; } } } } void CheckAndModify(std::string& str, const char replace = '*') { Filter* filter = NULL; for (Int32 i = 0; i < (int)str.size() - 1; i++) { filter = &m_Filter; for (UInt32 j = i; j < str.size(); j++) { unsigned char c = str[j]; if ((c >= 'A' && c <= 'Z')) { c += 32; } if (filter->m_NodeArray[c].m_Flag == FilterNode::NODE_IS_NULL) { break; } else if (filter->m_NodeArray[c].m_Flag & FilterNode::NODE_IS_END) { for (UInt32 k = i; k <= j; k++) { str[k] = replace; } if (filter->m_NodeArray[c].m_Flag & FilterNode::NODE_HAS_NEXT) { filter = (Filter*)filter->m_NodeArray[c].m_NextFilter; } else { continue; } } else // NODE_HAS_NEXT { filter = (Filter*)filter->m_NodeArray[c].m_NextFilter; } } } } private: struct FilterNode { char m_Flag; void* m_NextFilter; enum Flag { NODE_IS_NULL = 0x00, NODE_HAS_NEXT = 0x01, NODE_IS_END = 0x10, }; FilterNode() : m_Flag(NODE_IS_NULL), m_NextFilter(NULL) {} }; struct Filter { FilterNode m_NodeArray[256]; } m_Filter; void Clean(Filter* filter) { for (UInt32 i = 0; i < 256; i++) { if (filter->m_NodeArray[i].m_NextFilter) { Clean((Filter *)filter->m_NodeArray[i].m_NextFilter); XDELETE((Filter*)filter->m_NodeArray[i].m_NextFilter); } } } }; } // namespace KGame
聊天过滤词算法的解决思路
提高过滤的算法个人认为主要从两个方面考虑:(1)尽量减少内存、IO的次数。(2)增加串内查找的速度。
基于这两点我想采用连续的内存片,可以减少内存地址跳跃的次数,采用静态的内存这就解决了(1)的问题,第二点是增加串内查找的速度,这个比较公认的事KMP算法
class WordFilter { public: WordFilter(); ~WordFilter(); void Init(); void FilterWord(string& word); int Index_KMP(const char* S, const char* T, int pos); private: std::set<string> m_storage; const char** m_words; uint32 m_count; }; WordFilter::WordFilter() { m_words = NULL; m_count = 0; } WordFilter::~WordFilter() { if(m_words) { free(m_words); } } void WordFilter::Init() { // 把所有屏蔽词都放到m_storage里 m_count = m_storage.size(); if(m_count) { m_words = (const char**)malloc(sizeof(char*)*m_count); std::set<string>::iterator ptr; int i = 0; for(ptr = m_storage.begin(); ptr != m_storage.end(); ++ptr,i++) { m_words[i] = ptr->c_str(); } } } static inline void _filterWord(char* word, const char* lowerWord, const char* oldstr) { int len = strlen(oldstr); const char* tmp; memset(word, '*', len); word += len; lowerWord += len; while((tmp = Index_KMP(lowerWord, oldstr)) != NULL) { word += (tmp-lowerWord); memset(word, '*', len); word += len; lowerWord = tmp + len; } } void WordFilter::FilterWord(string& word) { string tmp(word); str_tolower(tmp); const char** p = (const char**)m_words; const char* dest; for(uint32 i=0; i<m_count; i++, p++) { dest = Index_KMP(tmp.c_str(), *p, 0); if(dest) { _filterWord((char*)(word.c_str() + (dest-tmp.c_str())), dest, *p); } } } int WordFilter::Index_KMP(const char* S, const char* T, int pos){ i=pos; j=1; while(i <= S[0] && j<= T[0]){ if(j == 0 || S[i] == T[j]) { ++i; ++j; } else j = next[j]; } if(j>T[0]) return i-T[0]; else return 0; }