一个字符串搜索的Aho-Corasick算法

阅读更多

Aho和Corasick对KMP算法(Knuth–Morris–Pratt algorithm)进行了改进,Aho-Corasick算法(Aho-Corasick algorithm)利用构建树,总时间复杂度是O(n)。原理图如下(摘自Aho-Corasick string matching in C#):

一个字符串搜索的Aho-Corasick算法_第1张图片一个字符串搜索的Aho-Corasick算法_第2张图片

Building of the keyword tree (figure 1 - after the first step, figure 2 - tree with the fail function)

C#版本的实现代码可以从Aho-Corasick string matching in C#得到,也可以点击这里获得该算法的PDF文档。

这是一个应用示例:

一个字符串搜索的Aho-Corasick算法_第3张图片

它能将载入的RTF文档中的搜索关键字高亮,检索速度较快,示例没有实现全字匹配,算法代码简要如下:

/* Aho-Corasick text search algorithm implementation * * For more information visit * - http://www.cs.uku.fi/~kilpelai/BSA05/lectures/slides04.pdf */ using System; using System.Collections; namespace EeekSoft.Text { ///

/// Interface containing all methods to be implemented /// by string search algorithm /// public interface IStringSearchAlgorithm { #region Methods & Properties /// /// Ignore case of letters /// bool IgnoreCase { get; set; } /// /// List of keywords to search for /// string[] Keywords { get; set; } /// /// Searches passed text and returns all occurrences of any keyword /// /// Text to search /// Array of occurrences StringSearchResult[] FindAll(string text); /// /// Searches passed text and returns first occurrence of any keyword /// /// Text to search /// First occurrence of any keyword (or StringSearchResult.Empty if text doesn't contain any keyword) StringSearchResult FindFirst(string text); /// /// Searches passed text and returns true if text contains any keyword /// /// Text to search /// True when text contains any keyword bool ContainsAny(string text); #endregion } /// /// Structure containing results of search /// (keyword and position in original text) /// public struct StringSearchResult { #region Members private int _index; private string _keyword; /// /// Initialize string search result /// /// Index in text /// Found keyword public StringSearchResult(int index, string keyword) { _index = index; _keyword = keyword; } /// /// Returns index of found keyword in original text /// public int Index { get { return _index; } } /// /// Returns keyword found by this result /// public string Keyword { get { return _keyword; } } /// /// Returns empty search result /// public static StringSearchResult Empty { get { return new StringSearchResult(-1, ""); } } #endregion } /// /// Class for searching string for one or multiple /// keywords using efficient Aho-Corasick search algorithm /// public class StringSearch : IStringSearchAlgorithm { #region Objects /// /// Tree node representing character and its /// transition and failure function /// class TreeNode { #region Constructor & Methods /// /// Initialize tree node with specified character /// /// Parent node /// Character public TreeNode(TreeNode parent, char c) { _char = c; _parent = parent; _results = new ArrayList(); _resultsAr = new string[] { }; _transitionsAr = new TreeNode[] { }; _transHash = new Hashtable(); } /// /// Adds pattern ending in this node /// /// Pattern public void AddResult(string result) { if (_results.Contains(result)) return; _results.Add(result); _resultsAr = (string[])_results.ToArray(typeof(string)); } /// /// Adds trabsition node /// /// Node //public void AddTransition(TreeNode node) //{ // AddTransition(node, false); //} /// /// Adds trabsition node /// /// Node /// Ignore case of letters public void AddTransition(TreeNode node, bool ignoreCase) { if (ignoreCase) _transHash.Add(char.ToLower(node.Char), node); else _transHash.Add(node.Char, node); TreeNode[] ar = new TreeNode[_transHash.Values.Count]; _transHash.Values.CopyTo(ar, 0); _transitionsAr = ar; } /// /// Returns transition to specified character (if exists) /// /// Character /// Ignore case of letters /// Returns TreeNode or null public TreeNode GetTransition(char c, bool ignoreCase) { if (ignoreCase) return (TreeNode)_transHash[char.ToLower(c)]; return (TreeNode)_transHash[c]; } /// /// Returns true if node contains transition to specified character /// /// Character /// Ignore case of letters /// True if transition exists public bool ContainsTransition(char c, bool ignoreCase) { return GetTransition(c, ignoreCase) != null; } #endregion #region Properties private char _char; private TreeNode _parent; private TreeNode _failure; private ArrayList _results; private TreeNode[] _transitionsAr; private string[] _resultsAr; private Hashtable _transHash; /// /// Character /// public char Char { get { return _char; } } /// /// Parent tree node /// public TreeNode Parent { get { return _parent; } } /// /// Failure function - descendant node /// public TreeNode Failure { get { return _failure; } set { _failure = value; } } /// /// Transition function - list of descendant nodes /// public TreeNode[] Transitions { get { return _transitionsAr; } } /// /// Returns list of patterns ending by this letter /// public string[] Results { get { return _resultsAr; } } #endregion } #endregion #region Local fields /// /// Root of keyword tree /// private TreeNode _root; /// /// Keywords to search for /// private string[] _keywords; #endregion #region Initialization /// /// Initialize search algorithm (Build keyword tree) /// /// Keywords to search for /// Ignore case of letters (the default is false) public StringSearch(string[] keywords, bool ignoreCase) : this(keywords) { IgnoreCase = ignoreCase; } /// /// Initialize search algorithm (Build keyword tree) /// /// Keywords to search for public StringSearch(string[] keywords) { Keywords = keywords; } /// /// Initialize search algorithm with no keywords /// (Use Keywords property) /// public StringSearch() { } #endregion #region Implementation /// /// Build tree from specified keywords /// void BuildTree() { // Build keyword tree and transition function _root = new TreeNode(null, ' '); foreach (string p in _keywords) { // add pattern to tree TreeNode nd = _root; foreach (char c in p) { TreeNode ndNew = null; foreach (TreeNode trans in nd.Transitions) { if (this.IgnoreCase) { if (char.ToLower(trans.Char) == char.ToLower(c)) { ndNew = trans; break; } } else { if (trans.Char == c) { ndNew = trans; break; } } } if (ndNew == null) { ndNew = new TreeNode(nd, c); nd.AddTransition(ndNew, this.IgnoreCase); } nd = ndNew; } nd.AddResult(p); } // Find failure functions ArrayList nodes = new ArrayList(); // level 1 nodes - fail to root node foreach (TreeNode nd in _root.Transitions) { nd.Failure = _root; foreach (TreeNode trans in nd.Transitions) nodes.Add(trans); } // other nodes - using BFS while (nodes.Count != 0) { ArrayList newNodes = new ArrayList(); foreach (TreeNode nd in nodes) { TreeNode r = nd.Parent.Failure; char c = nd.Char; while (r != null && !r.ContainsTransition(c, this.IgnoreCase)) r = r.Failure; if (r == null) nd.Failure = _root; else { nd.Failure = r.GetTransition(c, this.IgnoreCase); foreach (string result in nd.Failure.Results) nd.AddResult(result); } // add child nodes to BFS list foreach (TreeNode child in nd.Transitions) newNodes.Add(child); } nodes = newNodes; } _root.Failure = _root; } #endregion #region Methods & Properties /// /// Ignore case of letters /// public bool IgnoreCase { get; set; } /// /// Keywords to search for (setting this property is slow, because /// it requieres rebuilding of keyword tree) /// public string[] Keywords { get { return _keywords; } set { _keywords = value; BuildTree(); } } /// /// Searches passed text and returns all occurrences of any keyword /// /// Text to search /// Array of occurrences public StringSearchResult[] FindAll(string text) { ArrayList ret = new ArrayList(); TreeNode ptr = _root; int index = 0; while (index < text.Length) { TreeNode trans = null; while (trans == null) { trans = ptr.GetTransition(text[index], this.IgnoreCase); if (ptr == _root) break; if (trans == null) ptr = ptr.Failure; } if (trans != null) ptr = trans; foreach (string found in ptr.Results) ret.Add(new StringSearchResult(index - found.Length + 1, found)); index++; } return (StringSearchResult[])ret.ToArray(typeof(StringSearchResult)); } /// /// Searches passed text and returns first occurrence of any keyword /// /// Text to search /// First occurrence of any keyword (or StringSearchResult.Empty if text doesn't contain any keyword) public StringSearchResult FindFirst(string text) { ArrayList ret = new ArrayList(); TreeNode ptr = _root; int index = 0; while (index < text.Length) { TreeNode trans = null; while (trans == null) { trans = ptr.GetTransition(text[index], this.IgnoreCase); if (ptr == _root) break; if (trans == null) ptr = ptr.Failure; } if (trans != null) ptr = trans; foreach (string found in ptr.Results) return new StringSearchResult(index - found.Length + 1, found); index++; } return StringSearchResult.Empty; } /// /// Searches passed text and returns true if text contains any keyword /// /// Text to search /// True when text contains any keyword public bool ContainsAny(string text) { TreeNode ptr = _root; int index = 0; while (index < text.Length) { TreeNode trans = null; while (trans == null) { trans = ptr.GetTransition(text[index], this.IgnoreCase); if (ptr == _root) break; if (trans == null) ptr = ptr.Failure; } if (trans != null) ptr = trans; if (ptr.Results.Length > 0) return true; index++; } return false; } #endregion } }

示例下载页面:http://www.uushare.com/user/m2nlight/file/2722093

StringSearch.7z
类型:7Z 压缩文件
大小:32.5 KB

你可能感兴趣的:(算法,C,C++,C#)