今天有点空闲,想想用Ruby写个NFA试试。从正则表达式构造NFA采用经典的Thompson算法:正则表达式 -> 后缀表达式 -> 构造NFA。构造了NFA后,用之匹配字符串。一句话,写了个玩具的正则表达式引擎,支持concatenation、alternation以及 *、?、+量词,不支持反向引用和转义符。测试了下与Ruby自带的正则表达式引擎的性能对比,慢了3倍。构造NFA没什么问题,主要是匹配运行写的烂,有空再改改。
nfa.rb
module NFA class NFA def initialize(state) @state=state end def step(clist,c) return clist if clist.size==0; nlist=[] allNull = true matched = false clist.each do |t| if !t.nil? allNull = false if t.c!=-1 if t.c == c && t.end.type ==1 then matched = true nlist.push(t.end.out1) if !t.end.out1.end.nil? nlist.push(t.end.out2) if !t.end.out2.end.nil? elsif (t.c == c && t.end.type == 0) then matched = true; return ListUitls.new_list(t); elsif (t.c == -1 && !t.end.nil?) then nlist.push(t.end.out1); nlist.push(t.end.out2); end end end return step(nlist, c) if (allNull) return step(nlist, c) if (!matched) nlist end def test?(s) match(@state,s) end def match(state,s) clist =[] clist.push(state.out1); clist.push(state.out2); s.each_byte do |c| c =c&0xFF; clist = step(clist, c); return false if clist.size==0 end return is_match?(clist) end def is_match?(clist) clist.each do |t| return true if !t.nil? and t.c==-1 and t.end and t.end.is_matched? end false end end class Paren attr_accessor:n_alt,:n_atom end class State attr_accessor :out1,:out2,:type def initialize(out1,out2) @out1=out1 @out2=out2 @type=1 end def is_matched? return @type==0 end end class Transition attr_accessor :c,:end def initialize(c) @c=c end end class Frame attr_accessor :start,:outs def initialize(start,outs) @start=start @outs=outs end end class ListUitls def self.link(list,state) list.each{|t| t.end=state} end def self.append(list1,list2) list1+list2 end def self.new_list(out) result=[] result.push(out) result end end def self.compile(re) post = re2post(re) raise ArgumentError.new,"bad regexp!" if post.nil? state = post2nfa(post); raise RuntimeError.new,"construct nfa from postfix fail!" if state.nil? return NFA.new(state); end def self.post2nfa(postfix) stack=[] s=nil t=t1=t2=nil e1=e2=e=nil return nil if postfix.nil? postfix.each_byte do |p| case p.chr when '.': e2 = stack.pop() e1 = stack.pop() ListUitls.link(e1.outs, e2.start) stack.push(Frame.new(e1.start, e2.outs)) when '|': e2 = stack.pop() e1 = stack.pop() t1 = Transition.new(-1) t2 = Transition.new(-1) t1.end = e1.start t2.end = e2.start s = State.new(t1, t2) stack.push(Frame.new(s, ListUitls.append(e1.outs, e2.outs))) when '?': e = stack.pop() t1 = Transition.new(-1) t2 = Transition.new(-1) t1.end = e.start s = State.new(t1, t2) stack.push(Frame.new(s, ListUitls.append(e.outs, ListUitls.new_list(t2)))) when '*': e = stack.pop() t1 = Transition.new(-1) t2 = Transition.new(-1) t1.end = e.start s = State.new(t1, t2) ListUitls.link(e.outs, s) stack.push(Frame.new(s, ListUitls.new_list(s.out2))) when '+': e = stack.pop() t1 = Transition.new(-1) t2 = Transition.new(-1) t1.end = e.start s = State.new(t1, t2) ListUitls.link(e.outs, s) stack.push(Frame.new(e.start, ListUitls.new_list(t2))) else t = Transition.new(p) s = State.new(t, Transition.new(-1)) stack.push(Frame.new(s, ListUitls.new_list(s.out1))) end end e = stack.pop() return nil if stack.size()>0 end_state = State.new(nil, nil) end_state.type=0 e.outs.each do |tran| if tran.c!=-1 t1 = Transition.new(-1) t2 = Transition.new(-1) s=State.new(t1,t2) tran.end=s s.out1.end=end_state s.out2.end=end_state else tran.end=end_state end end start = e.start return start end def self.re2post(re) n_alt = n_atom = 0 result="" paren=[] re.each_byte do |c| case c.chr when '(' then if (n_atom > 1) then n_atom-=1 result<<"." end p =Paren.new p.n_alt = n_alt p.n_atom = n_atom paren.push(p) n_alt = n_atom = 0 when '|' then if (n_atom == 0) return nil end while (n_atom-=1) > 0 result<<"." end n_alt+=1 when ')' then if (paren.size() == 0) return nil end if (n_atom == 0) return nil end while (n_atom-=1)>0 result<<"." end while(n_alt>0) result<<"|" n_alt-=1 end p = paren.pop() n_alt = p.n_alt n_atom = p.n_atom n_atom+=1 when '*','+','?': if (n_atom == 0) return nil end result<<c else if (n_atom > 1) n_atom-=1 result<<"." end result<<c n_atom+=1 end end return nil if paren.size()>0 while ( (n_atom-=1)> 0) result<<"." end while(n_alt>0) n_alt-=1 result<<"|" end result end end
使用的话:
nfa = NFA::compile("a(bb)+a(cdf)*") assert nfa.test?("abba") assert nfa.test?("abbbba") assert !nfa.test?("a") assert !nfa.test?("aa") assert nfa.test?("abbacdf") assert nfa.test?("abbbbacdfcdf") assert !nfa.test?("bbbbacdfcdf") assert !nfa.test?("abbbacdfcdf") assert !nfa.test?("abbbbacdfdf") assert !nfa.test?("abbbbacdfdfg")