今天有点空闲,想想用Ruby写个NFA试试。从正则表达式构造NFA采用经典的Thompson算法:正则表达式 -> 后缀表达式 -> 构造NFA。构造了NFA后,用之匹配字符串。一句话,写了个玩具的正则表达式引擎,支持concatenation、alternation以及*、?、+量词,不支持反向引用和转义符。测试了下与Ruby自带的正则表达式引擎的性能对比,慢了3倍
。构造NFA没什么问题,主要是匹配运行写的烂,有空再改改。
nfa.rb
使用:
nfa.rb
module
NFA
class NFA
def initialize(state)
@state = state
end
def step(clist,c)
return clist if clist.size == 0;
nlist = []
allNull = true
matched = false
clist.each do | t |
if !t.nil?
allNull = false if t.c !=- 1
if t.c == c && t.end.type == 1 then
matched = true
nlist.push(t.end.out1) if !t.end.out1.end.nil?
nlist.push(t.end.out2) if !t.end.out2.end.nil?
elsif (t.c == c && t.end.type == 0) then
matched = true;
return ListUitls.new_list(t);
elsif (t.c == - 1 && !t.end.nil?) then
nlist.push(t.end.out1);
nlist.push(t.end.out2);
end
end
end
return step(nlist, c) if (allNull)
return step(nlist, c) if (!matched)
nlist
end
def test?(s)
match(@state,s)
end
def match(state,s)
clist = []
clist.push(state.out1);
clist.push(state.out2);
s.each_byte do | c |
c = c & 0xFF ;
clist = step(clist, c);
return false if clist.size == 0
end
return is_match?(clist)
end
def is_match?(clist)
clist.each do | t |
return true if !t.nil? and t.c ==- 1 and t.end and t.end.is_matched?
end
false
end
end
class Paren
attr_accessor:n_alt,:n_atom
end
class State
attr_accessor :out1,:out2,:type
def initialize(out1,out2)
@out1 = out1
@out2 = out2
@type = 1
end
def is_matched?
return @type == 0
end
end
class Transition
attr_accessor :c,:end
def initialize(c)
@c = c
end
end
class Frame
attr_accessor :start,:outs
def initialize(start,outs)
@start = start
@outs = outs
end
end
class ListUitls
def self.link(list,state)
list.each{ | t | t.end = state}
end
def self.append(list1,list2)
list1 + list2
end
def self.new_list(out)
result = []
result.push(out)
result
end
end
def self.compile(re)
post = re2post(re)
raise ArgumentError.new, " bad regexp! " if post.nil?
state = post2nfa(post);
raise RuntimeError.new, " construct nfa from postfix fail! " if state.nil?
return NFA.new(state);
end
def self.post2nfa(postfix)
stack = []
s = nil
t = t1 = t2 = nil
e1 = e2 = e = nil
return nil if postfix.nil?
postfix.each_byte do | p |
case p.chr
when ' . ' :
e2 = stack.pop()
e1 = stack.pop()
ListUitls.link(e1.outs, e2.start)
stack.push(Frame.new(e1.start, e2.outs))
when ' | ' :
e2 = stack.pop()
e1 = stack.pop()
t1 = Transition.new( - 1 )
t2 = Transition.new( - 1 )
t1.end = e1.start
t2.end = e2.start
s = State.new(t1, t2)
stack.push(Frame.new(s, ListUitls.append(e1.outs, e2.outs)))
when ' ? ' :
e = stack.pop()
t1 = Transition.new( - 1 )
t2 = Transition.new( - 1 )
t1.end = e.start
s = State.new(t1, t2)
stack.push(Frame.new(s, ListUitls.append(e.outs, ListUitls.new_list(t2))))
when ' * ' :
e = stack.pop()
t1 = Transition.new( - 1 )
t2 = Transition.new( - 1 )
t1.end = e.start
s = State.new(t1, t2)
ListUitls.link(e.outs, s)
stack.push(Frame.new(s, ListUitls.new_list(s.out2)))
when ' + ' :
e = stack.pop()
t1 = Transition.new( - 1 )
t2 = Transition.new( - 1 )
t1.end = e.start
s = State.new(t1, t2)
ListUitls.link(e.outs, s)
stack.push(Frame.new(e.start, ListUitls.new_list(t2)))
else
t = Transition.new(p)
s = State.new(t, Transition.new( - 1 ))
stack.push(Frame.new(s, ListUitls.new_list(s.out1)))
end
end
e = stack.pop()
return nil if stack.size() > 0
end_state = State.new(nil, nil)
end_state.type = 0
e.outs.each do | tran |
if tran.c !=- 1
t1 = Transition.new( - 1 )
t2 = Transition.new( - 1 )
s = State.new(t1,t2)
tran.end = s
s.out1.end = end_state
s.out2.end = end_state
else
tran.end = end_state
end
end
start = e.start
return start
end
def self.re2post(re)
n_alt = n_atom = 0
result = ""
paren = []
re.each_byte do | c |
case c.chr
when ' ( ' then
if (n_atom > 1 ) then
n_atom -= 1
result << " . "
end
p = Paren.new
p.n_alt = n_alt
p.n_atom = n_atom
paren.push(p)
n_alt = n_atom = 0
when ' | ' then
if (n_atom == 0)
return nil
end
while (n_atom -= 1 ) > 0
result << " . "
end
n_alt += 1
when ' ) ' then
if (paren.size() == 0)
return nil
end
if (n_atom == 0)
return nil
end
while (n_atom -= 1 ) > 0
result << " . "
end
while (n_alt > 0)
result << " | "
n_alt -= 1
end
p = paren.pop()
n_alt = p.n_alt
n_atom = p.n_atom
n_atom += 1
when ' * ' , ' + ' , ' ? ' :
if (n_atom == 0)
return nil
end
result << c
else
if (n_atom > 1 )
n_atom -= 1
result << " . "
end
result << c
n_atom += 1
end
end
return nil if paren.size() > 0
while ( (n_atom -= 1 ) > 0)
result << " . "
end
while (n_alt > 0)
n_alt -= 1
result << " | "
end
result
end
end
class NFA
def initialize(state)
@state = state
end
def step(clist,c)
return clist if clist.size == 0;
nlist = []
allNull = true
matched = false
clist.each do | t |
if !t.nil?
allNull = false if t.c !=- 1
if t.c == c && t.end.type == 1 then
matched = true
nlist.push(t.end.out1) if !t.end.out1.end.nil?
nlist.push(t.end.out2) if !t.end.out2.end.nil?
elsif (t.c == c && t.end.type == 0) then
matched = true;
return ListUitls.new_list(t);
elsif (t.c == - 1 && !t.end.nil?) then
nlist.push(t.end.out1);
nlist.push(t.end.out2);
end
end
end
return step(nlist, c) if (allNull)
return step(nlist, c) if (!matched)
nlist
end
def test?(s)
match(@state,s)
end
def match(state,s)
clist = []
clist.push(state.out1);
clist.push(state.out2);
s.each_byte do | c |
c = c & 0xFF ;
clist = step(clist, c);
return false if clist.size == 0
end
return is_match?(clist)
end
def is_match?(clist)
clist.each do | t |
return true if !t.nil? and t.c ==- 1 and t.end and t.end.is_matched?
end
false
end
end
class Paren
attr_accessor:n_alt,:n_atom
end
class State
attr_accessor :out1,:out2,:type
def initialize(out1,out2)
@out1 = out1
@out2 = out2
@type = 1
end
def is_matched?
return @type == 0
end
end
class Transition
attr_accessor :c,:end
def initialize(c)
@c = c
end
end
class Frame
attr_accessor :start,:outs
def initialize(start,outs)
@start = start
@outs = outs
end
end
class ListUitls
def self.link(list,state)
list.each{ | t | t.end = state}
end
def self.append(list1,list2)
list1 + list2
end
def self.new_list(out)
result = []
result.push(out)
result
end
end
def self.compile(re)
post = re2post(re)
raise ArgumentError.new, " bad regexp! " if post.nil?
state = post2nfa(post);
raise RuntimeError.new, " construct nfa from postfix fail! " if state.nil?
return NFA.new(state);
end
def self.post2nfa(postfix)
stack = []
s = nil
t = t1 = t2 = nil
e1 = e2 = e = nil
return nil if postfix.nil?
postfix.each_byte do | p |
case p.chr
when ' . ' :
e2 = stack.pop()
e1 = stack.pop()
ListUitls.link(e1.outs, e2.start)
stack.push(Frame.new(e1.start, e2.outs))
when ' | ' :
e2 = stack.pop()
e1 = stack.pop()
t1 = Transition.new( - 1 )
t2 = Transition.new( - 1 )
t1.end = e1.start
t2.end = e2.start
s = State.new(t1, t2)
stack.push(Frame.new(s, ListUitls.append(e1.outs, e2.outs)))
when ' ? ' :
e = stack.pop()
t1 = Transition.new( - 1 )
t2 = Transition.new( - 1 )
t1.end = e.start
s = State.new(t1, t2)
stack.push(Frame.new(s, ListUitls.append(e.outs, ListUitls.new_list(t2))))
when ' * ' :
e = stack.pop()
t1 = Transition.new( - 1 )
t2 = Transition.new( - 1 )
t1.end = e.start
s = State.new(t1, t2)
ListUitls.link(e.outs, s)
stack.push(Frame.new(s, ListUitls.new_list(s.out2)))
when ' + ' :
e = stack.pop()
t1 = Transition.new( - 1 )
t2 = Transition.new( - 1 )
t1.end = e.start
s = State.new(t1, t2)
ListUitls.link(e.outs, s)
stack.push(Frame.new(e.start, ListUitls.new_list(t2)))
else
t = Transition.new(p)
s = State.new(t, Transition.new( - 1 ))
stack.push(Frame.new(s, ListUitls.new_list(s.out1)))
end
end
e = stack.pop()
return nil if stack.size() > 0
end_state = State.new(nil, nil)
end_state.type = 0
e.outs.each do | tran |
if tran.c !=- 1
t1 = Transition.new( - 1 )
t2 = Transition.new( - 1 )
s = State.new(t1,t2)
tran.end = s
s.out1.end = end_state
s.out2.end = end_state
else
tran.end = end_state
end
end
start = e.start
return start
end
def self.re2post(re)
n_alt = n_atom = 0
result = ""
paren = []
re.each_byte do | c |
case c.chr
when ' ( ' then
if (n_atom > 1 ) then
n_atom -= 1
result << " . "
end
p = Paren.new
p.n_alt = n_alt
p.n_atom = n_atom
paren.push(p)
n_alt = n_atom = 0
when ' | ' then
if (n_atom == 0)
return nil
end
while (n_atom -= 1 ) > 0
result << " . "
end
n_alt += 1
when ' ) ' then
if (paren.size() == 0)
return nil
end
if (n_atom == 0)
return nil
end
while (n_atom -= 1 ) > 0
result << " . "
end
while (n_alt > 0)
result << " | "
n_alt -= 1
end
p = paren.pop()
n_alt = p.n_alt
n_atom = p.n_atom
n_atom += 1
when ' * ' , ' + ' , ' ? ' :
if (n_atom == 0)
return nil
end
result << c
else
if (n_atom > 1 )
n_atom -= 1
result << " . "
end
result << c
n_atom += 1
end
end
return nil if paren.size() > 0
while ( (n_atom -= 1 ) > 0)
result << " . "
end
while (n_alt > 0)
n_alt -= 1
result << " | "
end
result
end
end
使用:
nfa
=
NFA::compile(
"
a(bb)+a(cdf)*
"
)
assert nfa.test?( " abba " )
assert nfa.test?( " abbbba " )
assert !nfa.test?( " a " )
assert !nfa.test?( " aa " )
assert nfa.test?( " abbacdf " )
assert nfa.test?( " abbbbacdfcdf " )
assert !nfa.test?( " bbbbacdfcdf " )
assert !nfa.test?( " abbbacdfcdf " )
assert !nfa.test?( " abbbbacdfdf " )
assert !nfa.test?( " abbbbacdfdfg " )
assert nfa.test?( " abba " )
assert nfa.test?( " abbbba " )
assert !nfa.test?( " a " )
assert !nfa.test?( " aa " )
assert nfa.test?( " abbacdf " )
assert nfa.test?( " abbbbacdfcdf " )
assert !nfa.test?( " bbbbacdfcdf " )
assert !nfa.test?( " abbbacdfcdf " )
assert !nfa.test?( " abbbbacdfdf " )
assert !nfa.test?( " abbbbacdfdfg " )