用Ruby写了个NFA
今天有点空闲,想想用Ruby写个NFA试试。从正则表达式构造NFA采用经典的Thompson算法:正则表达式 -> 后缀表达式 -> 构造NFA。构造了NFA后,用之匹配字符串。一句话,写了个玩具的正则表达式引擎,支持concatenation、alternation以及*、?、+量词,不支持反向引用和转义符。测试了下与Ruby自带的正则表达式引擎的性能对比,慢了3倍
nfa.rb
module NFA
class NFA
def initialize(state)
@state=state
end
def step(clist,c)
return clist if clist.size==0;
nlist=[]
allNull = true
matched = false
clist.each do |t|
if !t.nil?
allNull = false if t.c!=-1
if t.c == c && t.end.type ==1 then
matched = true
nlist.push(t.end.out1) if !t.end.out1.end.nil?
nlist.push(t.end.out2) if !t.end.out2.end.nil?
elsif (t.c == c && t.end.type == 0) then
matched = true;
return ListUitls.new_list(t);
elsif (t.c == -1 && !t.end.nil?) then
nlist.push(t.end.out1);
nlist.push(t.end.out2);
end
end
end
return step(nlist, c) if (allNull)
return step(nlist, c) if (!matched)
nlist
end
def test?(s)
match(@state,s)
end
def match(state,s)
clist =[]
clist.push(state.out1);
clist.push(state.out2);
s.each_byte do |c|
c =c&0xFF;
clist = step(clist, c);
return false if clist.size==0
end
return is_match?(clist)
end
def is_match?(clist)
clist.each do |t|
return true if !t.nil? and t.c==-1 and t.end and t.end.is_matched?
end
false
end
end
class Paren
attr_accessor:n_alt,:n_atom
end
class State
attr_accessor :out1,:out2,:type
def initialize(out1,out2)
@out1=out1
@out2=out2
@type=1
end
def is_matched?
return @type==0
end
end
class Transition
attr_accessor :c,:end
def initialize(c)
@c=c
end
end
class Frame
attr_accessor :start,:outs
def initialize(start,outs)
@start=start
@outs=outs
end
end
class ListUitls
def self.link(list,state)
list.each{|t| t.end=state}
end
def self.append(list1,list2)
list1+list2
end
def self.new_list(out)
result=[]
result.push(out)
result
end
end
def self.compile(re)
post = re2post(re)
raise ArgumentError.new,"bad regexp!" if post.nil?
state = post2nfa(post);
raise RuntimeError.new,"construct nfa from postfix fail!" if state.nil?
return NFA.new(state);
end
def self.post2nfa(postfix)
stack=[]
s=nil
t=t1=t2=nil
e1=e2=e=nil
return nil if postfix.nil?
postfix.each_byte do |p|
case p.chr
when '.':
e2 = stack.pop()
e1 = stack.pop()
ListUitls.link(e1.outs, e2.start)
stack.push(Frame.new(e1.start, e2.outs))
when '|':
e2 = stack.pop()
e1 = stack.pop()
t1 = Transition.new(-1)
t2 = Transition.new(-1)
t1.end = e1.start
t2.end = e2.start
s = State.new(t1, t2)
stack.push(Frame.new(s, ListUitls.append(e1.outs, e2.outs)))
when '?':
e = stack.pop()
t1 = Transition.new(-1)
t2 = Transition.new(-1)
t1.end = e.start
s = State.new(t1, t2)
stack.push(Frame.new(s, ListUitls.append(e.outs, ListUitls.new_list(t2))))
when '*':
e = stack.pop()
t1 = Transition.new(-1)
t2 = Transition.new(-1)
t1.end = e.start
s = State.new(t1, t2)
ListUitls.link(e.outs, s)
stack.push(Frame.new(s, ListUitls.new_list(s.out2)))
when '+':
e = stack.pop()
t1 = Transition.new(-1)
t2 = Transition.new(-1)
t1.end = e.start
s = State.new(t1, t2)
ListUitls.link(e.outs, s)
stack.push(Frame.new(e.start, ListUitls.new_list(t2)))
else
t = Transition.new(p)
s = State.new(t, Transition.new(-1))
stack.push(Frame.new(s, ListUitls.new_list(s.out1)))
end
end
e = stack.pop()
return nil if stack.size()>0
end_state = State.new(nil, nil)
end_state.type=0
e.outs.each do |tran|
if tran.c!=-1
t1 = Transition.new(-1)
t2 = Transition.new(-1)
s=State.new(t1,t2)
tran.end=s
s.out1.end=end_state
s.out2.end=end_state
else
tran.end=end_state
end
end
start = e.start
return start
end
def self.re2post(re)
n_alt = n_atom = 0
result=""
paren=[]
re.each_byte do |c|
case c.chr
when '(' then
if (n_atom > 1) then
n_atom-=1
result<<"."
end
p =Paren.new
p.n_alt = n_alt
p.n_atom = n_atom
paren.push(p)
n_alt = n_atom = 0
when '|' then
if (n_atom == 0)
return nil
end
while (n_atom-=1) > 0
result<<"."
end
n_alt+=1
when ')' then
if (paren.size() == 0)
return nil
end
if (n_atom == 0)
return nil
end
while (n_atom-=1)>0
result<<"."
end
while(n_alt>0)
result<<"|"
n_alt-=1
end
p = paren.pop()
n_alt = p.n_alt
n_atom = p.n_atom
n_atom+=1
when '*','+','?':
if (n_atom == 0)
return nil
end
result<<c
else
if (n_atom > 1)
n_atom-=1
result<<"."
end
result<<c
n_atom+=1
end
end
return nil if paren.size()>0
while ( (n_atom-=1)> 0)
result<<"."
end
while(n_alt>0)
n_alt-=1
result<<"|"
end
result
end
end
class NFA
def initialize(state)
@state=state
end
def step(clist,c)
return clist if clist.size==0;
nlist=[]
allNull = true
matched = false
clist.each do |t|
if !t.nil?
allNull = false if t.c!=-1
if t.c == c && t.end.type ==1 then
matched = true
nlist.push(t.end.out1) if !t.end.out1.end.nil?
nlist.push(t.end.out2) if !t.end.out2.end.nil?
elsif (t.c == c && t.end.type == 0) then
matched = true;
return ListUitls.new_list(t);
elsif (t.c == -1 && !t.end.nil?) then
nlist.push(t.end.out1);
nlist.push(t.end.out2);
end
end
end
return step(nlist, c) if (allNull)
return step(nlist, c) if (!matched)
nlist
end
def test?(s)
match(@state,s)
end
def match(state,s)
clist =[]
clist.push(state.out1);
clist.push(state.out2);
s.each_byte do |c|
c =c&0xFF;
clist = step(clist, c);
return false if clist.size==0
end
return is_match?(clist)
end
def is_match?(clist)
clist.each do |t|
return true if !t.nil? and t.c==-1 and t.end and t.end.is_matched?
end
false
end
end
class Paren
attr_accessor:n_alt,:n_atom
end
class State
attr_accessor :out1,:out2,:type
def initialize(out1,out2)
@out1=out1
@out2=out2
@type=1
end
def is_matched?
return @type==0
end
end
class Transition
attr_accessor :c,:end
def initialize(c)
@c=c
end
end
class Frame
attr_accessor :start,:outs
def initialize(start,outs)
@start=start
@outs=outs
end
end
class ListUitls
def self.link(list,state)
list.each{|t| t.end=state}
end
def self.append(list1,list2)
list1+list2
end
def self.new_list(out)
result=[]
result.push(out)
result
end
end
def self.compile(re)
post = re2post(re)
raise ArgumentError.new,"bad regexp!" if post.nil?
state = post2nfa(post);
raise RuntimeError.new,"construct nfa from postfix fail!" if state.nil?
return NFA.new(state);
end
def self.post2nfa(postfix)
stack=[]
s=nil
t=t1=t2=nil
e1=e2=e=nil
return nil if postfix.nil?
postfix.each_byte do |p|
case p.chr
when '.':
e2 = stack.pop()
e1 = stack.pop()
ListUitls.link(e1.outs, e2.start)
stack.push(Frame.new(e1.start, e2.outs))
when '|':
e2 = stack.pop()
e1 = stack.pop()
t1 = Transition.new(-1)
t2 = Transition.new(-1)
t1.end = e1.start
t2.end = e2.start
s = State.new(t1, t2)
stack.push(Frame.new(s, ListUitls.append(e1.outs, e2.outs)))
when '?':
e = stack.pop()
t1 = Transition.new(-1)
t2 = Transition.new(-1)
t1.end = e.start
s = State.new(t1, t2)
stack.push(Frame.new(s, ListUitls.append(e.outs, ListUitls.new_list(t2))))
when '*':
e = stack.pop()
t1 = Transition.new(-1)
t2 = Transition.new(-1)
t1.end = e.start
s = State.new(t1, t2)
ListUitls.link(e.outs, s)
stack.push(Frame.new(s, ListUitls.new_list(s.out2)))
when '+':
e = stack.pop()
t1 = Transition.new(-1)
t2 = Transition.new(-1)
t1.end = e.start
s = State.new(t1, t2)
ListUitls.link(e.outs, s)
stack.push(Frame.new(e.start, ListUitls.new_list(t2)))
else
t = Transition.new(p)
s = State.new(t, Transition.new(-1))
stack.push(Frame.new(s, ListUitls.new_list(s.out1)))
end
end
e = stack.pop()
return nil if stack.size()>0
end_state = State.new(nil, nil)
end_state.type=0
e.outs.each do |tran|
if tran.c!=-1
t1 = Transition.new(-1)
t2 = Transition.new(-1)
s=State.new(t1,t2)
tran.end=s
s.out1.end=end_state
s.out2.end=end_state
else
tran.end=end_state
end
end
start = e.start
return start
end
def self.re2post(re)
n_alt = n_atom = 0
result=""
paren=[]
re.each_byte do |c|
case c.chr
when '(' then
if (n_atom > 1) then
n_atom-=1
result<<"."
end
p =Paren.new
p.n_alt = n_alt
p.n_atom = n_atom
paren.push(p)
n_alt = n_atom = 0
when '|' then
if (n_atom == 0)
return nil
end
while (n_atom-=1) > 0
result<<"."
end
n_alt+=1
when ')' then
if (paren.size() == 0)
return nil
end
if (n_atom == 0)
return nil
end
while (n_atom-=1)>0
result<<"."
end
while(n_alt>0)
result<<"|"
n_alt-=1
end
p = paren.pop()
n_alt = p.n_alt
n_atom = p.n_atom
n_atom+=1
when '*','+','?':
if (n_atom == 0)
return nil
end
result<<c
else
if (n_atom > 1)
n_atom-=1
result<<"."
end
result<<c
n_atom+=1
end
end
return nil if paren.size()>0
while ( (n_atom-=1)> 0)
result<<"."
end
while(n_alt>0)
n_alt-=1
result<<"|"
end
result
end
end
使用:
nfa = NFA::compile("a(bb)+a(cdf)*")
assert nfa.test?("abba")
assert nfa.test?("abbbba")
assert !nfa.test?("a")
assert !nfa.test?("aa")
assert nfa.test?("abbacdf")
assert nfa.test?("abbbbacdfcdf")
assert !nfa.test?("bbbbacdfcdf")
assert !nfa.test?("abbbacdfcdf")
assert !nfa.test?("abbbbacdfdf")
assert !nfa.test?("abbbbacdfdfg")
assert nfa.test?("abba")
assert nfa.test?("abbbba")
assert !nfa.test?("a")
assert !nfa.test?("aa")
assert nfa.test?("abbacdf")
assert nfa.test?("abbbbacdfcdf")
assert !nfa.test?("bbbbacdfcdf")
assert !nfa.test?("abbbacdfcdf")
assert !nfa.test?("abbbbacdfdf")
assert !nfa.test?("abbbbacdfdfg")
文章转自庄周梦蝶 ,原文发布时间2008-02-25
最后更新:2017-05-17 17:32:02