用Ruby寫了個NFA
今天有點空閑,想想用Ruby寫個NFA試試。從正則表達式構造NFA采用經典的Thompson算法:正則表達式 -> 後綴表達式 -> 構造NFA。構造了NFA後,用之匹配字符串。一句話,寫了個玩具的正則表達式引擎,支持concatenation、alternation以及*、?、+量詞,不支持反向引用和轉義符。測試了下與Ruby自帶的正則表達式引擎的性能對比,慢了3倍
nfa.rb
module NFA
class NFA
def initialize(state)
@state=state
end
def step(clist,c)
return clist if clist.size==0;
nlist=[]
allNull = true
matched = false
clist.each do |t|
if !t.nil?
allNull = false if t.c!=-1
if t.c == c && t.end.type ==1 then
matched = true
nlist.push(t.end.out1) if !t.end.out1.end.nil?
nlist.push(t.end.out2) if !t.end.out2.end.nil?
elsif (t.c == c && t.end.type == 0) then
matched = true;
return ListUitls.new_list(t);
elsif (t.c == -1 && !t.end.nil?) then
nlist.push(t.end.out1);
nlist.push(t.end.out2);
end
end
end
return step(nlist, c) if (allNull)
return step(nlist, c) if (!matched)
nlist
end
def test?(s)
match(@state,s)
end
def match(state,s)
clist =[]
clist.push(state.out1);
clist.push(state.out2);
s.each_byte do |c|
c =c&0xFF;
clist = step(clist, c);
return false if clist.size==0
end
return is_match?(clist)
end
def is_match?(clist)
clist.each do |t|
return true if !t.nil? and t.c==-1 and t.end and t.end.is_matched?
end
false
end
end
class Paren
attr_accessor:n_alt,:n_atom
end
class State
attr_accessor :out1,:out2,:type
def initialize(out1,out2)
@out1=out1
@out2=out2
@type=1
end
def is_matched?
return @type==0
end
end
class Transition
attr_accessor :c,:end
def initialize(c)
@c=c
end
end
class Frame
attr_accessor :start,:outs
def initialize(start,outs)
@start=start
@outs=outs
end
end
class ListUitls
def self.link(list,state)
list.each{|t| t.end=state}
end
def self.append(list1,list2)
list1+list2
end
def self.new_list(out)
result=[]
result.push(out)
result
end
end
def self.compile(re)
post = re2post(re)
raise ArgumentError.new,"bad regexp!" if post.nil?
state = post2nfa(post);
raise RuntimeError.new,"construct nfa from postfix fail!" if state.nil?
return NFA.new(state);
end
def self.post2nfa(postfix)
stack=[]
s=nil
t=t1=t2=nil
e1=e2=e=nil
return nil if postfix.nil?
postfix.each_byte do |p|
case p.chr
when '.':
e2 = stack.pop()
e1 = stack.pop()
ListUitls.link(e1.outs, e2.start)
stack.push(Frame.new(e1.start, e2.outs))
when '|':
e2 = stack.pop()
e1 = stack.pop()
t1 = Transition.new(-1)
t2 = Transition.new(-1)
t1.end = e1.start
t2.end = e2.start
s = State.new(t1, t2)
stack.push(Frame.new(s, ListUitls.append(e1.outs, e2.outs)))
when '?':
e = stack.pop()
t1 = Transition.new(-1)
t2 = Transition.new(-1)
t1.end = e.start
s = State.new(t1, t2)
stack.push(Frame.new(s, ListUitls.append(e.outs, ListUitls.new_list(t2))))
when '*':
e = stack.pop()
t1 = Transition.new(-1)
t2 = Transition.new(-1)
t1.end = e.start
s = State.new(t1, t2)
ListUitls.link(e.outs, s)
stack.push(Frame.new(s, ListUitls.new_list(s.out2)))
when '+':
e = stack.pop()
t1 = Transition.new(-1)
t2 = Transition.new(-1)
t1.end = e.start
s = State.new(t1, t2)
ListUitls.link(e.outs, s)
stack.push(Frame.new(e.start, ListUitls.new_list(t2)))
else
t = Transition.new(p)
s = State.new(t, Transition.new(-1))
stack.push(Frame.new(s, ListUitls.new_list(s.out1)))
end
end
e = stack.pop()
return nil if stack.size()>0
end_state = State.new(nil, nil)
end_state.type=0
e.outs.each do |tran|
if tran.c!=-1
t1 = Transition.new(-1)
t2 = Transition.new(-1)
s=State.new(t1,t2)
tran.end=s
s.out1.end=end_state
s.out2.end=end_state
else
tran.end=end_state
end
end
start = e.start
return start
end
def self.re2post(re)
n_alt = n_atom = 0
result=""
paren=[]
re.each_byte do |c|
case c.chr
when '(' then
if (n_atom > 1) then
n_atom-=1
result<<"."
end
p =Paren.new
p.n_alt = n_alt
p.n_atom = n_atom
paren.push(p)
n_alt = n_atom = 0
when '|' then
if (n_atom == 0)
return nil
end
while (n_atom-=1) > 0
result<<"."
end
n_alt+=1
when ')' then
if (paren.size() == 0)
return nil
end
if (n_atom == 0)
return nil
end
while (n_atom-=1)>0
result<<"."
end
while(n_alt>0)
result<<"|"
n_alt-=1
end
p = paren.pop()
n_alt = p.n_alt
n_atom = p.n_atom
n_atom+=1
when '*','+','?':
if (n_atom == 0)
return nil
end
result<<c
else
if (n_atom > 1)
n_atom-=1
result<<"."
end
result<<c
n_atom+=1
end
end
return nil if paren.size()>0
while ( (n_atom-=1)> 0)
result<<"."
end
while(n_alt>0)
n_alt-=1
result<<"|"
end
result
end
end
class NFA
def initialize(state)
@state=state
end
def step(clist,c)
return clist if clist.size==0;
nlist=[]
allNull = true
matched = false
clist.each do |t|
if !t.nil?
allNull = false if t.c!=-1
if t.c == c && t.end.type ==1 then
matched = true
nlist.push(t.end.out1) if !t.end.out1.end.nil?
nlist.push(t.end.out2) if !t.end.out2.end.nil?
elsif (t.c == c && t.end.type == 0) then
matched = true;
return ListUitls.new_list(t);
elsif (t.c == -1 && !t.end.nil?) then
nlist.push(t.end.out1);
nlist.push(t.end.out2);
end
end
end
return step(nlist, c) if (allNull)
return step(nlist, c) if (!matched)
nlist
end
def test?(s)
match(@state,s)
end
def match(state,s)
clist =[]
clist.push(state.out1);
clist.push(state.out2);
s.each_byte do |c|
c =c&0xFF;
clist = step(clist, c);
return false if clist.size==0
end
return is_match?(clist)
end
def is_match?(clist)
clist.each do |t|
return true if !t.nil? and t.c==-1 and t.end and t.end.is_matched?
end
false
end
end
class Paren
attr_accessor:n_alt,:n_atom
end
class State
attr_accessor :out1,:out2,:type
def initialize(out1,out2)
@out1=out1
@out2=out2
@type=1
end
def is_matched?
return @type==0
end
end
class Transition
attr_accessor :c,:end
def initialize(c)
@c=c
end
end
class Frame
attr_accessor :start,:outs
def initialize(start,outs)
@start=start
@outs=outs
end
end
class ListUitls
def self.link(list,state)
list.each{|t| t.end=state}
end
def self.append(list1,list2)
list1+list2
end
def self.new_list(out)
result=[]
result.push(out)
result
end
end
def self.compile(re)
post = re2post(re)
raise ArgumentError.new,"bad regexp!" if post.nil?
state = post2nfa(post);
raise RuntimeError.new,"construct nfa from postfix fail!" if state.nil?
return NFA.new(state);
end
def self.post2nfa(postfix)
stack=[]
s=nil
t=t1=t2=nil
e1=e2=e=nil
return nil if postfix.nil?
postfix.each_byte do |p|
case p.chr
when '.':
e2 = stack.pop()
e1 = stack.pop()
ListUitls.link(e1.outs, e2.start)
stack.push(Frame.new(e1.start, e2.outs))
when '|':
e2 = stack.pop()
e1 = stack.pop()
t1 = Transition.new(-1)
t2 = Transition.new(-1)
t1.end = e1.start
t2.end = e2.start
s = State.new(t1, t2)
stack.push(Frame.new(s, ListUitls.append(e1.outs, e2.outs)))
when '?':
e = stack.pop()
t1 = Transition.new(-1)
t2 = Transition.new(-1)
t1.end = e.start
s = State.new(t1, t2)
stack.push(Frame.new(s, ListUitls.append(e.outs, ListUitls.new_list(t2))))
when '*':
e = stack.pop()
t1 = Transition.new(-1)
t2 = Transition.new(-1)
t1.end = e.start
s = State.new(t1, t2)
ListUitls.link(e.outs, s)
stack.push(Frame.new(s, ListUitls.new_list(s.out2)))
when '+':
e = stack.pop()
t1 = Transition.new(-1)
t2 = Transition.new(-1)
t1.end = e.start
s = State.new(t1, t2)
ListUitls.link(e.outs, s)
stack.push(Frame.new(e.start, ListUitls.new_list(t2)))
else
t = Transition.new(p)
s = State.new(t, Transition.new(-1))
stack.push(Frame.new(s, ListUitls.new_list(s.out1)))
end
end
e = stack.pop()
return nil if stack.size()>0
end_state = State.new(nil, nil)
end_state.type=0
e.outs.each do |tran|
if tran.c!=-1
t1 = Transition.new(-1)
t2 = Transition.new(-1)
s=State.new(t1,t2)
tran.end=s
s.out1.end=end_state
s.out2.end=end_state
else
tran.end=end_state
end
end
start = e.start
return start
end
def self.re2post(re)
n_alt = n_atom = 0
result=""
paren=[]
re.each_byte do |c|
case c.chr
when '(' then
if (n_atom > 1) then
n_atom-=1
result<<"."
end
p =Paren.new
p.n_alt = n_alt
p.n_atom = n_atom
paren.push(p)
n_alt = n_atom = 0
when '|' then
if (n_atom == 0)
return nil
end
while (n_atom-=1) > 0
result<<"."
end
n_alt+=1
when ')' then
if (paren.size() == 0)
return nil
end
if (n_atom == 0)
return nil
end
while (n_atom-=1)>0
result<<"."
end
while(n_alt>0)
result<<"|"
n_alt-=1
end
p = paren.pop()
n_alt = p.n_alt
n_atom = p.n_atom
n_atom+=1
when '*','+','?':
if (n_atom == 0)
return nil
end
result<<c
else
if (n_atom > 1)
n_atom-=1
result<<"."
end
result<<c
n_atom+=1
end
end
return nil if paren.size()>0
while ( (n_atom-=1)> 0)
result<<"."
end
while(n_alt>0)
n_alt-=1
result<<"|"
end
result
end
end
使用:
nfa = NFA::compile("a(bb)+a(cdf)*")
assert nfa.test?("abba")
assert nfa.test?("abbbba")
assert !nfa.test?("a")
assert !nfa.test?("aa")
assert nfa.test?("abbacdf")
assert nfa.test?("abbbbacdfcdf")
assert !nfa.test?("bbbbacdfcdf")
assert !nfa.test?("abbbacdfcdf")
assert !nfa.test?("abbbbacdfdf")
assert !nfa.test?("abbbbacdfdfg")
assert nfa.test?("abba")
assert nfa.test?("abbbba")
assert !nfa.test?("a")
assert !nfa.test?("aa")
assert nfa.test?("abbacdf")
assert nfa.test?("abbbbacdfcdf")
assert !nfa.test?("bbbbacdfcdf")
assert !nfa.test?("abbbacdfcdf")
assert !nfa.test?("abbbbacdfdf")
assert !nfa.test?("abbbbacdfdfg")
文章轉自莊周夢蝶 ,原文發布時間2008-02-25
最後更新:2017-05-17 17:32:02