debian-python-whoosh/tests/test_automata.py

373 lines
9.5 KiB
Python

import gzip
import os.path
from bisect import bisect_left
import pytest
from whoosh.compat import permutations
from whoosh.compat import xrange
from whoosh.automata import fsa, glob, lev, reg
from whoosh.support.levenshtein import levenshtein
from whoosh.util import random_bytes
def test_nfa():
nfa = fsa.NFA(0)
nfa.add_transition(0, "a", 1)
nfa.add_transition(0, fsa.EPSILON, 4)
nfa.add_transition(0, "b", 1)
nfa.add_transition(1, "c", 4)
nfa.add_final_state(4)
assert nfa.accept("")
assert nfa.accept("ac")
assert nfa.accept("bc")
assert not nfa.accept("c")
def test_empty_string():
nfa = fsa.NFA(1)
nfa.add_final_state(1)
assert nfa.accept("")
assert not nfa.accept("a")
dfa = nfa.to_dfa()
assert dfa.accept("")
assert not dfa.accept("a")
def test_nfa2():
nfa = fsa.NFA(1)
nfa.add_transition(1, "a", 2)
nfa.add_transition(1, "c", 4)
nfa.add_transition(2, "b", 3)
nfa.add_transition(2, fsa.EPSILON, 1)
nfa.add_transition(3, "a", 2)
nfa.add_transition(4, "c", 3)
nfa.add_transition(4, fsa.EPSILON, 3)
nfa.add_final_state(3)
assert nfa.accept("ab")
assert nfa.accept("abab")
assert nfa.accept("cc")
assert nfa.accept("c")
assert nfa.accept("ccab")
assert nfa.accept("ccacc")
assert nfa.accept("ccac")
assert nfa.accept("abacab")
assert not nfa.accept("b")
assert not nfa.accept("a")
assert not nfa.accept("cb")
assert not nfa.accept("caa")
dfa = nfa.to_dfa()
assert dfa.accept("ab")
assert dfa.accept("abab")
assert dfa.accept("cc")
assert dfa.accept("c")
assert dfa.accept("ccab")
assert dfa.accept("ccacc")
assert dfa.accept("ccac")
assert dfa.accept("abacab")
assert not dfa.accept("b")
assert not dfa.accept("a")
assert not dfa.accept("cb")
assert not dfa.accept("caa")
def test_insert():
nfa1 = fsa.NFA(1)
nfa1.add_transition(1, "a", 2)
nfa1.add_transition(2, "b", 3)
nfa1.add_final_state(3)
nfa2 = fsa.NFA(4)
nfa2.add_transition(4, "x", 5)
nfa2.add_transition(4, "y", 5)
nfa2.insert(4, nfa1, 5)
nfa2.add_final_state(5)
assert nfa2.accept("x")
assert nfa2.accept("y")
assert nfa2.accept("ab")
assert not nfa2.accept("a")
def test_to_dfa():
nfa = fsa.NFA(0)
nfa.add_transition(0, "a", 1)
nfa.add_transition(0, fsa.EPSILON, 4)
nfa.add_transition(0, "b", 1)
nfa.add_transition(1, "c", 4)
nfa.add_final_state(4)
assert nfa.accept("")
dfa = nfa.to_dfa()
assert dfa.accept("")
assert dfa.accept("ac")
assert dfa.accept("bc")
assert not dfa.accept("c")
def test_glob_star():
nfa = glob.glob_automaton("a*c")
assert not nfa.accept("a")
assert not nfa.accept("c")
assert nfa.accept("ac")
assert nfa.accept("abc")
assert nfa.accept("abcc")
assert nfa.accept("abcac")
assert nfa.accept("aaaaaaaaaac")
assert not nfa.accept("abb")
dfa = nfa.to_dfa()
assert not dfa.accept("a")
assert not dfa.accept("c")
assert dfa.accept("ac")
assert dfa.accept("abc")
assert dfa.accept("abcc")
assert dfa.accept("abcac")
assert not dfa.accept("abb")
def test_glob_question():
nfa = glob.glob_automaton("?")
assert not nfa.accept("")
assert nfa.accept("a")
assert not nfa.accept("aa")
nfa = glob.glob_automaton("a?c")
assert not nfa.accept("a")
assert not nfa.accept("ac")
assert nfa.accept("abc")
assert not nfa.accept("aba")
def test_glob_range():
nfa = glob.glob_automaton("[ab][cd]")
assert not nfa.accept("")
assert not nfa.accept("a")
assert not nfa.accept("c")
assert nfa.accept("ac")
assert nfa.accept("bc")
assert nfa.accept("ad")
assert nfa.accept("bd")
assert not nfa.accept("acc")
# def test_glob_negate_range():
# nfa = glob.glob_automaton("a[!ab]a")
# assert not nfa.accept("aaa")
# assert not nfa.accept("aba")
# assert nfa.accept("aca")
# assert not nfa.accept("bcb")
class Skipper(object):
def __init__(self, data):
self.data = data
self.i = 0
def __call__(self, w):
if self.data[self.i] == w:
return w
self.i += 1
pos = bisect_left(self.data, w, self.i)
if pos < len(self.data):
return self.data[pos]
else:
return None
def test_levenshtein():
path = os.path.join(os.path.dirname(__file__), "english-words.10.gz")
wordfile = gzip.open(path, "rb")
words = sorted(line.decode("latin1").strip().lower() for line in wordfile)
def find_brute(target, k):
for w in words:
if levenshtein(w, target, k) <= k:
yield w
def find_auto(target, k):
dfa = lev.levenshtein_automaton(target, k).to_dfa()
sk = Skipper(words)
return fsa.find_all_matches(dfa, sk)
assert set(find_brute("look", 2)) == set(find_auto("look", 2))
assert set(find_brute("bend", 1)) == set(find_auto("bend", 1))
assert set(find_brute("puck", 1)) == set(find_auto("puck", 1))
assert set(find_brute("zero", 1)) == set(find_auto("zero", 1))
def test_levenshtein_prefix():
path = os.path.join(os.path.dirname(__file__), "english-words.10.gz")
wordfile = gzip.open(path, "rb")
words = sorted(line.decode("latin1").strip().lower() for line in wordfile)
prefixlen = 1
def find_brute(target, k):
for w in words:
d = levenshtein(w, target, k)
if d <= k and w[:prefixlen] == target[:prefixlen]:
yield w
def find_auto(target, k):
dfa = lev.levenshtein_automaton(target, k, prefix=prefixlen).to_dfa()
sk = Skipper(words)
return fsa.find_all_matches(dfa, sk)
assert set(find_brute("look", 2)) == set(find_auto("look", 2))
assert set(find_brute("bend", 1)) == set(find_auto("bend", 1))
assert set(find_brute("puck", 1)) == set(find_auto("puck", 1))
assert set(find_brute("zero", 1)) == set(find_auto("zero", 1))
def test_basics():
n = fsa.epsilon_nfa()
assert n.accept("")
assert not n.accept("a")
n = fsa.basic_nfa("a")
assert not n.accept("")
assert n.accept("a")
assert not n.accept("b")
n = fsa.dot_nfa()
assert not n.accept("")
assert n.accept("a")
assert n.accept("b")
def test_concat():
n = fsa.concat_nfa(fsa.basic_nfa("a"), fsa.basic_nfa("b"))
assert not n.accept("")
assert not n.accept("a")
assert not n.accept("aa")
assert not n.accept("b")
assert not n.accept("bb")
assert not n.accept("ba")
assert not n.accept("abc")
assert n.accept("ab")
def test_choice():
n = fsa.choice_nfa(fsa.basic_nfa("a"),
fsa.choice_nfa(fsa.basic_nfa("b"),
fsa.basic_nfa("c")))
assert not n.accept("")
assert n.accept("a")
assert n.accept("b")
assert n.accept("c")
assert not n.accept("d")
assert not n.accept("aa")
assert not n.accept("ab")
assert not n.accept("abc")
def test_star():
n = fsa.star_nfa(fsa.basic_nfa("a"))
assert n.accept("")
assert n.accept("a")
assert n.accept("aaaaaa")
assert not n.accept("b")
assert not n.accept("ab")
def test_optional():
n = fsa.concat_nfa(fsa.basic_nfa("a"), fsa.optional_nfa(fsa.basic_nfa("b")))
assert n.accept("a")
assert n.accept("ab")
assert not n.accept("")
assert not n.accept("b")
assert not n.accept("ba")
assert not n.accept("bab")
def test_reverse_nfa():
n = fsa.concat_nfa(fsa.basic_nfa("a"), fsa.basic_nfa("b"))
r = fsa.reverse_nfa(n)
assert not r.accept("")
assert not r.accept("a")
assert not r.accept("aa")
assert not r.accept("b")
assert not r.accept("bb")
assert not r.accept("ab")
assert not r.accept("abc")
assert r.accept("ba")
def test_regular():
ex = fsa.star_nfa(fsa.choice_nfa(fsa.basic_nfa("a"), fsa.basic_nfa("b")))
assert ex.accept("")
assert ex.accept("a")
assert ex.accept("aaaa")
assert ex.accept("b")
assert ex.accept("bbbb")
assert ex.accept("abab")
assert ex.accept("babb")
ex = fsa.concat_nfa(
fsa.basic_nfa("a"),
fsa.concat_nfa(
fsa.optional_nfa(fsa.basic_nfa("b")),
fsa.basic_nfa("c")
)
)
assert ex.accept("ac")
assert ex.accept("abc")
assert not ex.accept("ab")
assert not ex.accept("bc")
def test_minimize_dfa():
# Example from www.cs.odu.edu/~toida/nerzic/390teched/regular/fa/min-fa.html
dfa = fsa.DFA(1)
dfa.add_transition(1, "a", 3)
dfa.add_transition(1, "b", 2)
dfa.add_transition(2, "a", 4)
dfa.add_transition(2, "b", 1)
dfa.add_transition(3, "a", 5)
dfa.add_transition(3, "b", 4)
dfa.add_transition(4, "a", 4)
dfa.add_transition(4, "b", 4)
dfa.add_transition(5, "a", 3)
dfa.add_transition(5, "b", 2)
dfa.add_final_state(1)
dfa.add_final_state(5)
good = fsa.DFA(1)
good.add_transition(1, "a", 3)
good.add_transition(1, "b", 2)
good.add_transition(2, "b", 1)
good.add_transition(3, "a", 1)
good.add_final_state(1)
dfa.minimize()
assert dfa == good
def test_strings_dfa():
strings = "able alfa alpha apple bar bear beat boom boot".split()
dfa = fsa.strings_dfa(strings)
output = list(dfa.generate_all())
assert output == strings
domain = "abcd"
words = set()
for i in xrange(1, len(domain) + 1):
words.update("".join(p) for p in permutations(domain[:i]))
words = sorted(words)
dfa = fsa.strings_dfa(words)
assert list(dfa.generate_all()) == words