  * Returns a (deterministic) automaton that accepts the intersection of the language of <code>a1
  * </code> and the complement of the language of <code>a2</code>. As a side-effect, the automata
  * may be determinized, if not already deterministic.
  * <p>Complexity: quadratic in number of states (if already deterministic).
 public static Automaton minus(Automaton a1, Automaton a2) {
   if (BasicOperations.isEmpty(a1) || a1 == a2) return BasicAutomata.makeEmpty();
   if (BasicOperations.isEmpty(a2)) return a1.cloneIfRequired();
   if (a1.isSingleton()) {
     if (BasicOperations.run(a2, a1.singleton)) return BasicAutomata.makeEmpty();
     else return a1.cloneIfRequired();
   return intersection(a1, a2.complement());
예제 #2
  public void testOverlappedTokensSausage() throws Exception {

    // Two tokens on top of each other (sausage):
    final TokenStream ts =
        new CannedTokenStream(new Token[] {token("abc", 1, 1), token("xyz", 0, 1)});
    final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
    final Automaton a1 = BasicAutomata.makeString("abc");
    final Automaton a2 = BasicAutomata.makeString("xyz");
    final Automaton expected = BasicOperations.union(a1, a2);
    assertTrue(BasicOperations.sameLanguage(expected, actual));
예제 #3
 public void testSynHangingOverEnd() throws Exception {
   final TokenStream ts =
       new CannedTokenStream(
           new Token[] {
             token("a", 1, 1), token("X", 0, 10),
   final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
   final Automaton expected =
       BasicOperations.union(BasicAutomata.makeString("a"), BasicAutomata.makeString("X"));
   assertTrue(BasicOperations.sameLanguage(expected, actual));
예제 #4
 public void testSynOverMultipleHoles() throws Exception {
   final TokenStream ts =
       new CannedTokenStream(
           new Token[] {
             token("a", 1, 1), token("x", 0, 3), token("b", 3, 1),
   final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
   final Automaton a1 = join(s2a("a"), SEP_A, HOLE_A, SEP_A, HOLE_A, SEP_A, s2a("b"));
   final Automaton a2 = join(s2a("x"), SEP_A, s2a("b"));
   final Automaton expected = BasicOperations.union(a1, a2);
   assertTrue(BasicOperations.sameLanguage(expected, actual));
예제 #5
  public void testSynOverHole2() throws Exception {

    final TokenStream ts =
        new CannedTokenStream(
            new Token[] {
              token("xyz", 1, 1), token("abc", 0, 3), token("def", 2, 1),
    final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
    final Automaton expected =
            join(s2a("xyz"), SEP_A, HOLE_A, SEP_A, s2a("def")), BasicAutomata.makeString("abc"));
    assertTrue(BasicOperations.sameLanguage(expected, actual));
예제 #6
  public void testOverlappedTokensLattice2() throws Exception {

    final TokenStream ts =
        new CannedTokenStream(
            new Token[] {
              token("abc", 1, 1), token("xyz", 0, 3), token("def", 1, 1), token("ghi", 1, 1),
    final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
    final Automaton a1 = BasicAutomata.makeString("xyz");
    final Automaton a2 = join("abc", "def", "ghi");
    final Automaton expected = BasicOperations.union(a1, a2);
    // toDot(actual);
    assertTrue(BasicOperations.sameLanguage(expected, actual));
예제 #7
  public void testSynOverHole() throws Exception {

    final TokenStream ts =
        new CannedTokenStream(
            new Token[] {
              token("a", 1, 1), token("X", 0, 2), token("b", 2, 1),
    final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
    final Automaton a1 =
        BasicOperations.union(join(s2a("a"), SEP_A, HOLE_A), BasicAutomata.makeString("X"));
    final Automaton expected = BasicOperations.concatenate(a1, join(SEP_A, s2a("b")));
    // toDot(actual);
    assertTrue(BasicOperations.sameLanguage(expected, actual));
  * Returns an automaton that accepts the intersection of the languages of the given automata.
  * Never modifies the input automata languages.
  * <p>Complexity: quadratic in number of states.
 public static Automaton intersection(Automaton a1, Automaton a2) {
   if (a1.isSingleton()) {
     if (BasicOperations.run(a2, a1.singleton)) return a1.cloneIfRequired();
     else return BasicAutomata.makeEmpty();
   if (a2.isSingleton()) {
     if (BasicOperations.run(a1, a2.singleton)) return a2.cloneIfRequired();
     else return BasicAutomata.makeEmpty();
   if (a1 == a2) return a1.cloneIfRequired();
   Transition[][] transitions1 = a1.getSortedTransitions();
   Transition[][] transitions2 = a2.getSortedTransitions();
   Automaton c = new Automaton();
   LinkedList<StatePair> worklist = new LinkedList<StatePair>();
   HashMap<StatePair, StatePair> newstates = new HashMap<StatePair, StatePair>();
   StatePair p = new StatePair(c.initial, a1.initial, a2.initial);
   newstates.put(p, p);
   while (worklist.size() > 0) {
     p = worklist.removeFirst();
     p.s.accept = p.s1.accept && p.s2.accept;
     Transition[] t1 = transitions1[p.s1.number];
     Transition[] t2 = transitions2[p.s2.number];
     for (int n1 = 0, b2 = 0; n1 < t1.length; n1++) {
       while (b2 < t2.length && t2[b2].max < t1[n1].min) b2++;
       for (int n2 = b2; n2 < t2.length && t1[n1].max >= t2[n2].min; n2++)
         if (t2[n2].max >= t1[n1].min) {
           StatePair q = new StatePair(t1[n1].to, t2[n2].to);
           StatePair r = newstates.get(q);
           if (r == null) {
             q.s = new State();
             newstates.put(q, q);
             r = q;
           int min = t1[n1].min > t2[n2].min ? t1[n1].min : t2[n2].min;
           int max = t1[n1].max < t2[n2].max ? t1[n1].max : t2[n2].max;
           p.s.addTransition(new Transition(min, max, r.s));
   c.deterministic = a1.deterministic && a2.deterministic;
   return c;
예제 #9
  /** tests intersect: TODO start at a random term! */
  public void testIntersect() throws Exception {
    for (int i = 0; i < numIterations; i++) {
      String reg = AutomatonTestUtil.randomRegexp(random());
      Automaton automaton = new RegExp(reg, RegExp.NONE).toAutomaton();
      CompiledAutomaton ca =
          new CompiledAutomaton(automaton, SpecialOperations.isFinite(automaton), false);
      TermsEnum te = MultiFields.getTerms(reader, "field").intersect(ca, null);
      Automaton expected = BasicOperations.intersection(termsAutomaton, automaton);
      TreeSet<BytesRef> found = new TreeSet<BytesRef>();
      while (te.next() != null) {

      Automaton actual = BasicAutomata.makeStringUnion(found);
      assertTrue(BasicOperations.sameLanguage(expected, actual));
예제 #10
 private Automaton join(String... strings) {
   List<Automaton> as = new ArrayList<Automaton>();
   for (String s : strings) {
   as.remove(as.size() - 1);
   return BasicOperations.concatenate(as);
예제 #11
     * Create a automaton for a given context query this automaton will be used to find the matching
     * paths with the fst
     * @param preserveSep set an additional char (<code>XAnalyzingSuggester.SEP_LABEL</code>)
     *     between each context query
     * @param queries list of {@link ContextQuery} defining the lookup context
     * @return Automaton matching the given Query
    public static Automaton toAutomaton(boolean preserveSep, Iterable<ContextQuery> queries) {
      Automaton a = BasicAutomata.makeEmptyString();

      Automaton gap = BasicAutomata.makeChar(ContextMapping.SEPARATOR);
      if (preserveSep) {
        // if separators are preserved the fst contains a SEP_LABEL
        // behind each gap. To have a matching automaton, we need to
        // include the SEP_LABEL in the query as well
        gap =
            BasicOperations.concatenate(gap, BasicAutomata.makeChar(XAnalyzingSuggester.SEP_LABEL));

      for (ContextQuery query : queries) {
        a = Automaton.concatenate(Arrays.asList(query.toAutomaton(), gap, a));
      return a;
예제 #12
 public void testStartsWithHole() throws Exception {
   final TokenStream ts =
       new CannedTokenStream(
           new Token[] {
             token("abc", 2, 1),
   final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
   final Automaton expected = join(HOLE_A, SEP_A, s2a("abc"));
   // toDot(actual);
   assertTrue(BasicOperations.sameLanguage(expected, actual));
예제 #13
  public void testSingleToken() throws Exception {

    final TokenStream ts =
        new CannedTokenStream(
            new Token[] {
              token("abc", 1, 1),
    final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
    final Automaton expected = BasicAutomata.makeString("abc");
    assertTrue(BasicOperations.sameLanguage(expected, actual));
예제 #14
  public void testTwoTokens() throws Exception {

    final TokenStream ts =
        new CannedTokenStream(
            new Token[] {
              token("abc", 1, 1), token("def", 1, 1),
    final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
    final Automaton expected = join("abc", "def");

    // toDot(actual);
    assertTrue(BasicOperations.sameLanguage(expected, actual));
예제 #15
 /** Test a configuration that behaves a lot like KeepWordFilter */
 public void testKeep() throws Exception {
   CharacterRunAutomaton keepWords =
       new CharacterRunAutomaton(
                       BasicAutomata.makeString("foo"), BasicAutomata.makeString("bar")))));
   Analyzer a = new MockAnalyzer(MockTokenizer.SIMPLE, true, keepWords, true);
       "quick foo brown bar bar fox foo",
       new String[] {"foo", "bar", "bar", "foo"},
       new int[] {2, 2, 1, 2});
   * Returns true if the language of <code>a1</code> is a subset of the language of <code>a2</code>.
   * As a side-effect, <code>a2</code> is determinized if not already marked as deterministic.
   * <p>Complexity: quadratic in number of states.
  public static boolean subsetOf(Automaton a1, Automaton a2) {
    if (a1 == a2) return true;
    if (a1.isSingleton()) {
      if (a2.isSingleton()) return a1.singleton.equals(a2.singleton);
      return BasicOperations.run(a2, a1.singleton);
    Transition[][] transitions1 = a1.getSortedTransitions();
    Transition[][] transitions2 = a2.getSortedTransitions();
    LinkedList<StatePair> worklist = new LinkedList<StatePair>();
    HashSet<StatePair> visited = new HashSet<StatePair>();
    StatePair p = new StatePair(a1.initial, a2.initial);
    while (worklist.size() > 0) {
      p = worklist.removeFirst();
      if (p.s1.accept && !p.s2.accept) {
        return false;
      Transition[] t1 = transitions1[p.s1.number];
      Transition[] t2 = transitions2[p.s2.number];
      for (int n1 = 0, b2 = 0; n1 < t1.length; n1++) {
        while (b2 < t2.length && t2[b2].max < t1[n1].min) b2++;
        int min1 = t1[n1].min, max1 = t1[n1].max;

        for (int n2 = b2; n2 < t2.length && t1[n1].max >= t2[n2].min; n2++) {
          if (t2[n2].min > min1) {
            return false;
          if (t2[n2].max < Character.MAX_CODE_POINT) min1 = t2[n2].max + 1;
          else {
            min1 = Character.MAX_CODE_POINT;
            max1 = Character.MIN_CODE_POINT;
          StatePair q = new StatePair(t1[n1].to, t2[n2].to);
          if (!visited.contains(q)) {
        if (min1 <= max1) {
          return false;
    return true;
  * Returns an automaton that accepts the concatenation of the languages of the given automata.
  * <p>Complexity: linear in total number of states.
 public static Automaton concatenate(List<Automaton> l) {
   if (l.isEmpty()) return BasicAutomata.makeEmptyString();
   boolean all_singleton = true;
   for (Automaton a : l)
     if (!a.isSingleton()) {
       all_singleton = false;
   if (all_singleton) {
     StringBuilder b = new StringBuilder();
     for (Automaton a : l) b.append(a.singleton);
     return BasicAutomata.makeString(b.toString());
   } else {
     for (Automaton a : l) if (BasicOperations.isEmpty(a)) return BasicAutomata.makeEmpty();
     Set<Integer> ids = new HashSet<Integer>();
     for (Automaton a : l) ids.add(System.identityHashCode(a));
     boolean has_aliases = ids.size() != l.size();
     Automaton b = l.get(0);
     if (has_aliases) b = b.cloneExpanded();
     else b = b.cloneExpandedIfRequired();
     Set<State> ac = b.getAcceptStates();
     boolean first = true;
     for (Automaton a : l)
       if (first) first = false;
       else {
         if (a.isEmptyString()) continue;
         Automaton aa = a;
         if (has_aliases) aa = aa.cloneExpanded();
         else aa = aa.cloneExpandedIfRequired();
         Set<State> ns = aa.getAcceptStates();
         for (State s : ac) {
           s.accept = false;
           if (s.accept) ns.add(s);
         ac = ns;
     b.deterministic = false;
     // b.clearHashCode();
     return b;
  * Returns an automaton that accepts the union of the languages of the given automata.
  * <p>Complexity: linear in number of states.
 public static Automaton union(Collection<Automaton> l) {
   Set<Integer> ids = new HashSet<Integer>();
   for (Automaton a : l) ids.add(System.identityHashCode(a));
   boolean has_aliases = ids.size() != l.size();
   State s = new State();
   for (Automaton b : l) {
     if (BasicOperations.isEmpty(b)) continue;
     Automaton bb = b;
     if (has_aliases) bb = bb.cloneExpanded();
     else bb = bb.cloneExpandedIfRequired();
   Automaton a = new Automaton();
   a.initial = s;
   a.deterministic = false;
   // a.clearHashCode();
   return a;
예제 #19
  /** tests a pre-intersected automaton against the original */
  public void testFiniteVersusInfinite() throws Exception {
    for (int i = 0; i < numIterations; i++) {
      String reg = AutomatonTestUtil.randomRegexp(random());
      Automaton automaton = new RegExp(reg, RegExp.NONE).toAutomaton();
      final List<BytesRef> matchedTerms = new ArrayList<BytesRef>();
      for (BytesRef t : terms) {
        if (BasicOperations.run(automaton, t.utf8ToString())) {

      Automaton alternate = BasicAutomata.makeStringUnion(matchedTerms);
      // System.out.println("match " + matchedTerms.size() + " " + alternate.getNumberOfStates() + "
      // states, sigma=" + alternate.getStartPoints().length);
      // AutomatonTestUtil.minimizeSimple(alternate);
      // System.out.println("minmize done");
      AutomatonQuery a1 = new AutomatonQuery(new Term("field", ""), automaton);
      AutomatonQuery a2 = new AutomatonQuery(new Term("field", ""), alternate);
          a1, searcher.search(a1, 25).scoreDocs, searcher.search(a2, 25).scoreDocs);
예제 #20
  /** seeks to every term accepted by some automata */
  public void testSeeking() throws Exception {
    for (int i = 0; i < numIterations; i++) {
      String reg = AutomatonTestUtil.randomRegexp(random());
      Automaton automaton = new RegExp(reg, RegExp.NONE).toAutomaton();
      TermsEnum te = MultiFields.getTerms(reader, "field").iterator(null);
      ArrayList<BytesRef> unsortedTerms = new ArrayList<BytesRef>(terms);
      Collections.shuffle(unsortedTerms, random());

      for (BytesRef term : unsortedTerms) {
        if (BasicOperations.run(automaton, term.utf8ToString())) {
          // term is accepted
          if (random().nextBoolean()) {
            // seek exact
            assertTrue(te.seekExact(term, random().nextBoolean()));
          } else {
            // seek ceil
            assertEquals(SeekStatus.FOUND, te.seekCeil(term, random().nextBoolean()));
            assertEquals(term, te.term());
  /** return a random NFA/DFA for testing */
  public static Automaton randomAutomaton(Random random) {
    // get two random Automata from regexps
    Automaton a1 = new RegExp(AutomatonTestUtil.randomRegexp(random), RegExp.NONE).toAutomaton();
    if (random.nextBoolean()) a1 = BasicOperations.complement(a1);

    Automaton a2 = new RegExp(AutomatonTestUtil.randomRegexp(random), RegExp.NONE).toAutomaton();
    if (random.nextBoolean()) a2 = BasicOperations.complement(a2);

    // combine them in random ways
    switch (random.nextInt(4)) {
      case 0:
        return BasicOperations.concatenate(a1, a2);
      case 1:
        return BasicOperations.union(a1, a2);
      case 2:
        return BasicOperations.intersection(a1, a2);
        return BasicOperations.minus(a1, a2);
 * A tokenfilter for testing that removes terms accepted by a DFA.
 * <ul>
 *   <li>Union a list of singletons to act like a stopfilter.
 *   <li>Use the complement to act like a keepwordfilter
 *   <li>Use a regex like <code>.{12,}</code> to act like a lengthfilter
 * </ul>
public final class MockTokenFilter extends TokenFilter {
  /** Empty set of stopwords */
  public static final CharacterRunAutomaton EMPTY_STOPSET = new CharacterRunAutomaton(makeEmpty());

  /** Set of common english stopwords */
  public static final CharacterRunAutomaton ENGLISH_STOPSET =
      new CharacterRunAutomaton(

  private final CharacterRunAutomaton filter;
  private boolean enablePositionIncrements = false;

  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  private final PositionIncrementAttribute posIncrAtt =

   * Create a new MockTokenFilter.
   * @param input TokenStream to filter
   * @param filter DFA representing the terms that should be removed.
   * @param enablePositionIncrements true if the removal should accumulate position increments.
  public MockTokenFilter(
      TokenStream input, CharacterRunAutomaton filter, boolean enablePositionIncrements) {
    this.filter = filter;
    this.enablePositionIncrements = enablePositionIncrements;

  public boolean incrementToken() throws IOException {
    // return the first non-stop word found
    int skippedPositions = 0;
    while (input.incrementToken()) {
      if (!filter.run(termAtt.buffer(), 0, termAtt.length())) {
        if (enablePositionIncrements) {
          posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
        return true;
      skippedPositions += posIncrAtt.getPositionIncrement();
    // reached EOS -- return false
    return false;

  /** @see #setEnablePositionIncrements(boolean) */
  public boolean getEnablePositionIncrements() {
    return enablePositionIncrements;

   * If <code>true</code>, this Filter will preserve positions of the incoming tokens (ie,
   * accumulate and set position increments of the removed stop tokens).
  public void setEnablePositionIncrements(boolean enable) {
    this.enablePositionIncrements = enable;
   * Extracts all MultiTermQueries for {@code field}, and returns equivalent automata that will
   * match terms.
  static CharacterRunAutomaton[] extractAutomata(Query query, String field) {
    List<CharacterRunAutomaton> list = new ArrayList<>();
    if (query instanceof BooleanQuery) {
      BooleanClause clauses[] = ((BooleanQuery) query).getClauses();
      for (BooleanClause clause : clauses) {
        if (!clause.isProhibited()) {
          list.addAll(Arrays.asList(extractAutomata(clause.getQuery(), field)));
    } else if (query instanceof DisjunctionMaxQuery) {
      for (Query sub : ((DisjunctionMaxQuery) query).getDisjuncts()) {
        list.addAll(Arrays.asList(extractAutomata(sub, field)));
    } else if (query instanceof SpanOrQuery) {
      for (Query sub : ((SpanOrQuery) query).getClauses()) {
        list.addAll(Arrays.asList(extractAutomata(sub, field)));
    } else if (query instanceof SpanNearQuery) {
      for (Query sub : ((SpanNearQuery) query).getClauses()) {
        list.addAll(Arrays.asList(extractAutomata(sub, field)));
    } else if (query instanceof SpanNotQuery) {
      list.addAll(Arrays.asList(extractAutomata(((SpanNotQuery) query).getInclude(), field)));
    } else if (query instanceof SpanPositionCheckQuery) {
          Arrays.asList(extractAutomata(((SpanPositionCheckQuery) query).getMatch(), field)));
    } else if (query instanceof SpanMultiTermQueryWrapper) {
              extractAutomata(((SpanMultiTermQueryWrapper<?>) query).getWrappedQuery(), field)));
    } else if (query instanceof AutomatonQuery) {
      final AutomatonQuery aq = (AutomatonQuery) query;
      if (aq.getField().equals(field)) {
            new CharacterRunAutomaton(aq.getAutomaton()) {
              public String toString() {
                return aq.toString();
    } else if (query instanceof PrefixQuery) {
      final PrefixQuery pq = (PrefixQuery) query;
      Term prefix = pq.getPrefix();
      if (prefix.field().equals(field)) {
            new CharacterRunAutomaton(
                    BasicAutomata.makeString(prefix.text()), BasicAutomata.makeAnyString())) {
              public String toString() {
                return pq.toString();
    } else if (query instanceof FuzzyQuery) {
      final FuzzyQuery fq = (FuzzyQuery) query;
      if (fq.getField().equals(field)) {
        String utf16 = fq.getTerm().text();
        int termText[] = new int[utf16.codePointCount(0, utf16.length())];
        for (int cp, i = 0, j = 0; i < utf16.length(); i += Character.charCount(cp)) {
          termText[j++] = cp = utf16.codePointAt(i);
        int termLength = termText.length;
        int prefixLength = Math.min(fq.getPrefixLength(), termLength);
        String suffix =
            UnicodeUtil.newString(termText, prefixLength, termText.length - prefixLength);
        LevenshteinAutomata builder = new LevenshteinAutomata(suffix, fq.getTranspositions());
        Automaton automaton = builder.toAutomaton(fq.getMaxEdits());
        if (prefixLength > 0) {
          Automaton prefix =
              BasicAutomata.makeString(UnicodeUtil.newString(termText, 0, prefixLength));
          automaton = BasicOperations.concatenate(prefix, automaton);
            new CharacterRunAutomaton(automaton) {
              public String toString() {
                return fq.toString();
    } else if (query instanceof TermRangeQuery) {
      final TermRangeQuery tq = (TermRangeQuery) query;
      if (tq.getField().equals(field)) {
        final CharsRef lowerBound;
        if (tq.getLowerTerm() == null) {
          lowerBound = null;
        } else {
          lowerBound = new CharsRef(tq.getLowerTerm().utf8ToString());

        final CharsRef upperBound;
        if (tq.getUpperTerm() == null) {
          upperBound = null;
        } else {
          upperBound = new CharsRef(tq.getUpperTerm().utf8ToString());

        final boolean includeLower = tq.includesLower();
        final boolean includeUpper = tq.includesUpper();
        final CharsRef scratch = new CharsRef();
        final Comparator<CharsRef> comparator = CharsRef.getUTF16SortedAsUTF8Comparator();

        // this is *not* an automaton, but its very simple
            new CharacterRunAutomaton(BasicAutomata.makeEmpty()) {
              public boolean run(char[] s, int offset, int length) {
                scratch.chars = s;
                scratch.offset = offset;
                scratch.length = length;

                if (lowerBound != null) {
                  int cmp = comparator.compare(scratch, lowerBound);
                  if (cmp < 0 || (!includeLower && cmp == 0)) {
                    return false;

                if (upperBound != null) {
                  int cmp = comparator.compare(scratch, upperBound);
                  if (cmp > 0 || (!includeUpper && cmp == 0)) {
                    return false;
                return true;

              public String toString() {
                return tq.toString();
    return list.toArray(new CharacterRunAutomaton[list.size()]);
예제 #24
 private Automaton join(Automaton... as) {
   return BasicOperations.concatenate(Arrays.asList(as));