Exemplos de BasicOperations em Java, exemplos de org.apache.lucene.util.automaton.BasicOperations em Java

Exemplo n.º 1

0

Exibir arquivo

Arquivo: BasicOperations.java Projeto: erikhatcher/lucene-solr-svn

 /**
  * Returns a (deterministic) automaton that accepts the intersection of the language of <code>a1
  * </code> and the complement of the language of <code>a2</code>. As a side-effect, the automata
  * may be determinized, if not already deterministic.
  *
  * <p>Complexity: quadratic in number of states (if already deterministic).
  */
 public static Automaton minus(Automaton a1, Automaton a2) {
   if (BasicOperations.isEmpty(a1) || a1 == a2) return BasicAutomata.makeEmpty();
   if (BasicOperations.isEmpty(a2)) return a1.cloneIfRequired();
   if (a1.isSingleton()) {
     if (BasicOperations.run(a2, a1.singleton)) return BasicAutomata.makeEmpty();
     else return a1.cloneIfRequired();
   }
   return intersection(a1, a2.complement());
 }

Exemplo n.º 2

0

Exibir arquivo

Arquivo: TestGraphTokenizers.java Projeto: kwatters/heliosearch

  public void testOverlappedTokensSausage() throws Exception {

    // Two tokens on top of each other (sausage):
    final TokenStream ts =
        new CannedTokenStream(new Token[] {token("abc", 1, 1), token("xyz", 0, 1)});
    final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
    final Automaton a1 = BasicAutomata.makeString("abc");
    final Automaton a2 = BasicAutomata.makeString("xyz");
    final Automaton expected = BasicOperations.union(a1, a2);
    assertTrue(BasicOperations.sameLanguage(expected, actual));
  }

Exemplo n.º 3

0

Exibir arquivo

Arquivo: TestGraphTokenizers.java Projeto: kwatters/heliosearch

 public void testSynHangingOverEnd() throws Exception {
   final TokenStream ts =
       new CannedTokenStream(
           new Token[] {
             token("a", 1, 1), token("X", 0, 10),
           });
   final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
   final Automaton expected =
       BasicOperations.union(BasicAutomata.makeString("a"), BasicAutomata.makeString("X"));
   assertTrue(BasicOperations.sameLanguage(expected, actual));
 }

Exemplo n.º 4

0

Exibir arquivo

Arquivo: TestGraphTokenizers.java Projeto: kwatters/heliosearch

 public void testSynOverMultipleHoles() throws Exception {
   final TokenStream ts =
       new CannedTokenStream(
           new Token[] {
             token("a", 1, 1), token("x", 0, 3), token("b", 3, 1),
           });
   final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
   final Automaton a1 = join(s2a("a"), SEP_A, HOLE_A, SEP_A, HOLE_A, SEP_A, s2a("b"));
   final Automaton a2 = join(s2a("x"), SEP_A, s2a("b"));
   final Automaton expected = BasicOperations.union(a1, a2);
   assertTrue(BasicOperations.sameLanguage(expected, actual));
 }

Exemplo n.º 5

0

Exibir arquivo

Arquivo: TestGraphTokenizers.java Projeto: kwatters/heliosearch

  public void testSynOverHole2() throws Exception {

    final TokenStream ts =
        new CannedTokenStream(
            new Token[] {
              token("xyz", 1, 1), token("abc", 0, 3), token("def", 2, 1),
            });
    final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
    final Automaton expected =
        BasicOperations.union(
            join(s2a("xyz"), SEP_A, HOLE_A, SEP_A, s2a("def")), BasicAutomata.makeString("abc"));
    assertTrue(BasicOperations.sameLanguage(expected, actual));
  }

Exemplo n.º 6

0

Exibir arquivo

Arquivo: TestGraphTokenizers.java Projeto: kwatters/heliosearch

  public void testOverlappedTokensLattice2() throws Exception {

    final TokenStream ts =
        new CannedTokenStream(
            new Token[] {
              token("abc", 1, 1), token("xyz", 0, 3), token("def", 1, 1), token("ghi", 1, 1),
            });
    final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
    final Automaton a1 = BasicAutomata.makeString("xyz");
    final Automaton a2 = join("abc", "def", "ghi");
    final Automaton expected = BasicOperations.union(a1, a2);
    // toDot(actual);
    assertTrue(BasicOperations.sameLanguage(expected, actual));
  }

Exemplo n.º 7

0

Exibir arquivo

Arquivo: TestGraphTokenizers.java Projeto: kwatters/heliosearch

  public void testSynOverHole() throws Exception {

    final TokenStream ts =
        new CannedTokenStream(
            new Token[] {
              token("a", 1, 1), token("X", 0, 2), token("b", 2, 1),
            });
    final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
    final Automaton a1 =
        BasicOperations.union(join(s2a("a"), SEP_A, HOLE_A), BasicAutomata.makeString("X"));
    final Automaton expected = BasicOperations.concatenate(a1, join(SEP_A, s2a("b")));
    // toDot(actual);
    assertTrue(BasicOperations.sameLanguage(expected, actual));
  }

Exemplo n.º 8

0

Exibir arquivo

Arquivo: BasicOperations.java Projeto: erikhatcher/lucene-solr-svn

 /**
  * Returns an automaton that accepts the intersection of the languages of the given automata.
  * Never modifies the input automata languages.
  *
  * <p>Complexity: quadratic in number of states.
  */
 public static Automaton intersection(Automaton a1, Automaton a2) {
   if (a1.isSingleton()) {
     if (BasicOperations.run(a2, a1.singleton)) return a1.cloneIfRequired();
     else return BasicAutomata.makeEmpty();
   }
   if (a2.isSingleton()) {
     if (BasicOperations.run(a1, a2.singleton)) return a2.cloneIfRequired();
     else return BasicAutomata.makeEmpty();
   }
   if (a1 == a2) return a1.cloneIfRequired();
   Transition[][] transitions1 = a1.getSortedTransitions();
   Transition[][] transitions2 = a2.getSortedTransitions();
   Automaton c = new Automaton();
   LinkedList<StatePair> worklist = new LinkedList<StatePair>();
   HashMap<StatePair, StatePair> newstates = new HashMap<StatePair, StatePair>();
   StatePair p = new StatePair(c.initial, a1.initial, a2.initial);
   worklist.add(p);
   newstates.put(p, p);
   while (worklist.size() > 0) {
     p = worklist.removeFirst();
     p.s.accept = p.s1.accept && p.s2.accept;
     Transition[] t1 = transitions1[p.s1.number];
     Transition[] t2 = transitions2[p.s2.number];
     for (int n1 = 0, b2 = 0; n1 < t1.length; n1++) {
       while (b2 < t2.length && t2[b2].max < t1[n1].min) b2++;
       for (int n2 = b2; n2 < t2.length && t1[n1].max >= t2[n2].min; n2++)
         if (t2[n2].max >= t1[n1].min) {
           StatePair q = new StatePair(t1[n1].to, t2[n2].to);
           StatePair r = newstates.get(q);
           if (r == null) {
             q.s = new State();
             worklist.add(q);
             newstates.put(q, q);
             r = q;
           }
           int min = t1[n1].min > t2[n2].min ? t1[n1].min : t2[n2].min;
           int max = t1[n1].max < t2[n2].max ? t1[n1].max : t2[n2].max;
           p.s.addTransition(new Transition(min, max, r.s));
         }
     }
   }
   c.deterministic = a1.deterministic && a2.deterministic;
   c.removeDeadTransitions();
   c.checkMinimizeAlways();
   return c;
 }

Exemplo n.º 9

0

Exibir arquivo

Arquivo: TestTermsEnum2.java Projeto: shaie/lucene-solr

  /** tests intersect: TODO start at a random term! */
  public void testIntersect() throws Exception {
    for (int i = 0; i < numIterations; i++) {
      String reg = AutomatonTestUtil.randomRegexp(random());
      Automaton automaton = new RegExp(reg, RegExp.NONE).toAutomaton();
      CompiledAutomaton ca =
          new CompiledAutomaton(automaton, SpecialOperations.isFinite(automaton), false);
      TermsEnum te = MultiFields.getTerms(reader, "field").intersect(ca, null);
      Automaton expected = BasicOperations.intersection(termsAutomaton, automaton);
      TreeSet<BytesRef> found = new TreeSet<BytesRef>();
      while (te.next() != null) {
        found.add(BytesRef.deepCopyOf(te.term()));
      }

      Automaton actual = BasicAutomata.makeStringUnion(found);
      assertTrue(BasicOperations.sameLanguage(expected, actual));
    }
  }

Exemplo n.º 10

0

Exibir arquivo

Arquivo: TestGraphTokenizers.java Projeto: kwatters/heliosearch

 private Automaton join(String... strings) {
   List<Automaton> as = new ArrayList<Automaton>();
   for (String s : strings) {
     as.add(BasicAutomata.makeString(s));
     as.add(SEP_A);
   }
   as.remove(as.size() - 1);
   return BasicOperations.concatenate(as);
 }

Exemplo n.º 11

0

Exibir arquivo

Arquivo: ContextMapping.java Projeto: GAX-Team/elasticsearch

    /**
     * Create a automaton for a given context query this automaton will be used to find the matching
     * paths with the fst
     *
     * @param preserveSep set an additional char (<code>XAnalyzingSuggester.SEP_LABEL</code>)
     *     between each context query
     * @param queries list of {@link ContextQuery} defining the lookup context
     * @return Automaton matching the given Query
     */
    public static Automaton toAutomaton(boolean preserveSep, Iterable<ContextQuery> queries) {
      Automaton a = BasicAutomata.makeEmptyString();

      Automaton gap = BasicAutomata.makeChar(ContextMapping.SEPARATOR);
      if (preserveSep) {
        // if separators are preserved the fst contains a SEP_LABEL
        // behind each gap. To have a matching automaton, we need to
        // include the SEP_LABEL in the query as well
        gap =
            BasicOperations.concatenate(gap, BasicAutomata.makeChar(XAnalyzingSuggester.SEP_LABEL));
      }

      for (ContextQuery query : queries) {
        a = Automaton.concatenate(Arrays.asList(query.toAutomaton(), gap, a));
      }
      BasicOperations.determinize(a);
      return a;
    }

Exemplo n.º 12

0

Exibir arquivo

Arquivo: TestGraphTokenizers.java Projeto: kwatters/heliosearch

 public void testStartsWithHole() throws Exception {
   final TokenStream ts =
       new CannedTokenStream(
           new Token[] {
             token("abc", 2, 1),
           });
   final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
   final Automaton expected = join(HOLE_A, SEP_A, s2a("abc"));
   // toDot(actual);
   assertTrue(BasicOperations.sameLanguage(expected, actual));
 }

Exemplo n.º 13

0

Exibir arquivo

Arquivo: TestGraphTokenizers.java Projeto: kwatters/heliosearch

  public void testSingleToken() throws Exception {

    final TokenStream ts =
        new CannedTokenStream(
            new Token[] {
              token("abc", 1, 1),
            });
    final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
    final Automaton expected = BasicAutomata.makeString("abc");
    assertTrue(BasicOperations.sameLanguage(expected, actual));
  }

Exemplo n.º 14

0

Exibir arquivo

Arquivo: TestGraphTokenizers.java Projeto: kwatters/heliosearch

  public void testTwoTokens() throws Exception {

    final TokenStream ts =
        new CannedTokenStream(
            new Token[] {
              token("abc", 1, 1), token("def", 1, 1),
            });
    final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
    final Automaton expected = join("abc", "def");

    // toDot(actual);
    assertTrue(BasicOperations.sameLanguage(expected, actual));
  }

Exemplo n.º 15

0

Exibir arquivo

Arquivo: TestMockAnalyzer.java Projeto: simplegeo/lucene-solr

 /** Test a configuration that behaves a lot like KeepWordFilter */
 public void testKeep() throws Exception {
   CharacterRunAutomaton keepWords =
       new CharacterRunAutomaton(
           BasicOperations.complement(
               Automaton.union(
                   Arrays.asList(
                       BasicAutomata.makeString("foo"), BasicAutomata.makeString("bar")))));
   Analyzer a = new MockAnalyzer(MockTokenizer.SIMPLE, true, keepWords, true);
   assertAnalyzesTo(
       a,
       "quick foo brown bar bar fox foo",
       new String[] {"foo", "bar", "bar", "foo"},
       new int[] {2, 2, 1, 2});
 }

Exemplo n.º 16

0

Exibir arquivo

Arquivo: BasicOperations.java Projeto: erikhatcher/lucene-solr-svn

  /**
   * Returns true if the language of <code>a1</code> is a subset of the language of <code>a2</code>.
   * As a side-effect, <code>a2</code> is determinized if not already marked as deterministic.
   *
   * <p>Complexity: quadratic in number of states.
   */
  public static boolean subsetOf(Automaton a1, Automaton a2) {
    if (a1 == a2) return true;
    if (a1.isSingleton()) {
      if (a2.isSingleton()) return a1.singleton.equals(a2.singleton);
      return BasicOperations.run(a2, a1.singleton);
    }
    a2.determinize();
    Transition[][] transitions1 = a1.getSortedTransitions();
    Transition[][] transitions2 = a2.getSortedTransitions();
    LinkedList<StatePair> worklist = new LinkedList<StatePair>();
    HashSet<StatePair> visited = new HashSet<StatePair>();
    StatePair p = new StatePair(a1.initial, a2.initial);
    worklist.add(p);
    visited.add(p);
    while (worklist.size() > 0) {
      p = worklist.removeFirst();
      if (p.s1.accept && !p.s2.accept) {
        return false;
      }
      Transition[] t1 = transitions1[p.s1.number];
      Transition[] t2 = transitions2[p.s2.number];
      for (int n1 = 0, b2 = 0; n1 < t1.length; n1++) {
        while (b2 < t2.length && t2[b2].max < t1[n1].min) b2++;
        int min1 = t1[n1].min, max1 = t1[n1].max;

        for (int n2 = b2; n2 < t2.length && t1[n1].max >= t2[n2].min; n2++) {
          if (t2[n2].min > min1) {
            return false;
          }
          if (t2[n2].max < Character.MAX_CODE_POINT) min1 = t2[n2].max + 1;
          else {
            min1 = Character.MAX_CODE_POINT;
            max1 = Character.MIN_CODE_POINT;
          }
          StatePair q = new StatePair(t1[n1].to, t2[n2].to);
          if (!visited.contains(q)) {
            worklist.add(q);
            visited.add(q);
          }
        }
        if (min1 <= max1) {
          return false;
        }
      }
    }
    return true;
  }

Exemplo n.º 17

0

Exibir arquivo

Arquivo: BasicOperations.java Projeto: erikhatcher/lucene-solr-svn

 /**
  * Returns an automaton that accepts the concatenation of the languages of the given automata.
  *
  * <p>Complexity: linear in total number of states.
  */
 public static Automaton concatenate(List<Automaton> l) {
   if (l.isEmpty()) return BasicAutomata.makeEmptyString();
   boolean all_singleton = true;
   for (Automaton a : l)
     if (!a.isSingleton()) {
       all_singleton = false;
       break;
     }
   if (all_singleton) {
     StringBuilder b = new StringBuilder();
     for (Automaton a : l) b.append(a.singleton);
     return BasicAutomata.makeString(b.toString());
   } else {
     for (Automaton a : l) if (BasicOperations.isEmpty(a)) return BasicAutomata.makeEmpty();
     Set<Integer> ids = new HashSet<Integer>();
     for (Automaton a : l) ids.add(System.identityHashCode(a));
     boolean has_aliases = ids.size() != l.size();
     Automaton b = l.get(0);
     if (has_aliases) b = b.cloneExpanded();
     else b = b.cloneExpandedIfRequired();
     Set<State> ac = b.getAcceptStates();
     boolean first = true;
     for (Automaton a : l)
       if (first) first = false;
       else {
         if (a.isEmptyString()) continue;
         Automaton aa = a;
         if (has_aliases) aa = aa.cloneExpanded();
         else aa = aa.cloneExpandedIfRequired();
         Set<State> ns = aa.getAcceptStates();
         for (State s : ac) {
           s.accept = false;
           s.addEpsilon(aa.initial);
           if (s.accept) ns.add(s);
         }
         ac = ns;
       }
     b.deterministic = false;
     // b.clearHashCode();
     b.clearNumberedStates();
     b.checkMinimizeAlways();
     return b;
   }
 }

Exemplo n.º 18

0

Exibir arquivo

Arquivo: BasicOperations.java Projeto: erikhatcher/lucene-solr-svn

 /**
  * Returns an automaton that accepts the union of the languages of the given automata.
  *
  * <p>Complexity: linear in number of states.
  */
 public static Automaton union(Collection<Automaton> l) {
   Set<Integer> ids = new HashSet<Integer>();
   for (Automaton a : l) ids.add(System.identityHashCode(a));
   boolean has_aliases = ids.size() != l.size();
   State s = new State();
   for (Automaton b : l) {
     if (BasicOperations.isEmpty(b)) continue;
     Automaton bb = b;
     if (has_aliases) bb = bb.cloneExpanded();
     else bb = bb.cloneExpandedIfRequired();
     s.addEpsilon(bb.initial);
   }
   Automaton a = new Automaton();
   a.initial = s;
   a.deterministic = false;
   // a.clearHashCode();
   a.clearNumberedStates();
   a.checkMinimizeAlways();
   return a;
 }

Exemplo n.º 19

0

Exibir arquivo

Arquivo: TestTermsEnum2.java Projeto: shaie/lucene-solr

  /** tests a pre-intersected automaton against the original */
  public void testFiniteVersusInfinite() throws Exception {
    for (int i = 0; i < numIterations; i++) {
      String reg = AutomatonTestUtil.randomRegexp(random());
      Automaton automaton = new RegExp(reg, RegExp.NONE).toAutomaton();
      final List<BytesRef> matchedTerms = new ArrayList<BytesRef>();
      for (BytesRef t : terms) {
        if (BasicOperations.run(automaton, t.utf8ToString())) {
          matchedTerms.add(t);
        }
      }

      Automaton alternate = BasicAutomata.makeStringUnion(matchedTerms);
      // System.out.println("match " + matchedTerms.size() + " " + alternate.getNumberOfStates() + "
      // states, sigma=" + alternate.getStartPoints().length);
      // AutomatonTestUtil.minimizeSimple(alternate);
      // System.out.println("minmize done");
      AutomatonQuery a1 = new AutomatonQuery(new Term("field", ""), automaton);
      AutomatonQuery a2 = new AutomatonQuery(new Term("field", ""), alternate);
      CheckHits.checkEqual(
          a1, searcher.search(a1, 25).scoreDocs, searcher.search(a2, 25).scoreDocs);
    }
  }

Exemplo n.º 20

0

Exibir arquivo

Arquivo: TestTermsEnum2.java Projeto: shaie/lucene-solr

  /** seeks to every term accepted by some automata */
  public void testSeeking() throws Exception {
    for (int i = 0; i < numIterations; i++) {
      String reg = AutomatonTestUtil.randomRegexp(random());
      Automaton automaton = new RegExp(reg, RegExp.NONE).toAutomaton();
      TermsEnum te = MultiFields.getTerms(reader, "field").iterator(null);
      ArrayList<BytesRef> unsortedTerms = new ArrayList<BytesRef>(terms);
      Collections.shuffle(unsortedTerms, random());

      for (BytesRef term : unsortedTerms) {
        if (BasicOperations.run(automaton, term.utf8ToString())) {
          // term is accepted
          if (random().nextBoolean()) {
            // seek exact
            assertTrue(te.seekExact(term, random().nextBoolean()));
          } else {
            // seek ceil
            assertEquals(SeekStatus.FOUND, te.seekCeil(term, random().nextBoolean()));
            assertEquals(term, te.term());
          }
        }
      }
    }
  }

Exemplo n.º 21

0

Exibir arquivo

Arquivo: AutomatonTestUtil.java Projeto: ankit-zettata/solr-analytics

  /** return a random NFA/DFA for testing */
  public static Automaton randomAutomaton(Random random) {
    // get two random Automata from regexps
    Automaton a1 = new RegExp(AutomatonTestUtil.randomRegexp(random), RegExp.NONE).toAutomaton();
    if (random.nextBoolean()) a1 = BasicOperations.complement(a1);

    Automaton a2 = new RegExp(AutomatonTestUtil.randomRegexp(random), RegExp.NONE).toAutomaton();
    if (random.nextBoolean()) a2 = BasicOperations.complement(a2);

    // combine them in random ways
    switch (random.nextInt(4)) {
      case 0:
        return BasicOperations.concatenate(a1, a2);
      case 1:
        return BasicOperations.union(a1, a2);
      case 2:
        return BasicOperations.intersection(a1, a2);
      default:
        return BasicOperations.minus(a1, a2);
    }
  }

Exemplo n.º 22

0

Exibir arquivo

Arquivo: MockTokenFilter.java Projeto: ieure/lucene-solr-snapshot

/**
 * A tokenfilter for testing that removes terms accepted by a DFA.
 *
 * <ul>
 *   <li>Union a list of singletons to act like a stopfilter.
 *   <li>Use the complement to act like a keepwordfilter
 *   <li>Use a regex like <code>.{12,}</code> to act like a lengthfilter
 * </ul>
 */
public final class MockTokenFilter extends TokenFilter {
  /** Empty set of stopwords */
  public static final CharacterRunAutomaton EMPTY_STOPSET = new CharacterRunAutomaton(makeEmpty());

  /** Set of common english stopwords */
  public static final CharacterRunAutomaton ENGLISH_STOPSET =
      new CharacterRunAutomaton(
          BasicOperations.union(
              Arrays.asList(
                  makeString("a"),
                  makeString("an"),
                  makeString("and"),
                  makeString("are"),
                  makeString("as"),
                  makeString("at"),
                  makeString("be"),
                  makeString("but"),
                  makeString("by"),
                  makeString("for"),
                  makeString("if"),
                  makeString("in"),
                  makeString("into"),
                  makeString("is"),
                  makeString("it"),
                  makeString("no"),
                  makeString("not"),
                  makeString("of"),
                  makeString("on"),
                  makeString("or"),
                  makeString("such"),
                  makeString("that"),
                  makeString("the"),
                  makeString("their"),
                  makeString("then"),
                  makeString("there"),
                  makeString("these"),
                  makeString("they"),
                  makeString("this"),
                  makeString("to"),
                  makeString("was"),
                  makeString("will"),
                  makeString("with"))));

  private final CharacterRunAutomaton filter;
  private boolean enablePositionIncrements = false;

  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  private final PositionIncrementAttribute posIncrAtt =
      addAttribute(PositionIncrementAttribute.class);

  /**
   * Create a new MockTokenFilter.
   *
   * @param input TokenStream to filter
   * @param filter DFA representing the terms that should be removed.
   * @param enablePositionIncrements true if the removal should accumulate position increments.
   */
  public MockTokenFilter(
      TokenStream input, CharacterRunAutomaton filter, boolean enablePositionIncrements) {
    super(input);
    this.filter = filter;
    this.enablePositionIncrements = enablePositionIncrements;
  }

  @Override
  public boolean incrementToken() throws IOException {
    // return the first non-stop word found
    int skippedPositions = 0;
    while (input.incrementToken()) {
      if (!filter.run(termAtt.buffer(), 0, termAtt.length())) {
        if (enablePositionIncrements) {
          posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
        }
        return true;
      }
      skippedPositions += posIncrAtt.getPositionIncrement();
    }
    // reached EOS -- return false
    return false;
  }

  /** @see #setEnablePositionIncrements(boolean) */
  public boolean getEnablePositionIncrements() {
    return enablePositionIncrements;
  }

  /**
   * If <code>true</code>, this Filter will preserve positions of the incoming tokens (ie,
   * accumulate and set position increments of the removed stop tokens).
   */
  public void setEnablePositionIncrements(boolean enable) {
    this.enablePositionIncrements = enable;
  }
}

Exemplo n.º 23

0

Exibir arquivo

Arquivo: MultiTermHighlighting.java Projeto: kushal256/heliosearch

  /**
   * Extracts all MultiTermQueries for {@code field}, and returns equivalent automata that will
   * match terms.
   */
  static CharacterRunAutomaton[] extractAutomata(Query query, String field) {
    List<CharacterRunAutomaton> list = new ArrayList<>();
    if (query instanceof BooleanQuery) {
      BooleanClause clauses[] = ((BooleanQuery) query).getClauses();
      for (BooleanClause clause : clauses) {
        if (!clause.isProhibited()) {
          list.addAll(Arrays.asList(extractAutomata(clause.getQuery(), field)));
        }
      }
    } else if (query instanceof DisjunctionMaxQuery) {
      for (Query sub : ((DisjunctionMaxQuery) query).getDisjuncts()) {
        list.addAll(Arrays.asList(extractAutomata(sub, field)));
      }
    } else if (query instanceof SpanOrQuery) {
      for (Query sub : ((SpanOrQuery) query).getClauses()) {
        list.addAll(Arrays.asList(extractAutomata(sub, field)));
      }
    } else if (query instanceof SpanNearQuery) {
      for (Query sub : ((SpanNearQuery) query).getClauses()) {
        list.addAll(Arrays.asList(extractAutomata(sub, field)));
      }
    } else if (query instanceof SpanNotQuery) {
      list.addAll(Arrays.asList(extractAutomata(((SpanNotQuery) query).getInclude(), field)));
    } else if (query instanceof SpanPositionCheckQuery) {
      list.addAll(
          Arrays.asList(extractAutomata(((SpanPositionCheckQuery) query).getMatch(), field)));
    } else if (query instanceof SpanMultiTermQueryWrapper) {
      list.addAll(
          Arrays.asList(
              extractAutomata(((SpanMultiTermQueryWrapper<?>) query).getWrappedQuery(), field)));
    } else if (query instanceof AutomatonQuery) {
      final AutomatonQuery aq = (AutomatonQuery) query;
      if (aq.getField().equals(field)) {
        list.add(
            new CharacterRunAutomaton(aq.getAutomaton()) {
              @Override
              public String toString() {
                return aq.toString();
              }
            });
      }
    } else if (query instanceof PrefixQuery) {
      final PrefixQuery pq = (PrefixQuery) query;
      Term prefix = pq.getPrefix();
      if (prefix.field().equals(field)) {
        list.add(
            new CharacterRunAutomaton(
                BasicOperations.concatenate(
                    BasicAutomata.makeString(prefix.text()), BasicAutomata.makeAnyString())) {
              @Override
              public String toString() {
                return pq.toString();
              }
            });
      }
    } else if (query instanceof FuzzyQuery) {
      final FuzzyQuery fq = (FuzzyQuery) query;
      if (fq.getField().equals(field)) {
        String utf16 = fq.getTerm().text();
        int termText[] = new int[utf16.codePointCount(0, utf16.length())];
        for (int cp, i = 0, j = 0; i < utf16.length(); i += Character.charCount(cp)) {
          termText[j++] = cp = utf16.codePointAt(i);
        }
        int termLength = termText.length;
        int prefixLength = Math.min(fq.getPrefixLength(), termLength);
        String suffix =
            UnicodeUtil.newString(termText, prefixLength, termText.length - prefixLength);
        LevenshteinAutomata builder = new LevenshteinAutomata(suffix, fq.getTranspositions());
        Automaton automaton = builder.toAutomaton(fq.getMaxEdits());
        if (prefixLength > 0) {
          Automaton prefix =
              BasicAutomata.makeString(UnicodeUtil.newString(termText, 0, prefixLength));
          automaton = BasicOperations.concatenate(prefix, automaton);
        }
        list.add(
            new CharacterRunAutomaton(automaton) {
              @Override
              public String toString() {
                return fq.toString();
              }
            });
      }
    } else if (query instanceof TermRangeQuery) {
      final TermRangeQuery tq = (TermRangeQuery) query;
      if (tq.getField().equals(field)) {
        final CharsRef lowerBound;
        if (tq.getLowerTerm() == null) {
          lowerBound = null;
        } else {
          lowerBound = new CharsRef(tq.getLowerTerm().utf8ToString());
        }

        final CharsRef upperBound;
        if (tq.getUpperTerm() == null) {
          upperBound = null;
        } else {
          upperBound = new CharsRef(tq.getUpperTerm().utf8ToString());
        }

        final boolean includeLower = tq.includesLower();
        final boolean includeUpper = tq.includesUpper();
        final CharsRef scratch = new CharsRef();
        final Comparator<CharsRef> comparator = CharsRef.getUTF16SortedAsUTF8Comparator();

        // this is *not* an automaton, but its very simple
        list.add(
            new CharacterRunAutomaton(BasicAutomata.makeEmpty()) {
              @Override
              public boolean run(char[] s, int offset, int length) {
                scratch.chars = s;
                scratch.offset = offset;
                scratch.length = length;

                if (lowerBound != null) {
                  int cmp = comparator.compare(scratch, lowerBound);
                  if (cmp < 0 || (!includeLower && cmp == 0)) {
                    return false;
                  }
                }

                if (upperBound != null) {
                  int cmp = comparator.compare(scratch, upperBound);
                  if (cmp > 0 || (!includeUpper && cmp == 0)) {
                    return false;
                  }
                }
                return true;
              }

              @Override
              public String toString() {
                return tq.toString();
              }
            });
      }
    }
    return list.toArray(new CharacterRunAutomaton[list.size()]);
  }

Exemplo n.º 24

0

Exibir arquivo

Arquivo: TestGraphTokenizers.java Projeto: kwatters/heliosearch

 private Automaton join(Automaton... as) {
   return BasicOperations.concatenate(Arrays.asList(as));
 }