public void testOverlappedTokensSausage() throws Exception { // Two tokens on top of each other (sausage): final TokenStream ts = new CannedTokenStream(new Token[] {token("abc", 1, 1), token("xyz", 0, 1)}); final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts); final Automaton a1 = BasicAutomata.makeString("abc"); final Automaton a2 = BasicAutomata.makeString("xyz"); final Automaton expected = BasicOperations.union(a1, a2); assertTrue(BasicOperations.sameLanguage(expected, actual)); }
public void testSynHangingOverEnd() throws Exception { final TokenStream ts = new CannedTokenStream( new Token[] { token("a", 1, 1), token("X", 0, 10), }); final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts); final Automaton expected = BasicOperations.union(BasicAutomata.makeString("a"), BasicAutomata.makeString("X")); assertTrue(BasicOperations.sameLanguage(expected, actual)); }
/** Test a configuration that behaves a lot like KeepWordFilter */ public void testKeep() throws Exception { CharacterRunAutomaton keepWords = new CharacterRunAutomaton( BasicOperations.complement( Automaton.union( Arrays.asList( BasicAutomata.makeString("foo"), BasicAutomata.makeString("bar"))))); Analyzer a = new MockAnalyzer(MockTokenizer.SIMPLE, true, keepWords, true); assertAnalyzesTo( a, "quick foo brown bar bar fox foo", new String[] {"foo", "bar", "bar", "foo"}, new int[] {2, 2, 1, 2}); }
@Override public void setUp() throws Exception { super.setUp(); numIterations = atLeast(50); dir = newDirectory(); RandomIndexWriter writer = new RandomIndexWriter( random(), dir, newIndexWriterConfig( TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.KEYWORD, false)) .setMaxBufferedDocs(_TestUtil.nextInt(random(), 50, 1000))); Document doc = new Document(); Field field = newStringField("field", "", Field.Store.YES); doc.add(field); terms = new TreeSet<BytesRef>(); int num = atLeast(200); for (int i = 0; i < num; i++) { String s = _TestUtil.randomUnicodeString(random()); field.setStringValue(s); terms.add(new BytesRef(s)); writer.addDocument(doc); } termsAutomaton = BasicAutomata.makeStringUnion(terms); reader = writer.getReader(); searcher = newSearcher(reader); writer.close(); }
private Automaton join(String... strings) { List<Automaton> as = new ArrayList<Automaton>(); for (String s : strings) { as.add(BasicAutomata.makeString(s)); as.add(SEP_A); } as.remove(as.size() - 1); return BasicOperations.concatenate(as); }
/** * Create a automaton for a given context query this automaton will be used to find the matching * paths with the fst * * @param preserveSep set an additional char (<code>XAnalyzingSuggester.SEP_LABEL</code>) * between each context query * @param queries list of {@link ContextQuery} defining the lookup context * @return Automaton matching the given Query */ public static Automaton toAutomaton(boolean preserveSep, Iterable<ContextQuery> queries) { Automaton a = BasicAutomata.makeEmptyString(); Automaton gap = BasicAutomata.makeChar(ContextMapping.SEPARATOR); if (preserveSep) { // if separators are preserved the fst contains a SEP_LABEL // behind each gap. To have a matching automaton, we need to // include the SEP_LABEL in the query as well gap = BasicOperations.concatenate(gap, BasicAutomata.makeChar(XAnalyzingSuggester.SEP_LABEL)); } for (ContextQuery query : queries) { a = Automaton.concatenate(Arrays.asList(query.toAutomaton(), gap, a)); } BasicOperations.determinize(a); return a; }
public void testSingleToken() throws Exception { final TokenStream ts = new CannedTokenStream( new Token[] { token("abc", 1, 1), }); final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts); final Automaton expected = BasicAutomata.makeString("abc"); assertTrue(BasicOperations.sameLanguage(expected, actual)); }
public void testSynOverHole2() throws Exception { final TokenStream ts = new CannedTokenStream( new Token[] { token("xyz", 1, 1), token("abc", 0, 3), token("def", 2, 1), }); final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts); final Automaton expected = BasicOperations.union( join(s2a("xyz"), SEP_A, HOLE_A, SEP_A, s2a("def")), BasicAutomata.makeString("abc")); assertTrue(BasicOperations.sameLanguage(expected, actual)); }
public void testOverlappedTokensLattice2() throws Exception { final TokenStream ts = new CannedTokenStream( new Token[] { token("abc", 1, 1), token("xyz", 0, 3), token("def", 1, 1), token("ghi", 1, 1), }); final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts); final Automaton a1 = BasicAutomata.makeString("xyz"); final Automaton a2 = join("abc", "def", "ghi"); final Automaton expected = BasicOperations.union(a1, a2); // toDot(actual); assertTrue(BasicOperations.sameLanguage(expected, actual)); }
public void testSynOverHole() throws Exception { final TokenStream ts = new CannedTokenStream( new Token[] { token("a", 1, 1), token("X", 0, 2), token("b", 2, 1), }); final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts); final Automaton a1 = BasicOperations.union(join(s2a("a"), SEP_A, HOLE_A), BasicAutomata.makeString("X")); final Automaton expected = BasicOperations.concatenate(a1, join(SEP_A, s2a("b"))); // toDot(actual); assertTrue(BasicOperations.sameLanguage(expected, actual)); }
/** tests intersect: TODO start at a random term! */ public void testIntersect() throws Exception { for (int i = 0; i < numIterations; i++) { String reg = AutomatonTestUtil.randomRegexp(random()); Automaton automaton = new RegExp(reg, RegExp.NONE).toAutomaton(); CompiledAutomaton ca = new CompiledAutomaton(automaton, SpecialOperations.isFinite(automaton), false); TermsEnum te = MultiFields.getTerms(reader, "field").intersect(ca, null); Automaton expected = BasicOperations.intersection(termsAutomaton, automaton); TreeSet<BytesRef> found = new TreeSet<BytesRef>(); while (te.next() != null) { found.add(BytesRef.deepCopyOf(te.term())); } Automaton actual = BasicAutomata.makeStringUnion(found); assertTrue(BasicOperations.sameLanguage(expected, actual)); } }
/** tests a pre-intersected automaton against the original */ public void testFiniteVersusInfinite() throws Exception { for (int i = 0; i < numIterations; i++) { String reg = AutomatonTestUtil.randomRegexp(random()); Automaton automaton = new RegExp(reg, RegExp.NONE).toAutomaton(); final List<BytesRef> matchedTerms = new ArrayList<BytesRef>(); for (BytesRef t : terms) { if (BasicOperations.run(automaton, t.utf8ToString())) { matchedTerms.add(t); } } Automaton alternate = BasicAutomata.makeStringUnion(matchedTerms); // System.out.println("match " + matchedTerms.size() + " " + alternate.getNumberOfStates() + " // states, sigma=" + alternate.getStartPoints().length); // AutomatonTestUtil.minimizeSimple(alternate); // System.out.println("minmize done"); AutomatonQuery a1 = new AutomatonQuery(new Term("field", ""), automaton); AutomatonQuery a2 = new AutomatonQuery(new Term("field", ""), alternate); CheckHits.checkEqual( a1, searcher.search(a1, 25).scoreDocs, searcher.search(a2, 25).scoreDocs); } }
private Automaton s2a(String s) { return BasicAutomata.makeString(s); }
public class TestGraphTokenizers extends BaseTokenStreamTestCase { // Makes a graph TokenStream from the string; separate // positions with single space, multiple tokens at the same // position with /, and add optional position length with // :. EG "a b c" is a simple chain, "a/x b c" adds 'x' // over 'a' at position 0 with posLen=1, "a/x:3 b c" adds // 'x' over a with posLen=3. Tokens are in normal-form! // So, offsets are computed based on the first token at a // given position. NOTE: each token must be a single // character! We assume this when computing offsets... // NOTE: all input tokens must be length 1!!! This means // you cannot turn on MockCharFilter when random // testing... private static class GraphTokenizer extends Tokenizer { private List<Token> tokens; private int upto; private int inputLength; private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); private final PositionLengthAttribute posLengthAtt = addAttribute(PositionLengthAttribute.class); @Override public void reset() throws IOException { super.reset(); tokens = null; upto = 0; } @Override public boolean incrementToken() throws IOException { if (tokens == null) { fillTokens(); } // System.out.println("graphTokenizer: incr upto=" + upto + " vs " + tokens.size()); if (upto == tokens.size()) { // System.out.println(" END @ " + tokens.size()); return false; } final Token t = tokens.get(upto++); // System.out.println(" return token=" + t); clearAttributes(); termAtt.append(t.toString()); offsetAtt.setOffset(t.startOffset(), t.endOffset()); posIncrAtt.setPositionIncrement(t.getPositionIncrement()); posLengthAtt.setPositionLength(t.getPositionLength()); return true; } @Override public void end() throws IOException { super.end(); // NOTE: somewhat... hackish, but we need this to // satisfy BTSTC: final int lastOffset; if (tokens != null && !tokens.isEmpty()) { lastOffset = tokens.get(tokens.size() - 1).endOffset(); } else { lastOffset = 0; } offsetAtt.setOffset(correctOffset(lastOffset), correctOffset(inputLength)); } private void fillTokens() throws IOException { final StringBuilder sb = new StringBuilder(); final char[] buffer = new char[256]; while (true) { final int count = input.read(buffer); if (count == -1) { break; } sb.append(buffer, 0, count); // System.out.println("got count=" + count); } // System.out.println("fillTokens: " + sb); inputLength = sb.length(); final String[] parts = sb.toString().split(" "); tokens = new ArrayList<Token>(); int pos = 0; int maxPos = -1; int offset = 0; // System.out.println("again"); for (String part : parts) { final String[] overlapped = part.split("/"); boolean firstAtPos = true; int minPosLength = Integer.MAX_VALUE; for (String part2 : overlapped) { final int colonIndex = part2.indexOf(':'); final String token; final int posLength; if (colonIndex != -1) { token = part2.substring(0, colonIndex); posLength = Integer.parseInt(part2.substring(1 + colonIndex)); } else { token = part2; posLength = 1; } maxPos = Math.max(maxPos, pos + posLength); minPosLength = Math.min(minPosLength, posLength); final Token t = new Token(token, offset, offset + 2 * posLength - 1); t.setPositionLength(posLength); t.setPositionIncrement(firstAtPos ? 1 : 0); firstAtPos = false; // System.out.println(" add token=" + t + " startOff=" + t.startOffset() + " endOff=" + // t.endOffset()); tokens.add(t); } pos += minPosLength; offset = 2 * pos; } assert maxPos <= pos : "input string mal-formed: posLength>1 tokens hang over the end"; } } public void testMockGraphTokenFilterBasic() throws Exception { for (int iter = 0; iter < 10 * RANDOM_MULTIPLIER; iter++) { if (VERBOSE) { System.out.println("\nTEST: iter=" + iter); } // Make new analyzer each time, because MGTF has fixed // seed: final Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false); final TokenStream t2 = new MockGraphTokenFilter(random(), t); return new TokenStreamComponents(t, t2); } }; checkAnalysisConsistency(random(), a, false, "a b c d e f g h i j k"); } } public void testMockGraphTokenFilterOnGraphInput() throws Exception { for (int iter = 0; iter < 100 * RANDOM_MULTIPLIER; iter++) { if (VERBOSE) { System.out.println("\nTEST: iter=" + iter); } // Make new analyzer each time, because MGTF has fixed // seed: final Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { final Tokenizer t = new GraphTokenizer(); final TokenStream t2 = new MockGraphTokenFilter(random(), t); return new TokenStreamComponents(t, t2); } }; checkAnalysisConsistency(random(), a, false, "a/x:3 c/y:2 d e f/z:4 g h i j k"); } } // Just deletes (leaving hole) token 'a': private static final class RemoveATokens extends TokenFilter { private int pendingPosInc; private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); public RemoveATokens(TokenStream in) { super(in); } @Override public void reset() throws IOException { super.reset(); pendingPosInc = 0; } @Override public void end() throws IOException { super.end(); posIncAtt.setPositionIncrement(pendingPosInc + posIncAtt.getPositionIncrement()); } @Override public boolean incrementToken() throws IOException { while (true) { final boolean gotOne = input.incrementToken(); if (!gotOne) { return false; } else if (termAtt.toString().equals("a")) { pendingPosInc += posIncAtt.getPositionIncrement(); } else { posIncAtt.setPositionIncrement(pendingPosInc + posIncAtt.getPositionIncrement()); pendingPosInc = 0; return true; } } } } public void testMockGraphTokenFilterBeforeHoles() throws Exception { for (int iter = 0; iter < 100 * RANDOM_MULTIPLIER; iter++) { if (VERBOSE) { System.out.println("\nTEST: iter=" + iter); } // Make new analyzer each time, because MGTF has fixed // seed: final Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false); final TokenStream t2 = new MockGraphTokenFilter(random(), t); final TokenStream t3 = new RemoveATokens(t2); return new TokenStreamComponents(t, t3); } }; Random random = random(); checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k"); checkAnalysisConsistency(random, a, false, "x y a b c d e f g h i j k"); checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k a"); checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k a x y"); } } public void testMockGraphTokenFilterAfterHoles() throws Exception { for (int iter = 0; iter < 100 * RANDOM_MULTIPLIER; iter++) { if (VERBOSE) { System.out.println("\nTEST: iter=" + iter); } // Make new analyzer each time, because MGTF has fixed // seed: final Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false); final TokenStream t2 = new RemoveATokens(t); final TokenStream t3 = new MockGraphTokenFilter(random(), t2); return new TokenStreamComponents(t, t3); } }; Random random = random(); checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k"); checkAnalysisConsistency(random, a, false, "x y a b c d e f g h i j k"); checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k a"); checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k a x y"); } } public void testMockGraphTokenFilterRandom() throws Exception { for (int iter = 0; iter < 10 * RANDOM_MULTIPLIER; iter++) { if (VERBOSE) { System.out.println("\nTEST: iter=" + iter); } // Make new analyzer each time, because MGTF has fixed // seed: final Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false); final TokenStream t2 = new MockGraphTokenFilter(random(), t); return new TokenStreamComponents(t, t2); } }; Random random = random(); checkRandomData(random, a, 5, atLeast(100)); } } // Two MockGraphTokenFilters public void testDoubleMockGraphTokenFilterRandom() throws Exception { for (int iter = 0; iter < 10 * RANDOM_MULTIPLIER; iter++) { if (VERBOSE) { System.out.println("\nTEST: iter=" + iter); } // Make new analyzer each time, because MGTF has fixed // seed: final Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false); final TokenStream t1 = new MockGraphTokenFilter(random(), t); final TokenStream t2 = new MockGraphTokenFilter(random(), t1); return new TokenStreamComponents(t, t2); } }; Random random = random(); checkRandomData(random, a, 5, atLeast(100)); } } public void testMockGraphTokenFilterBeforeHolesRandom() throws Exception { for (int iter = 0; iter < 10 * RANDOM_MULTIPLIER; iter++) { if (VERBOSE) { System.out.println("\nTEST: iter=" + iter); } // Make new analyzer each time, because MGTF has fixed // seed: final Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false); final TokenStream t1 = new MockGraphTokenFilter(random(), t); final TokenStream t2 = new MockHoleInjectingTokenFilter(random(), t1); return new TokenStreamComponents(t, t2); } }; Random random = random(); checkRandomData(random, a, 5, atLeast(100)); } } public void testMockGraphTokenFilterAfterHolesRandom() throws Exception { for (int iter = 0; iter < 10 * RANDOM_MULTIPLIER; iter++) { if (VERBOSE) { System.out.println("\nTEST: iter=" + iter); } // Make new analyzer each time, because MGTF has fixed // seed: final Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false); final TokenStream t1 = new MockHoleInjectingTokenFilter(random(), t); final TokenStream t2 = new MockGraphTokenFilter(random(), t1); return new TokenStreamComponents(t, t2); } }; Random random = random(); checkRandomData(random, a, 5, atLeast(100)); } } private static Token token(String term, int posInc, int posLength) { final Token t = new Token(term, 0, 0); t.setPositionIncrement(posInc); t.setPositionLength(posLength); return t; } private static Token token( String term, int posInc, int posLength, int startOffset, int endOffset) { final Token t = new Token(term, startOffset, endOffset); t.setPositionIncrement(posInc); t.setPositionLength(posLength); return t; } public void testSingleToken() throws Exception { final TokenStream ts = new CannedTokenStream( new Token[] { token("abc", 1, 1), }); final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts); final Automaton expected = BasicAutomata.makeString("abc"); assertTrue(BasicOperations.sameLanguage(expected, actual)); } public void testMultipleHoles() throws Exception { final TokenStream ts = new CannedTokenStream( new Token[] { token("a", 1, 1), token("b", 3, 1), }); final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts); final Automaton expected = join(s2a("a"), SEP_A, HOLE_A, SEP_A, HOLE_A, SEP_A, s2a("b")); assertTrue(BasicOperations.sameLanguage(expected, actual)); } public void testSynOverMultipleHoles() throws Exception { final TokenStream ts = new CannedTokenStream( new Token[] { token("a", 1, 1), token("x", 0, 3), token("b", 3, 1), }); final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts); final Automaton a1 = join(s2a("a"), SEP_A, HOLE_A, SEP_A, HOLE_A, SEP_A, s2a("b")); final Automaton a2 = join(s2a("x"), SEP_A, s2a("b")); final Automaton expected = BasicOperations.union(a1, a2); assertTrue(BasicOperations.sameLanguage(expected, actual)); } // for debugging! /* private static void toDot(Automaton a) throws IOException { final String s = a.toDot(); Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp/out.dot")); w.write(s); w.close(); System.out.println("TEST: saved to /x/tmp/out.dot"); } */ private static final Automaton SEP_A = BasicAutomata.makeChar(TokenStreamToAutomaton.POS_SEP); private static final Automaton HOLE_A = BasicAutomata.makeChar(TokenStreamToAutomaton.HOLE); private Automaton join(String... strings) { List<Automaton> as = new ArrayList<Automaton>(); for (String s : strings) { as.add(BasicAutomata.makeString(s)); as.add(SEP_A); } as.remove(as.size() - 1); return BasicOperations.concatenate(as); } private Automaton join(Automaton... as) { return BasicOperations.concatenate(Arrays.asList(as)); } private Automaton s2a(String s) { return BasicAutomata.makeString(s); } public void testTwoTokens() throws Exception { final TokenStream ts = new CannedTokenStream( new Token[] { token("abc", 1, 1), token("def", 1, 1), }); final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts); final Automaton expected = join("abc", "def"); // toDot(actual); assertTrue(BasicOperations.sameLanguage(expected, actual)); } public void testHole() throws Exception { final TokenStream ts = new CannedTokenStream( new Token[] { token("abc", 1, 1), token("def", 2, 1), }); final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts); final Automaton expected = join(s2a("abc"), SEP_A, HOLE_A, SEP_A, s2a("def")); // toDot(actual); assertTrue(BasicOperations.sameLanguage(expected, actual)); } public void testOverlappedTokensSausage() throws Exception { // Two tokens on top of each other (sausage): final TokenStream ts = new CannedTokenStream(new Token[] {token("abc", 1, 1), token("xyz", 0, 1)}); final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts); final Automaton a1 = BasicAutomata.makeString("abc"); final Automaton a2 = BasicAutomata.makeString("xyz"); final Automaton expected = BasicOperations.union(a1, a2); assertTrue(BasicOperations.sameLanguage(expected, actual)); } public void testOverlappedTokensLattice() throws Exception { final TokenStream ts = new CannedTokenStream( new Token[] { token("abc", 1, 1), token("xyz", 0, 2), token("def", 1, 1), }); final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts); final Automaton a1 = BasicAutomata.makeString("xyz"); final Automaton a2 = join("abc", "def"); final Automaton expected = BasicOperations.union(a1, a2); // toDot(actual); assertTrue(BasicOperations.sameLanguage(expected, actual)); } public void testSynOverHole() throws Exception { final TokenStream ts = new CannedTokenStream( new Token[] { token("a", 1, 1), token("X", 0, 2), token("b", 2, 1), }); final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts); final Automaton a1 = BasicOperations.union(join(s2a("a"), SEP_A, HOLE_A), BasicAutomata.makeString("X")); final Automaton expected = BasicOperations.concatenate(a1, join(SEP_A, s2a("b"))); // toDot(actual); assertTrue(BasicOperations.sameLanguage(expected, actual)); } public void testSynOverHole2() throws Exception { final TokenStream ts = new CannedTokenStream( new Token[] { token("xyz", 1, 1), token("abc", 0, 3), token("def", 2, 1), }); final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts); final Automaton expected = BasicOperations.union( join(s2a("xyz"), SEP_A, HOLE_A, SEP_A, s2a("def")), BasicAutomata.makeString("abc")); assertTrue(BasicOperations.sameLanguage(expected, actual)); } public void testOverlappedTokensLattice2() throws Exception { final TokenStream ts = new CannedTokenStream( new Token[] { token("abc", 1, 1), token("xyz", 0, 3), token("def", 1, 1), token("ghi", 1, 1), }); final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts); final Automaton a1 = BasicAutomata.makeString("xyz"); final Automaton a2 = join("abc", "def", "ghi"); final Automaton expected = BasicOperations.union(a1, a2); // toDot(actual); assertTrue(BasicOperations.sameLanguage(expected, actual)); } public void testToDot() throws Exception { final TokenStream ts = new CannedTokenStream(new Token[] {token("abc", 1, 1, 0, 4)}); StringWriter w = new StringWriter(); new TokenStreamToDot("abcd", ts, new PrintWriter(w)).toDot(); assertTrue(w.toString().indexOf("abc / abcd") != -1); } public void testStartsWithHole() throws Exception { final TokenStream ts = new CannedTokenStream( new Token[] { token("abc", 2, 1), }); final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts); final Automaton expected = join(HOLE_A, SEP_A, s2a("abc")); // toDot(actual); assertTrue(BasicOperations.sameLanguage(expected, actual)); } // TODO: testEndsWithHole... but we need posInc to set in TS.end() public void testSynHangingOverEnd() throws Exception { final TokenStream ts = new CannedTokenStream( new Token[] { token("a", 1, 1), token("X", 0, 10), }); final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts); final Automaton expected = BasicOperations.union(BasicAutomata.makeString("a"), BasicAutomata.makeString("X")); assertTrue(BasicOperations.sameLanguage(expected, actual)); } }
/** * Extracts all MultiTermQueries for {@code field}, and returns equivalent automata that will * match terms. */ static CharacterRunAutomaton[] extractAutomata(Query query, String field) { List<CharacterRunAutomaton> list = new ArrayList<>(); if (query instanceof BooleanQuery) { BooleanClause clauses[] = ((BooleanQuery) query).getClauses(); for (BooleanClause clause : clauses) { if (!clause.isProhibited()) { list.addAll(Arrays.asList(extractAutomata(clause.getQuery(), field))); } } } else if (query instanceof DisjunctionMaxQuery) { for (Query sub : ((DisjunctionMaxQuery) query).getDisjuncts()) { list.addAll(Arrays.asList(extractAutomata(sub, field))); } } else if (query instanceof SpanOrQuery) { for (Query sub : ((SpanOrQuery) query).getClauses()) { list.addAll(Arrays.asList(extractAutomata(sub, field))); } } else if (query instanceof SpanNearQuery) { for (Query sub : ((SpanNearQuery) query).getClauses()) { list.addAll(Arrays.asList(extractAutomata(sub, field))); } } else if (query instanceof SpanNotQuery) { list.addAll(Arrays.asList(extractAutomata(((SpanNotQuery) query).getInclude(), field))); } else if (query instanceof SpanPositionCheckQuery) { list.addAll( Arrays.asList(extractAutomata(((SpanPositionCheckQuery) query).getMatch(), field))); } else if (query instanceof SpanMultiTermQueryWrapper) { list.addAll( Arrays.asList( extractAutomata(((SpanMultiTermQueryWrapper<?>) query).getWrappedQuery(), field))); } else if (query instanceof AutomatonQuery) { final AutomatonQuery aq = (AutomatonQuery) query; if (aq.getField().equals(field)) { list.add( new CharacterRunAutomaton(aq.getAutomaton()) { @Override public String toString() { return aq.toString(); } }); } } else if (query instanceof PrefixQuery) { final PrefixQuery pq = (PrefixQuery) query; Term prefix = pq.getPrefix(); if (prefix.field().equals(field)) { list.add( new CharacterRunAutomaton( BasicOperations.concatenate( BasicAutomata.makeString(prefix.text()), BasicAutomata.makeAnyString())) { @Override public String toString() { return pq.toString(); } }); } } else if (query instanceof FuzzyQuery) { final FuzzyQuery fq = (FuzzyQuery) query; if (fq.getField().equals(field)) { String utf16 = fq.getTerm().text(); int termText[] = new int[utf16.codePointCount(0, utf16.length())]; for (int cp, i = 0, j = 0; i < utf16.length(); i += Character.charCount(cp)) { termText[j++] = cp = utf16.codePointAt(i); } int termLength = termText.length; int prefixLength = Math.min(fq.getPrefixLength(), termLength); String suffix = UnicodeUtil.newString(termText, prefixLength, termText.length - prefixLength); LevenshteinAutomata builder = new LevenshteinAutomata(suffix, fq.getTranspositions()); Automaton automaton = builder.toAutomaton(fq.getMaxEdits()); if (prefixLength > 0) { Automaton prefix = BasicAutomata.makeString(UnicodeUtil.newString(termText, 0, prefixLength)); automaton = BasicOperations.concatenate(prefix, automaton); } list.add( new CharacterRunAutomaton(automaton) { @Override public String toString() { return fq.toString(); } }); } } else if (query instanceof TermRangeQuery) { final TermRangeQuery tq = (TermRangeQuery) query; if (tq.getField().equals(field)) { final CharsRef lowerBound; if (tq.getLowerTerm() == null) { lowerBound = null; } else { lowerBound = new CharsRef(tq.getLowerTerm().utf8ToString()); } final CharsRef upperBound; if (tq.getUpperTerm() == null) { upperBound = null; } else { upperBound = new CharsRef(tq.getUpperTerm().utf8ToString()); } final boolean includeLower = tq.includesLower(); final boolean includeUpper = tq.includesUpper(); final CharsRef scratch = new CharsRef(); final Comparator<CharsRef> comparator = CharsRef.getUTF16SortedAsUTF8Comparator(); // this is *not* an automaton, but its very simple list.add( new CharacterRunAutomaton(BasicAutomata.makeEmpty()) { @Override public boolean run(char[] s, int offset, int length) { scratch.chars = s; scratch.offset = offset; scratch.length = length; if (lowerBound != null) { int cmp = comparator.compare(scratch, lowerBound); if (cmp < 0 || (!includeLower && cmp == 0)) { return false; } } if (upperBound != null) { int cmp = comparator.compare(scratch, upperBound); if (cmp > 0 || (!includeUpper && cmp == 0)) { return false; } } return true; } @Override public String toString() { return tq.toString(); } }); } } return list.toArray(new CharacterRunAutomaton[list.size()]); }