/** * Returns a function that filters a document map based on the given include and exclude rules. * * @see #filter(Map, String[], String[]) for details */ public static Function<Map<String, ?>, Map<String, Object>> filter( String[] includes, String[] excludes) { CharacterRunAutomaton matchAllAutomaton = new CharacterRunAutomaton(Automata.makeAnyString()); CharacterRunAutomaton include; if (includes == null || includes.length == 0) { include = matchAllAutomaton; } else { Automaton includeA = Regex.simpleMatchToAutomaton(includes); includeA = makeMatchDotsInFieldNames(includeA); include = new CharacterRunAutomaton(includeA); } Automaton excludeA; if (excludes == null || excludes.length == 0) { excludeA = Automata.makeEmpty(); } else { excludeA = Regex.simpleMatchToAutomaton(excludes); excludeA = makeMatchDotsInFieldNames(excludeA); } CharacterRunAutomaton exclude = new CharacterRunAutomaton(excludeA); // NOTE: We cannot use Operations.minus because of the special case that // we want all sub properties to match as soon as an object matches return (map) -> filter(map, include, 0, exclude, 0, matchAllAutomaton); }
/** Test a configuration that behaves a lot like KeepWordFilter */ public void testKeep() throws Exception { CharacterRunAutomaton keepWords = new CharacterRunAutomaton( Operations.complement( Operations.union( Arrays.asList(Automata.makeString("foo"), Automata.makeString("bar"))), DEFAULT_MAX_DETERMINIZED_STATES)); Analyzer a = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, keepWords); assertAnalyzesTo( a, "quick foo brown bar bar fox foo", new String[] {"foo", "bar", "bar", "foo"}, new int[] {2, 2, 1, 2}); }
private Automaton toAutomaton() { Automaton a = null; if (include != null) { a = include.toAutomaton(); } else if (includeValues != null) { a = Automata.makeStringUnion(includeValues); } else { a = Automata.makeAnyString(); } if (exclude != null) { a = Operations.minus(a, exclude.toAutomaton(), Operations.DEFAULT_MAX_DETERMINIZED_STATES); } else if (excludeValues != null) { a = Operations.minus( a, Automata.makeStringUnion(excludeValues), Operations.DEFAULT_MAX_DETERMINIZED_STATES); } return a; }
public static Automaton toAutomaton( BytesRef lowerTerm, BytesRef upperTerm, boolean includeLower, boolean includeUpper) { if (lowerTerm == null) { // makeBinaryInterval is more picky than we are: includeLower = true; } if (upperTerm == null) { // makeBinaryInterval is more picky than we are: includeUpper = true; } return Automata.makeBinaryInterval(lowerTerm, includeLower, upperTerm, includeUpper); }
/** * Build the {@link CharacterRunAutomaton} that represents the reindex-from-remote whitelist and * make sure that it doesn't whitelist the world. */ static CharacterRunAutomaton buildRemoteWhitelist(List<String> whitelist) { if (whitelist.isEmpty()) { return new CharacterRunAutomaton(Automata.makeEmpty()); } Automaton automaton = Regex.simpleMatchToAutomaton(whitelist.toArray(Strings.EMPTY_ARRAY)); automaton = MinimizationOperations.minimize(automaton, Operations.DEFAULT_MAX_DETERMINIZED_STATES); if (Operations.isTotal(automaton)) { throw new IllegalArgumentException( "Refusing to start because whitelist " + whitelist + " accepts all addresses. " + "This would allow users to reindex-from-remote any URL they like effectively having Elasticsearch make HTTP GETs " + "for them."); } return new CharacterRunAutomaton(automaton); }
@BeforeClass public static void beforeClass() throws Exception { Random random = random(); directory = newDirectory(); stopword = "" + randomChar(); CharacterRunAutomaton stopset = new CharacterRunAutomaton(Automata.makeString(stopword)); analyzer = new MockAnalyzer(random, MockTokenizer.WHITESPACE, false, stopset); RandomIndexWriter iw = new RandomIndexWriter(random, directory, analyzer); Document doc = new Document(); Field id = new StringField("id", "", Field.Store.NO); Field field = new TextField("field", "", Field.Store.NO); doc.add(id); doc.add(field); // index some docs int numDocs = atLeast(1000); for (int i = 0; i < numDocs; i++) { id.setStringValue(Integer.toString(i)); field.setStringValue(randomFieldContents()); iw.addDocument(doc); } // delete some docs int numDeletes = numDocs / 20; for (int i = 0; i < numDeletes; i++) { Term toDelete = new Term("id", Integer.toString(random.nextInt(numDocs))); if (random.nextBoolean()) { iw.deleteDocuments(toDelete); } else { iw.deleteDocuments(new TermQuery(toDelete)); } } reader = iw.getReader(); s1 = newSearcher(reader); s2 = newSearcher(reader); iw.close(); }
/** * Extracts all MultiTermQueries for {@code field}, and returns equivalent automata that will * match terms. */ static CharacterRunAutomaton[] extractAutomata(Query query, String field) { List<CharacterRunAutomaton> list = new ArrayList<>(); if (query instanceof BooleanQuery) { for (BooleanClause clause : (BooleanQuery) query) { if (!clause.isProhibited()) { list.addAll(Arrays.asList(extractAutomata(clause.getQuery(), field))); } } } else if (query instanceof ConstantScoreQuery) { list.addAll(Arrays.asList(extractAutomata(((ConstantScoreQuery) query).getQuery(), field))); } else if (query instanceof DisjunctionMaxQuery) { for (Query sub : ((DisjunctionMaxQuery) query).getDisjuncts()) { list.addAll(Arrays.asList(extractAutomata(sub, field))); } } else if (query instanceof SpanOrQuery) { for (Query sub : ((SpanOrQuery) query).getClauses()) { list.addAll(Arrays.asList(extractAutomata(sub, field))); } } else if (query instanceof SpanNearQuery) { for (Query sub : ((SpanNearQuery) query).getClauses()) { list.addAll(Arrays.asList(extractAutomata(sub, field))); } } else if (query instanceof SpanNotQuery) { list.addAll(Arrays.asList(extractAutomata(((SpanNotQuery) query).getInclude(), field))); } else if (query instanceof SpanPositionCheckQuery) { list.addAll( Arrays.asList(extractAutomata(((SpanPositionCheckQuery) query).getMatch(), field))); } else if (query instanceof SpanMultiTermQueryWrapper) { list.addAll( Arrays.asList( extractAutomata(((SpanMultiTermQueryWrapper<?>) query).getWrappedQuery(), field))); } else if (query instanceof AutomatonQuery) { final AutomatonQuery aq = (AutomatonQuery) query; if (aq.getField().equals(field)) { list.add( new CharacterRunAutomaton(aq.getAutomaton()) { @Override public String toString() { return aq.toString(); } }); } } else if (query instanceof PrefixQuery) { final PrefixQuery pq = (PrefixQuery) query; Term prefix = pq.getPrefix(); if (prefix.field().equals(field)) { list.add( new CharacterRunAutomaton( Operations.concatenate( Automata.makeString(prefix.text()), Automata.makeAnyString())) { @Override public String toString() { return pq.toString(); } }); } } else if (query instanceof FuzzyQuery) { final FuzzyQuery fq = (FuzzyQuery) query; if (fq.getField().equals(field)) { String utf16 = fq.getTerm().text(); int termText[] = new int[utf16.codePointCount(0, utf16.length())]; for (int cp, i = 0, j = 0; i < utf16.length(); i += Character.charCount(cp)) { termText[j++] = cp = utf16.codePointAt(i); } int termLength = termText.length; int prefixLength = Math.min(fq.getPrefixLength(), termLength); String suffix = UnicodeUtil.newString(termText, prefixLength, termText.length - prefixLength); LevenshteinAutomata builder = new LevenshteinAutomata(suffix, fq.getTranspositions()); String prefix = UnicodeUtil.newString(termText, 0, prefixLength); Automaton automaton = builder.toAutomaton(fq.getMaxEdits(), prefix); list.add( new CharacterRunAutomaton(automaton) { @Override public String toString() { return fq.toString(); } }); } } else if (query instanceof TermRangeQuery) { final TermRangeQuery tq = (TermRangeQuery) query; if (tq.getField().equals(field)) { final CharsRef lowerBound; if (tq.getLowerTerm() == null) { lowerBound = null; } else { lowerBound = new CharsRef(tq.getLowerTerm().utf8ToString()); } final CharsRef upperBound; if (tq.getUpperTerm() == null) { upperBound = null; } else { upperBound = new CharsRef(tq.getUpperTerm().utf8ToString()); } final boolean includeLower = tq.includesLower(); final boolean includeUpper = tq.includesUpper(); final CharsRef scratch = new CharsRef(); final Comparator<CharsRef> comparator = CharsRef.getUTF16SortedAsUTF8Comparator(); // this is *not* an automaton, but it's very simple list.add( new CharacterRunAutomaton(Automata.makeEmpty()) { @Override public boolean run(char[] s, int offset, int length) { scratch.chars = s; scratch.offset = offset; scratch.length = length; if (lowerBound != null) { int cmp = comparator.compare(scratch, lowerBound); if (cmp < 0 || (!includeLower && cmp == 0)) { return false; } } if (upperBound != null) { int cmp = comparator.compare(scratch, upperBound); if (cmp > 0 || (!includeUpper && cmp == 0)) { return false; } } return true; } @Override public String toString() { return tq.toString(); } }); } } return list.toArray(new CharacterRunAutomaton[list.size()]); }
/** * Make matches on objects also match dots in field names. For instance, if the original simple * regex is `foo`, this will translate it into `foo` OR `foo.*`. */ private static Automaton makeMatchDotsInFieldNames(Automaton automaton) { return Operations.union( automaton, Operations.concatenate( Arrays.asList(automaton, Automata.makeChar('.'), Automata.makeAnyString()))); }