@Override public int nextDoc() throws IOException { if (docID == NO_MORE_DOCS) { return docID; } boolean first = true; int termFreq = 0; while (true) { final long lineStart = in.getFilePointer(); SimpleTextUtil.readLine(in, scratch); if (StringHelper.startsWith(scratch, DOC)) { if (!first && (liveDocs == null || liveDocs.get(docID))) { in.seek(lineStart); if (!omitTF) { tf = termFreq; } return docID; } UnicodeUtil.UTF8toUTF16( scratch.bytes, scratch.offset + DOC.length, scratch.length - DOC.length, scratchUTF16); docID = ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length); termFreq = 0; first = false; } else if (StringHelper.startsWith(scratch, FREQ)) { UnicodeUtil.UTF8toUTF16( scratch.bytes, scratch.offset + FREQ.length, scratch.length - FREQ.length, scratchUTF16); termFreq = ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length); } else if (StringHelper.startsWith(scratch, POS)) { // skip termFreq++; } else if (StringHelper.startsWith(scratch, START_OFFSET)) { // skip } else if (StringHelper.startsWith(scratch, END_OFFSET)) { // skip } else if (StringHelper.startsWith(scratch, PAYLOAD)) { // skip } else { assert StringHelper.startsWith(scratch, TERM) || StringHelper.startsWith(scratch, FIELD) || StringHelper.startsWith(scratch, END) : "scratch=" + scratch.utf8ToString(); if (!first && (liveDocs == null || liveDocs.get(docID))) { in.seek(lineStart); if (!omitTF) { tf = termFreq; } return docID; } return docID = NO_MORE_DOCS; } } }
@Override public List<LookupResult> lookup(CharSequence key, boolean onlyMorePopular, int num) { assert num > 0; BytesRef scratch = new BytesRef(key); int prefixLength = scratch.length; Arc<Long> arc = new Arc<Long>(); // match the prefix portion exactly Long prefixOutput = null; try { prefixOutput = lookupPrefix(scratch, arc); } catch (IOException bogus) { throw new RuntimeException(bogus); } if (prefixOutput == null) { return Collections.<LookupResult>emptyList(); } List<LookupResult> results = new ArrayList<LookupResult>(num); CharsRef spare = new CharsRef(); if (exactFirst && arc.isFinal()) { spare.grow(scratch.length); UnicodeUtil.UTF8toUTF16(scratch, spare); results.add( new LookupResult(spare.toString(), decodeWeight(prefixOutput + arc.nextFinalOutput))); if (--num == 0) { return results; // that was quick } } // complete top-N MinResult<Long> completions[] = null; try { completions = Util.shortestPaths(fst, arc, weightComparator, num); } catch (IOException bogus) { throw new RuntimeException(bogus); } BytesRef suffix = new BytesRef(8); for (MinResult<Long> completion : completions) { scratch.length = prefixLength; // append suffix Util.toBytesRef(completion.input, suffix); scratch.append(suffix); spare.grow(scratch.length); UnicodeUtil.UTF8toUTF16(scratch, spare); results.add( new LookupResult(spare.toString(), decodeWeight(prefixOutput + completion.output))); } return results; }
@Override public int nextDoc() throws IOException { boolean first = true; in.seek(nextDocStart); long posStart = 0; while (true) { final long lineStart = in.getFilePointer(); SimpleTextUtil.readLine(in, scratch); // System.out.println("NEXT DOC: " + scratch.utf8ToString()); if (StringHelper.startsWith(scratch, DOC)) { if (!first && (liveDocs == null || liveDocs.get(docID))) { nextDocStart = lineStart; in.seek(posStart); return docID; } UnicodeUtil.UTF8toUTF16( scratch.bytes, scratch.offset + DOC.length, scratch.length - DOC.length, scratchUTF16); docID = ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length); tf = 0; first = false; } else if (StringHelper.startsWith(scratch, FREQ)) { UnicodeUtil.UTF8toUTF16( scratch.bytes, scratch.offset + FREQ.length, scratch.length - FREQ.length, scratchUTF16); tf = ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length); posStart = in.getFilePointer(); } else if (StringHelper.startsWith(scratch, POS)) { // skip } else if (StringHelper.startsWith(scratch, START_OFFSET)) { // skip } else if (StringHelper.startsWith(scratch, END_OFFSET)) { // skip } else if (StringHelper.startsWith(scratch, PAYLOAD)) { // skip } else { assert StringHelper.startsWith(scratch, TERM) || StringHelper.startsWith(scratch, FIELD) || StringHelper.startsWith(scratch, END); if (!first && (liveDocs == null || liveDocs.get(docID))) { nextDocStart = lineStart; in.seek(posStart); return docID; } return docID = NO_MORE_DOCS; } } }
@Override public int nextPosition() throws IOException { final int pos; if (readPositions) { SimpleTextUtil.readLine(in, scratch); assert StringHelper.startsWith(scratch, POS) : "got line=" + scratch.utf8ToString(); UnicodeUtil.UTF8toUTF16( scratch.bytes, scratch.offset + POS.length, scratch.length - POS.length, scratchUTF16_2); pos = ArrayUtil.parseInt(scratchUTF16_2.chars, 0, scratchUTF16_2.length); } else { pos = -1; } if (readOffsets) { SimpleTextUtil.readLine(in, scratch); assert StringHelper.startsWith(scratch, START_OFFSET) : "got line=" + scratch.utf8ToString(); UnicodeUtil.UTF8toUTF16( scratch.bytes, scratch.offset + START_OFFSET.length, scratch.length - START_OFFSET.length, scratchUTF16_2); startOffset = ArrayUtil.parseInt(scratchUTF16_2.chars, 0, scratchUTF16_2.length); SimpleTextUtil.readLine(in, scratch); assert StringHelper.startsWith(scratch, END_OFFSET) : "got line=" + scratch.utf8ToString(); UnicodeUtil.UTF8toUTF16( scratch.bytes, scratch.offset + END_OFFSET.length, scratch.length - END_OFFSET.length, scratchUTF16_2); endOffset = ArrayUtil.parseInt(scratchUTF16_2.chars, 0, scratchUTF16_2.length); } final long fp = in.getFilePointer(); SimpleTextUtil.readLine(in, scratch); if (StringHelper.startsWith(scratch, PAYLOAD)) { final int len = scratch.length - PAYLOAD.length; if (scratch2.bytes.length < len) { scratch2.grow(len); } System.arraycopy(scratch.bytes, PAYLOAD.length, scratch2.bytes, 0, len); scratch2.length = len; payload = scratch2; } else { payload = null; in.seek(fp); } return pos; }
private void assertAutomaton(Automaton automaton) throws Exception { CharacterRunAutomaton cra = new CharacterRunAutomaton(automaton); ByteRunAutomaton bra = new ByteRunAutomaton(automaton); final AutomatonTestUtil.RandomAcceptedStrings ras = new AutomatonTestUtil.RandomAcceptedStrings(automaton); int num = 1000 * RANDOM_MULTIPLIER; for (int i = 0; i < num; i++) { final String string; if (random.nextBoolean()) { // likely not accepted string = _TestUtil.randomUnicodeString(random); } else { // will be accepted int[] codepoints = ras.getRandomAcceptedString(random); try { string = UnicodeUtil.newString(codepoints, 0, codepoints.length); } catch (Exception e) { System.out.println(codepoints.length + " codepoints:"); for (int j = 0; j < codepoints.length; j++) { System.out.println(" " + Integer.toHexString(codepoints[j])); } throw e; } } byte bytes[] = string.getBytes("UTF-8"); assertEquals(cra.run(string), bra.run(bytes, 0, bytes.length)); } }
private static TInfo parseTerm(FunctionQParser fp) throws SyntaxError { TInfo tinfo = new TInfo(); tinfo.indexedField = tinfo.field = fp.parseArg(); tinfo.val = fp.parseArg(); tinfo.indexedBytes = new BytesRef(); FieldType ft = fp.getReq().getSchema().getFieldTypeNoEx(tinfo.field); if (ft == null) ft = new StrField(); if (ft instanceof TextField) { // need to do analysis on the term String indexedVal = tinfo.val; Query q = ft.getFieldQuery(fp, fp.getReq().getSchema().getFieldOrNull(tinfo.field), tinfo.val); if (q instanceof TermQuery) { Term term = ((TermQuery) q).getTerm(); tinfo.indexedField = term.field(); indexedVal = term.text(); } UnicodeUtil.UTF16toUTF8(indexedVal, 0, indexedVal.length(), tinfo.indexedBytes); } else { ft.readableToIndexed(tinfo.val, tinfo.indexedBytes); } return tinfo; }
@Override public String strVal(int doc) { int ord = termsIndex.getOrd(doc); if (ord == 0) return null; termsIndex.lookup(ord, spare); UnicodeUtil.UTF8toUTF16(spare, spareChars); return spareChars.toString(); }
@Override public CharsRef indexedToReadable(BytesRef input, CharsRef charsRef) { // TODO: this could be more efficient, but the sortable types should be deprecated instead UnicodeUtil.UTF8toUTF16(input, charsRef); final char[] indexedToReadable = indexedToReadable(charsRef.toString()).toCharArray(); charsRef.copyChars(indexedToReadable, 0, indexedToReadable.length); return charsRef; }
@Override public String toUtf8() { if (length() == 0) { return ""; } byte[] bytes = toBytes(); final CharsRef ref = new CharsRef(length); UnicodeUtil.UTF8toUTF16(bytes, offset, length, ref); return ref.toString(); }
/** Returns random string, including full unicode range. */ public static String randomRegexp(Random r) { while (true) { String regexp = randomRegexpString(r); // we will also generate some undefined unicode queries if (!UnicodeUtil.validUTF16String(regexp)) continue; try { new RegExp(regexp, RegExp.NONE); return regexp; } catch (Exception e) { } } }
/** * Constructor for enumeration of all terms from specified <code>reader</code> which share a * prefix of length <code>prefixLength</code> with <code>term</code> and which have a fuzzy * similarity > <code>minSimilarity</code>. * * <p>After calling the constructor the enumeration is already pointing to the first valid term * if such a term exists. * * @throws IOException If there is a low-level I/O error. */ public LinearFuzzyTermsEnum() throws IOException { super(terms.iterator()); this.text = new int[termLength - realPrefixLength]; System.arraycopy(termText, realPrefixLength, text, 0, text.length); final String prefix = UnicodeUtil.newString(termText, 0, realPrefixLength); prefixBytesRef = new BytesRef(prefix); this.d = new int[this.text.length + 1]; this.p = new int[this.text.length + 1]; setInitialSeekTerm(prefixBytesRef); }
@Override public List<LookupResult> lookup(CharSequence key, boolean higherWeightsFirst, int num) { final List<Completion> completions; if (higherWeightsFirst) { completions = higherWeightsCompletion.lookup(key, num); } else { completions = normalCompletion.lookup(key, num); } final ArrayList<LookupResult> results = new ArrayList<LookupResult>(completions.size()); CharsRef spare = new CharsRef(); for (Completion c : completions) { spare.grow(c.utf8.length); UnicodeUtil.UTF8toUTF16(c.utf8, spare); results.add(new LookupResult(spare.toString(), c.bucket)); } return results; }
@Override public void setUp() throws Exception { super.setUp(); dir = newDirectory(); fieldName = random().nextBoolean() ? "field" : ""; // sometimes use an empty string as field name RandomIndexWriter writer = new RandomIndexWriter( random(), dir, newIndexWriterConfig( TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.KEYWORD, false)) .setMaxBufferedDocs(_TestUtil.nextInt(random(), 50, 1000))); Document doc = new Document(); Field field = newStringField(fieldName, "", Field.Store.NO); doc.add(field); List<String> terms = new ArrayList<String>(); int num = atLeast(200); for (int i = 0; i < num; i++) { String s = _TestUtil.randomUnicodeString(random()); field.setStringValue(s); terms.add(s); writer.addDocument(doc); } if (VERBOSE) { // utf16 order Collections.sort(terms); System.out.println("UTF16 order:"); for (String s : terms) { System.out.println(" " + UnicodeUtil.toHexString(s)); } } reader = writer.getReader(); searcher1 = newSearcher(reader); searcher2 = newSearcher(reader); writer.close(); }
@Override public void setUp() throws Exception { super.setUp(); dir = newDirectory(); RandomIndexWriter writer = new RandomIndexWriter( random, dir, newIndexWriterConfig( TEST_VERSION_CURRENT, new MockAnalyzer(MockTokenizer.KEYWORD, false)) .setMaxBufferedDocs(_TestUtil.nextInt(random, 50, 1000))); Document doc = new Document(); Field field = newField("field", "", Field.Store.NO, Field.Index.NOT_ANALYZED); doc.add(field); List<String> terms = new ArrayList<String>(); int num = 2000 * RANDOM_MULTIPLIER; for (int i = 0; i < num; i++) { String s = _TestUtil.randomUnicodeString(random); field.setValue(s); terms.add(s); writer.addDocument(doc); } if (VERBOSE) { // utf16 order Collections.sort(terms); System.out.println("UTF16 order:"); for (String s : terms) { System.out.println(" " + UnicodeUtil.toHexString(s)); } } reader = writer.getReader(); searcher = new IndexSearcher(reader); writer.close(); }
@Override public void build(InputIterator tfit) throws IOException { if (tfit.hasPayloads()) { throw new IllegalArgumentException("this suggester doesn't support payloads"); } root = new TernaryTreeNode(); // buffer first if (tfit.getComparator() != BytesRef.getUTF8SortedAsUTF16Comparator()) { // make sure it's sorted and the comparator uses UTF16 sort order tfit = new SortedInputIterator(tfit, BytesRef.getUTF8SortedAsUTF16Comparator()); } ArrayList<String> tokens = new ArrayList<String>(); ArrayList<Number> vals = new ArrayList<Number>(); BytesRef spare; CharsRef charsSpare = new CharsRef(); while ((spare = tfit.next()) != null) { charsSpare.grow(spare.length); UnicodeUtil.UTF8toUTF16(spare.bytes, spare.offset, spare.length, charsSpare); tokens.add(charsSpare.toString()); vals.add(Long.valueOf(tfit.weight())); } autocomplete.balancedTree(tokens.toArray(), vals.toArray(), 0, tokens.size() - 1, root); }
static { MAX_TERM = new BytesRef(); final char[] chars = Character.toChars(Character.MAX_CODE_POINT); UnicodeUtil.UTF16toUTF8(chars, 0, chars.length, MAX_TERM); }
@Override protected AcceptStatus accept(BytesRef term) throws IOException { UnicodeUtil.UTF8toUTF16(term.bytes, term.offset, term.length, utf16); return runAutomaton.run(utf16.result, 0, utf16.length) ? AcceptStatus.YES : AcceptStatus.NO; }
private void loadTerms() throws IOException { PositiveIntOutputs posIntOutputs = PositiveIntOutputs.getSingleton(false); final Builder<PairOutputs.Pair<Long, PairOutputs.Pair<Long, Long>>> b; final PairOutputs<Long, Long> outputsInner = new PairOutputs<Long, Long>(posIntOutputs, posIntOutputs); final PairOutputs<Long, PairOutputs.Pair<Long, Long>> outputs = new PairOutputs<Long, PairOutputs.Pair<Long, Long>>(posIntOutputs, outputsInner); b = new Builder<PairOutputs.Pair<Long, PairOutputs.Pair<Long, Long>>>( FST.INPUT_TYPE.BYTE1, outputs); IndexInput in = (IndexInput) SimpleTextFieldsReader.this.in.clone(); in.seek(termsStart); final BytesRef lastTerm = new BytesRef(10); long lastDocsStart = -1; int docFreq = 0; long totalTermFreq = 0; OpenBitSet visitedDocs = new OpenBitSet(); final IntsRef scratchIntsRef = new IntsRef(); while (true) { SimpleTextUtil.readLine(in, scratch); if (scratch.equals(END) || StringHelper.startsWith(scratch, FIELD)) { if (lastDocsStart != -1) { b.add( Util.toIntsRef(lastTerm, scratchIntsRef), outputs.newPair( lastDocsStart, outputsInner.newPair((long) docFreq, totalTermFreq))); sumTotalTermFreq += totalTermFreq; } break; } else if (StringHelper.startsWith(scratch, DOC)) { docFreq++; sumDocFreq++; UnicodeUtil.UTF8toUTF16( scratch.bytes, scratch.offset + DOC.length, scratch.length - DOC.length, scratchUTF16); int docID = ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length); visitedDocs.set(docID); } else if (StringHelper.startsWith(scratch, FREQ)) { UnicodeUtil.UTF8toUTF16( scratch.bytes, scratch.offset + FREQ.length, scratch.length - FREQ.length, scratchUTF16); totalTermFreq += ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length); } else if (StringHelper.startsWith(scratch, TERM)) { if (lastDocsStart != -1) { b.add( Util.toIntsRef(lastTerm, scratchIntsRef), outputs.newPair( lastDocsStart, outputsInner.newPair((long) docFreq, totalTermFreq))); } lastDocsStart = in.getFilePointer(); final int len = scratch.length - TERM.length; if (len > lastTerm.length) { lastTerm.grow(len); } System.arraycopy(scratch.bytes, TERM.length, lastTerm.bytes, 0, len); lastTerm.length = len; docFreq = 0; sumTotalTermFreq += totalTermFreq; totalTermFreq = 0; termCount++; } } docCount = (int) visitedDocs.cardinality(); fst = b.finish(); /* PrintStream ps = new PrintStream("out.dot"); fst.toDot(ps); ps.close(); System.out.println("SAVED out.dot"); */ // System.out.println("FST " + fst.sizeInBytes()); }
@Override public CharsRef indexedToReadable(BytesRef input, CharsRef charsRef) { UnicodeUtil.UTF8toUTF16(input, charsRef); charsRef.append(Z_ARRAY, 0, 1); return charsRef; }
@Override protected Suggest.Suggestion< ? extends Suggest.Suggestion.Entry<? extends Suggest.Suggestion.Entry.Option>> innerExecute( String name, CompletionSuggestionContext suggestionContext, IndexReader indexReader, CharsRef spare) throws IOException { if (suggestionContext.mapper() == null || !(suggestionContext.mapper() instanceof CompletionFieldMapper)) { throw new ElasticsearchException( "Field [" + suggestionContext.getField() + "] is not a completion suggest field"); } CompletionSuggestion completionSuggestion = new CompletionSuggestion(name, suggestionContext.getSize()); UnicodeUtil.UTF8toUTF16(suggestionContext.getText(), spare); CompletionSuggestion.Entry completionSuggestEntry = new CompletionSuggestion.Entry(new StringText(spare.toString()), 0, spare.length()); completionSuggestion.addTerm(completionSuggestEntry); String fieldName = suggestionContext.getField(); Map<String, CompletionSuggestion.Entry.Option> results = Maps.newHashMapWithExpectedSize(indexReader.leaves().size() * suggestionContext.getSize()); for (AtomicReaderContext atomicReaderContext : indexReader.leaves()) { AtomicReader atomicReader = atomicReaderContext.reader(); Terms terms = atomicReader.fields().terms(fieldName); if (terms instanceof Completion090PostingsFormat.CompletionTerms) { final Completion090PostingsFormat.CompletionTerms lookupTerms = (Completion090PostingsFormat.CompletionTerms) terms; final Lookup lookup = lookupTerms.getLookup(suggestionContext.mapper(), suggestionContext); if (lookup == null) { // we don't have a lookup for this segment.. this might be possible if a merge dropped all // docs from the segment that had a value in this segment. continue; } List<Lookup.LookupResult> lookupResults = lookup.lookup(spare, false, suggestionContext.getSize()); for (Lookup.LookupResult res : lookupResults) { final String key = res.key.toString(); final float score = res.value; final Option value = results.get(key); if (value == null) { final Option option = new CompletionSuggestion.Entry.Option( new StringText(key), score, res.payload == null ? null : new BytesArray(res.payload)); results.put(key, option); } else if (value.getScore() < score) { value.setScore(score); value.setPayload(res.payload == null ? null : new BytesArray(res.payload)); } } } } final List<CompletionSuggestion.Entry.Option> options = new ArrayList<CompletionSuggestion.Entry.Option>(results.values()); CollectionUtil.introSort(options, scoreComparator); int optionCount = Math.min(suggestionContext.getSize(), options.size()); for (int i = 0; i < optionCount; i++) { completionSuggestEntry.addOption(options.get(i)); } return completionSuggestion; }
private boolean matches(ByteRunAutomaton a, int code) { char[] chars = Character.toChars(code); UnicodeUtil.UTF16toUTF8(chars, 0, chars.length, b); return a.run(b.bytes, 0, b.length); }
/** * Extracts all MultiTermQueries for {@code field}, and returns equivalent automata that will * match terms. */ static CharacterRunAutomaton[] extractAutomata(Query query, String field) { List<CharacterRunAutomaton> list = new ArrayList<>(); if (query instanceof BooleanQuery) { for (BooleanClause clause : (BooleanQuery) query) { if (!clause.isProhibited()) { list.addAll(Arrays.asList(extractAutomata(clause.getQuery(), field))); } } } else if (query instanceof ConstantScoreQuery) { list.addAll(Arrays.asList(extractAutomata(((ConstantScoreQuery) query).getQuery(), field))); } else if (query instanceof DisjunctionMaxQuery) { for (Query sub : ((DisjunctionMaxQuery) query).getDisjuncts()) { list.addAll(Arrays.asList(extractAutomata(sub, field))); } } else if (query instanceof SpanOrQuery) { for (Query sub : ((SpanOrQuery) query).getClauses()) { list.addAll(Arrays.asList(extractAutomata(sub, field))); } } else if (query instanceof SpanNearQuery) { for (Query sub : ((SpanNearQuery) query).getClauses()) { list.addAll(Arrays.asList(extractAutomata(sub, field))); } } else if (query instanceof SpanNotQuery) { list.addAll(Arrays.asList(extractAutomata(((SpanNotQuery) query).getInclude(), field))); } else if (query instanceof SpanPositionCheckQuery) { list.addAll( Arrays.asList(extractAutomata(((SpanPositionCheckQuery) query).getMatch(), field))); } else if (query instanceof SpanMultiTermQueryWrapper) { list.addAll( Arrays.asList( extractAutomata(((SpanMultiTermQueryWrapper<?>) query).getWrappedQuery(), field))); } else if (query instanceof AutomatonQuery) { final AutomatonQuery aq = (AutomatonQuery) query; if (aq.getField().equals(field)) { list.add( new CharacterRunAutomaton(aq.getAutomaton()) { @Override public String toString() { return aq.toString(); } }); } } else if (query instanceof PrefixQuery) { final PrefixQuery pq = (PrefixQuery) query; Term prefix = pq.getPrefix(); if (prefix.field().equals(field)) { list.add( new CharacterRunAutomaton( Operations.concatenate( Automata.makeString(prefix.text()), Automata.makeAnyString())) { @Override public String toString() { return pq.toString(); } }); } } else if (query instanceof FuzzyQuery) { final FuzzyQuery fq = (FuzzyQuery) query; if (fq.getField().equals(field)) { String utf16 = fq.getTerm().text(); int termText[] = new int[utf16.codePointCount(0, utf16.length())]; for (int cp, i = 0, j = 0; i < utf16.length(); i += Character.charCount(cp)) { termText[j++] = cp = utf16.codePointAt(i); } int termLength = termText.length; int prefixLength = Math.min(fq.getPrefixLength(), termLength); String suffix = UnicodeUtil.newString(termText, prefixLength, termText.length - prefixLength); LevenshteinAutomata builder = new LevenshteinAutomata(suffix, fq.getTranspositions()); String prefix = UnicodeUtil.newString(termText, 0, prefixLength); Automaton automaton = builder.toAutomaton(fq.getMaxEdits(), prefix); list.add( new CharacterRunAutomaton(automaton) { @Override public String toString() { return fq.toString(); } }); } } else if (query instanceof TermRangeQuery) { final TermRangeQuery tq = (TermRangeQuery) query; if (tq.getField().equals(field)) { final CharsRef lowerBound; if (tq.getLowerTerm() == null) { lowerBound = null; } else { lowerBound = new CharsRef(tq.getLowerTerm().utf8ToString()); } final CharsRef upperBound; if (tq.getUpperTerm() == null) { upperBound = null; } else { upperBound = new CharsRef(tq.getUpperTerm().utf8ToString()); } final boolean includeLower = tq.includesLower(); final boolean includeUpper = tq.includesUpper(); final CharsRef scratch = new CharsRef(); final Comparator<CharsRef> comparator = CharsRef.getUTF16SortedAsUTF8Comparator(); // this is *not* an automaton, but it's very simple list.add( new CharacterRunAutomaton(Automata.makeEmpty()) { @Override public boolean run(char[] s, int offset, int length) { scratch.chars = s; scratch.offset = offset; scratch.length = length; if (lowerBound != null) { int cmp = comparator.compare(scratch, lowerBound); if (cmp < 0 || (!includeLower && cmp == 0)) { return false; } } if (upperBound != null) { int cmp = comparator.compare(scratch, upperBound); if (cmp > 0 || (!includeUpper && cmp == 0)) { return false; } } return true; } @Override public String toString() { return tq.toString(); } }); } } return list.toArray(new CharacterRunAutomaton[list.size()]); }
public void testRandom() throws Exception { int num = atLeast(2); for (int iter = 0; iter < num; iter++) { if (VERBOSE) { System.out.println("TEST: iter=" + iter); } Directory dir = newDirectory(); IndexWriter w = new IndexWriter( dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())) .setMergePolicy(NoMergePolicy.COMPOUND_FILES)); _TestUtil.keepFullyDeletedSegments(w); Map<BytesRef, List<Integer>> docs = new HashMap<BytesRef, List<Integer>>(); Set<Integer> deleted = new HashSet<Integer>(); List<BytesRef> terms = new ArrayList<BytesRef>(); int numDocs = _TestUtil.nextInt(random(), 1, 100 * RANDOM_MULTIPLIER); Document doc = new Document(); Field f = newStringField("field", "", Field.Store.NO); doc.add(f); Field id = newStringField("id", "", Field.Store.NO); doc.add(id); boolean onlyUniqueTerms = random().nextBoolean(); if (VERBOSE) { System.out.println("TEST: onlyUniqueTerms=" + onlyUniqueTerms + " numDocs=" + numDocs); } Set<BytesRef> uniqueTerms = new HashSet<BytesRef>(); for (int i = 0; i < numDocs; i++) { if (!onlyUniqueTerms && random().nextBoolean() && terms.size() > 0) { // re-use existing term BytesRef term = terms.get(random().nextInt(terms.size())); docs.get(term).add(i); f.setStringValue(term.utf8ToString()); } else { String s = _TestUtil.randomUnicodeString(random(), 10); BytesRef term = new BytesRef(s); if (!docs.containsKey(term)) { docs.put(term, new ArrayList<Integer>()); } docs.get(term).add(i); terms.add(term); uniqueTerms.add(term); f.setStringValue(s); } id.setStringValue("" + i); w.addDocument(doc); if (random().nextInt(4) == 1) { w.commit(); } if (i > 0 && random().nextInt(20) == 1) { int delID = random().nextInt(i); deleted.add(delID); w.deleteDocuments(new Term("id", "" + delID)); if (VERBOSE) { System.out.println("TEST: delete " + delID); } } } if (VERBOSE) { List<BytesRef> termsList = new ArrayList<BytesRef>(uniqueTerms); Collections.sort(termsList, BytesRef.getUTF8SortedAsUTF16Comparator()); System.out.println("TEST: terms in UTF16 order:"); for (BytesRef b : termsList) { System.out.println(" " + UnicodeUtil.toHexString(b.utf8ToString()) + " " + b); for (int docID : docs.get(b)) { if (deleted.contains(docID)) { System.out.println(" " + docID + " (deleted)"); } else { System.out.println(" " + docID); } } } } IndexReader reader = w.getReader(); w.close(); if (VERBOSE) { System.out.println("TEST: reader=" + reader); } Bits liveDocs = MultiFields.getLiveDocs(reader); for (int delDoc : deleted) { assertFalse(liveDocs.get(delDoc)); } for (int i = 0; i < 100; i++) { BytesRef term = terms.get(random().nextInt(terms.size())); if (VERBOSE) { System.out.println( "TEST: seek term=" + UnicodeUtil.toHexString(term.utf8ToString()) + " " + term); } DocsEnum docsEnum = _TestUtil.docs(random(), reader, "field", term, liveDocs, null, 0); assertNotNull(docsEnum); for (int docID : docs.get(term)) { if (!deleted.contains(docID)) { assertEquals(docID, docsEnum.nextDoc()); } } assertEquals(DocIdSetIterator.NO_MORE_DOCS, docsEnum.nextDoc()); } reader.close(); dir.close(); } }
public static byte[] toUTF8Bytes(CharSequence charSequence, BytesRef spare) { UnicodeUtil.UTF16toUTF8(charSequence, 0, charSequence.length(), spare); final byte[] bytes = new byte[spare.length]; System.arraycopy(spare.bytes, spare.offset, bytes, 0, bytes.length); return bytes; }