@Override public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { if (include != null) { builder.field(INCLUDE_FIELD.getPreferredName(), include.getOriginalString()); } else if (includeValues != null) { builder.startArray(INCLUDE_FIELD.getPreferredName()); for (BytesRef value : includeValues) { builder.value(value.utf8ToString()); } builder.endArray(); } else if (isPartitionBased()) { builder.startObject(INCLUDE_FIELD.getPreferredName()); builder.field(PARTITION_FIELD.getPreferredName(), incZeroBasedPartition); builder.field(NUM_PARTITIONS_FIELD.getPreferredName(), incNumPartitions); builder.endObject(); } if (exclude != null) { builder.field(EXCLUDE_FIELD.getPreferredName(), exclude.getOriginalString()); } else if (excludeValues != null) { builder.startArray(EXCLUDE_FIELD.getPreferredName()); for (BytesRef value : excludeValues) { builder.value(value.utf8ToString()); } builder.endArray(); } return builder; }
@Test public void testNoCopy() throws Exception { BytesRef ref = new BytesRef("i do not want to be copied!"); BytesRef sub1 = SubstrFunction.substring(ref, 0, 10); BytesRef sub2 = SubstrFunction.substring(ref, 5, 14); assertThat(sub1.utf8ToString(), is("i do not w")); assertThat(sub2.utf8ToString(), is("not want ")); assertThat(ref.bytes, allOf(is(sub2.bytes), is(sub1.bytes))); }
@Test public void testTokenStream() throws Exception { Analyzer analyzer = new MockAnalyzer(random()); ContextSuggestField field = new ContextSuggestField("field", "input", 1, "context1", "context2"); BytesRef surfaceForm = new BytesRef("input"); ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream(); try (OutputStreamDataOutput output = new OutputStreamDataOutput(byteArrayOutputStream)) { output.writeVInt(surfaceForm.length); output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length); output.writeVInt(1 + 1); output.writeByte(ContextSuggestField.TYPE); } BytesRef payload = new BytesRef(byteArrayOutputStream.toByteArray()); String[] expectedOutputs = new String[2]; CharsRefBuilder builder = new CharsRefBuilder(); builder.append("context1"); builder.append(((char) ContextSuggestField.CONTEXT_SEPARATOR)); builder.append(((char) CompletionAnalyzer.SEP_LABEL)); builder.append("input"); expectedOutputs[0] = builder.toCharsRef().toString(); builder.clear(); builder.append("context2"); builder.append(((char) ContextSuggestField.CONTEXT_SEPARATOR)); builder.append(((char) CompletionAnalyzer.SEP_LABEL)); builder.append("input"); expectedOutputs[1] = builder.toCharsRef().toString(); TokenStream stream = new CompletionTokenStreamTest.PayloadAttrToTypeAttrFilter( field.tokenStream(analyzer, null)); assertTokenStreamContents( stream, expectedOutputs, null, null, new String[] {payload.utf8ToString(), payload.utf8ToString()}, new int[] {1, 1}, null, null); CompletionAnalyzer completionAnalyzer = new CompletionAnalyzer(analyzer); stream = new CompletionTokenStreamTest.PayloadAttrToTypeAttrFilter( field.tokenStream(completionAnalyzer, null)); assertTokenStreamContents( stream, expectedOutputs, null, null, new String[] {payload.utf8ToString(), payload.utf8ToString()}, new int[] {1, 1}, null, null); }
@Override public int nextPosition() throws IOException { final int pos; if (readPositions) { SimpleTextUtil.readLine(in, scratch); assert StringHelper.startsWith(scratch, POS) : "got line=" + scratch.utf8ToString(); UnicodeUtil.UTF8toUTF16( scratch.bytes, scratch.offset + POS.length, scratch.length - POS.length, scratchUTF16_2); pos = ArrayUtil.parseInt(scratchUTF16_2.chars, 0, scratchUTF16_2.length); } else { pos = -1; } if (readOffsets) { SimpleTextUtil.readLine(in, scratch); assert StringHelper.startsWith(scratch, START_OFFSET) : "got line=" + scratch.utf8ToString(); UnicodeUtil.UTF8toUTF16( scratch.bytes, scratch.offset + START_OFFSET.length, scratch.length - START_OFFSET.length, scratchUTF16_2); startOffset = ArrayUtil.parseInt(scratchUTF16_2.chars, 0, scratchUTF16_2.length); SimpleTextUtil.readLine(in, scratch); assert StringHelper.startsWith(scratch, END_OFFSET) : "got line=" + scratch.utf8ToString(); UnicodeUtil.UTF8toUTF16( scratch.bytes, scratch.offset + END_OFFSET.length, scratch.length - END_OFFSET.length, scratchUTF16_2); endOffset = ArrayUtil.parseInt(scratchUTF16_2.chars, 0, scratchUTF16_2.length); } final long fp = in.getFilePointer(); SimpleTextUtil.readLine(in, scratch); if (StringHelper.startsWith(scratch, PAYLOAD)) { final int len = scratch.length - PAYLOAD.length; if (scratch2.bytes.length < len) { scratch2.grow(len); } System.arraycopy(scratch.bytes, PAYLOAD.length, scratch2.bytes, 0, len); scratch2.length = len; payload = scratch2; } else { payload = null; in.seek(fp); } return pos; }
@Override public void hitExecute(SearchContext context, HitContext hitContext) { if (context.getFetchSubPhaseContext(CONTEXT_FACTORY).hitExecutionNeeded() == false) { return; } String field = context.getFetchSubPhaseContext(CONTEXT_FACTORY).getField(); if (hitContext.hit().fieldsOrNull() == null) { hitContext.hit().fields(new HashMap<>()); } SearchHitField hitField = hitContext.hit().fields().get(NAMES[0]); if (hitField == null) { hitField = new InternalSearchHitField(NAMES[0], new ArrayList<>(1)); hitContext.hit().fields().put(NAMES[0], hitField); } TermVectorsResponse termVector = TermVectorsService.getTermVectors( context.indexShard(), new TermVectorsRequest( context.indexShard().shardId().getIndex().getName(), hitContext.hit().type(), hitContext.hit().id())); try { Map<String, Integer> tv = new HashMap<>(); TermsEnum terms = termVector.getFields().terms(field).iterator(); BytesRef term; while ((term = terms.next()) != null) { tv.put(term.utf8ToString(), terms.postings(null, PostingsEnum.ALL).freq()); } hitField.values().add(tv); } catch (IOException e) { ESLoggerFactory.getLogger(FetchSubPhasePluginIT.class.getName()) .info("Swallowed exception", e); } }
/** * Find terms in the index based on a prefix. Useful for autocomplete. * * @param index the index * @param fieldName the field * @param prefix the prefix we're looking for (null or empty string for all terms) * @param sensitive match case-sensitively or not? * @param maxResults max. number of results to return (or -1 for all) * @return the matching terms */ public static List<String> findTermsByPrefix( LeafReader index, String fieldName, String prefix, boolean sensitive, int maxResults) { boolean allTerms = prefix == null || prefix.length() == 0; if (allTerms) { prefix = ""; sensitive = true; // don't do unnecessary work in this case } try { if (!sensitive) prefix = StringUtil.removeAccents(prefix).toLowerCase(); org.apache.lucene.index.Terms terms = index.terms(fieldName); List<String> results = new ArrayList<>(); TermsEnum termsEnum = terms.iterator(); BytesRef brPrefix = new BytesRef(prefix.getBytes(LUCENE_DEFAULT_CHARSET)); termsEnum.seekCeil(brPrefix); // find the prefix in the terms list while (maxResults < 0 || results.size() < maxResults) { BytesRef term = termsEnum.next(); if (term == null) break; String termText = term.utf8ToString(); String optDesensitized = termText; if (!sensitive) optDesensitized = StringUtil.removeAccents(termText).toLowerCase(); if (!allTerms && !optDesensitized.substring(0, prefix.length()).equalsIgnoreCase(prefix)) { // Doesn't match prefix or different field; no more matches break; } // Match, add term results.add(termText); } return results; } catch (IOException e) { throw new RuntimeException(e); } }
static void writeFieldVal(BytesRef val, FieldType ft, Appendable out, int flags) throws IOException { if (ft != null) { try { CharsRef readable = new CharsRef(); ft.indexedToReadable(val, readable); out.append(readable); } catch (Exception e) { out.append("EXCEPTION(val="); out.append(val.utf8ToString()); out.append(")"); } } else { out.append(val.utf8ToString()); } }
@Override public int nextDoc() throws IOException { if (docID == NO_MORE_DOCS) { return docID; } boolean first = true; int termFreq = 0; while (true) { final long lineStart = in.getFilePointer(); SimpleTextUtil.readLine(in, scratch); if (StringHelper.startsWith(scratch, DOC)) { if (!first && (liveDocs == null || liveDocs.get(docID))) { in.seek(lineStart); if (!omitTF) { tf = termFreq; } return docID; } UnicodeUtil.UTF8toUTF16( scratch.bytes, scratch.offset + DOC.length, scratch.length - DOC.length, scratchUTF16); docID = ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length); termFreq = 0; first = false; } else if (StringHelper.startsWith(scratch, FREQ)) { UnicodeUtil.UTF8toUTF16( scratch.bytes, scratch.offset + FREQ.length, scratch.length - FREQ.length, scratchUTF16); termFreq = ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length); } else if (StringHelper.startsWith(scratch, POS)) { // skip termFreq++; } else if (StringHelper.startsWith(scratch, START_OFFSET)) { // skip } else if (StringHelper.startsWith(scratch, END_OFFSET)) { // skip } else if (StringHelper.startsWith(scratch, PAYLOAD)) { // skip } else { assert StringHelper.startsWith(scratch, TERM) || StringHelper.startsWith(scratch, FIELD) || StringHelper.startsWith(scratch, END) : "scratch=" + scratch.utf8ToString(); if (!first && (liveDocs == null || liveDocs.get(docID))) { in.seek(lineStart); if (!omitTF) { tf = termFreq; } return docID; } return docID = NO_MORE_DOCS; } } }
@Override public final boolean incrementToken() throws IOException { clearAttributes(); BytesRef bytes = termsEnum.next(); if (bytes == null) return false; charTerm.setEmpty(); charTerm.append(bytes.utf8ToString()); return true; }
// for debugging @SuppressWarnings("unused") static String brToString(BytesRef b) { try { return b.utf8ToString() + " " + b; } catch (Throwable t) { // If BytesRef isn't actually UTF8, or it's eg a // prefix of UTF8 that ends mid-unicode-char, we // fallback to hex: return b.toString(); } }
public LongFilter convertToLongFilter(DocValueFormat format) { if (isPartitionBased()) { return new PartitionedLongFilter(); } int numValids = includeValues == null ? 0 : includeValues.size(); int numInvalids = excludeValues == null ? 0 : excludeValues.size(); SetBackedLongFilter result = new SetBackedLongFilter(numValids, numInvalids); if (includeValues != null) { for (BytesRef val : includeValues) { result.addAccept(format.parseLong(val.utf8ToString(), false, null)); } } if (excludeValues != null) { for (BytesRef val : excludeValues) { result.addReject(format.parseLong(val.utf8ToString(), false, null)); } } return result; }
@Test public void testTokenStream() throws Exception { Analyzer analyzer = new MockAnalyzer(random()); SuggestField suggestField = new SuggestField("field", "input", 1); BytesRef surfaceForm = new BytesRef("input"); ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream(); try (OutputStreamDataOutput output = new OutputStreamDataOutput(byteArrayOutputStream)) { output.writeVInt(surfaceForm.length); output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length); output.writeVInt(1 + 1); output.writeByte(SuggestField.TYPE); } BytesRef payload = new BytesRef(byteArrayOutputStream.toByteArray()); TokenStream stream = new CompletionTokenStreamTest.PayloadAttrToTypeAttrFilter( suggestField.tokenStream(analyzer, null)); assertTokenStreamContents( stream, new String[] {"input"}, null, null, new String[] {payload.utf8ToString()}, new int[] {1}, null, null); CompletionAnalyzer completionAnalyzer = new CompletionAnalyzer(analyzer); stream = new CompletionTokenStreamTest.PayloadAttrToTypeAttrFilter( suggestField.tokenStream(completionAnalyzer, null)); assertTokenStreamContents( stream, new String[] {"input"}, null, null, new String[] {payload.utf8ToString()}, new int[] {1}, null, null); }
@Override public Explanation explain(LeafReaderContext context, int doc) throws IOException { SortedDocValues values = DocValues.getSorted(context.reader(), joinField); if (values != null) { int segmentOrd = values.getOrd(doc); if (segmentOrd != -1) { BytesRef joinValue = values.lookupOrd(segmentOrd); return Explanation.match( queryNorm, "Score based on join value " + joinValue.utf8ToString()); } } return Explanation.noMatch("Not a match"); }
public AggregatorValueProc( LongIntOpenHashMap facets, Set<BytesRef> excluded, SearchScript script) { super(facets); this.script = script; if (excluded == null || excluded.isEmpty()) { this.excluded = null; } else { this.excluded = new LongOpenHashSet(excluded.size()); for (BytesRef s : excluded) { this.excluded.add(Long.parseLong(s.utf8ToString())); } } }
public LongFilter convertToDoubleFilter() { if (isPartitionBased()) { return new PartitionedLongFilter(); } int numValids = includeValues == null ? 0 : includeValues.size(); int numInvalids = excludeValues == null ? 0 : excludeValues.size(); SetBackedLongFilter result = new SetBackedLongFilter(numValids, numInvalids); if (includeValues != null) { for (BytesRef val : includeValues) { double dval = Double.parseDouble(val.utf8ToString()); result.addAccept(NumericUtils.doubleToSortableLong(dval)); } } if (excludeValues != null) { for (BytesRef val : excludeValues) { double dval = Double.parseDouble(val.utf8ToString()); result.addReject(NumericUtils.doubleToSortableLong(dval)); } } return result; }
private static SortedSet<BytesRef> parseForDocValues( SortedSet<BytesRef> endUserFormattedValues, DocValueFormat format) { SortedSet<BytesRef> result = endUserFormattedValues; if (endUserFormattedValues != null) { if (format != DocValueFormat.RAW) { result = new TreeSet<>(); for (BytesRef formattedVal : endUserFormattedValues) { result.add(format.parseBytesRef(formattedVal.utf8ToString())); } } } return result; }
@Override protected ShardTermlistResponse shardOperation(ShardTermlistRequest request) throws ElasticSearchException { synchronized (termlistMutex) { InternalIndexShard indexShard = (InternalIndexShard) indicesService.indexServiceSafe(request.index()).shardSafe(request.shardId()); indexShard.store().directory(); Engine.Searcher searcher = indexShard.searcher(); try { Set<String> set = new CompactHashSet(); Fields fields = MultiFields.getFields(searcher.reader()); if (fields != null) { for (Iterator<String> it = fields.iterator(); it.hasNext(); ) { String field = it.next(); if (field.charAt(0) == '_') { continue; } if (request.getField() == null || field.equals(request.getField())) { Terms terms = fields.terms(field); if (terms != null) { TermsEnum termsEnum = terms.iterator(null); BytesRef text; while ((text = termsEnum.next()) != null) { set.add(text.utf8ToString()); System.out.println("field=" + field + "; text=" + text.utf8ToString()); } } } } } return new ShardTermlistResponse(request.index(), request.shardId(), set); } catch (IOException ex) { throw new ElasticSearchException(ex.getMessage(), ex); } } }
// for debugging String brToString(BytesRef b) { if (b == null) { return "null"; } else { try { return b.utf8ToString() + " " + b; } catch (Throwable t) { // If BytesRef isn't actually UTF8, or it's eg a // prefix of UTF8 that ends mid-unicode-char, we // fallback to hex: return b.toString(); } } }
/** * Updates a previous suggestion, matching the exact same text as before. Use this to change the * weight or payload of an already added suggstion. If you know this text is not already present * you can use {@link #add} instead. After adding or updating a batch of new suggestions, you must * call {@link #refresh} in the end in order to see the suggestions in {@link #lookup} */ public void update(BytesRef text, long weight, BytesRef payload) throws IOException { String textString = text.utf8ToString(); Document doc = new Document(); FieldType ft = getTextFieldType(); doc.add(new Field(TEXT_FIELD_NAME, textString, ft)); doc.add(new Field("textgrams", textString, ft)); doc.add(new StringField(EXACT_TEXT_FIELD_NAME, textString, Field.Store.NO)); doc.add(new BinaryDocValuesField(TEXT_FIELD_NAME, text)); doc.add(new NumericDocValuesField("weight", weight)); if (payload != null) { doc.add(new BinaryDocValuesField("payloads", payload)); } writer.updateDocument(new Term(EXACT_TEXT_FIELD_NAME, textString), doc); }
/** * Create the results based on the search hits. Can be overridden by subclass to add particular * behavior (e.g. weight transformation) * * @throws IOException If there are problems reading fields from the underlying Lucene index. */ protected List<LookupResult> createResults( IndexSearcher searcher, TopFieldDocs hits, int num, CharSequence charSequence, boolean doHighlight, Set<String> matchedTokens, String prefixToken) throws IOException { BinaryDocValues textDV = MultiDocValues.getBinaryValues(searcher.getIndexReader(), TEXT_FIELD_NAME); // This will just be null if app didn't pass payloads to build(): // TODO: maybe just stored fields? they compress... BinaryDocValues payloadsDV = MultiDocValues.getBinaryValues(searcher.getIndexReader(), "payloads"); List<LookupResult> results = new ArrayList<>(); BytesRef scratch = new BytesRef(); for (int i = 0; i < hits.scoreDocs.length; i++) { FieldDoc fd = (FieldDoc) hits.scoreDocs[i]; textDV.get(fd.doc, scratch); String text = scratch.utf8ToString(); long score = (Long) fd.fields[0]; BytesRef payload; if (payloadsDV != null) { payload = new BytesRef(); payloadsDV.get(fd.doc, payload); } else { payload = null; } LookupResult result; if (doHighlight) { Object highlightKey = highlight(text, matchedTokens, prefixToken); result = new LookupResult(highlightKey.toString(), highlightKey, score, payload); } else { result = new LookupResult(text, score, payload); } results.add(result); } return results; }
public void testNonRootFloorBlocks() throws Exception { Directory dir = newDirectory(); IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); IndexWriter w = new IndexWriter(dir, iwc); List<String> terms = new ArrayList<>(); for (int i = 0; i < 36; i++) { Document doc = new Document(); String term = "" + (char) (97 + i); terms.add(term); if (VERBOSE) { System.out.println("i=" + i + " term=" + term); } doc.add(newTextField("field", term, Field.Store.NO)); w.addDocument(doc); } for (int i = 0; i < 128; i++) { Document doc = new Document(); String term = "m" + (char) i; terms.add(term); if (VERBOSE) { System.out.println("i=" + i + " term=" + term + " bytes=" + new BytesRef(term)); } doc.add(newStringField("field", term, Field.Store.NO)); w.addDocument(doc); } w.forceMerge(1); IndexReader r = DirectoryReader.open(w, true); TermsEnum te = MultiFields.getTerms(r, "field").iterator(null); BytesRef term; int ord = 0; while ((term = te.next()) != null) { if (VERBOSE) { System.out.println("TEST: " + te.ord() + ": " + term.utf8ToString()); } assertEquals(ord, te.ord()); ord++; } testEnum(te, terms); r.close(); w.close(); dir.close(); }
void dump(PrintStream out) { out.println(field + ":"); final BytesRef ref = new BytesRef(); for (int i = 0; i < terms.size(); i++) { terms.get(ords[i], ref); out.print(ref + " " + ref.utf8ToString() + " "); try { out.print(Long.toHexString(LegacyNumericUtils.prefixCodedToLong(ref)) + "L"); } catch (Exception e) { try { out.print(Integer.toHexString(LegacyNumericUtils.prefixCodedToInt(ref)) + "i"); } catch (Exception ee) { } } out.println(" score=" + scores[ords[i]]); out.println(""); } }
@Override public Explanation explain(LeafReaderContext context, int doc) throws IOException { SortedDocValues values = DocValues.getSorted(context.reader(), joinField); if (values != null) { int segmentOrd = values.getOrd(doc); if (segmentOrd != -1) { final float score; if (globalOrds != null) { long globalOrd = globalOrds.getGlobalOrds(context.ord).get(segmentOrd); score = collector.score((int) globalOrd); } else { score = collector.score(segmentOrd); } BytesRef joinValue = values.lookupOrd(segmentOrd); return Explanation.match(score, "Score based on join value " + joinValue.utf8ToString()); } } return Explanation.noMatch("Not a match"); }
// Produces a realistic unicode random string that // survives MockAnalyzer unchanged: private String getRandomTerm(String other) throws IOException { Analyzer a = new MockAnalyzer(random()); while (true) { String s = _TestUtil.randomRealisticUnicodeString(random()); if (other != null && s.equals(other)) { continue; } IOException priorException = null; TokenStream ts = a.tokenStream("foo", s); try { final TermToBytesRefAttribute termAtt = ts.getAttribute(TermToBytesRefAttribute.class); final BytesRef termBytes = termAtt.getBytesRef(); ts.reset(); int count = 0; boolean changed = false; while (ts.incrementToken()) { termAtt.fillBytesRef(); if (count == 0 && !termBytes.utf8ToString().equals(s)) { // The value was changed during analysis. Keep iterating so the // tokenStream is exhausted. changed = true; } count++; } ts.end(); // Did we iterate just once and the value was unchanged? if (!changed && count == 1) { return s; } } catch (IOException e) { priorException = e; } finally { IOUtils.closeWhileHandlingException(priorException, ts); } } }
/** tests a pre-intersected automaton against the original */ public void testFiniteVersusInfinite() throws Exception { for (int i = 0; i < numIterations; i++) { String reg = AutomatonTestUtil.randomRegexp(random()); Automaton automaton = new RegExp(reg, RegExp.NONE).toAutomaton(); final List<BytesRef> matchedTerms = new ArrayList<BytesRef>(); for (BytesRef t : terms) { if (BasicOperations.run(automaton, t.utf8ToString())) { matchedTerms.add(t); } } Automaton alternate = BasicAutomata.makeStringUnion(matchedTerms); // System.out.println("match " + matchedTerms.size() + " " + alternate.getNumberOfStates() + " // states, sigma=" + alternate.getStartPoints().length); // AutomatonTestUtil.minimizeSimple(alternate); // System.out.println("minmize done"); AutomatonQuery a1 = new AutomatonQuery(new Term("field", ""), automaton); AutomatonQuery a2 = new AutomatonQuery(new Term("field", ""), alternate); CheckHits.checkEqual( a1, searcher.search(a1, 25).scoreDocs, searcher.search(a2, 25).scoreDocs); } }
/** seeks to every term accepted by some automata */ public void testSeeking() throws Exception { for (int i = 0; i < numIterations; i++) { String reg = AutomatonTestUtil.randomRegexp(random()); Automaton automaton = new RegExp(reg, RegExp.NONE).toAutomaton(); TermsEnum te = MultiFields.getTerms(reader, "field").iterator(null); ArrayList<BytesRef> unsortedTerms = new ArrayList<BytesRef>(terms); Collections.shuffle(unsortedTerms, random()); for (BytesRef term : unsortedTerms) { if (BasicOperations.run(automaton, term.utf8ToString())) { // term is accepted if (random().nextBoolean()) { // seek exact assertTrue(te.seekExact(term, random().nextBoolean())); } else { // seek ceil assertEquals(SeekStatus.FOUND, te.seekCeil(term, random().nextBoolean())); assertEquals(term, te.term()); } } } } }
public void testFloorBlocks() throws Exception { Directory dir = newDirectory(); IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); IndexWriter w = new IndexWriter(dir, iwc); for (int i = 0; i < 128; i++) { Document doc = new Document(); String term = "" + (char) i; if (VERBOSE) { System.out.println("i=" + i + " term=" + term + " bytes=" + new BytesRef(term)); } doc.add(newStringField("field", term, Field.Store.NO)); w.addDocument(doc); } w.forceMerge(1); IndexReader r = DirectoryReader.open(w, true); TermsEnum te = MultiFields.getTerms(r, "field").iterator(null); if (VERBOSE) { BytesRef term; while ((term = te.next()) != null) { System.out.println(" " + te.ord() + ": " + term.utf8ToString()); } } assertTrue(te.seekExact(new BytesRef("a"))); assertEquals(97, te.ord()); te.seekExact(98); assertEquals(new BytesRef("b"), te.term()); assertTrue(te.seekExact(new BytesRef("z"))); assertEquals(122, te.ord()); r.close(); w.close(); dir.close(); }
@Override public void visitMatchingTerms(IndexReader reader, String fieldName, MatchingTermVisitor mtv) throws IOException { int prefixLength = prefix.length(); Terms terms = MultiFields.getTerms(reader, fieldName); if (terms != null) { Matcher matcher = pattern.matcher(""); try { TermsEnum termsEnum = terms.iterator(null); TermsEnum.SeekStatus status = termsEnum.seekCeil(prefixRef); BytesRef text; if (status == TermsEnum.SeekStatus.FOUND) { text = prefixRef; } else if (status == TermsEnum.SeekStatus.NOT_FOUND) { text = termsEnum.term(); } else { text = null; } while (text != null) { if (text != null && StringHelper.startsWith(text, prefixRef)) { String textString = text.utf8ToString(); matcher.reset(textString.substring(prefixLength)); if (matcher.matches()) { mtv.visitMatchingTerm(new Term(fieldName, textString)); } } else { break; } text = termsEnum.next(); } } finally { matcher.reset(); } } }
public void testRandom() throws Exception { int num = atLeast(2); for (int iter = 0; iter < num; iter++) { if (VERBOSE) { System.out.println("TEST: iter=" + iter); } Directory dir = newDirectory(); IndexWriter w = new IndexWriter( dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())) .setMergePolicy(NoMergePolicy.COMPOUND_FILES)); _TestUtil.keepFullyDeletedSegments(w); Map<BytesRef, List<Integer>> docs = new HashMap<BytesRef, List<Integer>>(); Set<Integer> deleted = new HashSet<Integer>(); List<BytesRef> terms = new ArrayList<BytesRef>(); int numDocs = _TestUtil.nextInt(random(), 1, 100 * RANDOM_MULTIPLIER); Document doc = new Document(); Field f = newStringField("field", "", Field.Store.NO); doc.add(f); Field id = newStringField("id", "", Field.Store.NO); doc.add(id); boolean onlyUniqueTerms = random().nextBoolean(); if (VERBOSE) { System.out.println("TEST: onlyUniqueTerms=" + onlyUniqueTerms + " numDocs=" + numDocs); } Set<BytesRef> uniqueTerms = new HashSet<BytesRef>(); for (int i = 0; i < numDocs; i++) { if (!onlyUniqueTerms && random().nextBoolean() && terms.size() > 0) { // re-use existing term BytesRef term = terms.get(random().nextInt(terms.size())); docs.get(term).add(i); f.setStringValue(term.utf8ToString()); } else { String s = _TestUtil.randomUnicodeString(random(), 10); BytesRef term = new BytesRef(s); if (!docs.containsKey(term)) { docs.put(term, new ArrayList<Integer>()); } docs.get(term).add(i); terms.add(term); uniqueTerms.add(term); f.setStringValue(s); } id.setStringValue("" + i); w.addDocument(doc); if (random().nextInt(4) == 1) { w.commit(); } if (i > 0 && random().nextInt(20) == 1) { int delID = random().nextInt(i); deleted.add(delID); w.deleteDocuments(new Term("id", "" + delID)); if (VERBOSE) { System.out.println("TEST: delete " + delID); } } } if (VERBOSE) { List<BytesRef> termsList = new ArrayList<BytesRef>(uniqueTerms); Collections.sort(termsList, BytesRef.getUTF8SortedAsUTF16Comparator()); System.out.println("TEST: terms in UTF16 order:"); for (BytesRef b : termsList) { System.out.println(" " + UnicodeUtil.toHexString(b.utf8ToString()) + " " + b); for (int docID : docs.get(b)) { if (deleted.contains(docID)) { System.out.println(" " + docID + " (deleted)"); } else { System.out.println(" " + docID); } } } } IndexReader reader = w.getReader(); w.close(); if (VERBOSE) { System.out.println("TEST: reader=" + reader); } Bits liveDocs = MultiFields.getLiveDocs(reader); for (int delDoc : deleted) { assertFalse(liveDocs.get(delDoc)); } for (int i = 0; i < 100; i++) { BytesRef term = terms.get(random().nextInt(terms.size())); if (VERBOSE) { System.out.println( "TEST: seek term=" + UnicodeUtil.toHexString(term.utf8ToString()) + " " + term); } DocsEnum docsEnum = _TestUtil.docs(random(), reader, "field", term, liveDocs, null, 0); assertNotNull(docsEnum); for (int docID : docs.get(term)) { if (!deleted.contains(docID)) { assertEquals(docID, docsEnum.nextDoc()); } } assertEquals(DocIdSetIterator.NO_MORE_DOCS, docsEnum.nextDoc()); } reader.close(); dir.close(); } }
@Override public Object toObject() { return exists ? value.utf8ToString() : null; }