/** * Sets the enum to operate in linear fashion, as we have found a looping transition at position: * we set an upper bound and act like a TermRangeQuery for this portion of the term space. */ private void setLinear(int position) { assert linear == false; int state = runAutomaton.getInitialState(); int maxInterval = 0xff; for (int i = 0; i < position; i++) { state = runAutomaton.step(state, seekBytesRef.bytes[i] & 0xff); assert state >= 0 : "state=" + state; } for (int i = 0; i < allTransitions[state].length; i++) { Transition t = allTransitions[state][i]; if (t.getMin() <= (seekBytesRef.bytes[position] & 0xff) && (seekBytesRef.bytes[position] & 0xff) <= t.getMax()) { maxInterval = t.getMax(); break; } } // 0xff terms don't get the optimization... not worth the trouble. if (maxInterval != 0xff) maxInterval++; int length = position + 1; /* position + maxTransition */ if (linearUpperBound.bytes.length < length) linearUpperBound.bytes = new byte[length]; System.arraycopy(seekBytesRef.bytes, 0, linearUpperBound.bytes, 0, position); linearUpperBound.bytes[position] = (byte) maxInterval; linearUpperBound.length = length; linear = true; }
private void process(int groupOrd, int facetOrd) { if (facetOrd < startFacetOrd || facetOrd >= endFacetOrd) { return; } int segmentGroupedFacetsIndex = groupOrd * (facetFieldNumTerms + 1) + facetOrd; if (segmentGroupedFacetHits.exists(segmentGroupedFacetsIndex)) { return; } segmentTotalCount++; segmentFacetCounts[facetOrd]++; segmentGroupedFacetHits.put(segmentGroupedFacetsIndex); BytesRef groupKey; if (groupOrd == -1) { groupKey = null; } else { groupKey = BytesRef.deepCopyOf(groupFieldTermsIndex.lookupOrd(groupOrd)); } final BytesRef facetValue; if (facetOrd == facetFieldNumTerms) { facetValue = null; } else { facetValue = BytesRef.deepCopyOf(facetFieldDocTermOrds.lookupOrd(facetOrd)); } groupedFacetHits.add(new GroupedFacetHit(groupKey, facetValue)); }
@Override public BytesRef getPayload() throws IOException { if (!payloadPending) { return null; } if (pendingPayloadBytes == 0) { return payload; } assert pendingPayloadBytes >= payloadLength; if (pendingPayloadBytes > payloadLength) { payloadIn.seek(payloadIn.getFilePointer() + (pendingPayloadBytes - payloadLength)); } if (payload == null) { payload = new BytesRef(); payload.bytes = new byte[payloadLength]; } else if (payload.bytes.length < payloadLength) { payload.grow(payloadLength); } payloadIn.readBytes(payload.bytes, 0, payloadLength); payload.length = payloadLength; pendingPayloadBytes = 0; return payload; }
@Override public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { if (include != null) { builder.field(INCLUDE_FIELD.getPreferredName(), include.getOriginalString()); } else if (includeValues != null) { builder.startArray(INCLUDE_FIELD.getPreferredName()); for (BytesRef value : includeValues) { builder.value(value.utf8ToString()); } builder.endArray(); } else if (isPartitionBased()) { builder.startObject(INCLUDE_FIELD.getPreferredName()); builder.field(PARTITION_FIELD.getPreferredName(), incZeroBasedPartition); builder.field(NUM_PARTITIONS_FIELD.getPreferredName(), incNumPartitions); builder.endObject(); } if (exclude != null) { builder.field(EXCLUDE_FIELD.getPreferredName(), exclude.getOriginalString()); } else if (excludeValues != null) { builder.startArray(EXCLUDE_FIELD.getPreferredName()); for (BytesRef value : excludeValues) { builder.value(value.utf8ToString()); } builder.endArray(); } return builder; }
// LUCENE-3870 public void testLengthPrefixAcrossTwoPages() throws Exception { Directory d = newDirectory(); IndexWriter w = new IndexWriter(d, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()))); Document doc = new Document(); byte[] bytes = new byte[32764]; BytesRef b = new BytesRef(); b.bytes = bytes; b.length = bytes.length; doc.add(new SortedDocValuesField("field", b)); w.addDocument(doc); bytes[0] = 1; w.addDocument(doc); w.forceMerge(1); DirectoryReader r = w.getReader(); BinaryDocValues s = FieldCache.DEFAULT.getTerms(getOnlySegmentReader(r), "field"); BytesRef bytes1 = new BytesRef(); s.get(0, bytes1); assertEquals(bytes.length, bytes1.length); bytes[0] = 0; assertEquals(b, bytes1); s.get(1, bytes1); assertEquals(bytes.length, bytes1.length); bytes[0] = 1; assertEquals(b, bytes1); r.close(); w.close(); d.close(); }
@Override public BytesRef binaryValue() { assertThread("Sorted doc values", creationThread); final BytesRef result = in.binaryValue(); assert result.isValid(); return result; }
/** * Find terms in the index based on a prefix. Useful for autocomplete. * * @param index the index * @param fieldName the field * @param prefix the prefix we're looking for (null or empty string for all terms) * @param sensitive match case-sensitively or not? * @param maxResults max. number of results to return (or -1 for all) * @return the matching terms */ public static List<String> findTermsByPrefix( LeafReader index, String fieldName, String prefix, boolean sensitive, int maxResults) { boolean allTerms = prefix == null || prefix.length() == 0; if (allTerms) { prefix = ""; sensitive = true; // don't do unnecessary work in this case } try { if (!sensitive) prefix = StringUtil.removeAccents(prefix).toLowerCase(); org.apache.lucene.index.Terms terms = index.terms(fieldName); List<String> results = new ArrayList<>(); TermsEnum termsEnum = terms.iterator(); BytesRef brPrefix = new BytesRef(prefix.getBytes(LUCENE_DEFAULT_CHARSET)); termsEnum.seekCeil(brPrefix); // find the prefix in the terms list while (maxResults < 0 || results.size() < maxResults) { BytesRef term = termsEnum.next(); if (term == null) break; String termText = term.utf8ToString(); String optDesensitized = termText; if (!sensitive) optDesensitized = StringUtil.removeAccents(termText).toLowerCase(); if (!allTerms && !optDesensitized.substring(0, prefix.length()).equalsIgnoreCase(prefix)) { // Doesn't match prefix or different field; no more matches break; } // Match, add term results.add(termText); } return results; } catch (IOException e) { throw new RuntimeException(e); } }
@Override public SeekStatus seekCeil(BytesRef target) throws IOException { // already here if (term != null && term.equals(target)) { return SeekStatus.FOUND; } int startIdx = Arrays.binarySearch(indexedTermsArray, target); if (startIdx >= 0) { // we hit the term exactly... lucky us! TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(target); assert seekStatus == TermsEnum.SeekStatus.FOUND; ord = startIdx << indexIntervalBits; setTerm(); assert term != null; return SeekStatus.FOUND; } // we didn't hit the term exactly startIdx = -startIdx - 1; if (startIdx == 0) { // our target occurs *before* the first term TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(target); assert seekStatus == TermsEnum.SeekStatus.NOT_FOUND; ord = 0; setTerm(); assert term != null; return SeekStatus.NOT_FOUND; } // back up to the start of the block startIdx--; if ((ord >> indexIntervalBits) == startIdx && term != null && term.compareTo(target) <= 0) { // we are already in the right block and the current term is before the term we want, // so we don't need to seek. } else { // seek to the right block TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(indexedTermsArray[startIdx]); assert seekStatus == TermsEnum.SeekStatus.FOUND; ord = startIdx << indexIntervalBits; setTerm(); assert term != null; // should be non-null since it's in the index } while (term != null && term.compareTo(target) < 0) { next(); } if (term == null) { return SeekStatus.END; } else if (term.compareTo(target) == 0) { return SeekStatus.FOUND; } else { return SeekStatus.NOT_FOUND; } }
@Override public BytesRef writeToBytes() { long start = System.nanoTime(); int size = set.size(); BytesRef bytes = new BytesRef(new byte[HEADER_SIZE + (int) bytesUsed.get()]); // Encode encoding type Bytes.writeInt(bytes, this.getEncoding().ordinal()); // Encode flag bytes.bytes[bytes.offset++] = (byte) (this.isPruned() ? 1 : 0); // Encode size of the set Bytes.writeInt(bytes, size); // Encode longs BytesRef reusable = new BytesRef(); for (int i = 0; i < this.set.size(); i++) { this.set.get(i, reusable); Bytes.writeBytesRef(reusable, bytes); } logger.debug( "Serialized {} terms - took {} ms", this.size(), (System.nanoTime() - start) / 1000000); bytes.length = bytes.offset; bytes.offset = 0; return bytes; }
@Override public void hitExecute(SearchContext context, HitContext hitContext) { if (context.getFetchSubPhaseContext(CONTEXT_FACTORY).hitExecutionNeeded() == false) { return; } String field = context.getFetchSubPhaseContext(CONTEXT_FACTORY).getField(); if (hitContext.hit().fieldsOrNull() == null) { hitContext.hit().fields(new HashMap<>()); } SearchHitField hitField = hitContext.hit().fields().get(NAMES[0]); if (hitField == null) { hitField = new InternalSearchHitField(NAMES[0], new ArrayList<>(1)); hitContext.hit().fields().put(NAMES[0], hitField); } TermVectorsResponse termVector = TermVectorsService.getTermVectors( context.indexShard(), new TermVectorsRequest( context.indexShard().shardId().getIndex().getName(), hitContext.hit().type(), hitContext.hit().id())); try { Map<String, Integer> tv = new HashMap<>(); TermsEnum terms = termVector.getFields().terms(field).iterator(); BytesRef term; while ((term = terms.next()) != null) { tv.put(term.utf8ToString(), terms.postings(null, PostingsEnum.ALL).freq()); } hitField.values().add(tv); } catch (IOException e) { ESLoggerFactory.getLogger(FetchSubPhasePluginIT.class.getName()) .info("Swallowed exception", e); } }
public void testIntStream() throws Exception { final NumericTokenStream stream = new NumericTokenStream().setIntValue(ivalue); // use getAttribute to test if attributes really exist, if not an IAE will be throwed final TermToBytesRefAttribute bytesAtt = stream.getAttribute(TermToBytesRefAttribute.class); final TypeAttribute typeAtt = stream.getAttribute(TypeAttribute.class); final NumericTokenStream.NumericTermAttribute numericAtt = stream.getAttribute(NumericTokenStream.NumericTermAttribute.class); final BytesRef bytes = bytesAtt.getBytesRef(); stream.reset(); assertEquals(32, numericAtt.getValueSize()); for (int shift = 0; shift < 32; shift += NumericUtils.PRECISION_STEP_DEFAULT) { assertTrue("New token is available", stream.incrementToken()); assertEquals("Shift value wrong", shift, numericAtt.getShift()); final int hash = bytesAtt.fillBytesRef(); assertEquals("Hash incorrect", bytes.hashCode(), hash); assertEquals( "Term is incorrectly encoded", ivalue & ~((1 << shift) - 1), NumericUtils.prefixCodedToInt(bytes)); assertEquals( "Term raw value is incorrectly encoded", ((long) ivalue) & ~((1L << shift) - 1L), numericAtt.getRawValue()); assertEquals( "Type incorrect", (shift == 0) ? NumericTokenStream.TOKEN_TYPE_FULL_PREC : NumericTokenStream.TOKEN_TYPE_LOWER_PREC, typeAtt.type()); } assertFalse("More tokens available", stream.incrementToken()); stream.end(); stream.close(); }
private static BytesRef readBytesRef(IndexInput in) throws IOException { BytesRef bytes = new BytesRef(); bytes.length = in.readVInt(); bytes.bytes = new byte[bytes.length]; in.readBytes(bytes.bytes, 0, bytes.length); return bytes; }
/** * Returns the next String in lexicographic order that will not put the machine into a reject * state. * * <p>This method traverses the DFA from the given position in the String, starting at the given * state. * * <p>If this cannot satisfy the machine, returns false. This method will walk the minimal path, * in lexicographic order, as long as possible. * * <p>If this method returns false, then there might still be more solutions, it is necessary to * backtrack to find out. * * @param state current non-reject state * @param position useful portion of the string * @return true if more possible solutions exist for the DFA from this position */ private boolean nextString(int state, int position) { /* * the next lexicographic character must be greater than the existing * character, if it exists. */ int c = 0; if (position < seekBytesRef.length) { c = seekBytesRef.bytes[position] & 0xff; // if the next byte is 0xff and is not part of the useful portion, // then by definition it puts us in a reject state, and therefore this // path is dead. there cannot be any higher transitions. backtrack. if (c++ == 0xff) return false; } seekBytesRef.length = position; visited[state] = curGen; Transition transitions[] = allTransitions[state]; // find the minimal path (lexicographic order) that is >= c for (int i = 0; i < transitions.length; i++) { Transition transition = transitions[i]; if (transition.getMax() >= c) { int nextChar = Math.max(c, transition.getMin()); // append either the next sequential char, or the minimum transition seekBytesRef.grow(seekBytesRef.length + 1); seekBytesRef.length++; seekBytesRef.bytes[seekBytesRef.length - 1] = (byte) nextChar; state = transition.getDest().getNumber(); /* * as long as is possible, continue down the minimal path in * lexicographic order. if a loop or accept state is encountered, stop. */ while (visited[state] != curGen && !runAutomaton.isAccept(state)) { visited[state] = curGen; /* * Note: we work with a DFA with no transitions to dead states. * so the below is ok, if it is not an accept state, * then there MUST be at least one transition. */ transition = allTransitions[state][0]; state = transition.getDest().getNumber(); // append the minimum transition seekBytesRef.grow(seekBytesRef.length + 1); seekBytesRef.length++; seekBytesRef.bytes[seekBytesRef.length - 1] = (byte) transition.getMin(); // we found a loop, record it for faster enumeration if (!finite && !linear && visited[state] == curGen) { setLinear(seekBytesRef.length - 1); } } return true; } } return false; }
@Override public BytesRef lookupOrd(long ord) { assertThread("Sorted set doc values", creationThread); assert ord >= 0 && ord < valueCount; final BytesRef result = in.lookupOrd(ord); assert result.isValid(); return result; }
@Override public BytesRef term() throws IOException { assertThread("Terms enums", creationThread); assert state == State.POSITIONED : "term() called on unpositioned TermsEnum"; BytesRef ret = super.term(); assert ret == null || ret.isValid(); return ret; }
/** * Ensure we own term.bytes so that it's safe to modify. We detect via a kluge in which * cellsByLevel[0].termBuf is non-null, which is a pre-allocated for use to replace term.bytes. */ void ensureOwnTermBytes() { NRCell cell0 = cellsByLevel[0]; if (cell0.termBuf == null) return; // we already own the bytes System.arraycopy(term.bytes, term.offset, cell0.termBuf, 0, term.length); term.bytes = cell0.termBuf; term.offset = 0; cell0.termBuf = null; }
protected static void copy(BytesRef from, BytesRef to) { if (to.bytes.length < from.length) { to.bytes = new byte[ArrayUtil.oversize(from.length, RamUsageEstimator.NUM_BYTES_BYTE)]; } to.offset = 0; to.length = from.length; System.arraycopy(from.bytes, from.offset, to.bytes, 0, from.length); }
@Override public void seekExact(BytesRef target, TermState otherState) { if (!target.equals(term)) { state.copyFrom(otherState); term = BytesRef.deepCopyOf(target); seekPending = true; } }
@Override public BytesRef getTokenBytesNoLeaf(BytesRef result) { if (result == null) result = new BytesRef(); result.bytes = term.bytes; result.offset = term.offset; result.length = termLenByLevel[cellLevel]; assert result.length <= term.length; return result; }
@Test public void testNoCopy() throws Exception { BytesRef ref = new BytesRef("i do not want to be copied!"); BytesRef sub1 = SubstrFunction.substring(ref, 0, 10); BytesRef sub2 = SubstrFunction.substring(ref, 5, 14); assertThat(sub1.utf8ToString(), is("i do not w")); assertThat(sub2.utf8ToString(), is("not want ")); assertThat(ref.bytes, allOf(is(sub2.bytes), is(sub1.bytes))); }
@Override public long lookupTerm(BytesRef key) { assertThread("Sorted set doc values", creationThread); assert key.isValid(); long result = in.lookupTerm(key); assert result < valueCount; assert key.isValid(); return result; }
public static void fillQueue(TermsEnum termsEnum, TermStatsQueue tiq, String field) throws Exception { BytesRef term; while ((term = termsEnum.next()) != null) { BytesRef r = new BytesRef(); r.copyBytes(term); tiq.insertWithOverflow(new TermStats(field, r, termsEnum.docFreq())); } }
@Override public int hashCode() { final int prime = 31; int result = super.hashCode(); result = prime * result + (includeLower ? 1231 : 1237); result = prime * result + (includeUpper ? 1231 : 1237); result = prime * result + ((lowerTerm == null) ? 0 : lowerTerm.hashCode()); result = prime * result + ((upperTerm == null) ? 0 : upperTerm.hashCode()); return result; }
@Override public final boolean incrementToken() throws IOException { clearAttributes(); BytesRef bytes = termsEnum.next(); if (bytes == null) return false; charTerm.setEmpty(); charTerm.append(bytes.utf8ToString()); return true; }
@Override public void collect(int doc) throws IOException { if (doc > facetFieldTermsIndex.docID()) { facetFieldTermsIndex.advance(doc); } int facetOrd; if (doc == facetFieldTermsIndex.docID()) { facetOrd = facetFieldTermsIndex.ordValue(); } else { facetOrd = -1; } if (facetOrd < startFacetOrd || facetOrd >= endFacetOrd) { return; } if (doc > groupFieldTermsIndex.docID()) { groupFieldTermsIndex.advance(doc); } int groupOrd; if (doc == groupFieldTermsIndex.docID()) { groupOrd = groupFieldTermsIndex.ordValue(); } else { groupOrd = -1; } int segmentGroupedFacetsIndex = groupOrd * (facetFieldTermsIndex.getValueCount() + 1) + facetOrd; if (segmentGroupedFacetHits.exists(segmentGroupedFacetsIndex)) { return; } segmentTotalCount++; segmentFacetCounts[facetOrd + 1]++; segmentGroupedFacetHits.put(segmentGroupedFacetsIndex); BytesRef groupKey; if (groupOrd == -1) { groupKey = null; } else { groupKey = BytesRef.deepCopyOf(groupFieldTermsIndex.lookupOrd(groupOrd)); } BytesRef facetKey; if (facetOrd == -1) { facetKey = null; } else { facetKey = BytesRef.deepCopyOf(facetFieldTermsIndex.lookupOrd(facetOrd)); } groupedFacetHits.add(new GroupedFacetHit(groupKey, facetKey)); }
// for debugging @SuppressWarnings("unused") static String brToString(BytesRef b) { try { return b.utf8ToString() + " " + b; } catch (Throwable t) { // If BytesRef isn't actually UTF8, or it's eg a // prefix of UTF8 that ends mid-unicode-char, we // fallback to hex: return b.toString(); } }
@Test public void testTokenStream() throws Exception { Analyzer analyzer = new MockAnalyzer(random()); ContextSuggestField field = new ContextSuggestField("field", "input", 1, "context1", "context2"); BytesRef surfaceForm = new BytesRef("input"); ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream(); try (OutputStreamDataOutput output = new OutputStreamDataOutput(byteArrayOutputStream)) { output.writeVInt(surfaceForm.length); output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length); output.writeVInt(1 + 1); output.writeByte(ContextSuggestField.TYPE); } BytesRef payload = new BytesRef(byteArrayOutputStream.toByteArray()); String[] expectedOutputs = new String[2]; CharsRefBuilder builder = new CharsRefBuilder(); builder.append("context1"); builder.append(((char) ContextSuggestField.CONTEXT_SEPARATOR)); builder.append(((char) CompletionAnalyzer.SEP_LABEL)); builder.append("input"); expectedOutputs[0] = builder.toCharsRef().toString(); builder.clear(); builder.append("context2"); builder.append(((char) ContextSuggestField.CONTEXT_SEPARATOR)); builder.append(((char) CompletionAnalyzer.SEP_LABEL)); builder.append("input"); expectedOutputs[1] = builder.toCharsRef().toString(); TokenStream stream = new CompletionTokenStreamTest.PayloadAttrToTypeAttrFilter( field.tokenStream(analyzer, null)); assertTokenStreamContents( stream, expectedOutputs, null, null, new String[] {payload.utf8ToString(), payload.utf8ToString()}, new int[] {1, 1}, null, null); CompletionAnalyzer completionAnalyzer = new CompletionAnalyzer(analyzer); stream = new CompletionTokenStreamTest.PayloadAttrToTypeAttrFilter( field.tokenStream(completionAnalyzer, null)); assertTokenStreamContents( stream, expectedOutputs, null, null, new String[] {payload.utf8ToString(), payload.utf8ToString()}, new int[] {1, 1}, null, null); }
private BytesRef bytesFromTokenStream(TokenStream stream) throws IOException { TermToBytesRefAttribute termAttr = stream.getAttribute(TermToBytesRefAttribute.class); BytesRef bytesRef = termAttr.getBytesRef(); stream.reset(); while (stream.incrementToken()) { termAttr.fillBytesRef(); } stream.close(); BytesRef copy = new BytesRef(); copy.copyBytes(bytesRef); return copy; }
/** decodes the payload at the current position */ protected BytesRef decodePayload(BytesRef scratch, ByteArrayDataInput tmpInput) { tmpInput.reset(scratch.bytes); tmpInput.skipBytes(scratch.length - 2); // skip to payload size short payloadLength = tmpInput.readShort(); // read payload size tmpInput.setPosition(scratch.length - 2 - payloadLength); // setPosition to start of payload BytesRef payloadScratch = new BytesRef(payloadLength); tmpInput.readBytes(payloadScratch.bytes, 0, payloadLength); // read payload payloadScratch.length = payloadLength; scratch.length -= 2; // payload length info (short) scratch.length -= payloadLength; // payload return payloadScratch; }
@Override public int nextPosition() throws IOException { final int pos; if (readPositions) { SimpleTextUtil.readLine(in, scratch); assert StringHelper.startsWith(scratch, POS) : "got line=" + scratch.utf8ToString(); UnicodeUtil.UTF8toUTF16( scratch.bytes, scratch.offset + POS.length, scratch.length - POS.length, scratchUTF16_2); pos = ArrayUtil.parseInt(scratchUTF16_2.chars, 0, scratchUTF16_2.length); } else { pos = -1; } if (readOffsets) { SimpleTextUtil.readLine(in, scratch); assert StringHelper.startsWith(scratch, START_OFFSET) : "got line=" + scratch.utf8ToString(); UnicodeUtil.UTF8toUTF16( scratch.bytes, scratch.offset + START_OFFSET.length, scratch.length - START_OFFSET.length, scratchUTF16_2); startOffset = ArrayUtil.parseInt(scratchUTF16_2.chars, 0, scratchUTF16_2.length); SimpleTextUtil.readLine(in, scratch); assert StringHelper.startsWith(scratch, END_OFFSET) : "got line=" + scratch.utf8ToString(); UnicodeUtil.UTF8toUTF16( scratch.bytes, scratch.offset + END_OFFSET.length, scratch.length - END_OFFSET.length, scratchUTF16_2); endOffset = ArrayUtil.parseInt(scratchUTF16_2.chars, 0, scratchUTF16_2.length); } final long fp = in.getFilePointer(); SimpleTextUtil.readLine(in, scratch); if (StringHelper.startsWith(scratch, PAYLOAD)) { final int len = scratch.length - PAYLOAD.length; if (scratch2.bytes.length < len) { scratch2.grow(len); } System.arraycopy(scratch.bytes, PAYLOAD.length, scratch2.bytes, 0, len); scratch2.length = len; payload = scratch2; } else { payload = null; in.seek(fp); } return pos; }