@Override public BytesRef indexedValueForSearch(Object value) { int intValue = NumericUtils.floatToSortableInt(parseValue(value)); BytesRefBuilder bytesRef = new BytesRefBuilder(); NumericUtils.intToPrefixCoded(intValue, 0, bytesRef); // 0 because of exact match return bytesRef.get(); }
static Query parseQueryString( ExtendedCommonTermsQuery query, Object queryString, String field, Analyzer analyzer, String lowFreqMinimumShouldMatch, String highFreqMinimumShouldMatch) throws IOException { // Logic similar to QueryParser#getFieldQuery int count = 0; try (TokenStream source = analyzer.tokenStream(field, queryString.toString())) { source.reset(); CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class); BytesRefBuilder builder = new BytesRefBuilder(); while (source.incrementToken()) { // UTF-8 builder.copyChars(termAtt); query.add(new Term(field, builder.toBytesRef())); count++; } } if (count == 0) { return null; } query.setLowFreqMinimumNumberShouldMatch(lowFreqMinimumShouldMatch); query.setHighFreqMinimumNumberShouldMatch(highFreqMinimumShouldMatch); return query; }
@Override public BytesRef indexedValueForSearch(Object value) { BytesRefBuilder bytesRef = new BytesRefBuilder(); LegacyNumericUtils.intToPrefixCoded( parseValue(value), 0, bytesRef); // 0 because of exact match return bytesRef.get(); }
void processQuery(Query query, ParseContext context) { ParseContext.Document doc = context.doc(); FieldType pft = (FieldType) this.fieldType(); QueryAnalyzer.Result result; try { result = QueryAnalyzer.analyze(query); } catch (QueryAnalyzer.UnsupportedQueryException e) { doc.add( new Field( pft.extractionResultField.name(), EXTRACTION_FAILED, extractionResultField.fieldType())); return; } for (Term term : result.terms) { BytesRefBuilder builder = new BytesRefBuilder(); builder.append(new BytesRef(term.field())); builder.append(FIELD_VALUE_SEPARATOR); builder.append(term.bytes()); doc.add(new Field(queryTermsField.name(), builder.toBytesRef(), queryTermsField.fieldType())); } if (result.verified) { doc.add( new Field( extractionResultField.name(), EXTRACTION_COMPLETE, extractionResultField.fieldType())); } else { doc.add( new Field( extractionResultField.name(), EXTRACTION_PARTIAL, extractionResultField.fieldType())); } }
/* Decodes only the term bytes of the next term. If caller then asks for metadata, ie docFreq, totalTermFreq or pulls a D/&PEnum, we then (lazily) decode all metadata up to the current term. */ private BytesRef _next() throws IOException { // System.out.println("BTR._next seg=" + segment + " this=" + this + " termCount=" + // state.termBlockOrd + " (vs " + blockTermCount + ")"); if (state.termBlockOrd == blockTermCount && !nextBlock()) { // System.out.println(" eof"); indexIsCurrent = false; return null; } // TODO: cutover to something better for these ints! simple64? final int suffix = termSuffixesReader.readVInt(); // System.out.println(" suffix=" + suffix); term.setLength(termBlockPrefix + suffix); term.grow(term.length()); termSuffixesReader.readBytes(term.bytes(), termBlockPrefix, suffix); state.termBlockOrd++; // NOTE: meaningless in the non-ord case state.ord++; // System.out.println(" return term=" + fieldInfo.name + ":" + term.utf8ToString() + " " + // term + " tbOrd=" + state.termBlockOrd); return term.get(); }
@Override public BytesRef indexedValueForSearch(Object value) { long longValue = NumericUtils.doubleToSortableLong(parseDoubleValue(value)); BytesRefBuilder bytesRef = new BytesRefBuilder(); NumericUtils.longToPrefixCoded(longValue, 0, bytesRef); // 0 because of exact match return bytesRef.get(); }
Query createCandidateQuery(IndexReader indexReader) throws IOException { List<Term> extractedTerms = new ArrayList<>(); // include extractionResultField:failed, because docs with this term have no // extractedTermsField // and otherwise we would fail to return these docs. Docs that failed query term extraction // always need to be verified by MemoryIndex: extractedTerms.add(new Term(extractionResultField.name(), EXTRACTION_FAILED)); LeafReader reader = indexReader.leaves().get(0).reader(); Fields fields = reader.fields(); for (String field : fields) { Terms terms = fields.terms(field); if (terms == null) { continue; } BytesRef fieldBr = new BytesRef(field); TermsEnum tenum = terms.iterator(); for (BytesRef term = tenum.next(); term != null; term = tenum.next()) { BytesRefBuilder builder = new BytesRefBuilder(); builder.append(fieldBr); builder.append(FIELD_VALUE_SEPARATOR); builder.append(term); extractedTerms.add(new Term(queryTermsField.name(), builder.toBytesRef())); } } return new TermsQuery(extractedTerms); }
/** Unmarshals a string-based field value. */ protected static Object unmarshalStringSortValue(Object value) { if (null == value) { return null; } BytesRefBuilder spare = new BytesRefBuilder(); String stringVal = (String) value; spare.copyChars(stringVal); return spare.get(); }
/** * Converts a list of long value to their bytes ref representation as performed by {@link * org.apache.lucene.analysis.NumericTokenStream} */ private BytesRef[] toBytesRefs(long[] values) { BytesRef[] bytesRefs = new BytesRef[values.length]; for (int i = 0; i < values.length; i++) { BytesRefBuilder b = new BytesRefBuilder(); NumericUtils.longToPrefixCoded(values[i], 0, b); bytesRefs[i] = b.toBytesRef(); } return bytesRefs; }
public void testRandomReads() throws IOException { int length = randomIntBetween(10, scaledRandomIntBetween(PAGE_SIZE * 2, PAGE_SIZE * 20)); BytesReference pbr = newBytesReference(length); StreamInput streamInput = pbr.streamInput(); BytesRefBuilder target = new BytesRefBuilder(); while (target.length() < pbr.length()) { switch (randomIntBetween(0, 10)) { case 6: case 5: target.append(new BytesRef(new byte[] {streamInput.readByte()})); break; case 4: case 3: BytesRef bytesRef = streamInput.readBytesRef(scaledRandomIntBetween(1, pbr.length() - target.length())); target.append(bytesRef); break; default: byte[] buffer = new byte[scaledRandomIntBetween(1, pbr.length() - target.length())]; int offset = scaledRandomIntBetween(0, buffer.length - 1); int read = streamInput.read(buffer, offset, buffer.length - offset); target.append(new BytesRef(buffer, offset, read)); break; } } assertEquals(pbr.length(), target.length()); BytesRef targetBytes = target.get(); assertArrayEquals( pbr.toBytes(), Arrays.copyOfRange(targetBytes.bytes, targetBytes.offset, targetBytes.length)); }
public void testIterator() throws IOException { int length = randomIntBetween(10, PAGE_SIZE * randomIntBetween(2, 8)); BytesReference pbr = newBytesReference(length); BytesRefIterator iterator = pbr.iterator(); BytesRef ref; BytesRefBuilder builder = new BytesRefBuilder(); while ((ref = iterator.next()) != null) { builder.append(ref); } assertArrayEquals(pbr.toBytes(), BytesRef.deepCopyOf(builder.toBytesRef()).bytes); }
@Override protected void doSetNextReader(LeafReaderContext context) throws IOException { if (segmentFacetCounts != null) { segmentResults.add(createSegmentResult()); } groupFieldTermsIndex = DocValues.getSorted(context.reader(), groupField); facetFieldTermsIndex = DocValues.getSorted(context.reader(), facetField); // 1+ to allow for the -1 "not set": segmentFacetCounts = new int[facetFieldTermsIndex.getValueCount() + 1]; segmentTotalCount = 0; segmentGroupedFacetHits.clear(); for (GroupedFacetHit groupedFacetHit : groupedFacetHits) { int facetOrd = groupedFacetHit.facetValue == null ? -1 : facetFieldTermsIndex.lookupTerm(groupedFacetHit.facetValue); if (groupedFacetHit.facetValue != null && facetOrd < 0) { continue; } int groupOrd = groupedFacetHit.groupValue == null ? -1 : groupFieldTermsIndex.lookupTerm(groupedFacetHit.groupValue); if (groupedFacetHit.groupValue != null && groupOrd < 0) { continue; } int segmentGroupedFacetsIndex = groupOrd * (facetFieldTermsIndex.getValueCount() + 1) + facetOrd; segmentGroupedFacetHits.put(segmentGroupedFacetsIndex); } if (facetPrefix != null) { startFacetOrd = facetFieldTermsIndex.lookupTerm(facetPrefix); if (startFacetOrd < 0) { // Points to the ord one higher than facetPrefix startFacetOrd = -startFacetOrd - 1; } BytesRefBuilder facetEndPrefix = new BytesRefBuilder(); facetEndPrefix.append(facetPrefix); facetEndPrefix.append(UnicodeUtil.BIG_TERM); endFacetOrd = facetFieldTermsIndex.lookupTerm(facetEndPrefix.get()); assert endFacetOrd < 0; endFacetOrd = -endFacetOrd - 1; // Points to the ord one higher than facetEndPrefix } else { startFacetOrd = -1; endFacetOrd = facetFieldTermsIndex.getValueCount(); } }
@Override public void seekExact(long ord) throws IOException { // TODO: would be better to make this simpler and faster. // but we dont want to introduce a bug that corrupts our enum state! bytesReader.setPosition(0); fst.getFirstArc(firstArc); IntsRef output = Util.getByOutput(fst, ord, bytesReader, firstArc, scratchArc, scratchInts); BytesRefBuilder scratchBytes = new BytesRefBuilder(); scratchBytes.clear(); Util.toBytesRef(output, scratchBytes); // TODO: we could do this lazily, better to try to push into FSTEnum though? in.seekExact(scratchBytes.get()); }
public void testSliceIterator() throws IOException { int length = randomIntBetween(10, PAGE_SIZE * randomIntBetween(2, 8)); BytesReference pbr = newBytesReference(length); int sliceOffset = randomIntBetween(0, pbr.length()); int sliceLength = randomIntBetween(pbr.length() - sliceOffset, pbr.length() - sliceOffset); BytesReference slice = pbr.slice(sliceOffset, sliceLength); BytesRefIterator iterator = slice.iterator(); BytesRef ref = null; BytesRefBuilder builder = new BytesRefBuilder(); while ((ref = iterator.next()) != null) { builder.append(ref); } assertArrayEquals(slice.toBytes(), BytesRef.deepCopyOf(builder.toBytesRef()).bytes); }
// we don't actually write a .fdx-like index, instead we read the // stored fields file in entirety up-front and save the offsets // so we can seek to the documents later. private void readIndex(int size) throws IOException { ChecksumIndexInput input = new BufferedChecksumIndexInput(in); offsets = new long[size]; int upto = 0; while (!scratch.get().equals(END)) { SimpleTextUtil.readLine(input, scratch); if (StringHelper.startsWith(scratch.get(), DOC)) { offsets[upto] = input.getFilePointer(); upto++; } } SimpleTextUtil.checkFooter(input); assert upto == offsets.length; }
// NOTE: while it's tempting to make this public, since // caller's parser likely knows the // numInput/numOutputWords, sneaky exceptions, much later // on, will result if these values are wrong; so we always // recompute ourselves to be safe: private void add( CharsRef input, int numInputWords, CharsRef output, int numOutputWords, boolean includeOrig) { // first convert to UTF-8 if (numInputWords <= 0) { throw new IllegalArgumentException("numInputWords must be > 0 (got " + numInputWords + ")"); } if (input.length <= 0) { throw new IllegalArgumentException("input.length must be > 0 (got " + input.length + ")"); } if (numOutputWords <= 0) { throw new IllegalArgumentException( "numOutputWords must be > 0 (got " + numOutputWords + ")"); } if (output.length <= 0) { throw new IllegalArgumentException("output.length must be > 0 (got " + output.length + ")"); } assert !hasHoles(input) : "input has holes: " + input; assert !hasHoles(output) : "output has holes: " + output; // System.out.println("fmap.add input=" + input + " numInputWords=" + numInputWords + " // output=" + output + " numOutputWords=" + numOutputWords); utf8Scratch.copyChars(output.chars, output.offset, output.length); // lookup in hash int ord = words.add(utf8Scratch.get()); if (ord < 0) { // already exists in our hash ord = (-ord) - 1; // System.out.println(" output=" + output + " old ord=" + ord); } else { // System.out.println(" output=" + output + " new ord=" + ord); } MapEntry e = workingSet.get(input); if (e == null) { e = new MapEntry(); workingSet.put( CharsRef.deepCopyOf(input), e); // make a copy, since we will keep around in our map } e.ords.add(ord); e.includeOrig |= includeOrig; maxHorizontalContext = Math.max(maxHorizontalContext, numInputWords); maxHorizontalContext = Math.max(maxHorizontalContext, numOutputWords); }
@Override public void seekExact(long ord) throws IOException { // System.out.println("BTR.seek by ord ord=" + ord); if (indexEnum == null) { throw new IllegalStateException("terms index was not loaded"); } assert ord < numTerms; // TODO: if ord is in same terms block and // after current ord, we should avoid this seek just // like we do in the seek(BytesRef) case in.seek(indexEnum.seek(ord)); boolean result = nextBlock(); // Block must exist since ord < numTerms: assert result; indexIsCurrent = true; didIndexNext = false; seekPending = false; state.ord = indexEnum.ord() - 1; assert state.ord >= -1 : "ord=" + state.ord; term.copyBytes(indexEnum.term()); // Now, scan: int left = (int) (ord - state.ord); while (left > 0) { final BytesRef term = _next(); assert term != null; left--; assert indexIsCurrent; } }
@Override public long ramBytesUsed() { return BASE_RAM_BYTES_USED + RamUsageEstimator.sizeOf(offsets) + RamUsageEstimator.sizeOf(scratch.bytes()) + RamUsageEstimator.sizeOf(scratchUTF16.chars()); }
@Override public BytesRef next() throws IOException { boolean success = false; if (done) { return null; } try { ByteArrayDataInput input = new ByteArrayDataInput(); if (reader.read(scratch)) { final BytesRef bytes = scratch.get(); weight = decode(bytes, input); if (hasPayloads) { payload = decodePayload(bytes, input); } if (hasContexts) { contexts = decodeContexts(bytes, input); } success = true; return bytes; } close(); success = done = true; return null; } finally { if (!success) { done = true; close(); } } }
@Override public BytesRef next() throws IOException { if (++curPos < entries.size()) { entries.get(spare, curPos); return spare.get(); } return null; }
@Override public void reflectWith(AttributeReflector reflector) { fillBytesRef(); reflector.reflect(TermToBytesRefAttribute.class, "bytes", bytes.toBytesRef()); reflector.reflect(NumericTermAttribute.class, "shift", shift); reflector.reflect(NumericTermAttribute.class, "rawValue", getRawValue()); reflector.reflect(NumericTermAttribute.class, "valueSize", valueSize); }
@Override public void visitDocument(int n, StoredFieldVisitor visitor) throws IOException { in.seek(offsets[n]); while (true) { readLine(); if (StringHelper.startsWith(scratch.get(), FIELD) == false) { break; } int fieldNumber = parseIntAt(FIELD.length); FieldInfo fieldInfo = fieldInfos.fieldInfo(fieldNumber); readLine(); assert StringHelper.startsWith(scratch.get(), NAME); readLine(); assert StringHelper.startsWith(scratch.get(), TYPE); final BytesRef type; if (equalsAt(TYPE_STRING, scratch.get(), TYPE.length)) { type = TYPE_STRING; } else if (equalsAt(TYPE_BINARY, scratch.get(), TYPE.length)) { type = TYPE_BINARY; } else if (equalsAt(TYPE_INT, scratch.get(), TYPE.length)) { type = TYPE_INT; } else if (equalsAt(TYPE_LONG, scratch.get(), TYPE.length)) { type = TYPE_LONG; } else if (equalsAt(TYPE_FLOAT, scratch.get(), TYPE.length)) { type = TYPE_FLOAT; } else if (equalsAt(TYPE_DOUBLE, scratch.get(), TYPE.length)) { type = TYPE_DOUBLE; } else { throw new RuntimeException("unknown field type"); } switch (visitor.needsField(fieldInfo)) { case YES: readField(type, fieldInfo, visitor); break; case NO: readLine(); assert StringHelper.startsWith(scratch.get(), VALUE); break; case STOP: return; } } }
@Override public BytesRef getBytesRef() { assert valueSize == 64 || valueSize == 32; if (valueSize == 64) { NumericUtils.longToPrefixCoded(value, shift, bytes); } else { NumericUtils.intToPrefixCoded((int) value, shift, bytes); } return bytes.get(); }
@Override public void seekExact(BytesRef target, TermState otherState) { // System.out.println("BTR.seekExact termState target=" + target.utf8ToString() + " " + // target + " this=" + this); assert otherState != null && otherState instanceof BlockTermState; assert !doOrd || ((BlockTermState) otherState).ord < numTerms; state.copyFrom(otherState); seekPending = true; indexIsCurrent = false; term.copyBytes(target); }
@Override public void seekExact(BytesRef target, TermState otherState) { // if (DEBUG) { // System.out.println("BTTR.seekExact termState seg=" + segment + " target=" + // target.utf8ToString() + " " + target + " state=" + otherState); // } assert clearEOF(); if (target.compareTo(term.get()) != 0 || !termExists) { assert otherState != null && otherState instanceof BlockTermState; currentFrame = staticFrame; currentFrame.state.copyFrom(otherState); term.copyBytes(target); currentFrame.metaDataUpto = currentFrame.getTermBlockOrd(); assert currentFrame.metaDataUpto > 0; validIndexPrefix = 0; } else { // if (DEBUG) { // System.out.println(" skip seek: already on target state=" + currentFrame.state); // } } }
public static void displayPointRanges(Scanner in) { double[][] point = getPoints(in, 1); long hash, hashUpper; double lon, lat, lonUpper, latUpper; for (int i = 63; i >= 45; i -= GeoPointField.PRECISION_STEP) { BytesRefBuilder brb = new BytesRefBuilder(); NumericUtils.longToPrefixCoded( GeoUtils.mortonHash(point[0][LON_INDEX], point[0][LAT_INDEX]), i, brb); BytesRef br = brb.get(); hash = NumericUtils.prefixCodedToLong(br); hashUpper = hash | ((1L << i) - 1); lon = GeoUtils.mortonUnhashLon(hash); lat = GeoUtils.mortonUnhashLat(hash); lonUpper = GeoUtils.mortonUnhashLon(hashUpper); latUpper = GeoUtils.mortonUnhashLat(hashUpper); System.out.println( i + ": " + br + " " + hash + " (" + lon + "," + lat + ")" + " : " + "(" + lonUpper + "," + latUpper + ")"); } }
public static int sortAndDedup(final BytesRefArray bytes, final int[] indices) { final BytesRefBuilder scratch = new BytesRefBuilder(); final BytesRefBuilder scratch1 = new BytesRefBuilder(); final int numValues = bytes.size(); assert indices.length >= numValues; if (numValues <= 1) { return numValues; } sort(scratch, scratch1, bytes, indices); int uniqueCount = 1; BytesRefBuilder previous = scratch; BytesRefBuilder current = scratch1; bytes.get(previous, indices[0]); for (int i = 1; i < numValues; ++i) { bytes.get(current, indices[i]); if (!previous.get().equals(current.get())) { indices[uniqueCount++] = indices[i]; } BytesRefBuilder tmp = previous; previous = current; current = tmp; } return uniqueCount; }
@Override public BytesRef getBytesRef() { return bytes.get(); }
/** Given the readable value, return the term value that will match it. */ public void readableToIndexed(CharSequence val, BytesRefBuilder result) { final String internal = readableToIndexed(val.toString()); result.copyChars(internal); }
@SuppressWarnings("unused") private void printSeekState(PrintStream out) throws IOException { if (currentFrame == staticFrame) { out.println(" no prior seek"); } else { out.println(" prior seek state:"); int ord = 0; boolean isSeekFrame = true; while (true) { IDVersionSegmentTermsEnumFrame f = getFrame(ord); assert f != null; final BytesRef prefix = new BytesRef(term.bytes(), 0, f.prefix); if (f.nextEnt == -1) { out.println( " frame " + (isSeekFrame ? "(seek)" : "(next)") + " ord=" + ord + " fp=" + f.fp + (f.isFloor ? (" (fpOrig=" + f.fpOrig + ")") : "") + " prefixLen=" + f.prefix + " prefix=" + brToString(prefix) + (f.nextEnt == -1 ? "" : (" (of " + f.entCount + ")")) + " hasTerms=" + f.hasTerms + " isFloor=" + f.isFloor + " code=" + ((f.fp << VersionBlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS) + (f.hasTerms ? VersionBlockTreeTermsWriter.OUTPUT_FLAG_HAS_TERMS : 0) + (f.isFloor ? VersionBlockTreeTermsWriter.OUTPUT_FLAG_IS_FLOOR : 0)) + " isLastInFloor=" + f.isLastInFloor + " mdUpto=" + f.metaDataUpto + " tbOrd=" + f.getTermBlockOrd()); } else { out.println( " frame " + (isSeekFrame ? "(seek, loaded)" : "(next, loaded)") + " ord=" + ord + " fp=" + f.fp + (f.isFloor ? (" (fpOrig=" + f.fpOrig + ")") : "") + " prefixLen=" + f.prefix + " prefix=" + brToString(prefix) + " nextEnt=" + f.nextEnt + (f.nextEnt == -1 ? "" : (" (of " + f.entCount + ")")) + " hasTerms=" + f.hasTerms + " isFloor=" + f.isFloor + " code=" + ((f.fp << VersionBlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS) + (f.hasTerms ? VersionBlockTreeTermsWriter.OUTPUT_FLAG_HAS_TERMS : 0) + (f.isFloor ? VersionBlockTreeTermsWriter.OUTPUT_FLAG_IS_FLOOR : 0)) + " lastSubFP=" + f.lastSubFP + " isLastInFloor=" + f.isLastInFloor + " mdUpto=" + f.metaDataUpto + " tbOrd=" + f.getTermBlockOrd()); } if (fr.index != null) { assert !isSeekFrame || f.arc != null : "isSeekFrame=" + isSeekFrame + " f.arc=" + f.arc; if (f.prefix > 0 && isSeekFrame && f.arc.label != (term.byteAt(f.prefix - 1) & 0xFF)) { out.println( " broken seek state: arc.label=" + (char) f.arc.label + " vs term byte=" + (char) (term.byteAt(f.prefix - 1) & 0xFF)); throw new RuntimeException("seek state is broken"); } Pair<BytesRef, Long> output = Util.get(fr.index, prefix); if (output == null) { out.println(" broken seek state: prefix is not final in index"); throw new RuntimeException("seek state is broken"); } else if (isSeekFrame && !f.isFloor) { final ByteArrayDataInput reader = new ByteArrayDataInput( output.output1.bytes, output.output1.offset, output.output1.length); final long codeOrig = reader.readVLong(); final long code = (f.fp << VersionBlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS) | (f.hasTerms ? VersionBlockTreeTermsWriter.OUTPUT_FLAG_HAS_TERMS : 0) | (f.isFloor ? VersionBlockTreeTermsWriter.OUTPUT_FLAG_IS_FLOOR : 0); if (codeOrig != code) { out.println( " broken seek state: output code=" + codeOrig + " doesn't match frame code=" + code); throw new RuntimeException("seek state is broken"); } } } if (f == currentFrame) { break; } if (f.prefix == validIndexPrefix) { isSeekFrame = false; } ord++; } } }