// we don't actually write a .fdx-like index, instead we read the // stored fields file in entirety up-front and save the offsets // so we can seek to the documents later. private void readIndex(int size) throws IOException { ChecksumIndexInput input = new BufferedChecksumIndexInput(in); offsets = new long[size]; int upto = 0; while (!scratch.get().equals(END)) { SimpleTextUtil.readLine(input, scratch); if (StringHelper.startsWith(scratch.get(), DOC)) { offsets[upto] = input.getFilePointer(); upto++; } } SimpleTextUtil.checkFooter(input); assert upto == offsets.length; }
private void readField(BytesRef type, FieldInfo fieldInfo, StoredFieldVisitor visitor) throws IOException { readLine(); assert StringHelper.startsWith(scratch.get(), VALUE); if (type == TYPE_STRING) { byte[] bytes = new byte[scratch.length() - VALUE.length]; System.arraycopy(scratch.bytes(), VALUE.length, bytes, 0, bytes.length); visitor.stringField(fieldInfo, bytes); } else if (type == TYPE_BINARY) { byte[] copy = new byte[scratch.length() - VALUE.length]; System.arraycopy(scratch.bytes(), VALUE.length, copy, 0, copy.length); visitor.binaryField(fieldInfo, copy); } else if (type == TYPE_INT) { scratchUTF16.copyUTF8Bytes(scratch.bytes(), VALUE.length, scratch.length() - VALUE.length); visitor.intField(fieldInfo, Integer.parseInt(scratchUTF16.toString())); } else if (type == TYPE_LONG) { scratchUTF16.copyUTF8Bytes(scratch.bytes(), VALUE.length, scratch.length() - VALUE.length); visitor.longField(fieldInfo, Long.parseLong(scratchUTF16.toString())); } else if (type == TYPE_FLOAT) { scratchUTF16.copyUTF8Bytes(scratch.bytes(), VALUE.length, scratch.length() - VALUE.length); visitor.floatField(fieldInfo, Float.parseFloat(scratchUTF16.toString())); } else if (type == TYPE_DOUBLE) { scratchUTF16.copyUTF8Bytes(scratch.bytes(), VALUE.length, scratch.length() - VALUE.length); visitor.doubleField(fieldInfo, Double.parseDouble(scratchUTF16.toString())); } }
@Override public BytesRef next() throws IOException { boolean success = false; if (done) { return null; } try { ByteArrayDataInput input = new ByteArrayDataInput(); if (reader.read(scratch)) { final BytesRef bytes = scratch.get(); weight = decode(bytes, input); if (hasPayloads) { payload = decodePayload(bytes, input); } if (hasContexts) { contexts = decodeContexts(bytes, input); } success = true; return bytes; } close(); success = done = true; return null; } finally { if (!success) { done = true; close(); } } }
@Override public BytesRef indexedValueForSearch(Object value) { long longValue = NumericUtils.doubleToSortableLong(parseDoubleValue(value)); BytesRefBuilder bytesRef = new BytesRefBuilder(); NumericUtils.longToPrefixCoded(longValue, 0, bytesRef); // 0 because of exact match return bytesRef.get(); }
/* Decodes only the term bytes of the next term. If caller then asks for metadata, ie docFreq, totalTermFreq or pulls a D/&PEnum, we then (lazily) decode all metadata up to the current term. */ private BytesRef _next() throws IOException { // System.out.println("BTR._next seg=" + segment + " this=" + this + " termCount=" + // state.termBlockOrd + " (vs " + blockTermCount + ")"); if (state.termBlockOrd == blockTermCount && !nextBlock()) { // System.out.println(" eof"); indexIsCurrent = false; return null; } // TODO: cutover to something better for these ints! simple64? final int suffix = termSuffixesReader.readVInt(); // System.out.println(" suffix=" + suffix); term.setLength(termBlockPrefix + suffix); term.grow(term.length()); termSuffixesReader.readBytes(term.bytes(), termBlockPrefix, suffix); state.termBlockOrd++; // NOTE: meaningless in the non-ord case state.ord++; // System.out.println(" return term=" + fieldInfo.name + ":" + term.utf8ToString() + " " + // term + " tbOrd=" + state.termBlockOrd); return term.get(); }
@Override public BytesRef indexedValueForSearch(Object value) { int intValue = NumericUtils.floatToSortableInt(parseValue(value)); BytesRefBuilder bytesRef = new BytesRefBuilder(); NumericUtils.intToPrefixCoded(intValue, 0, bytesRef); // 0 because of exact match return bytesRef.get(); }
@Override public BytesRef indexedValueForSearch(Object value) { BytesRefBuilder bytesRef = new BytesRefBuilder(); LegacyNumericUtils.intToPrefixCoded( parseValue(value), 0, bytesRef); // 0 because of exact match return bytesRef.get(); }
public void testRandomReads() throws IOException { int length = randomIntBetween(10, scaledRandomIntBetween(PAGE_SIZE * 2, PAGE_SIZE * 20)); BytesReference pbr = newBytesReference(length); StreamInput streamInput = pbr.streamInput(); BytesRefBuilder target = new BytesRefBuilder(); while (target.length() < pbr.length()) { switch (randomIntBetween(0, 10)) { case 6: case 5: target.append(new BytesRef(new byte[] {streamInput.readByte()})); break; case 4: case 3: BytesRef bytesRef = streamInput.readBytesRef(scaledRandomIntBetween(1, pbr.length() - target.length())); target.append(bytesRef); break; default: byte[] buffer = new byte[scaledRandomIntBetween(1, pbr.length() - target.length())]; int offset = scaledRandomIntBetween(0, buffer.length - 1); int read = streamInput.read(buffer, offset, buffer.length - offset); target.append(new BytesRef(buffer, offset, read)); break; } } assertEquals(pbr.length(), target.length()); BytesRef targetBytes = target.get(); assertArrayEquals( pbr.toBytes(), Arrays.copyOfRange(targetBytes.bytes, targetBytes.offset, targetBytes.length)); }
@Override public void visitDocument(int n, StoredFieldVisitor visitor) throws IOException { in.seek(offsets[n]); while (true) { readLine(); if (StringHelper.startsWith(scratch.get(), FIELD) == false) { break; } int fieldNumber = parseIntAt(FIELD.length); FieldInfo fieldInfo = fieldInfos.fieldInfo(fieldNumber); readLine(); assert StringHelper.startsWith(scratch.get(), NAME); readLine(); assert StringHelper.startsWith(scratch.get(), TYPE); final BytesRef type; if (equalsAt(TYPE_STRING, scratch.get(), TYPE.length)) { type = TYPE_STRING; } else if (equalsAt(TYPE_BINARY, scratch.get(), TYPE.length)) { type = TYPE_BINARY; } else if (equalsAt(TYPE_INT, scratch.get(), TYPE.length)) { type = TYPE_INT; } else if (equalsAt(TYPE_LONG, scratch.get(), TYPE.length)) { type = TYPE_LONG; } else if (equalsAt(TYPE_FLOAT, scratch.get(), TYPE.length)) { type = TYPE_FLOAT; } else if (equalsAt(TYPE_DOUBLE, scratch.get(), TYPE.length)) { type = TYPE_DOUBLE; } else { throw new RuntimeException("unknown field type"); } switch (visitor.needsField(fieldInfo)) { case YES: readField(type, fieldInfo, visitor); break; case NO: readLine(); assert StringHelper.startsWith(scratch.get(), VALUE); break; case STOP: return; } } }
@Override public BytesRef next() throws IOException { if (++curPos < entries.size()) { entries.get(spare, curPos); return spare.get(); } return null; }
/** Unmarshals a string-based field value. */ protected static Object unmarshalStringSortValue(Object value) { if (null == value) { return null; } BytesRefBuilder spare = new BytesRefBuilder(); String stringVal = (String) value; spare.copyChars(stringVal); return spare.get(); }
@Override public BytesRef getBytesRef() { assert valueSize == 64 || valueSize == 32; if (valueSize == 64) { NumericUtils.longToPrefixCoded(value, shift, bytes); } else { NumericUtils.intToPrefixCoded((int) value, shift, bytes); } return bytes.get(); }
@Override protected void doSetNextReader(LeafReaderContext context) throws IOException { if (segmentFacetCounts != null) { segmentResults.add(createSegmentResult()); } groupFieldTermsIndex = DocValues.getSorted(context.reader(), groupField); facetFieldTermsIndex = DocValues.getSorted(context.reader(), facetField); // 1+ to allow for the -1 "not set": segmentFacetCounts = new int[facetFieldTermsIndex.getValueCount() + 1]; segmentTotalCount = 0; segmentGroupedFacetHits.clear(); for (GroupedFacetHit groupedFacetHit : groupedFacetHits) { int facetOrd = groupedFacetHit.facetValue == null ? -1 : facetFieldTermsIndex.lookupTerm(groupedFacetHit.facetValue); if (groupedFacetHit.facetValue != null && facetOrd < 0) { continue; } int groupOrd = groupedFacetHit.groupValue == null ? -1 : groupFieldTermsIndex.lookupTerm(groupedFacetHit.groupValue); if (groupedFacetHit.groupValue != null && groupOrd < 0) { continue; } int segmentGroupedFacetsIndex = groupOrd * (facetFieldTermsIndex.getValueCount() + 1) + facetOrd; segmentGroupedFacetHits.put(segmentGroupedFacetsIndex); } if (facetPrefix != null) { startFacetOrd = facetFieldTermsIndex.lookupTerm(facetPrefix); if (startFacetOrd < 0) { // Points to the ord one higher than facetPrefix startFacetOrd = -startFacetOrd - 1; } BytesRefBuilder facetEndPrefix = new BytesRefBuilder(); facetEndPrefix.append(facetPrefix); facetEndPrefix.append(UnicodeUtil.BIG_TERM); endFacetOrd = facetFieldTermsIndex.lookupTerm(facetEndPrefix.get()); assert endFacetOrd < 0; endFacetOrd = -endFacetOrd - 1; // Points to the ord one higher than facetEndPrefix } else { startFacetOrd = -1; endFacetOrd = facetFieldTermsIndex.getValueCount(); } }
@Override public void seekExact(long ord) throws IOException { // TODO: would be better to make this simpler and faster. // but we dont want to introduce a bug that corrupts our enum state! bytesReader.setPosition(0); fst.getFirstArc(firstArc); IntsRef output = Util.getByOutput(fst, ord, bytesReader, firstArc, scratchArc, scratchInts); BytesRefBuilder scratchBytes = new BytesRefBuilder(); scratchBytes.clear(); Util.toBytesRef(output, scratchBytes); // TODO: we could do this lazily, better to try to push into FSTEnum though? in.seekExact(scratchBytes.get()); }
// NOTE: while it's tempting to make this public, since // caller's parser likely knows the // numInput/numOutputWords, sneaky exceptions, much later // on, will result if these values are wrong; so we always // recompute ourselves to be safe: private void add( CharsRef input, int numInputWords, CharsRef output, int numOutputWords, boolean includeOrig) { // first convert to UTF-8 if (numInputWords <= 0) { throw new IllegalArgumentException("numInputWords must be > 0 (got " + numInputWords + ")"); } if (input.length <= 0) { throw new IllegalArgumentException("input.length must be > 0 (got " + input.length + ")"); } if (numOutputWords <= 0) { throw new IllegalArgumentException( "numOutputWords must be > 0 (got " + numOutputWords + ")"); } if (output.length <= 0) { throw new IllegalArgumentException("output.length must be > 0 (got " + output.length + ")"); } assert !hasHoles(input) : "input has holes: " + input; assert !hasHoles(output) : "output has holes: " + output; // System.out.println("fmap.add input=" + input + " numInputWords=" + numInputWords + " // output=" + output + " numOutputWords=" + numOutputWords); utf8Scratch.copyChars(output.chars, output.offset, output.length); // lookup in hash int ord = words.add(utf8Scratch.get()); if (ord < 0) { // already exists in our hash ord = (-ord) - 1; // System.out.println(" output=" + output + " old ord=" + ord); } else { // System.out.println(" output=" + output + " new ord=" + ord); } MapEntry e = workingSet.get(input); if (e == null) { e = new MapEntry(); workingSet.put( CharsRef.deepCopyOf(input), e); // make a copy, since we will keep around in our map } e.ords.add(ord); e.includeOrig |= includeOrig; maxHorizontalContext = Math.max(maxHorizontalContext, numInputWords); maxHorizontalContext = Math.max(maxHorizontalContext, numOutputWords); }
public static int sortAndDedup(final BytesRefArray bytes, final int[] indices) { final BytesRefBuilder scratch = new BytesRefBuilder(); final BytesRefBuilder scratch1 = new BytesRefBuilder(); final int numValues = bytes.size(); assert indices.length >= numValues; if (numValues <= 1) { return numValues; } sort(scratch, scratch1, bytes, indices); int uniqueCount = 1; BytesRefBuilder previous = scratch; BytesRefBuilder current = scratch1; bytes.get(previous, indices[0]); for (int i = 1; i < numValues; ++i) { bytes.get(current, indices[i]); if (!previous.get().equals(current.get())) { indices[uniqueCount++] = indices[i]; } BytesRefBuilder tmp = previous; previous = current; current = tmp; } return uniqueCount; }
@Override public void seekExact(BytesRef target, TermState otherState) { // if (DEBUG) { // System.out.println("BTTR.seekExact termState seg=" + segment + " target=" + // target.utf8ToString() + " " + target + " state=" + otherState); // } assert clearEOF(); if (target.compareTo(term.get()) != 0 || !termExists) { assert otherState != null && otherState instanceof BlockTermState; currentFrame = staticFrame; currentFrame.state.copyFrom(otherState); term.copyBytes(target); currentFrame.metaDataUpto = currentFrame.getTermBlockOrd(); assert currentFrame.metaDataUpto > 0; validIndexPrefix = 0; } else { // if (DEBUG) { // System.out.println(" skip seek: already on target state=" + currentFrame.state); // } } }
public static void displayPointRanges(Scanner in) { double[][] point = getPoints(in, 1); long hash, hashUpper; double lon, lat, lonUpper, latUpper; for (int i = 63; i >= 45; i -= GeoPointField.PRECISION_STEP) { BytesRefBuilder brb = new BytesRefBuilder(); NumericUtils.longToPrefixCoded( GeoUtils.mortonHash(point[0][LON_INDEX], point[0][LAT_INDEX]), i, brb); BytesRef br = brb.get(); hash = NumericUtils.prefixCodedToLong(br); hashUpper = hash | ((1L << i) - 1); lon = GeoUtils.mortonUnhashLon(hash); lat = GeoUtils.mortonUnhashLat(hash); lonUpper = GeoUtils.mortonUnhashLon(hashUpper); latUpper = GeoUtils.mortonUnhashLat(hashUpper); System.out.println( i + ": " + br + " " + hash + " (" + lon + "," + lat + ")" + " : " + "(" + lonUpper + "," + latUpper + ")"); } }
@Override public Fields get(int doc) throws IOException { SortedMap<String, SimpleTVTerms> fields = new TreeMap<>(); in.seek(offsets[doc]); readLine(); assert StringHelper.startsWith(scratch.get(), NUMFIELDS); int numFields = parseIntAt(NUMFIELDS.length); if (numFields == 0) { return null; // no vectors for this doc } for (int i = 0; i < numFields; i++) { readLine(); assert StringHelper.startsWith(scratch.get(), FIELD); // skip fieldNumber: parseIntAt(FIELD.length); readLine(); assert StringHelper.startsWith(scratch.get(), FIELDNAME); String fieldName = readString(FIELDNAME.length, scratch); readLine(); assert StringHelper.startsWith(scratch.get(), FIELDPOSITIONS); boolean positions = Boolean.parseBoolean(readString(FIELDPOSITIONS.length, scratch)); readLine(); assert StringHelper.startsWith(scratch.get(), FIELDOFFSETS); boolean offsets = Boolean.parseBoolean(readString(FIELDOFFSETS.length, scratch)); readLine(); assert StringHelper.startsWith(scratch.get(), FIELDPAYLOADS); boolean payloads = Boolean.parseBoolean(readString(FIELDPAYLOADS.length, scratch)); readLine(); assert StringHelper.startsWith(scratch.get(), FIELDTERMCOUNT); int termCount = parseIntAt(FIELDTERMCOUNT.length); SimpleTVTerms terms = new SimpleTVTerms(offsets, positions, payloads); fields.put(fieldName, terms); BytesRefBuilder term = new BytesRefBuilder(); for (int j = 0; j < termCount; j++) { readLine(); assert StringHelper.startsWith(scratch.get(), TERMTEXT); int termLength = scratch.length() - TERMTEXT.length; term.grow(termLength); term.setLength(termLength); System.arraycopy(scratch.bytes(), TERMTEXT.length, term.bytes(), 0, termLength); SimpleTVPostings postings = new SimpleTVPostings(); terms.terms.put(term.toBytesRef(), postings); readLine(); assert StringHelper.startsWith(scratch.get(), TERMFREQ); postings.freq = parseIntAt(TERMFREQ.length); if (positions || offsets) { if (positions) { postings.positions = new int[postings.freq]; if (payloads) { postings.payloads = new BytesRef[postings.freq]; } } if (offsets) { postings.startOffsets = new int[postings.freq]; postings.endOffsets = new int[postings.freq]; } for (int k = 0; k < postings.freq; k++) { if (positions) { readLine(); assert StringHelper.startsWith(scratch.get(), POSITION); postings.positions[k] = parseIntAt(POSITION.length); if (payloads) { readLine(); assert StringHelper.startsWith(scratch.get(), PAYLOAD); if (scratch.length() - PAYLOAD.length == 0) { postings.payloads[k] = null; } else { byte payloadBytes[] = new byte[scratch.length() - PAYLOAD.length]; System.arraycopy( scratch.bytes(), PAYLOAD.length, payloadBytes, 0, payloadBytes.length); postings.payloads[k] = new BytesRef(payloadBytes); } } } if (offsets) { readLine(); assert StringHelper.startsWith(scratch.get(), STARTOFFSET); postings.startOffsets[k] = parseIntAt(STARTOFFSET.length); readLine(); assert StringHelper.startsWith(scratch.get(), ENDOFFSET); postings.endOffsets[k] = parseIntAt(ENDOFFSET.length); } } } } } return new SimpleTVFields(fields); }
@Override public BytesRef term() { assert !eof; return term.get(); }
/* Decodes only the term bytes of the next term. If caller then asks for metadata, ie docFreq, totalTermFreq or pulls a D/&PEnum, we then (lazily) decode all metadata up to the current term. */ @Override public BytesRef next() throws IOException { if (in == null) { // Fresh TermsEnum; seek to first term: final FST.Arc<Pair<BytesRef, Long>> arc; if (fr.index != null) { arc = fr.index.getFirstArc(arcs[0]); // Empty string prefix must have an output in the index! assert arc.isFinal(); } else { arc = null; } currentFrame = pushFrame(arc, fr.rootCode, 0); currentFrame.loadBlock(); } targetBeforeCurrentLength = currentFrame.ord; assert !eof; // if (DEBUG) { // System.out.println("\nBTTR.next seg=" + segment + " term=" + brToString(term) + " // termExists?=" + termExists + " field=" + fieldInfo.name + " termBlockOrd=" + // currentFrame.state.termBlockOrd + " validIndexPrefix=" + validIndexPrefix); // printSeekState(); // } if (currentFrame == staticFrame) { // If seek was previously called and the term was // cached, or seek(TermState) was called, usually // caller is just going to pull a D/&PEnum or get // docFreq, etc. But, if they then call next(), // this method catches up all internal state so next() // works properly: // if (DEBUG) System.out.println(" re-seek to pending term=" + term.utf8ToString() + " " + // term); final boolean result = seekExact(term.get()); assert result; } // Pop finished blocks while (currentFrame.nextEnt == currentFrame.entCount) { if (!currentFrame.isLastInFloor) { currentFrame.loadNextFloorBlock(); } else { // if (DEBUG) System.out.println(" pop frame"); if (currentFrame.ord == 0) { // if (DEBUG) System.out.println(" return null"); assert setEOF(); term.clear(); validIndexPrefix = 0; currentFrame.rewind(); termExists = false; return null; } final long lastFP = currentFrame.fpOrig; currentFrame = stack[currentFrame.ord - 1]; if (currentFrame.nextEnt == -1 || currentFrame.lastSubFP != lastFP) { // We popped into a frame that's not loaded // yet or not scan'd to the right entry currentFrame.scanToFloorFrame(term.get()); currentFrame.loadBlock(); currentFrame.scanToSubBlock(lastFP); } // Note that the seek state (last seek) has been // invalidated beyond this depth validIndexPrefix = Math.min(validIndexPrefix, currentFrame.prefix); // if (DEBUG) { // System.out.println(" reset validIndexPrefix=" + validIndexPrefix); // } } } while (true) { if (currentFrame.next()) { // Push to new block: // if (DEBUG) System.out.println(" push frame"); currentFrame = pushFrame(null, currentFrame.lastSubFP, term.length()); // This is a "next" frame -- even if it's // floor'd we must pretend it isn't so we don't // try to scan to the right floor frame: currentFrame.isFloor = false; // currentFrame.hasTerms = true; currentFrame.loadBlock(); } else { // if (DEBUG) System.out.println(" return term=" + term.utf8ToString() + " " + term + " // currentFrame.ord=" + currentFrame.ord); return term.get(); } } }
// TODO: we may want an alternate mode here which is // "if you are about to return NOT_FOUND I won't use // the terms data from that"; eg FuzzyTermsEnum will // (usually) just immediately call seek again if we // return NOT_FOUND so it's a waste for us to fill in // the term that was actually NOT_FOUND @Override public SeekStatus seekCeil(final BytesRef target) throws IOException { if (indexEnum == null) { throw new IllegalStateException("terms index was not loaded"); } // System.out.println("BTR.seek seg=" + segment + " target=" + fieldInfo.name + ":" + // target.utf8ToString() + " " + target + " current=" + term().utf8ToString() + " " + term() // + " indexIsCurrent=" + indexIsCurrent + " didIndexNext=" + didIndexNext + " seekPending=" // + seekPending + " divisor=" + indexReader.getDivisor() + " this=" + this); if (didIndexNext) { if (nextIndexTerm == null) { // System.out.println(" nextIndexTerm=null"); } else { // System.out.println(" nextIndexTerm=" + nextIndexTerm.utf8ToString()); } } boolean doSeek = true; // See if we can avoid seeking, because target term // is after current term but before next index term: if (indexIsCurrent) { final int cmp = BytesRef.getUTF8SortedAsUnicodeComparator().compare(term.get(), target); if (cmp == 0) { // Already at the requested term return SeekStatus.FOUND; } else if (cmp < 0) { // Target term is after current term if (!didIndexNext) { if (indexEnum.next() == -1) { nextIndexTerm = null; } else { nextIndexTerm = indexEnum.term(); } // System.out.println(" now do index next() nextIndexTerm=" + (nextIndexTerm == null // ? "null" : nextIndexTerm.utf8ToString())); didIndexNext = true; } if (nextIndexTerm == null || BytesRef.getUTF8SortedAsUnicodeComparator().compare(target, nextIndexTerm) < 0) { // Optimization: requested term is within the // same term block we are now in; skip seeking // (but do scanning): doSeek = false; // System.out.println(" skip seek: nextIndexTerm=" + (nextIndexTerm == null ? "null" // : nextIndexTerm.utf8ToString())); } } } if (doSeek) { // System.out.println(" seek"); // Ask terms index to find biggest indexed term (= // first term in a block) that's <= our text: in.seek(indexEnum.seek(target)); boolean result = nextBlock(); // Block must exist since, at least, the indexed term // is in the block: assert result; indexIsCurrent = true; didIndexNext = false; if (doOrd) { state.ord = indexEnum.ord() - 1; } term.copyBytes(indexEnum.term()); // System.out.println(" seek: term=" + term.utf8ToString()); } else { // System.out.println(" skip seek"); if (state.termBlockOrd == blockTermCount && !nextBlock()) { indexIsCurrent = false; return SeekStatus.END; } } seekPending = false; int common = 0; // Scan within block. We could do this by calling // _next() and testing the resulting term, but this // is wasteful. Instead, we first confirm the // target matches the common prefix of this block, // and then we scan the term bytes directly from the // termSuffixesreader's byte[], saving a copy into // the BytesRef term per term. Only when we return // do we then copy the bytes into the term. while (true) { // First, see if target term matches common prefix // in this block: if (common < termBlockPrefix) { final int cmp = (term.byteAt(common) & 0xFF) - (target.bytes[target.offset + common] & 0xFF); if (cmp < 0) { // TODO: maybe we should store common prefix // in block header? (instead of relying on // last term of previous block) // Target's prefix is after the common block // prefix, so term cannot be in this block // but it could be in next block. We // must scan to end-of-block to set common // prefix for next block: if (state.termBlockOrd < blockTermCount) { while (state.termBlockOrd < blockTermCount - 1) { state.termBlockOrd++; state.ord++; termSuffixesReader.skipBytes(termSuffixesReader.readVInt()); } final int suffix = termSuffixesReader.readVInt(); term.setLength(termBlockPrefix + suffix); term.grow(term.length()); termSuffixesReader.readBytes(term.bytes(), termBlockPrefix, suffix); } state.ord++; if (!nextBlock()) { indexIsCurrent = false; return SeekStatus.END; } common = 0; } else if (cmp > 0) { // Target's prefix is before the common prefix // of this block, so we position to start of // block and return NOT_FOUND: assert state.termBlockOrd == 0; final int suffix = termSuffixesReader.readVInt(); term.setLength(termBlockPrefix + suffix); term.grow(term.length()); termSuffixesReader.readBytes(term.bytes(), termBlockPrefix, suffix); return SeekStatus.NOT_FOUND; } else { common++; } continue; } // Test every term in this block while (true) { state.termBlockOrd++; state.ord++; final int suffix = termSuffixesReader.readVInt(); // We know the prefix matches, so just compare the new suffix: final int termLen = termBlockPrefix + suffix; int bytePos = termSuffixesReader.getPosition(); boolean next = false; final int limit = target.offset + (termLen < target.length ? termLen : target.length); int targetPos = target.offset + termBlockPrefix; while (targetPos < limit) { final int cmp = (termSuffixes[bytePos++] & 0xFF) - (target.bytes[targetPos++] & 0xFF); if (cmp < 0) { // Current term is still before the target; // keep scanning next = true; break; } else if (cmp > 0) { // Done! Current term is after target. Stop // here, fill in real term, return NOT_FOUND. term.setLength(termBlockPrefix + suffix); term.grow(term.length()); termSuffixesReader.readBytes(term.bytes(), termBlockPrefix, suffix); // System.out.println(" NOT_FOUND"); return SeekStatus.NOT_FOUND; } } if (!next && target.length <= termLen) { term.setLength(termBlockPrefix + suffix); term.grow(term.length()); termSuffixesReader.readBytes(term.bytes(), termBlockPrefix, suffix); if (target.length == termLen) { // Done! Exact match. Stop here, fill in // real term, return FOUND. // System.out.println(" FOUND"); return SeekStatus.FOUND; } else { // System.out.println(" NOT_FOUND"); return SeekStatus.NOT_FOUND; } } if (state.termBlockOrd == blockTermCount) { // Must pre-fill term for next block's common prefix term.setLength(termBlockPrefix + suffix); term.grow(term.length()); termSuffixesReader.readBytes(term.bytes(), termBlockPrefix, suffix); break; } else { termSuffixesReader.skipBytes(suffix); } } // The purpose of the terms dict index is to seek // the enum to the closest index term before the // term we are looking for. So, we should never // cross another index term (besides the first // one) while we are scanning: assert indexIsCurrent; if (!nextBlock()) { // System.out.println(" END"); indexIsCurrent = false; return SeekStatus.END; } common = 0; } }
@Override public BytesRef term() { return term.get(); }
public static NamedList<Integer> getCounts( SolrIndexSearcher searcher, DocSet docs, String fieldName, int offset, int limit, int mincount, boolean missing, String sort, String prefix, String contains, boolean ignoreCase) throws IOException { SchemaField schemaField = searcher.getSchema().getField(fieldName); FieldType ft = schemaField.getType(); NamedList<Integer> res = new NamedList<>(); // TODO: remove multiValuedFieldCache(), check dv type / uninversion type? final boolean multiValued = schemaField.multiValued() || ft.multiValuedFieldCache(); final SortedSetDocValues si; // for term lookups only OrdinalMap ordinalMap = null; // for mapping per-segment ords to global ones if (multiValued) { si = searcher.getLeafReader().getSortedSetDocValues(fieldName); if (si instanceof MultiSortedSetDocValues) { ordinalMap = ((MultiSortedSetDocValues) si).mapping; } } else { SortedDocValues single = searcher.getLeafReader().getSortedDocValues(fieldName); si = single == null ? null : DocValues.singleton(single); if (single instanceof MultiSortedDocValues) { ordinalMap = ((MultiSortedDocValues) single).mapping; } } if (si == null) { return finalize(res, searcher, schemaField, docs, -1, missing); } if (si.getValueCount() >= Integer.MAX_VALUE) { throw new UnsupportedOperationException( "Currently this faceting method is limited to " + Integer.MAX_VALUE + " unique terms"); } final BytesRefBuilder prefixRef; if (prefix == null) { prefixRef = null; } else if (prefix.length() == 0) { prefix = null; prefixRef = null; } else { prefixRef = new BytesRefBuilder(); prefixRef.copyChars(prefix); } int startTermIndex, endTermIndex; if (prefix != null) { startTermIndex = (int) si.lookupTerm(prefixRef.get()); if (startTermIndex < 0) startTermIndex = -startTermIndex - 1; prefixRef.append(UnicodeUtil.BIG_TERM); endTermIndex = (int) si.lookupTerm(prefixRef.get()); assert endTermIndex < 0; endTermIndex = -endTermIndex - 1; } else { startTermIndex = -1; endTermIndex = (int) si.getValueCount(); } final int nTerms = endTermIndex - startTermIndex; int missingCount = -1; final CharsRefBuilder charsRef = new CharsRefBuilder(); if (nTerms > 0 && docs.size() >= mincount) { // count collection array only needs to be as big as the number of terms we are // going to collect counts for. final int[] counts = new int[nTerms]; Filter filter = docs.getTopFilter(); List<LeafReaderContext> leaves = searcher.getTopReaderContext().leaves(); for (int subIndex = 0; subIndex < leaves.size(); subIndex++) { LeafReaderContext leaf = leaves.get(subIndex); DocIdSet dis = filter.getDocIdSet(leaf, null); // solr docsets already exclude any deleted docs DocIdSetIterator disi = null; if (dis != null) { disi = dis.iterator(); } if (disi != null) { if (multiValued) { SortedSetDocValues sub = leaf.reader().getSortedSetDocValues(fieldName); if (sub == null) { sub = DocValues.emptySortedSet(); } final SortedDocValues singleton = DocValues.unwrapSingleton(sub); if (singleton != null) { // some codecs may optimize SORTED_SET storage for single-valued fields accumSingle(counts, startTermIndex, singleton, disi, subIndex, ordinalMap); } else { accumMulti(counts, startTermIndex, sub, disi, subIndex, ordinalMap); } } else { SortedDocValues sub = leaf.reader().getSortedDocValues(fieldName); if (sub == null) { sub = DocValues.emptySorted(); } accumSingle(counts, startTermIndex, sub, disi, subIndex, ordinalMap); } } } if (startTermIndex == -1) { missingCount = counts[0]; } // IDEA: we could also maintain a count of "other"... everything that fell outside // of the top 'N' int off = offset; int lim = limit >= 0 ? limit : Integer.MAX_VALUE; if (sort.equals(FacetParams.FACET_SORT_COUNT) || sort.equals(FacetParams.FACET_SORT_COUNT_LEGACY)) { int maxsize = limit > 0 ? offset + limit : Integer.MAX_VALUE - 1; maxsize = Math.min(maxsize, nTerms); LongPriorityQueue queue = new LongPriorityQueue(Math.min(maxsize, 1000), maxsize, Long.MIN_VALUE); int min = mincount - 1; // the smallest value in the top 'N' values for (int i = (startTermIndex == -1) ? 1 : 0; i < nTerms; i++) { int c = counts[i]; if (contains != null) { final BytesRef term = si.lookupOrd(startTermIndex + i); if (!SimpleFacets.contains(term.utf8ToString(), contains, ignoreCase)) { continue; } } if (c > min) { // NOTE: we use c>min rather than c>=min as an optimization because we are going in // index order, so we already know that the keys are ordered. This can be very // important if a lot of the counts are repeated (like zero counts would be). // smaller term numbers sort higher, so subtract the term number instead long pair = (((long) c) << 32) + (Integer.MAX_VALUE - i); boolean displaced = queue.insert(pair); if (displaced) min = (int) (queue.top() >>> 32); } } // if we are deep paging, we don't have to order the highest "offset" counts. int collectCount = Math.max(0, queue.size() - off); assert collectCount <= lim; // the start and end indexes of our list "sorted" (starting with the highest value) int sortedIdxStart = queue.size() - (collectCount - 1); int sortedIdxEnd = queue.size() + 1; final long[] sorted = queue.sort(collectCount); for (int i = sortedIdxStart; i < sortedIdxEnd; i++) { long pair = sorted[i]; int c = (int) (pair >>> 32); int tnum = Integer.MAX_VALUE - (int) pair; final BytesRef term = si.lookupOrd(startTermIndex + tnum); ft.indexedToReadable(term, charsRef); res.add(charsRef.toString(), c); } } else { // add results in index order int i = (startTermIndex == -1) ? 1 : 0; if (mincount <= 0 && contains == null) { // if mincount<=0 and we're not examining the values for contains, then // we won't discard any terms and we know exactly where to start. i += off; off = 0; } for (; i < nTerms; i++) { int c = counts[i]; if (c < mincount) continue; BytesRef term = null; if (contains != null) { term = si.lookupOrd(startTermIndex + i); if (!SimpleFacets.contains(term.utf8ToString(), contains, ignoreCase)) { continue; } } if (--off >= 0) continue; if (--lim < 0) break; if (term == null) { term = si.lookupOrd(startTermIndex + i); } ft.indexedToReadable(term, charsRef); res.add(charsRef.toString(), c); } } } return finalize(res, searcher, schemaField, docs, missingCount, missing); }
@Override public Directory getCompoundReader(Directory dir, SegmentInfo si, IOContext context) throws IOException { String dataFile = IndexFileNames.segmentFileName(si.name, "", DATA_EXTENSION); final IndexInput in = dir.openInput(dataFile, context); BytesRefBuilder scratch = new BytesRefBuilder(); // first get to TOC: DecimalFormat df = new DecimalFormat(OFFSETPATTERN, DecimalFormatSymbols.getInstance(Locale.ROOT)); long pos = in.length() - TABLEPOS.length - OFFSETPATTERN.length() - 1; in.seek(pos); SimpleTextUtil.readLine(in, scratch); assert StringHelper.startsWith(scratch.get(), TABLEPOS); long tablePos = -1; try { tablePos = df.parse(stripPrefix(scratch, TABLEPOS)).longValue(); } catch (ParseException e) { throw new CorruptIndexException( "can't parse CFS trailer, got: " + scratch.get().utf8ToString(), in); } // seek to TOC and read it in.seek(tablePos); SimpleTextUtil.readLine(in, scratch); assert StringHelper.startsWith(scratch.get(), TABLE); int numEntries = Integer.parseInt(stripPrefix(scratch, TABLE)); final String fileNames[] = new String[numEntries]; final long startOffsets[] = new long[numEntries]; final long endOffsets[] = new long[numEntries]; for (int i = 0; i < numEntries; i++) { SimpleTextUtil.readLine(in, scratch); assert StringHelper.startsWith(scratch.get(), TABLENAME); fileNames[i] = si.name + IndexFileNames.stripSegmentName(stripPrefix(scratch, TABLENAME)); if (i > 0) { // files must be unique and in sorted order assert fileNames[i].compareTo(fileNames[i - 1]) > 0; } SimpleTextUtil.readLine(in, scratch); assert StringHelper.startsWith(scratch.get(), TABLESTART); startOffsets[i] = Long.parseLong(stripPrefix(scratch, TABLESTART)); SimpleTextUtil.readLine(in, scratch); assert StringHelper.startsWith(scratch.get(), TABLEEND); endOffsets[i] = Long.parseLong(stripPrefix(scratch, TABLEEND)); } return new Directory() { private int getIndex(String name) throws IOException { int index = Arrays.binarySearch(fileNames, name); if (index < 0) { throw new FileNotFoundException( "No sub-file found (fileName=" + name + " files: " + Arrays.toString(fileNames) + ")"); } return index; } @Override public String[] listAll() throws IOException { ensureOpen(); return fileNames.clone(); } @Override public long fileLength(String name) throws IOException { ensureOpen(); int index = getIndex(name); return endOffsets[index] - startOffsets[index]; } @Override public IndexInput openInput(String name, IOContext context) throws IOException { ensureOpen(); int index = getIndex(name); return in.slice(name, startOffsets[index], endOffsets[index] - startOffsets[index]); } @Override public void close() throws IOException { in.close(); } // write methods: disabled @Override public IndexOutput createOutput(String name, IOContext context) { throw new UnsupportedOperationException(); } @Override public void sync(Collection<String> names) { throw new UnsupportedOperationException(); } @Override public void deleteFile(String name) { throw new UnsupportedOperationException(); } @Override public void renameFile(String source, String dest) { throw new UnsupportedOperationException(); } @Override public Lock makeLock(String name) { throw new UnsupportedOperationException(); } }; }
@Override public BytesRef getBytesRef() { return bytes.get(); }
@Override protected void doSetNextReader(LeafReaderContext context) throws IOException { if (segmentFacetCounts != null) { segmentResults.add(createSegmentResult()); } groupFieldTermsIndex = DocValues.getSorted(context.reader(), groupField); facetFieldDocTermOrds = DocValues.getSortedSet(context.reader(), facetField); facetFieldNumTerms = (int) facetFieldDocTermOrds.getValueCount(); if (facetFieldNumTerms == 0) { facetOrdTermsEnum = null; } else { facetOrdTermsEnum = facetFieldDocTermOrds.termsEnum(); } // [facetFieldNumTerms() + 1] for all possible facet values and docs not containing facet // field segmentFacetCounts = new int[facetFieldNumTerms + 1]; segmentTotalCount = 0; segmentGroupedFacetHits.clear(); for (GroupedFacetHit groupedFacetHit : groupedFacetHits) { int groupOrd = groupedFacetHit.groupValue == null ? -1 : groupFieldTermsIndex.lookupTerm(groupedFacetHit.groupValue); if (groupedFacetHit.groupValue != null && groupOrd < 0) { continue; } int facetOrd; if (groupedFacetHit.facetValue != null) { if (facetOrdTermsEnum == null || !facetOrdTermsEnum.seekExact(groupedFacetHit.facetValue)) { continue; } facetOrd = (int) facetOrdTermsEnum.ord(); } else { facetOrd = facetFieldNumTerms; } // (facetFieldDocTermOrds.numTerms() + 1) for all possible facet values and docs not // containing facet field int segmentGroupedFacetsIndex = groupOrd * (facetFieldNumTerms + 1) + facetOrd; segmentGroupedFacetHits.put(segmentGroupedFacetsIndex); } if (facetPrefix != null) { TermsEnum.SeekStatus seekStatus; if (facetOrdTermsEnum != null) { seekStatus = facetOrdTermsEnum.seekCeil(facetPrefix); } else { seekStatus = TermsEnum.SeekStatus.END; } if (seekStatus != TermsEnum.SeekStatus.END) { startFacetOrd = (int) facetOrdTermsEnum.ord(); } else { startFacetOrd = 0; endFacetOrd = 0; return; } BytesRefBuilder facetEndPrefix = new BytesRefBuilder(); facetEndPrefix.append(facetPrefix); facetEndPrefix.append(UnicodeUtil.BIG_TERM); seekStatus = facetOrdTermsEnum.seekCeil(facetEndPrefix.get()); if (seekStatus != TermsEnum.SeekStatus.END) { endFacetOrd = (int) facetOrdTermsEnum.ord(); } else { endFacetOrd = facetFieldNumTerms; // Don't include null... } } else { startFacetOrd = 0; endFacetOrd = facetFieldNumTerms + 1; } }