@Override public long ramBytesUsed() { return BASE_RAM_BYTES_USED + RamUsageEstimator.sizeOf(offsets) + RamUsageEstimator.sizeOf(scratch.bytes()) + RamUsageEstimator.sizeOf(scratchUTF16.chars()); }
/* Decodes only the term bytes of the next term. If caller then asks for metadata, ie docFreq, totalTermFreq or pulls a D/&PEnum, we then (lazily) decode all metadata up to the current term. */ private BytesRef _next() throws IOException { // System.out.println("BTR._next seg=" + segment + " this=" + this + " termCount=" + // state.termBlockOrd + " (vs " + blockTermCount + ")"); if (state.termBlockOrd == blockTermCount && !nextBlock()) { // System.out.println(" eof"); indexIsCurrent = false; return null; } // TODO: cutover to something better for these ints! simple64? final int suffix = termSuffixesReader.readVInt(); // System.out.println(" suffix=" + suffix); term.setLength(termBlockPrefix + suffix); term.grow(term.length()); termSuffixesReader.readBytes(term.bytes(), termBlockPrefix, suffix); state.termBlockOrd++; // NOTE: meaningless in the non-ord case state.ord++; // System.out.println(" return term=" + fieldInfo.name + ":" + term.utf8ToString() + " " + // term + " tbOrd=" + state.termBlockOrd); return term.get(); }
private void readField(BytesRef type, FieldInfo fieldInfo, StoredFieldVisitor visitor) throws IOException { readLine(); assert StringHelper.startsWith(scratch.get(), VALUE); if (type == TYPE_STRING) { byte[] bytes = new byte[scratch.length() - VALUE.length]; System.arraycopy(scratch.bytes(), VALUE.length, bytes, 0, bytes.length); visitor.stringField(fieldInfo, bytes); } else if (type == TYPE_BINARY) { byte[] copy = new byte[scratch.length() - VALUE.length]; System.arraycopy(scratch.bytes(), VALUE.length, copy, 0, copy.length); visitor.binaryField(fieldInfo, copy); } else if (type == TYPE_INT) { scratchUTF16.copyUTF8Bytes(scratch.bytes(), VALUE.length, scratch.length() - VALUE.length); visitor.intField(fieldInfo, Integer.parseInt(scratchUTF16.toString())); } else if (type == TYPE_LONG) { scratchUTF16.copyUTF8Bytes(scratch.bytes(), VALUE.length, scratch.length() - VALUE.length); visitor.longField(fieldInfo, Long.parseLong(scratchUTF16.toString())); } else if (type == TYPE_FLOAT) { scratchUTF16.copyUTF8Bytes(scratch.bytes(), VALUE.length, scratch.length() - VALUE.length); visitor.floatField(fieldInfo, Float.parseFloat(scratchUTF16.toString())); } else if (type == TYPE_DOUBLE) { scratchUTF16.copyUTF8Bytes(scratch.bytes(), VALUE.length, scratch.length() - VALUE.length); visitor.doubleField(fieldInfo, Double.parseDouble(scratchUTF16.toString())); } }
@SuppressWarnings("unused") private void printSeekState(PrintStream out) throws IOException { if (currentFrame == staticFrame) { out.println(" no prior seek"); } else { out.println(" prior seek state:"); int ord = 0; boolean isSeekFrame = true; while (true) { IDVersionSegmentTermsEnumFrame f = getFrame(ord); assert f != null; final BytesRef prefix = new BytesRef(term.bytes(), 0, f.prefix); if (f.nextEnt == -1) { out.println( " frame " + (isSeekFrame ? "(seek)" : "(next)") + " ord=" + ord + " fp=" + f.fp + (f.isFloor ? (" (fpOrig=" + f.fpOrig + ")") : "") + " prefixLen=" + f.prefix + " prefix=" + brToString(prefix) + (f.nextEnt == -1 ? "" : (" (of " + f.entCount + ")")) + " hasTerms=" + f.hasTerms + " isFloor=" + f.isFloor + " code=" + ((f.fp << VersionBlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS) + (f.hasTerms ? VersionBlockTreeTermsWriter.OUTPUT_FLAG_HAS_TERMS : 0) + (f.isFloor ? VersionBlockTreeTermsWriter.OUTPUT_FLAG_IS_FLOOR : 0)) + " isLastInFloor=" + f.isLastInFloor + " mdUpto=" + f.metaDataUpto + " tbOrd=" + f.getTermBlockOrd()); } else { out.println( " frame " + (isSeekFrame ? "(seek, loaded)" : "(next, loaded)") + " ord=" + ord + " fp=" + f.fp + (f.isFloor ? (" (fpOrig=" + f.fpOrig + ")") : "") + " prefixLen=" + f.prefix + " prefix=" + brToString(prefix) + " nextEnt=" + f.nextEnt + (f.nextEnt == -1 ? "" : (" (of " + f.entCount + ")")) + " hasTerms=" + f.hasTerms + " isFloor=" + f.isFloor + " code=" + ((f.fp << VersionBlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS) + (f.hasTerms ? VersionBlockTreeTermsWriter.OUTPUT_FLAG_HAS_TERMS : 0) + (f.isFloor ? VersionBlockTreeTermsWriter.OUTPUT_FLAG_IS_FLOOR : 0)) + " lastSubFP=" + f.lastSubFP + " isLastInFloor=" + f.isLastInFloor + " mdUpto=" + f.metaDataUpto + " tbOrd=" + f.getTermBlockOrd()); } if (fr.index != null) { assert !isSeekFrame || f.arc != null : "isSeekFrame=" + isSeekFrame + " f.arc=" + f.arc; if (f.prefix > 0 && isSeekFrame && f.arc.label != (term.byteAt(f.prefix - 1) & 0xFF)) { out.println( " broken seek state: arc.label=" + (char) f.arc.label + " vs term byte=" + (char) (term.byteAt(f.prefix - 1) & 0xFF)); throw new RuntimeException("seek state is broken"); } Pair<BytesRef, Long> output = Util.get(fr.index, prefix); if (output == null) { out.println(" broken seek state: prefix is not final in index"); throw new RuntimeException("seek state is broken"); } else if (isSeekFrame && !f.isFloor) { final ByteArrayDataInput reader = new ByteArrayDataInput( output.output1.bytes, output.output1.offset, output.output1.length); final long codeOrig = reader.readVLong(); final long code = (f.fp << VersionBlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS) | (f.hasTerms ? VersionBlockTreeTermsWriter.OUTPUT_FLAG_HAS_TERMS : 0) | (f.isFloor ? VersionBlockTreeTermsWriter.OUTPUT_FLAG_IS_FLOOR : 0); if (codeOrig != code) { out.println( " broken seek state: output code=" + codeOrig + " doesn't match frame code=" + code); throw new RuntimeException("seek state is broken"); } } } if (f == currentFrame) { break; } if (f.prefix == validIndexPrefix) { isSeekFrame = false; } ord++; } } }
/** Builds an {@link SynonymMap} and returns it. */ public SynonymMap build() throws IOException { ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); // TODO: are we using the best sharing options? org.apache.lucene.util.fst.Builder<BytesRef> builder = new org.apache.lucene.util.fst.Builder<>(FST.INPUT_TYPE.BYTE4, outputs); BytesRefBuilder scratch = new BytesRefBuilder(); ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput(); final Set<Integer> dedupSet; if (dedup) { dedupSet = new HashSet<>(); } else { dedupSet = null; } final byte[] spare = new byte[5]; Set<CharsRef> keys = workingSet.keySet(); CharsRef sortedKeys[] = keys.toArray(new CharsRef[keys.size()]); Arrays.sort(sortedKeys, CharsRef.getUTF16SortedAsUTF8Comparator()); final IntsRefBuilder scratchIntsRef = new IntsRefBuilder(); // System.out.println("fmap.build"); for (int keyIdx = 0; keyIdx < sortedKeys.length; keyIdx++) { CharsRef input = sortedKeys[keyIdx]; MapEntry output = workingSet.get(input); int numEntries = output.ords.size(); // output size, assume the worst case int estimatedSize = 5 + numEntries * 5; // numEntries + one ord for each entry scratch.grow(estimatedSize); scratchOutput.reset(scratch.bytes()); // now write our output data: int count = 0; for (int i = 0; i < numEntries; i++) { if (dedupSet != null) { // box once final Integer ent = output.ords.get(i); if (dedupSet.contains(ent)) { continue; } dedupSet.add(ent); } scratchOutput.writeVInt(output.ords.get(i)); count++; } final int pos = scratchOutput.getPosition(); scratchOutput.writeVInt(count << 1 | (output.includeOrig ? 0 : 1)); final int pos2 = scratchOutput.getPosition(); final int vIntLen = pos2 - pos; // Move the count + includeOrig to the front of the byte[]: System.arraycopy(scratch.bytes(), pos, spare, 0, vIntLen); System.arraycopy(scratch.bytes(), 0, scratch.bytes(), vIntLen, pos); System.arraycopy(spare, 0, scratch.bytes(), 0, vIntLen); if (dedupSet != null) { dedupSet.clear(); } scratch.setLength(scratchOutput.getPosition()); // System.out.println(" add input=" + input + " output=" + scratch + " offset=" + // scratch.offset + " length=" + scratch.length + " count=" + count); builder.add(Util.toUTF32(input, scratchIntsRef), scratch.toBytesRef()); } FST<BytesRef> fst = builder.finish(); return new SynonymMap(fst, words, maxHorizontalContext); }
private int parseIntAt(int offset) { scratchUTF16.copyUTF8Bytes(scratch.bytes(), offset, scratch.length() - offset); return ArrayUtil.parseInt(scratchUTF16.chars(), 0, scratchUTF16.length()); }
// helper method to strip strip away 'prefix' from 'scratch' and return as String private String stripPrefix(BytesRefBuilder scratch, BytesRef prefix) throws IOException { return new String( scratch.bytes(), prefix.length, scratch.length() - prefix.length, StandardCharsets.UTF_8); }
// TODO: we may want an alternate mode here which is // "if you are about to return NOT_FOUND I won't use // the terms data from that"; eg FuzzyTermsEnum will // (usually) just immediately call seek again if we // return NOT_FOUND so it's a waste for us to fill in // the term that was actually NOT_FOUND @Override public SeekStatus seekCeil(final BytesRef target) throws IOException { if (indexEnum == null) { throw new IllegalStateException("terms index was not loaded"); } // System.out.println("BTR.seek seg=" + segment + " target=" + fieldInfo.name + ":" + // target.utf8ToString() + " " + target + " current=" + term().utf8ToString() + " " + term() // + " indexIsCurrent=" + indexIsCurrent + " didIndexNext=" + didIndexNext + " seekPending=" // + seekPending + " divisor=" + indexReader.getDivisor() + " this=" + this); if (didIndexNext) { if (nextIndexTerm == null) { // System.out.println(" nextIndexTerm=null"); } else { // System.out.println(" nextIndexTerm=" + nextIndexTerm.utf8ToString()); } } boolean doSeek = true; // See if we can avoid seeking, because target term // is after current term but before next index term: if (indexIsCurrent) { final int cmp = BytesRef.getUTF8SortedAsUnicodeComparator().compare(term.get(), target); if (cmp == 0) { // Already at the requested term return SeekStatus.FOUND; } else if (cmp < 0) { // Target term is after current term if (!didIndexNext) { if (indexEnum.next() == -1) { nextIndexTerm = null; } else { nextIndexTerm = indexEnum.term(); } // System.out.println(" now do index next() nextIndexTerm=" + (nextIndexTerm == null // ? "null" : nextIndexTerm.utf8ToString())); didIndexNext = true; } if (nextIndexTerm == null || BytesRef.getUTF8SortedAsUnicodeComparator().compare(target, nextIndexTerm) < 0) { // Optimization: requested term is within the // same term block we are now in; skip seeking // (but do scanning): doSeek = false; // System.out.println(" skip seek: nextIndexTerm=" + (nextIndexTerm == null ? "null" // : nextIndexTerm.utf8ToString())); } } } if (doSeek) { // System.out.println(" seek"); // Ask terms index to find biggest indexed term (= // first term in a block) that's <= our text: in.seek(indexEnum.seek(target)); boolean result = nextBlock(); // Block must exist since, at least, the indexed term // is in the block: assert result; indexIsCurrent = true; didIndexNext = false; if (doOrd) { state.ord = indexEnum.ord() - 1; } term.copyBytes(indexEnum.term()); // System.out.println(" seek: term=" + term.utf8ToString()); } else { // System.out.println(" skip seek"); if (state.termBlockOrd == blockTermCount && !nextBlock()) { indexIsCurrent = false; return SeekStatus.END; } } seekPending = false; int common = 0; // Scan within block. We could do this by calling // _next() and testing the resulting term, but this // is wasteful. Instead, we first confirm the // target matches the common prefix of this block, // and then we scan the term bytes directly from the // termSuffixesreader's byte[], saving a copy into // the BytesRef term per term. Only when we return // do we then copy the bytes into the term. while (true) { // First, see if target term matches common prefix // in this block: if (common < termBlockPrefix) { final int cmp = (term.byteAt(common) & 0xFF) - (target.bytes[target.offset + common] & 0xFF); if (cmp < 0) { // TODO: maybe we should store common prefix // in block header? (instead of relying on // last term of previous block) // Target's prefix is after the common block // prefix, so term cannot be in this block // but it could be in next block. We // must scan to end-of-block to set common // prefix for next block: if (state.termBlockOrd < blockTermCount) { while (state.termBlockOrd < blockTermCount - 1) { state.termBlockOrd++; state.ord++; termSuffixesReader.skipBytes(termSuffixesReader.readVInt()); } final int suffix = termSuffixesReader.readVInt(); term.setLength(termBlockPrefix + suffix); term.grow(term.length()); termSuffixesReader.readBytes(term.bytes(), termBlockPrefix, suffix); } state.ord++; if (!nextBlock()) { indexIsCurrent = false; return SeekStatus.END; } common = 0; } else if (cmp > 0) { // Target's prefix is before the common prefix // of this block, so we position to start of // block and return NOT_FOUND: assert state.termBlockOrd == 0; final int suffix = termSuffixesReader.readVInt(); term.setLength(termBlockPrefix + suffix); term.grow(term.length()); termSuffixesReader.readBytes(term.bytes(), termBlockPrefix, suffix); return SeekStatus.NOT_FOUND; } else { common++; } continue; } // Test every term in this block while (true) { state.termBlockOrd++; state.ord++; final int suffix = termSuffixesReader.readVInt(); // We know the prefix matches, so just compare the new suffix: final int termLen = termBlockPrefix + suffix; int bytePos = termSuffixesReader.getPosition(); boolean next = false; final int limit = target.offset + (termLen < target.length ? termLen : target.length); int targetPos = target.offset + termBlockPrefix; while (targetPos < limit) { final int cmp = (termSuffixes[bytePos++] & 0xFF) - (target.bytes[targetPos++] & 0xFF); if (cmp < 0) { // Current term is still before the target; // keep scanning next = true; break; } else if (cmp > 0) { // Done! Current term is after target. Stop // here, fill in real term, return NOT_FOUND. term.setLength(termBlockPrefix + suffix); term.grow(term.length()); termSuffixesReader.readBytes(term.bytes(), termBlockPrefix, suffix); // System.out.println(" NOT_FOUND"); return SeekStatus.NOT_FOUND; } } if (!next && target.length <= termLen) { term.setLength(termBlockPrefix + suffix); term.grow(term.length()); termSuffixesReader.readBytes(term.bytes(), termBlockPrefix, suffix); if (target.length == termLen) { // Done! Exact match. Stop here, fill in // real term, return FOUND. // System.out.println(" FOUND"); return SeekStatus.FOUND; } else { // System.out.println(" NOT_FOUND"); return SeekStatus.NOT_FOUND; } } if (state.termBlockOrd == blockTermCount) { // Must pre-fill term for next block's common prefix term.setLength(termBlockPrefix + suffix); term.grow(term.length()); termSuffixesReader.readBytes(term.bytes(), termBlockPrefix, suffix); break; } else { termSuffixesReader.skipBytes(suffix); } } // The purpose of the terms dict index is to seek // the enum to the closest index term before the // term we are looking for. So, we should never // cross another index term (besides the first // one) while we are scanning: assert indexIsCurrent; if (!nextBlock()) { // System.out.println(" END"); indexIsCurrent = false; return SeekStatus.END; } common = 0; } }
private String readString(int offset, BytesRefBuilder scratch) { scratchUTF16.copyUTF8Bytes(scratch.bytes(), offset, scratch.length() - offset); return scratchUTF16.toString(); }
@Override public Fields get(int doc) throws IOException { SortedMap<String, SimpleTVTerms> fields = new TreeMap<>(); in.seek(offsets[doc]); readLine(); assert StringHelper.startsWith(scratch.get(), NUMFIELDS); int numFields = parseIntAt(NUMFIELDS.length); if (numFields == 0) { return null; // no vectors for this doc } for (int i = 0; i < numFields; i++) { readLine(); assert StringHelper.startsWith(scratch.get(), FIELD); // skip fieldNumber: parseIntAt(FIELD.length); readLine(); assert StringHelper.startsWith(scratch.get(), FIELDNAME); String fieldName = readString(FIELDNAME.length, scratch); readLine(); assert StringHelper.startsWith(scratch.get(), FIELDPOSITIONS); boolean positions = Boolean.parseBoolean(readString(FIELDPOSITIONS.length, scratch)); readLine(); assert StringHelper.startsWith(scratch.get(), FIELDOFFSETS); boolean offsets = Boolean.parseBoolean(readString(FIELDOFFSETS.length, scratch)); readLine(); assert StringHelper.startsWith(scratch.get(), FIELDPAYLOADS); boolean payloads = Boolean.parseBoolean(readString(FIELDPAYLOADS.length, scratch)); readLine(); assert StringHelper.startsWith(scratch.get(), FIELDTERMCOUNT); int termCount = parseIntAt(FIELDTERMCOUNT.length); SimpleTVTerms terms = new SimpleTVTerms(offsets, positions, payloads); fields.put(fieldName, terms); BytesRefBuilder term = new BytesRefBuilder(); for (int j = 0; j < termCount; j++) { readLine(); assert StringHelper.startsWith(scratch.get(), TERMTEXT); int termLength = scratch.length() - TERMTEXT.length; term.grow(termLength); term.setLength(termLength); System.arraycopy(scratch.bytes(), TERMTEXT.length, term.bytes(), 0, termLength); SimpleTVPostings postings = new SimpleTVPostings(); terms.terms.put(term.toBytesRef(), postings); readLine(); assert StringHelper.startsWith(scratch.get(), TERMFREQ); postings.freq = parseIntAt(TERMFREQ.length); if (positions || offsets) { if (positions) { postings.positions = new int[postings.freq]; if (payloads) { postings.payloads = new BytesRef[postings.freq]; } } if (offsets) { postings.startOffsets = new int[postings.freq]; postings.endOffsets = new int[postings.freq]; } for (int k = 0; k < postings.freq; k++) { if (positions) { readLine(); assert StringHelper.startsWith(scratch.get(), POSITION); postings.positions[k] = parseIntAt(POSITION.length); if (payloads) { readLine(); assert StringHelper.startsWith(scratch.get(), PAYLOAD); if (scratch.length() - PAYLOAD.length == 0) { postings.payloads[k] = null; } else { byte payloadBytes[] = new byte[scratch.length() - PAYLOAD.length]; System.arraycopy( scratch.bytes(), PAYLOAD.length, payloadBytes, 0, payloadBytes.length); postings.payloads[k] = new BytesRef(payloadBytes); } } } if (offsets) { readLine(); assert StringHelper.startsWith(scratch.get(), STARTOFFSET); postings.startOffsets[k] = parseIntAt(STARTOFFSET.length); readLine(); assert StringHelper.startsWith(scratch.get(), ENDOFFSET); postings.endOffsets[k] = parseIntAt(ENDOFFSET.length); } } } } } return new SimpleTVFields(fields); }