@Override public List<LuceneSegmentInputSplit> getSplits(JobContext context) throws IOException, InterruptedException { Configuration configuration = context.getConfiguration(); LuceneStorageConfiguration lucene2SeqConfiguration = new LuceneStorageConfiguration(configuration); List<LuceneSegmentInputSplit> inputSplits = new ArrayList<>(); List<Path> indexPaths = lucene2SeqConfiguration.getIndexPaths(); for (Path indexPath : indexPaths) { ReadOnlyFileSystemDirectory directory = new ReadOnlyFileSystemDirectory( FileSystem.get(configuration), indexPath, false, configuration); SegmentInfos segmentInfos = new SegmentInfos(); segmentInfos.read(directory); for (SegmentCommitInfo segmentInfo : segmentInfos) { LuceneSegmentInputSplit inputSplit = new LuceneSegmentInputSplit( indexPath, segmentInfo.info.name, segmentInfo.sizeInBytes()); inputSplits.add(inputSplit); LOG.info( "Created {} byte input split for index '{}' segment {}", segmentInfo.sizeInBytes(), indexPath.toUri(), segmentInfo.info.name); } } return inputSplits; }
/* Removes any BufferedDeletes that we no longer need to * store because all segments in the index have had the * deletes applied. */ public synchronized void prune(SegmentInfos segmentInfos) { assert checkDeleteStats(); long minGen = Long.MAX_VALUE; for (SegmentCommitInfo info : segmentInfos) { minGen = Math.min(info.getBufferedDeletesGen(), minGen); } if (infoStream.isEnabled("BD")) { infoStream.message( "BD", "prune sis=" + segmentInfos + " minGen=" + minGen + " packetCount=" + updates.size()); } final int limit = updates.size(); for (int delIDX = 0; delIDX < limit; delIDX++) { if (updates.get(delIDX).delGen() >= minGen) { prune(delIDX); assert checkDeleteStats(); return; } } // All deletes pruned prune(limit); assert !any(); assert checkDeleteStats(); }
/** Clones {@code this}, optionally also cloning the {@link SegmentInfo}. */ SegmentCommitInfo clone(boolean cloneSegmentInfo) { SegmentInfo otherInfo; if (cloneSegmentInfo) { otherInfo = info.clone(); } else { otherInfo = info; } SegmentCommitInfo other = new SegmentCommitInfo(otherInfo, delCount, delGen, fieldInfosGen, docValuesGen); // Not clear that we need to carry over nextWriteDelGen // (i.e. do we ever clone after a failed write and // before the next successful write?), but just do it to // be safe: other.nextWriteDelGen = nextWriteDelGen; other.nextWriteFieldInfosGen = nextWriteFieldInfosGen; other.nextWriteDocValuesGen = nextWriteDocValuesGen; // deep clone for (Entry<Long, Set<String>> e : genUpdatesFiles.entrySet()) { other.genUpdatesFiles.put(e.getKey(), new HashSet<>(e.getValue())); } // deep clone for (Entry<Integer, Set<String>> e : dvUpdatesFiles.entrySet()) { other.dvUpdatesFiles.put(e.getKey(), new HashSet<>(e.getValue())); } other.fieldInfosFiles.addAll(fieldInfosFiles); return other; }
List<SegmentCommitInfo> createBackupSegmentInfos() { final List<SegmentCommitInfo> list = new ArrayList<>(size()); for (final SegmentCommitInfo info : this) { assert info.info.getCodec() != null; list.add(info.clone()); } return list; }
@Override public int compare(SegmentCommitInfo si1, SegmentCommitInfo si2) { final long cmp = si1.getBufferedDeletesGen() - si2.getBufferedDeletesGen(); if (cmp > 0) { return 1; } else if (cmp < 0) { return -1; } else { return 0; } }
/** Returns readable description of this segment. */ @Override public String toString() { StringBuilder buffer = new StringBuilder(); buffer.append(getSegmentsFileName()).append(": "); final int count = size(); for (int i = 0; i < count; i++) { if (i > 0) { buffer.append(' '); } final SegmentCommitInfo info = info(i); buffer.append(info.toString(0)); } return buffer.toString(); }
/** * Returns all file names referenced by SegmentInfo. The returned collection is recomputed on each * invocation. */ public Collection<String> files(boolean includeSegmentsFile) throws IOException { HashSet<String> files = new HashSet<>(); if (includeSegmentsFile) { final String segmentFileName = getSegmentsFileName(); if (segmentFileName != null) { files.add(segmentFileName); } } final int size = size(); for (int i = 0; i < size; i++) { final SegmentCommitInfo info = info(i); files.addAll(info.files()); } return files; }
/** Returns a copy of this instance, also copying each SegmentInfo. */ @Override public SegmentInfos clone() { try { final SegmentInfos sis = (SegmentInfos) super.clone(); // deep clone, first recreate all collections: sis.segments = new ArrayList<>(size()); for (final SegmentCommitInfo info : this) { assert info.info.getCodec() != null; // dont directly access segments, use add method!!! sis.add(info.clone()); } sis.userData = new HashMap<>(userData); return sis; } catch (CloneNotSupportedException e) { throw new RuntimeException("should not happen", e); } }
/** * Return the byte size of the provided {@link SegmentCommitInfo}, pro-rated by percentage of * non-deleted documents is set. */ protected long size(SegmentCommitInfo info) throws IOException { long byteSize = info.sizeInBytes(); int delCount = writer.get().numDeletedDocs(info); double delRatio = (info.info.getDocCount() <= 0 ? 0.0f : ((float) delCount / (float) info.info.getDocCount())); assert delRatio <= 1.0; return (info.info.getDocCount() <= 0 ? byteSize : (long) (byteSize * (1.0 - delRatio))); }
/** * Resolves the buffered deleted Term/Query/docIDs, into actual deleted docIDs in the liveDocs * MutableBits for each SegmentReader. */ public synchronized ApplyDeletesResult applyDeletesAndUpdates( IndexWriter.ReaderPool readerPool, List<SegmentCommitInfo> infos) throws IOException { final long t0 = System.currentTimeMillis(); if (infos.size() == 0) { return new ApplyDeletesResult(false, nextGen++, null); } assert checkDeleteStats(); if (!any()) { if (infoStream.isEnabled("BD")) { infoStream.message("BD", "applyDeletes: no deletes; skipping"); } return new ApplyDeletesResult(false, nextGen++, null); } if (infoStream.isEnabled("BD")) { infoStream.message("BD", "applyDeletes: infos=" + infos + " packetCount=" + updates.size()); } final long gen = nextGen++; List<SegmentCommitInfo> infos2 = new ArrayList<SegmentCommitInfo>(); infos2.addAll(infos); Collections.sort(infos2, sortSegInfoByDelGen); CoalescedUpdates coalescedUpdates = null; boolean anyNewDeletes = false; int infosIDX = infos2.size() - 1; int delIDX = updates.size() - 1; List<SegmentCommitInfo> allDeleted = null; while (infosIDX >= 0) { // System.out.println("BD: cycle delIDX=" + delIDX + " infoIDX=" + infosIDX); final FrozenBufferedUpdates packet = delIDX >= 0 ? updates.get(delIDX) : null; final SegmentCommitInfo info = infos2.get(infosIDX); final long segGen = info.getBufferedDeletesGen(); if (packet != null && segGen < packet.delGen()) { // System.out.println(" coalesce"); if (coalescedUpdates == null) { coalescedUpdates = new CoalescedUpdates(); } if (!packet.isSegmentPrivate) { /* * Only coalesce if we are NOT on a segment private del packet: the segment private del packet * must only applied to segments with the same delGen. Yet, if a segment is already deleted * from the SI since it had no more documents remaining after some del packets younger than * its segPrivate packet (higher delGen) have been applied, the segPrivate packet has not been * removed. */ coalescedUpdates.update(packet); } delIDX--; } else if (packet != null && segGen == packet.delGen()) { assert packet.isSegmentPrivate : "Packet and Segments deletegen can only match on a segment private del packet gen=" + segGen; // System.out.println(" eq"); // Lock order: IW -> BD -> RP assert readerPool.infoIsLive(info); final ReadersAndUpdates rld = readerPool.get(info, true); final SegmentReader reader = rld.getReader(IOContext.READ); int delCount = 0; final boolean segAllDeletes; try { Map<String, NumericFieldUpdates> fieldUpdates = null; if (coalescedUpdates != null) { // System.out.println(" del coalesced"); delCount += applyTermDeletes(coalescedUpdates.termsIterable(), rld, reader); delCount += applyQueryDeletes(coalescedUpdates.queriesIterable(), rld, reader); fieldUpdates = applyNumericDocValuesUpdates( coalescedUpdates.numericDVUpdates, rld, reader, fieldUpdates); } // System.out.println(" del exact"); // Don't delete by Term here; DocumentsWriterPerThread // already did that on flush: delCount += applyQueryDeletes(packet.queriesIterable(), rld, reader); fieldUpdates = applyNumericDocValuesUpdates( Arrays.asList(packet.updates), rld, reader, fieldUpdates); if (!fieldUpdates.isEmpty()) { rld.writeFieldUpdates(info.info.dir, fieldUpdates); } final int fullDelCount = rld.info.getDelCount() + rld.getPendingDeleteCount(); assert fullDelCount <= rld.info.info.getDocCount(); segAllDeletes = fullDelCount == rld.info.info.getDocCount(); } finally { rld.release(reader); readerPool.release(rld); } anyNewDeletes |= delCount > 0; if (segAllDeletes) { if (allDeleted == null) { allDeleted = new ArrayList<SegmentCommitInfo>(); } allDeleted.add(info); } if (infoStream.isEnabled("BD")) { infoStream.message( "BD", "seg=" + info + " segGen=" + segGen + " segDeletes=[" + packet + "]; coalesced deletes=[" + (coalescedUpdates == null ? "null" : coalescedUpdates) + "] newDelCount=" + delCount + (segAllDeletes ? " 100% deleted" : "")); } if (coalescedUpdates == null) { coalescedUpdates = new CoalescedUpdates(); } /* * Since we are on a segment private del packet we must not * update the coalescedDeletes here! We can simply advance to the * next packet and seginfo. */ delIDX--; infosIDX--; info.setBufferedDeletesGen(gen); } else { // System.out.println(" gt"); if (coalescedUpdates != null) { // Lock order: IW -> BD -> RP assert readerPool.infoIsLive(info); final ReadersAndUpdates rld = readerPool.get(info, true); final SegmentReader reader = rld.getReader(IOContext.READ); int delCount = 0; final boolean segAllDeletes; try { delCount += applyTermDeletes(coalescedUpdates.termsIterable(), rld, reader); delCount += applyQueryDeletes(coalescedUpdates.queriesIterable(), rld, reader); Map<String, NumericFieldUpdates> fieldUpdates = applyNumericDocValuesUpdates(coalescedUpdates.numericDVUpdates, rld, reader, null); if (!fieldUpdates.isEmpty()) { rld.writeFieldUpdates(info.info.dir, fieldUpdates); } final int fullDelCount = rld.info.getDelCount() + rld.getPendingDeleteCount(); assert fullDelCount <= rld.info.info.getDocCount(); segAllDeletes = fullDelCount == rld.info.info.getDocCount(); } finally { rld.release(reader); readerPool.release(rld); } anyNewDeletes |= delCount > 0; if (segAllDeletes) { if (allDeleted == null) { allDeleted = new ArrayList<SegmentCommitInfo>(); } allDeleted.add(info); } if (infoStream.isEnabled("BD")) { infoStream.message( "BD", "seg=" + info + " segGen=" + segGen + " coalesced deletes=[" + coalescedUpdates + "] newDelCount=" + delCount + (segAllDeletes ? " 100% deleted" : "")); } } info.setBufferedDeletesGen(gen); infosIDX--; } } assert checkDeleteStats(); if (infoStream.isEnabled("BD")) { infoStream.message("BD", "applyDeletes took " + (System.currentTimeMillis() - t0) + " msec"); } // assert infos != segmentInfos || !any() : "infos=" + infos + " segmentInfos=" + segmentInfos + // " any=" + any; return new ApplyDeletesResult(anyNewDeletes, gen, allDeleted); }
@Test public void testRollingUpdates() throws Exception { Random random = new Random(random().nextLong()); final BaseDirectoryWrapper dir = newDirectory(); // test checks for no unref'ed files with the IW helper method, which isn't aware of "tried to // delete files" if (dir instanceof MockDirectoryWrapper) { ((MockDirectoryWrapper) dir).setEnableVirusScanner(false); } final LineFileDocs docs = new LineFileDocs(random, true); // provider.register(new MemoryCodec()); if (random().nextBoolean()) { Codec.setDefault( TestUtil.alwaysPostingsFormat( new MemoryPostingsFormat(random().nextBoolean(), random.nextFloat()))); } MockAnalyzer analyzer = new MockAnalyzer(random()); analyzer.setMaxTokenLength(TestUtil.nextInt(random(), 1, IndexWriter.MAX_TERM_LENGTH)); final IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(analyzer)); final int SIZE = atLeast(20); int id = 0; IndexReader r = null; IndexSearcher s = null; final int numUpdates = (int) (SIZE * (2 + (TEST_NIGHTLY ? 200 * random().nextDouble() : 5 * random().nextDouble()))); if (VERBOSE) { System.out.println("TEST: numUpdates=" + numUpdates); } int updateCount = 0; // TODO: sometimes update ids not in order... for (int docIter = 0; docIter < numUpdates; docIter++) { final Document doc = docs.nextDoc(); final String myID = Integer.toString(id); if (id == SIZE - 1) { id = 0; } else { id++; } if (VERBOSE) { System.out.println(" docIter=" + docIter + " id=" + id); } ((Field) doc.getField("docid")).setStringValue(myID); Term idTerm = new Term("docid", myID); final boolean doUpdate; if (s != null && updateCount < SIZE) { TopDocs hits = s.search(new TermQuery(idTerm), 1); assertEquals(1, hits.totalHits); doUpdate = !w.tryDeleteDocument(r, hits.scoreDocs[0].doc); if (VERBOSE) { if (doUpdate) { System.out.println(" tryDeleteDocument failed"); } else { System.out.println(" tryDeleteDocument succeeded"); } } } else { doUpdate = true; if (VERBOSE) { System.out.println(" no searcher: doUpdate=true"); } } updateCount++; if (doUpdate) { if (random().nextBoolean()) { w.updateDocument(idTerm, doc); } else { // It's OK to not be atomic for this test (no separate thread reopening readers): w.deleteDocuments(new TermQuery(idTerm)); w.addDocument(doc); } } else { w.addDocument(doc); } if (docIter >= SIZE && random().nextInt(50) == 17) { if (r != null) { r.close(); } final boolean applyDeletions = random().nextBoolean(); if (VERBOSE) { System.out.println("TEST: reopen applyDeletions=" + applyDeletions); } r = w.getReader(applyDeletions); if (applyDeletions) { s = newSearcher(r); } else { s = null; } assertTrue( "applyDeletions=" + applyDeletions + " r.numDocs()=" + r.numDocs() + " vs SIZE=" + SIZE, !applyDeletions || r.numDocs() == SIZE); updateCount = 0; } } if (r != null) { r.close(); } w.commit(); assertEquals(SIZE, w.numDocs()); w.close(); TestIndexWriter.assertNoUnreferencedFiles(dir, "leftover files after rolling updates"); docs.close(); // LUCENE-4455: SegmentInfos infos = SegmentInfos.readLatestCommit(dir); long totalBytes = 0; for (SegmentCommitInfo sipc : infos) { totalBytes += sipc.sizeInBytes(); } long totalBytes2 = 0; for (String fileName : dir.listAll()) { if (IndexFileNames.CODEC_FILE_PATTERN.matcher(fileName).matches()) { totalBytes2 += dir.fileLength(fileName); } } assertEquals(totalBytes2, totalBytes); dir.close(); }
private void write(Directory directory) throws IOException { long nextGeneration = getNextPendingGeneration(); String segmentFileName = IndexFileNames.fileNameFromGeneration(IndexFileNames.PENDING_SEGMENTS, "", nextGeneration); // Always advance the generation on write: generation = nextGeneration; IndexOutput segnOutput = null; boolean success = false; try { segnOutput = directory.createOutput(segmentFileName, IOContext.DEFAULT); CodecUtil.writeIndexHeader( segnOutput, "segments", VERSION_CURRENT, StringHelper.randomId(), Long.toString(nextGeneration, Character.MAX_RADIX)); segnOutput.writeVInt(Version.LATEST.major); segnOutput.writeVInt(Version.LATEST.minor); segnOutput.writeVInt(Version.LATEST.bugfix); segnOutput.writeLong(version); segnOutput.writeInt(counter); // write counter segnOutput.writeInt(size()); if (size() > 0) { Version minSegmentVersion = null; // We do a separate loop up front so we can write the minSegmentVersion before // any SegmentInfo; this makes it cleaner to throw IndexFormatTooOldExc at read time: for (SegmentCommitInfo siPerCommit : this) { Version segmentVersion = siPerCommit.info.getVersion(); if (minSegmentVersion == null || segmentVersion.onOrAfter(minSegmentVersion) == false) { minSegmentVersion = segmentVersion; } } segnOutput.writeVInt(minSegmentVersion.major); segnOutput.writeVInt(minSegmentVersion.minor); segnOutput.writeVInt(minSegmentVersion.bugfix); } // write infos for (SegmentCommitInfo siPerCommit : this) { SegmentInfo si = siPerCommit.info; segnOutput.writeString(si.name); byte segmentID[] = si.getId(); // TODO: remove this in lucene 6, we don't need to include 4.x segments in commits anymore if (segmentID == null) { segnOutput.writeByte((byte) 0); } else { if (segmentID.length != StringHelper.ID_LENGTH) { throw new IllegalStateException( "cannot write segment: invalid id segment=" + si.name + "id=" + StringHelper.idToString(segmentID)); } segnOutput.writeByte((byte) 1); segnOutput.writeBytes(segmentID, segmentID.length); } segnOutput.writeString(si.getCodec().getName()); segnOutput.writeLong(siPerCommit.getDelGen()); int delCount = siPerCommit.getDelCount(); if (delCount < 0 || delCount > si.maxDoc()) { throw new IllegalStateException( "cannot write segment: invalid maxDoc segment=" + si.name + " maxDoc=" + si.maxDoc() + " delCount=" + delCount); } segnOutput.writeInt(delCount); segnOutput.writeLong(siPerCommit.getFieldInfosGen()); segnOutput.writeLong(siPerCommit.getDocValuesGen()); segnOutput.writeSetOfStrings(siPerCommit.getFieldInfosFiles()); final Map<Integer, Set<String>> dvUpdatesFiles = siPerCommit.getDocValuesUpdatesFiles(); segnOutput.writeInt(dvUpdatesFiles.size()); for (Entry<Integer, Set<String>> e : dvUpdatesFiles.entrySet()) { segnOutput.writeInt(e.getKey()); segnOutput.writeSetOfStrings(e.getValue()); } } segnOutput.writeMapOfStrings(userData); CodecUtil.writeFooter(segnOutput); segnOutput.close(); directory.sync(Collections.singleton(segmentFileName)); success = true; } finally { if (success) { pendingCommit = true; } else { // We hit an exception above; try to close the file // but suppress any exception: IOUtils.closeWhileHandlingException(segnOutput); // Try not to leave a truncated segments_N file in // the index: IOUtils.deleteFilesIgnoringExceptions(directory, segmentFileName); } } }
/** * Read a particular segmentFileName. Note that this may throw an IOException if a commit is in * process. * * @param directory -- directory containing the segments file * @param segmentFileName -- segment file to load * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error */ public static final SegmentInfos readCommit(Directory directory, String segmentFileName) throws IOException { long generation = generationFromSegmentsFileName(segmentFileName); try (ChecksumIndexInput input = directory.openChecksumInput(segmentFileName, IOContext.READ)) { // NOTE: as long as we want to throw indexformattooold (vs corruptindexexception), we need // to read the magic ourselves. int magic = input.readInt(); if (magic != CodecUtil.CODEC_MAGIC) { throw new IndexFormatTooOldException( input, magic, CodecUtil.CODEC_MAGIC, CodecUtil.CODEC_MAGIC); } // 4.0+ int format = CodecUtil.checkHeaderNoMagic(input, "segments", VERSION_40, VERSION_CURRENT); // 5.0+ byte id[] = null; if (format >= VERSION_50) { id = new byte[StringHelper.ID_LENGTH]; input.readBytes(id, 0, id.length); CodecUtil.checkIndexHeaderSuffix(input, Long.toString(generation, Character.MAX_RADIX)); } SegmentInfos infos = new SegmentInfos(); infos.id = id; infos.generation = generation; infos.lastGeneration = generation; if (format >= VERSION_53) { // TODO: in the future (7.0? sigh) we can use this to throw IndexFormatTooOldException ... // or just rely on the // minSegmentLuceneVersion check instead: infos.luceneVersion = Version.fromBits(input.readVInt(), input.readVInt(), input.readVInt()); } else { // else compute the min version down below in the for loop } infos.version = input.readLong(); infos.counter = input.readInt(); int numSegments = input.readInt(); if (numSegments < 0) { throw new CorruptIndexException("invalid segment count: " + numSegments, input); } if (format >= VERSION_53) { if (numSegments > 0) { infos.minSegmentLuceneVersion = Version.fromBits(input.readVInt(), input.readVInt(), input.readVInt()); if (infos.minSegmentLuceneVersion.onOrAfter(Version.LUCENE_4_0_0_ALPHA) == false) { throw new IndexFormatTooOldException( input, "this index contains a too-old segment (version: " + infos.minSegmentLuceneVersion + ")"); } } else { // else leave as null: no segments } } else { // else we recompute it below as we visit segments; it can't be used for throwing // IndexFormatTooOldExc, but consumers of // SegmentInfos can maybe still use it for other reasons } long totalDocs = 0; for (int seg = 0; seg < numSegments; seg++) { String segName = input.readString(); final byte segmentID[]; if (format >= VERSION_50) { byte hasID = input.readByte(); if (hasID == 1) { segmentID = new byte[StringHelper.ID_LENGTH]; input.readBytes(segmentID, 0, segmentID.length); } else if (hasID == 0) { segmentID = null; // 4.x segment, doesn't have an ID } else { throw new CorruptIndexException("invalid hasID byte, got: " + hasID, input); } } else { segmentID = null; } Codec codec = readCodec(input, format < VERSION_53); SegmentInfo info = codec.segmentInfoFormat().read(directory, segName, segmentID, IOContext.READ); info.setCodec(codec); totalDocs += info.maxDoc(); long delGen = input.readLong(); int delCount = input.readInt(); if (delCount < 0 || delCount > info.maxDoc()) { throw new CorruptIndexException( "invalid deletion count: " + delCount + " vs maxDoc=" + info.maxDoc(), input); } long fieldInfosGen = -1; if (format >= VERSION_46) { fieldInfosGen = input.readLong(); } long dvGen = -1; if (format >= VERSION_49) { dvGen = input.readLong(); } else { dvGen = fieldInfosGen; } SegmentCommitInfo siPerCommit = new SegmentCommitInfo(info, delCount, delGen, fieldInfosGen, dvGen); if (format >= VERSION_46) { if (format < VERSION_49) { // Recorded per-generation files, which were buggy (see // LUCENE-5636). We need to read and keep them so we continue to // reference those files. Unfortunately it means that the files will // be referenced even if the fields are updated again, until the // segment is merged. final int numGensUpdatesFiles = input.readInt(); final Map<Long, Set<String>> genUpdatesFiles; if (numGensUpdatesFiles == 0) { genUpdatesFiles = Collections.emptyMap(); } else { genUpdatesFiles = new HashMap<>(numGensUpdatesFiles); for (int i = 0; i < numGensUpdatesFiles; i++) { genUpdatesFiles.put(input.readLong(), input.readStringSet()); } } siPerCommit.setGenUpdatesFiles(genUpdatesFiles); } else { if (format >= VERSION_51) { siPerCommit.setFieldInfosFiles(input.readSetOfStrings()); } else { siPerCommit.setFieldInfosFiles(Collections.unmodifiableSet(input.readStringSet())); } final Map<Integer, Set<String>> dvUpdateFiles; final int numDVFields = input.readInt(); if (numDVFields == 0) { dvUpdateFiles = Collections.emptyMap(); } else { Map<Integer, Set<String>> map = new HashMap<>(numDVFields); for (int i = 0; i < numDVFields; i++) { if (format >= VERSION_51) { map.put(input.readInt(), input.readSetOfStrings()); } else { map.put(input.readInt(), Collections.unmodifiableSet(input.readStringSet())); } } dvUpdateFiles = Collections.unmodifiableMap(map); } siPerCommit.setDocValuesUpdatesFiles(dvUpdateFiles); } } infos.add(siPerCommit); Version segmentVersion = info.getVersion(); if (format < VERSION_53) { if (infos.minSegmentLuceneVersion == null || segmentVersion.onOrAfter(infos.minSegmentLuceneVersion) == false) { infos.minSegmentLuceneVersion = segmentVersion; } } else if (segmentVersion.onOrAfter(infos.minSegmentLuceneVersion) == false) { throw new CorruptIndexException( "segments file recorded minSegmentLuceneVersion=" + infos.minSegmentLuceneVersion + " but segment=" + info + " has older version=" + segmentVersion, input); } } if (format >= VERSION_51) { infos.userData = input.readMapOfStrings(); } else { infos.userData = Collections.unmodifiableMap(input.readStringStringMap()); } if (format >= VERSION_48) { CodecUtil.checkFooter(input); } else { final long checksumNow = input.getChecksum(); final long checksumThen = input.readLong(); if (checksumNow != checksumThen) { throw new CorruptIndexException( "checksum failed (hardware problem?) : expected=" + Long.toHexString(checksumThen) + " actual=" + Long.toHexString(checksumNow), input); } CodecUtil.checkEOF(input); } // LUCENE-6299: check we are in bounds if (totalDocs > IndexWriter.getActualMaxDocs()) { throw new CorruptIndexException( "Too many documents: an index cannot exceed " + IndexWriter.getActualMaxDocs() + " but readers have total maxDoc=" + totalDocs, input); } return infos; } }
/** Expert: scores one merge; subclasses can override. */ protected MergeScore score( List<SegmentCommitInfo> candidate, boolean hitTooLarge, long mergingBytes, IndexWriter writer) throws IOException { long totBeforeMergeBytes = 0; long totAfterMergeBytes = 0; long totAfterMergeBytesFloored = 0; for (SegmentCommitInfo info : candidate) { final long segBytes = size(info, writer); totAfterMergeBytes += segBytes; totAfterMergeBytesFloored += floorSize(segBytes); totBeforeMergeBytes += info.sizeInBytes(); } // Roughly measure "skew" of the merge, i.e. how // "balanced" the merge is (whether the segments are // about the same size), which can range from // 1.0/numSegsBeingMerged (good) to 1.0 (poor). Heavily // lopsided merges (skew near 1.0) is no good; it means // O(N^2) merge cost over time: final double skew; if (hitTooLarge) { // Pretend the merge has perfect skew; skew doesn't // matter in this case because this merge will not // "cascade" and so it cannot lead to N^2 merge cost // over time: skew = 1.0 / maxMergeAtOnce; } else { skew = ((double) floorSize(size(candidate.get(0), writer))) / totAfterMergeBytesFloored; } // Strongly favor merges with less skew (smaller // mergeScore is better): double mergeScore = skew; // Gently favor smaller merges over bigger ones. We // don't want to make this exponent too large else we // can end up doing poor merges of small segments in // order to avoid the large merges: mergeScore *= Math.pow(totAfterMergeBytes, 0.05); // Strongly favor merges that reclaim deletes: final double nonDelRatio = ((double) totAfterMergeBytes) / totBeforeMergeBytes; mergeScore *= Math.pow(nonDelRatio, reclaimDeletesWeight); final double finalMergeScore = mergeScore; return new MergeScore() { @Override public double getScore() { return finalMergeScore; } @Override public String getExplanation() { return "skew=" + String.format(Locale.ROOT, "%.3f", skew) + " nonDelRatio=" + String.format(Locale.ROOT, "%.3f", nonDelRatio); } }; }