/** * We assume that the initial indexing has been done and a set of reference objects has been found * and indexed in the separate directory. However further documents were added and they now need * to get a ranked list of reference objects. So we (i) get all these new documents missing the * field "ro-order" and (ii) add this field. * * @param indexPath the index to update * @throws IOException */ public void updateIndex(String indexPath) throws IOException { IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexPath))); int numDocs = reader.numDocs(); boolean hasDeletions = reader.hasDeletions(); int countUpdated = 0; IndexReader readerRo = DirectoryReader.open(FSDirectory.open(new File(indexPath + "-ro"))); ImageSearcher searcher = new GenericImageSearcher(numReferenceObjectsUsed, featureClass, featureFieldName); Map<String, Analyzer> perField = new HashMap<String, Analyzer>(1); perField.put("ro-order", new WhitespaceAnalyzer(LuceneUtils.LUCENE_VERSION)); PerFieldAnalyzerWrapper aWrapper = new PerFieldAnalyzerWrapper(new SimpleAnalyzer(LuceneUtils.LUCENE_VERSION), perField); IndexWriter iw = new IndexWriter( FSDirectory.open(new File(indexPath)), new IndexWriterConfig(LuceneUtils.LUCENE_VERSION, aWrapper) .setOpenMode(IndexWriterConfig.OpenMode.CREATE)); StringBuilder sb = new StringBuilder(256); // Needed for check whether the document is deleted. Bits liveDocs = MultiFields.getLiveDocs(reader); for (int i = 0; i < numDocs; i++) { if (reader.hasDeletions() && !liveDocs.get(i)) continue; // if it is deleted, just ignore it. Document document = reader.document(i); if (document.getField("ro-order") == null) { // if the field is not here we create it. ImageSearchHits hits = searcher.search(document, readerRo); sb.delete(0, sb.length()); for (int j = 0; j < numReferenceObjectsUsed; j++) { sb.append(hits.doc(j).getValues("ro-id")[0]); sb.append(' '); } // System.out.println(sb.toString()); document.add(new TextField("ro-order", sb.toString(), Field.Store.YES)); iw.updateDocument( new Term( DocumentBuilder.FIELD_NAME_IDENTIFIER, document.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]), document); countUpdated++; } // progress report progress.setNumDocsProcessed(progress.getNumDocsProcessed() + 1); // debug: System.out.println("countUpdated = " + countUpdated); } iw.commit(); iw.close(); }
protected DirectoryReader indexDocsWithLucene(TestDoc[] testDocs) throws IOException { Map<String, Analyzer> mapping = new HashMap<String, Analyzer>(); for (TestFieldSetting field : testDocs[0].fieldSettings) { if (field.storedPayloads) { mapping.put( field.name, new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new StandardTokenizer(Version.CURRENT.luceneVersion, reader); TokenFilter filter = new LowerCaseFilter(Version.CURRENT.luceneVersion, tokenizer); filter = new TypeAsPayloadTokenFilter(filter); return new TokenStreamComponents(tokenizer, filter); } }); } } PerFieldAnalyzerWrapper wrapper = new PerFieldAnalyzerWrapper( new StandardAnalyzer(Version.CURRENT.luceneVersion, CharArraySet.EMPTY_SET), mapping); Directory dir = new RAMDirectory(); IndexWriterConfig conf = new IndexWriterConfig(Version.CURRENT.luceneVersion, wrapper); conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE); IndexWriter writer = new IndexWriter(dir, conf); for (TestDoc doc : testDocs) { Document d = new Document(); d.add(new Field("id", doc.id, StringField.TYPE_STORED)); for (int i = 0; i < doc.fieldContent.length; i++) { FieldType type = new FieldType(TextField.TYPE_STORED); TestFieldSetting fieldSetting = doc.fieldSettings[i]; type.setStoreTermVectorOffsets(fieldSetting.storedOffset); type.setStoreTermVectorPayloads(fieldSetting.storedPayloads); type.setStoreTermVectorPositions( fieldSetting.storedPositions || fieldSetting.storedPayloads || fieldSetting.storedOffset); type.setStoreTermVectors(true); type.freeze(); d.add(new Field(fieldSetting.name, doc.fieldContent[i], type)); } writer.updateDocument(new Term("id", doc.id), d); writer.commit(); } writer.close(); return DirectoryReader.open(dir); }
public void testTermDocsEnum() throws Exception { Directory dir = newDirectory(); IndexWriter w = new IndexWriter( dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()))); Document d = new Document(); d.add(newStringField("f", "j", Field.Store.NO)); w.addDocument(d); w.commit(); w.addDocument(d); IndexReader r = w.getReader(); w.close(); DocsEnum de = MultiFields.getTermDocsEnum(r, null, "f", new BytesRef("j")); assertEquals(0, de.nextDoc()); assertEquals(1, de.nextDoc()); assertEquals(DocIdSetIterator.NO_MORE_DOCS, de.nextDoc()); r.close(); dir.close(); }
public void testSeparateEnums() throws Exception { Directory dir = newDirectory(); IndexWriter w = new IndexWriter( dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()))); Document d = new Document(); d.add(newStringField("f", "j", Field.Store.NO)); w.addDocument(d); w.commit(); w.addDocument(d); IndexReader r = w.getReader(); w.close(); DocsEnum d1 = _TestUtil.docs(random(), r, "f", new BytesRef("j"), null, null, 0); DocsEnum d2 = _TestUtil.docs(random(), r, "f", new BytesRef("j"), null, null, 0); assertEquals(0, d1.nextDoc()); assertEquals(0, d2.nextDoc()); r.close(); dir.close(); }
public void testRandom() throws Exception { int num = atLeast(2); for (int iter = 0; iter < num; iter++) { if (VERBOSE) { System.out.println("TEST: iter=" + iter); } Directory dir = newDirectory(); IndexWriter w = new IndexWriter( dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())) .setMergePolicy(NoMergePolicy.COMPOUND_FILES)); _TestUtil.keepFullyDeletedSegments(w); Map<BytesRef, List<Integer>> docs = new HashMap<BytesRef, List<Integer>>(); Set<Integer> deleted = new HashSet<Integer>(); List<BytesRef> terms = new ArrayList<BytesRef>(); int numDocs = _TestUtil.nextInt(random(), 1, 100 * RANDOM_MULTIPLIER); Document doc = new Document(); Field f = newStringField("field", "", Field.Store.NO); doc.add(f); Field id = newStringField("id", "", Field.Store.NO); doc.add(id); boolean onlyUniqueTerms = random().nextBoolean(); if (VERBOSE) { System.out.println("TEST: onlyUniqueTerms=" + onlyUniqueTerms + " numDocs=" + numDocs); } Set<BytesRef> uniqueTerms = new HashSet<BytesRef>(); for (int i = 0; i < numDocs; i++) { if (!onlyUniqueTerms && random().nextBoolean() && terms.size() > 0) { // re-use existing term BytesRef term = terms.get(random().nextInt(terms.size())); docs.get(term).add(i); f.setStringValue(term.utf8ToString()); } else { String s = _TestUtil.randomUnicodeString(random(), 10); BytesRef term = new BytesRef(s); if (!docs.containsKey(term)) { docs.put(term, new ArrayList<Integer>()); } docs.get(term).add(i); terms.add(term); uniqueTerms.add(term); f.setStringValue(s); } id.setStringValue("" + i); w.addDocument(doc); if (random().nextInt(4) == 1) { w.commit(); } if (i > 0 && random().nextInt(20) == 1) { int delID = random().nextInt(i); deleted.add(delID); w.deleteDocuments(new Term("id", "" + delID)); if (VERBOSE) { System.out.println("TEST: delete " + delID); } } } if (VERBOSE) { List<BytesRef> termsList = new ArrayList<BytesRef>(uniqueTerms); Collections.sort(termsList, BytesRef.getUTF8SortedAsUTF16Comparator()); System.out.println("TEST: terms in UTF16 order:"); for (BytesRef b : termsList) { System.out.println(" " + UnicodeUtil.toHexString(b.utf8ToString()) + " " + b); for (int docID : docs.get(b)) { if (deleted.contains(docID)) { System.out.println(" " + docID + " (deleted)"); } else { System.out.println(" " + docID); } } } } IndexReader reader = w.getReader(); w.close(); if (VERBOSE) { System.out.println("TEST: reader=" + reader); } Bits liveDocs = MultiFields.getLiveDocs(reader); for (int delDoc : deleted) { assertFalse(liveDocs.get(delDoc)); } for (int i = 0; i < 100; i++) { BytesRef term = terms.get(random().nextInt(terms.size())); if (VERBOSE) { System.out.println( "TEST: seek term=" + UnicodeUtil.toHexString(term.utf8ToString()) + " " + term); } DocsEnum docsEnum = _TestUtil.docs(random(), reader, "field", term, liveDocs, null, 0); assertNotNull(docsEnum); for (int docID : docs.get(term)) { if (!deleted.contains(docID)) { assertEquals(docID, docsEnum.nextDoc()); } } assertEquals(DocIdSetIterator.NO_MORE_DOCS, docsEnum.nextDoc()); } reader.close(); dir.close(); } }
/** * Assert that the {@code scoreType} operates as expected and parents are found in the expected * order. * * <p>This will use the test index's parent/child types to create parents with multiple children. * Each child will have a randomly generated scored stored in {@link #CHILD_SCORE_NAME}, which is * used to score based on the {@code scoreType} by using a {@link MockScorer} to determine the * expected scores. * * @param scoreType The score type to use within the query to score parents relative to their * children. * @throws IOException if any unexpected error occurs */ private void assertScoreType(ScoreType scoreType) throws IOException { SearchContext context = SearchContext.current(); Directory directory = newDirectory(); IndexWriter writer = new IndexWriter(directory, newIndexWriterConfig(new MockAnalyzer(random()))); // calculates the expected score per parent MockScorer scorer = new MockScorer(scoreType); scorer.scores = new FloatArrayList(10); // number of parents to generate int parentDocs = scaledRandomIntBetween(2, 10); // unique child ID int childDocId = 0; // Parent ID to expected score Map<String, Float> parentScores = new TreeMap<>(); // Add a few random parents to ensure that the children's score is appropriately taken into // account for (int parentDocId = 0; parentDocId < parentDocs; ++parentDocId) { String parent = Integer.toString(parentDocId); // Create the parent Document parentDocument = new Document(); parentDocument.add( new StringField(UidFieldMapper.NAME, Uid.createUid("parent", parent), Field.Store.YES)); parentDocument.add(new StringField(IdFieldMapper.NAME, parent, Field.Store.YES)); parentDocument.add(new StringField(TypeFieldMapper.NAME, "parent", Field.Store.NO)); // add the parent to the index writer.addDocument(parentDocument); int numChildDocs = scaledRandomIntBetween(1, 10); // forget any parent's previous scores scorer.scores.clear(); // associate children with the parent for (int i = 0; i < numChildDocs; ++i) { int childScore = random().nextInt(128); Document childDocument = new Document(); childDocument.add( new StringField( UidFieldMapper.NAME, Uid.createUid("child", Integer.toString(childDocId++)), Field.Store.NO)); childDocument.add(new StringField(TypeFieldMapper.NAME, "child", Field.Store.NO)); // parent association: childDocument.add( new StringField( ParentFieldMapper.NAME, Uid.createUid("parent", parent), Field.Store.NO)); childDocument.add(new DoubleField(CHILD_SCORE_NAME, childScore, Field.Store.NO)); // remember the score to be calculated scorer.scores.add(childScore); // add the associated child to the index writer.addDocument(childDocument); } // this score that should be returned for this parent parentScores.put(parent, scorer.score()); } writer.commit(); IndexReader reader = DirectoryReader.open(writer, true); IndexSearcher searcher = new IndexSearcher(reader); // setup to read the parent/child map Engine.Searcher engineSearcher = new Engine.Searcher(ChildrenQueryTests.class.getSimpleName(), searcher); ((TestSearchContext) context).setSearcher(new ContextIndexSearcher(context, engineSearcher)); // child query that returns the score as the value of "childScore" for each child document, with // the parent's score determined by the score type QueryBuilder childQueryBuilder = functionScoreQuery(typeFilter("child")) .add(new FieldValueFactorFunctionBuilder(CHILD_SCORE_NAME)); QueryBuilder queryBuilder = hasChildQuery("child", childQueryBuilder) .scoreType(scoreType.name().toLowerCase(Locale.ENGLISH)) .setShortCircuitCutoff(parentDocs); // Perform the search for the documents using the selected score type TopDocs docs = searcher.search(parseQuery(queryBuilder), parentDocs); assertThat("Expected all parents", docs.totalHits, is(parentDocs)); // score should be descending (just a sanity check) float topScore = docs.scoreDocs[0].score; // ensure each score is returned as expected for (int i = 0; i < parentDocs; ++i) { ScoreDoc scoreDoc = docs.scoreDocs[i]; // get the ID from the document to get its expected score; remove it so we cannot double-count // it float score = parentScores.remove(reader.document(scoreDoc.doc).get(IdFieldMapper.NAME)); // expect exact match assertThat("Unexpected score", scoreDoc.score, is(score)); assertThat("Not descending", score, lessThanOrEqualTo(topScore)); // it had better keep descending topScore = score; } reader.close(); writer.close(); directory.close(); }
/** * Creates a set of reference objects and stores it in a new index (name "<indexPath>-ro"). Then * creates ordered lists of reference object positions for each data item in the index with given * feature. Finally a new index (name "<indexPath>-ms") is created where all the original * documents as well as the new data are stored. * * @param indexPath the path to the original index * @throws IOException */ public void createIndex(String indexPath) throws IOException { IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexPath))); int numDocs = reader.numDocs(); if (numDocs < numReferenceObjects) { throw new UnsupportedOperationException("Too few documents in index."); } // progress report progress.setNumDocsAll(numDocs); progress.setCurrentState(State.RoSelection); boolean hasDeletions = reader.hasDeletions(); // init reference objects: IndexWriter iw = LuceneUtils.createIndexWriter(indexPath + "-ro", true); HashSet<Integer> referenceObjsIds = new HashSet<Integer>(numReferenceObjects); double numDocsDouble = (double) numDocs; while (referenceObjsIds.size() < numReferenceObjects) { referenceObjsIds.add((int) (numDocsDouble * Math.random())); } int count = 0; if (hasDeletions) { System.err.println( "WARNING: There are deleted docs in your index. You should " + "optimize your index before using this method."); } // progress report progress.setCurrentState(State.RoIndexing); // find them in the index and put them into a separate index: for (int i : referenceObjsIds) { count++; Document document = reader.document(i); document.add(new Field("ro-id", count + "", StringField.TYPE_STORED)); iw.addDocument(document); } iw.commit(); iw.close(); // progress report progress.setCurrentState(State.Indexing); // now find the reference objects for each entry ;) IndexReader readerRo = DirectoryReader.open(FSDirectory.open(new File(indexPath + "-ro"))); ImageSearcher searcher = new GenericImageSearcher(numReferenceObjectsUsed, featureClass, featureFieldName); Map<String, Analyzer> analyzerPerField = new HashMap<String, Analyzer>(); analyzerPerField.put("ro-order", new WhitespaceAnalyzer(LuceneUtils.LUCENE_VERSION)); PerFieldAnalyzerWrapper aWrapper = new PerFieldAnalyzerWrapper( new SimpleAnalyzer(LuceneUtils.LUCENE_VERSION), analyzerPerField); iw = new IndexWriter( FSDirectory.open(new File(indexPath)), new IndexWriterConfig(LuceneUtils.LUCENE_VERSION, aWrapper) .setOpenMode(IndexWriterConfig.OpenMode.CREATE)); StringBuilder sb = new StringBuilder(256); // Needed for check whether the document is deleted. Bits liveDocs = MultiFields.getLiveDocs(reader); for (int i = 0; i < numDocs; i++) { if (reader.hasDeletions() && !liveDocs.get(i)) continue; // if it is deleted, just ignore it. Document document = reader.document(i); ImageSearchHits hits = searcher.search(document, readerRo); sb.delete(0, sb.length()); for (int j = 0; j < numReferenceObjectsUsed; j++) { sb.append(hits.doc(j).getValues("ro-id")[0]); sb.append(' '); } // System.out.println(sb.toString()); document.add(new TextField("ro-order", sb.toString(), Field.Store.YES)); iw.updateDocument( new Term( DocumentBuilder.FIELD_NAME_IDENTIFIER, document.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]), document); // progress report progress.setNumDocsProcessed(progress.getNumDocsProcessed() + 1); } iw.commit(); iw.close(); // progress report progress.setCurrentState(State.Idle); }
@Test public void testRecoveryDiff() throws IOException, InterruptedException { int numDocs = 2 + random().nextInt(100); List<Document> docs = new ArrayList<>(); for (int i = 0; i < numDocs; i++) { Document doc = new Document(); doc.add( new StringField("id", "" + i, random().nextBoolean() ? Field.Store.YES : Field.Store.NO)); doc.add( new TextField( "body", TestUtil.randomRealisticUnicodeString(random()), random().nextBoolean() ? Field.Store.YES : Field.Store.NO)); doc.add( new SortedDocValuesField( "dv", new BytesRef(TestUtil.randomRealisticUnicodeString(random())))); docs.add(doc); } long seed = random().nextLong(); Store.MetadataSnapshot first; { Random random = new Random(seed); IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random)).setCodec(actualDefaultCodec()); iwc.setMergePolicy(NoMergePolicy.INSTANCE); iwc.setUseCompoundFile(random.nextBoolean()); iwc.setMaxThreadStates(1); final ShardId shardId = new ShardId(new Index("index"), 1); DirectoryService directoryService = new LuceneManagedDirectoryService(random); Store store = new Store( shardId, ImmutableSettings.EMPTY, directoryService, randomDistributor(random, directoryService), new DummyShardLock(shardId)); IndexWriter writer = new IndexWriter(store.directory(), iwc); final boolean lotsOfSegments = rarely(random); for (Document d : docs) { writer.addDocument(d); if (lotsOfSegments && random.nextBoolean()) { writer.commit(); } else if (rarely(random)) { writer.commit(); } } writer.commit(); writer.close(); first = store.getMetadata(); assertDeleteContent(store, directoryService); store.close(); } long time = new Date().getTime(); while (time == new Date().getTime()) { Thread.sleep(10); // bump the time } Store.MetadataSnapshot second; Store store; { Random random = new Random(seed); IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random)).setCodec(actualDefaultCodec()); iwc.setMergePolicy(NoMergePolicy.INSTANCE); iwc.setUseCompoundFile(random.nextBoolean()); iwc.setMaxThreadStates(1); final ShardId shardId = new ShardId(new Index("index"), 1); DirectoryService directoryService = new LuceneManagedDirectoryService(random); store = new Store( shardId, ImmutableSettings.EMPTY, directoryService, randomDistributor(random, directoryService), new DummyShardLock(shardId)); IndexWriter writer = new IndexWriter(store.directory(), iwc); final boolean lotsOfSegments = rarely(random); for (Document d : docs) { writer.addDocument(d); if (lotsOfSegments && random.nextBoolean()) { writer.commit(); } else if (rarely(random)) { writer.commit(); } } writer.commit(); writer.close(); second = store.getMetadata(); } Store.RecoveryDiff diff = first.recoveryDiff(second); assertThat(first.size(), equalTo(second.size())); for (StoreFileMetaData md : first) { assertThat(second.get(md.name()), notNullValue()); // si files are different - containing timestamps etc assertThat(second.get(md.name()).isSame(md), equalTo(false)); } assertThat(diff.different.size(), equalTo(first.size())); assertThat( diff.identical.size(), equalTo(0)); // in lucene 5 nothing is identical - we use random ids in file headers assertThat(diff.missing, empty()); // check the self diff Store.RecoveryDiff selfDiff = first.recoveryDiff(first); assertThat(selfDiff.identical.size(), equalTo(first.size())); assertThat(selfDiff.different, empty()); assertThat(selfDiff.missing, empty()); // lets add some deletes Random random = new Random(seed); IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random)).setCodec(actualDefaultCodec()); iwc.setMergePolicy(NoMergePolicy.INSTANCE); iwc.setUseCompoundFile(random.nextBoolean()); iwc.setMaxThreadStates(1); iwc.setOpenMode(IndexWriterConfig.OpenMode.APPEND); IndexWriter writer = new IndexWriter(store.directory(), iwc); writer.deleteDocuments(new Term("id", Integer.toString(random().nextInt(numDocs)))); writer.commit(); writer.close(); Store.MetadataSnapshot metadata = store.getMetadata(); StoreFileMetaData delFile = null; for (StoreFileMetaData md : metadata) { if (md.name().endsWith(".liv")) { delFile = md; break; } } Store.RecoveryDiff afterDeleteDiff = metadata.recoveryDiff(second); if (delFile != null) { assertThat( afterDeleteDiff.identical.size(), equalTo(metadata.size() - 2)); // segments_N + del file assertThat(afterDeleteDiff.different.size(), equalTo(0)); assertThat(afterDeleteDiff.missing.size(), equalTo(2)); } else { // an entire segment must be missing (single doc segment got dropped) assertThat(afterDeleteDiff.identical.size(), greaterThan(0)); assertThat(afterDeleteDiff.different.size(), equalTo(0)); assertThat(afterDeleteDiff.missing.size(), equalTo(1)); // the commit file is different } // check the self diff selfDiff = metadata.recoveryDiff(metadata); assertThat(selfDiff.identical.size(), equalTo(metadata.size())); assertThat(selfDiff.different, empty()); assertThat(selfDiff.missing, empty()); // add a new commit iwc = new IndexWriterConfig(new MockAnalyzer(random)).setCodec(actualDefaultCodec()); iwc.setMergePolicy(NoMergePolicy.INSTANCE); iwc.setUseCompoundFile( true); // force CFS - easier to test here since we know it will add 3 files iwc.setMaxThreadStates(1); iwc.setOpenMode(IndexWriterConfig.OpenMode.APPEND); writer = new IndexWriter(store.directory(), iwc); writer.addDocument(docs.get(0)); writer.close(); Store.MetadataSnapshot newCommitMetaData = store.getMetadata(); Store.RecoveryDiff newCommitDiff = newCommitMetaData.recoveryDiff(metadata); if (delFile != null) { assertThat( newCommitDiff.identical.size(), equalTo( newCommitMetaData.size() - 5)); // segments_N, del file, cfs, cfe, si for the new segment assertThat(newCommitDiff.different.size(), equalTo(1)); // the del file must be different assertThat(newCommitDiff.different.get(0).name(), endsWith(".liv")); assertThat( newCommitDiff.missing.size(), equalTo(4)); // segments_N,cfs, cfe, si for the new segment } else { assertThat( newCommitDiff.identical.size(), equalTo(newCommitMetaData.size() - 4)); // segments_N, cfs, cfe, si for the new segment assertThat(newCommitDiff.different.size(), equalTo(0)); assertThat( newCommitDiff.missing.size(), equalTo( 4)); // an entire segment must be missing (single doc segment got dropped) plus the // commit is different } store.deleteContent(); IOUtils.close(store); }
@Test public void testMixedChecksums() throws IOException { final ShardId shardId = new ShardId(new Index("index"), 1); DirectoryService directoryService = new LuceneManagedDirectoryService(random()); Store store = new Store( shardId, ImmutableSettings.EMPTY, directoryService, randomDistributor(directoryService), new DummyShardLock(shardId)); // this time random codec.... IndexWriter writer = new IndexWriter( store.directory(), newIndexWriterConfig(random(), new MockAnalyzer(random())) .setCodec(actualDefaultCodec())); int docs = 1 + random().nextInt(100); for (int i = 0; i < docs; i++) { Document doc = new Document(); doc.add( new TextField("id", "" + i, random().nextBoolean() ? Field.Store.YES : Field.Store.NO)); doc.add( new TextField( "body", TestUtil.randomRealisticUnicodeString(random()), random().nextBoolean() ? Field.Store.YES : Field.Store.NO)); doc.add( new SortedDocValuesField( "dv", new BytesRef(TestUtil.randomRealisticUnicodeString(random())))); writer.addDocument(doc); } if (random().nextBoolean()) { for (int i = 0; i < docs; i++) { if (random().nextBoolean()) { Document doc = new Document(); doc.add( new TextField( "id", "" + i, random().nextBoolean() ? Field.Store.YES : Field.Store.NO)); doc.add( new TextField( "body", TestUtil.randomRealisticUnicodeString(random()), random().nextBoolean() ? Field.Store.YES : Field.Store.NO)); writer.updateDocument(new Term("id", "" + i), doc); } } } if (random().nextBoolean()) { DirectoryReader.open(writer, random().nextBoolean()).close(); // flush } Store.MetadataSnapshot metadata; // check before we committed try { store.getMetadata(); fail("no index present - expected exception"); } catch (IndexNotFoundException ex) { // expected } assertThat(store.getMetadataOrEmpty(), is(Store.MetadataSnapshot.EMPTY)); // nothing committed writer.commit(); writer.close(); Store.LegacyChecksums checksums = new Store.LegacyChecksums(); metadata = store.getMetadata(); assertThat(metadata.asMap().isEmpty(), is(false)); for (StoreFileMetaData meta : metadata) { try (IndexInput input = store.directory().openInput(meta.name(), IOContext.DEFAULT)) { if (meta.checksum() == null) { String checksum = null; try { CodecUtil.retrieveChecksum(input); fail("expected a corrupt index - posting format has not checksums"); } catch (CorruptIndexException | IndexFormatTooOldException | IndexFormatTooNewException ex) { try (ChecksumIndexInput checksumIndexInput = store.directory().openChecksumInput(meta.name(), IOContext.DEFAULT)) { checksumIndexInput.seek(meta.length()); checksum = Store.digestToString(checksumIndexInput.getChecksum()); } // fine - it's a postings format without checksums checksums.add(new StoreFileMetaData(meta.name(), meta.length(), checksum, null)); } } else { String checksum = Store.digestToString(CodecUtil.retrieveChecksum(input)); assertThat( "File: " + meta.name() + " has a different checksum", meta.checksum(), equalTo(checksum)); assertThat(meta.hasLegacyChecksum(), equalTo(false)); assertThat(meta.writtenBy(), equalTo(Version.LATEST)); } } } assertConsistent(store, metadata); checksums.write(store); metadata = store.getMetadata(); assertThat(metadata.asMap().isEmpty(), is(false)); for (StoreFileMetaData meta : metadata) { assertThat( "file: " + meta.name() + " has a null checksum", meta.checksum(), not(nullValue())); if (meta.hasLegacyChecksum()) { try (ChecksumIndexInput checksumIndexInput = store.directory().openChecksumInput(meta.name(), IOContext.DEFAULT)) { checksumIndexInput.seek(meta.length()); assertThat( meta.checksum(), equalTo(Store.digestToString(checksumIndexInput.getChecksum()))); } } else { try (IndexInput input = store.directory().openInput(meta.name(), IOContext.DEFAULT)) { String checksum = Store.digestToString(CodecUtil.retrieveChecksum(input)); assertThat( "File: " + meta.name() + " has a different checksum", meta.checksum(), equalTo(checksum)); assertThat(meta.hasLegacyChecksum(), equalTo(false)); assertThat(meta.writtenBy(), equalTo(Version.LATEST)); } } } assertConsistent(store, metadata); TestUtil.checkIndex(store.directory()); assertDeleteContent(store, directoryService); IOUtils.close(store); }
@Test public void testNewChecksums() throws IOException { final ShardId shardId = new ShardId(new Index("index"), 1); DirectoryService directoryService = new LuceneManagedDirectoryService(random()); Store store = new Store( shardId, ImmutableSettings.EMPTY, directoryService, randomDistributor(directoryService), new DummyShardLock(shardId)); // set default codec - all segments need checksums IndexWriter writer = new IndexWriter( store.directory(), newIndexWriterConfig(random(), new MockAnalyzer(random())) .setCodec(actualDefaultCodec())); int docs = 1 + random().nextInt(100); for (int i = 0; i < docs; i++) { Document doc = new Document(); doc.add( new TextField("id", "" + i, random().nextBoolean() ? Field.Store.YES : Field.Store.NO)); doc.add( new TextField( "body", TestUtil.randomRealisticUnicodeString(random()), random().nextBoolean() ? Field.Store.YES : Field.Store.NO)); doc.add( new SortedDocValuesField( "dv", new BytesRef(TestUtil.randomRealisticUnicodeString(random())))); writer.addDocument(doc); } if (random().nextBoolean()) { for (int i = 0; i < docs; i++) { if (random().nextBoolean()) { Document doc = new Document(); doc.add( new TextField( "id", "" + i, random().nextBoolean() ? Field.Store.YES : Field.Store.NO)); doc.add( new TextField( "body", TestUtil.randomRealisticUnicodeString(random()), random().nextBoolean() ? Field.Store.YES : Field.Store.NO)); writer.updateDocument(new Term("id", "" + i), doc); } } } if (random().nextBoolean()) { DirectoryReader.open(writer, random().nextBoolean()).close(); // flush } Store.MetadataSnapshot metadata; // check before we committed try { store.getMetadata(); fail("no index present - expected exception"); } catch (IndexNotFoundException ex) { // expected } assertThat(store.getMetadataOrEmpty(), is(Store.MetadataSnapshot.EMPTY)); // nothing committed writer.commit(); writer.close(); metadata = store.getMetadata(); assertThat(metadata.asMap().isEmpty(), is(false)); for (StoreFileMetaData meta : metadata) { try (IndexInput input = store.directory().openInput(meta.name(), IOContext.DEFAULT)) { String checksum = Store.digestToString(CodecUtil.retrieveChecksum(input)); assertThat( "File: " + meta.name() + " has a different checksum", meta.checksum(), equalTo(checksum)); assertThat(meta.hasLegacyChecksum(), equalTo(false)); assertThat(meta.writtenBy(), equalTo(Version.LATEST)); if (meta.name().endsWith(".si") || meta.name().startsWith("segments_")) { assertThat(meta.hash().length, greaterThan(0)); } } } assertConsistent(store, metadata); TestUtil.checkIndex(store.directory()); assertDeleteContent(store, directoryService); IOUtils.close(store); }
@Test public void testCleanupFromSnapshot() throws IOException { final ShardId shardId = new ShardId(new Index("index"), 1); DirectoryService directoryService = new LuceneManagedDirectoryService(random()); Store store = new Store( shardId, ImmutableSettings.EMPTY, directoryService, randomDistributor(directoryService), new DummyShardLock(shardId)); // this time random codec.... IndexWriterConfig indexWriterConfig = newIndexWriterConfig(random(), new MockAnalyzer(random())).setCodec(actualDefaultCodec()); // we keep all commits and that allows us clean based on multiple snapshots indexWriterConfig.setIndexDeletionPolicy(NoDeletionPolicy.INSTANCE); IndexWriter writer = new IndexWriter(store.directory(), indexWriterConfig); int docs = 1 + random().nextInt(100); int numCommits = 0; for (int i = 0; i < docs; i++) { if (i > 0 && randomIntBetween(0, 10) == 0) { writer.commit(); numCommits++; } Document doc = new Document(); doc.add( new TextField("id", "" + i, random().nextBoolean() ? Field.Store.YES : Field.Store.NO)); doc.add( new TextField( "body", TestUtil.randomRealisticUnicodeString(random()), random().nextBoolean() ? Field.Store.YES : Field.Store.NO)); doc.add( new SortedDocValuesField( "dv", new BytesRef(TestUtil.randomRealisticUnicodeString(random())))); writer.addDocument(doc); } if (numCommits < 1) { writer.commit(); Document doc = new Document(); doc.add( new TextField( "id", "" + docs++, random().nextBoolean() ? Field.Store.YES : Field.Store.NO)); doc.add( new TextField( "body", TestUtil.randomRealisticUnicodeString(random()), random().nextBoolean() ? Field.Store.YES : Field.Store.NO)); doc.add( new SortedDocValuesField( "dv", new BytesRef(TestUtil.randomRealisticUnicodeString(random())))); writer.addDocument(doc); } Store.MetadataSnapshot firstMeta = store.getMetadata(); if (random().nextBoolean()) { for (int i = 0; i < docs; i++) { if (random().nextBoolean()) { Document doc = new Document(); doc.add( new TextField( "id", "" + i, random().nextBoolean() ? Field.Store.YES : Field.Store.NO)); doc.add( new TextField( "body", TestUtil.randomRealisticUnicodeString(random()), random().nextBoolean() ? Field.Store.YES : Field.Store.NO)); writer.updateDocument(new Term("id", "" + i), doc); } } } writer.commit(); writer.close(); Store.MetadataSnapshot secondMeta = store.getMetadata(); Store.LegacyChecksums checksums = new Store.LegacyChecksums(); Map<String, StoreFileMetaData> legacyMeta = new HashMap<>(); for (String file : store.directory().listAll()) { if (file.equals("write.lock") || file.equals(IndexFileNames.OLD_SEGMENTS_GEN)) { continue; } BytesRef hash = new BytesRef(); if (file.startsWith("segments")) { hash = Store.MetadataSnapshot.hashFile(store.directory(), file); } StoreFileMetaData storeFileMetaData = new StoreFileMetaData( file, store.directory().fileLength(file), file + "checksum", null, hash); legacyMeta.put(file, storeFileMetaData); checksums.add(storeFileMetaData); } checksums.write( store); // write one checksum file here - we expect it to survive all the cleanups if (randomBoolean()) { store.cleanupAndVerify("test", firstMeta); String[] strings = store.directory().listAll(); int numChecksums = 0; int numNotFound = 0; for (String file : strings) { assertTrue(firstMeta.contains(file) || Store.isChecksum(file)); if (Store.isChecksum(file)) { numChecksums++; } else if (secondMeta.contains(file) == false) { numNotFound++; } } assertTrue( "at least one file must not be in here since we have two commits?", numNotFound > 0); assertEquals( "we wrote one checksum but it's gone now? - checksums are supposed to be kept", numChecksums, 1); } else { store.cleanupAndVerify("test", secondMeta); String[] strings = store.directory().listAll(); int numChecksums = 0; int numNotFound = 0; for (String file : strings) { assertTrue(secondMeta.contains(file) || Store.isChecksum(file)); if (Store.isChecksum(file)) { numChecksums++; } else if (firstMeta.contains(file) == false) { numNotFound++; } } assertTrue( "at least one file must not be in here since we have two commits?", numNotFound > 0); assertEquals( "we wrote one checksum but it's gone now? - checksums are supposed to be kept", numChecksums, 1); } store.deleteContent(); IOUtils.close(store); }
public void runTest(String testName) throws Exception { failed.set(false); addCount.set(0); delCount.set(0); packCount.set(0); final long t0 = System.currentTimeMillis(); Random random = new Random(random().nextLong()); final LineFileDocs docs = new LineFileDocs(random, true); final Path tempDir = createTempDir(testName); dir = getDirectory(newMockFSDirectory(tempDir)); // some subclasses rely on this being MDW if (dir instanceof BaseDirectoryWrapper) { ((BaseDirectoryWrapper) dir) .setCheckIndexOnClose(false); // don't double-checkIndex, we do it ourselves. } MockAnalyzer analyzer = new MockAnalyzer(random()); analyzer.setMaxTokenLength(TestUtil.nextInt(random(), 1, IndexWriter.MAX_TERM_LENGTH)); final IndexWriterConfig conf = newIndexWriterConfig(analyzer).setCommitOnClose(false); conf.setInfoStream(new FailOnNonBulkMergesInfoStream()); if (conf.getMergePolicy() instanceof MockRandomMergePolicy) { ((MockRandomMergePolicy) conf.getMergePolicy()).setDoNonBulkMerges(false); } if (LuceneTestCase.TEST_NIGHTLY) { // newIWConfig makes smallish max seg size, which // results in tons and tons of segments for this test // when run nightly: MergePolicy mp = conf.getMergePolicy(); if (mp instanceof TieredMergePolicy) { ((TieredMergePolicy) mp).setMaxMergedSegmentMB(5000.); } else if (mp instanceof LogByteSizeMergePolicy) { ((LogByteSizeMergePolicy) mp).setMaxMergeMB(1000.); } else if (mp instanceof LogMergePolicy) { ((LogMergePolicy) mp).setMaxMergeDocs(100000); } } conf.setMergedSegmentWarmer( new IndexWriter.IndexReaderWarmer() { @Override public void warm(LeafReader reader) throws IOException { if (VERBOSE) { System.out.println("TEST: now warm merged reader=" + reader); } warmed.put(((SegmentReader) reader).core, Boolean.TRUE); final int maxDoc = reader.maxDoc(); final Bits liveDocs = reader.getLiveDocs(); int sum = 0; final int inc = Math.max(1, maxDoc / 50); for (int docID = 0; docID < maxDoc; docID += inc) { if (liveDocs == null || liveDocs.get(docID)) { final StoredDocument doc = reader.document(docID); sum += doc.getFields().size(); } } IndexSearcher searcher = newSearcher(reader); sum += searcher.search(new TermQuery(new Term("body", "united")), 10).totalHits; if (VERBOSE) { System.out.println("TEST: warm visited " + sum + " fields"); } } }); if (VERBOSE) { conf.setInfoStream( new PrintStreamInfoStream(System.out) { @Override public void message(String component, String message) { if ("TP".equals(component)) { return; // ignore test points! } super.message(component, message); } }); } writer = new IndexWriter(dir, conf); TestUtil.reduceOpenFiles(writer); final ExecutorService es = random().nextBoolean() ? null : Executors.newCachedThreadPool(new NamedThreadFactory(testName)); doAfterWriter(es); final int NUM_INDEX_THREADS = TestUtil.nextInt(random(), 2, 4); final int RUN_TIME_SEC = LuceneTestCase.TEST_NIGHTLY ? 300 : RANDOM_MULTIPLIER; final Set<String> delIDs = Collections.synchronizedSet(new HashSet<String>()); final Set<String> delPackIDs = Collections.synchronizedSet(new HashSet<String>()); final List<SubDocs> allSubDocs = Collections.synchronizedList(new ArrayList<SubDocs>()); final long stopTime = System.currentTimeMillis() + RUN_TIME_SEC * 1000; final Thread[] indexThreads = launchIndexingThreads(docs, NUM_INDEX_THREADS, stopTime, delIDs, delPackIDs, allSubDocs); if (VERBOSE) { System.out.println( "TEST: DONE start " + NUM_INDEX_THREADS + " indexing threads [" + (System.currentTimeMillis() - t0) + " ms]"); } // Let index build up a bit Thread.sleep(100); doSearching(es, stopTime); if (VERBOSE) { System.out.println("TEST: all searching done [" + (System.currentTimeMillis() - t0) + " ms]"); } for (int thread = 0; thread < indexThreads.length; thread++) { indexThreads[thread].join(); } if (VERBOSE) { System.out.println( "TEST: done join indexing threads [" + (System.currentTimeMillis() - t0) + " ms]; addCount=" + addCount + " delCount=" + delCount); } final IndexSearcher s = getFinalSearcher(); if (VERBOSE) { System.out.println("TEST: finalSearcher=" + s); } assertFalse(failed.get()); boolean doFail = false; // Verify: make sure delIDs are in fact deleted: for (String id : delIDs) { final TopDocs hits = s.search(new TermQuery(new Term("docid", id)), 1); if (hits.totalHits != 0) { System.out.println( "doc id=" + id + " is supposed to be deleted, but got " + hits.totalHits + " hits; first docID=" + hits.scoreDocs[0].doc); doFail = true; } } // Verify: make sure delPackIDs are in fact deleted: for (String id : delPackIDs) { final TopDocs hits = s.search(new TermQuery(new Term("packID", id)), 1); if (hits.totalHits != 0) { System.out.println( "packID=" + id + " is supposed to be deleted, but got " + hits.totalHits + " matches"); doFail = true; } } // Verify: make sure each group of sub-docs are still in docID order: for (SubDocs subDocs : allSubDocs) { TopDocs hits = s.search(new TermQuery(new Term("packID", subDocs.packID)), 20); if (!subDocs.deleted) { // We sort by relevance but the scores should be identical so sort falls back to by docID: if (hits.totalHits != subDocs.subIDs.size()) { System.out.println( "packID=" + subDocs.packID + ": expected " + subDocs.subIDs.size() + " hits but got " + hits.totalHits); doFail = true; } else { int lastDocID = -1; int startDocID = -1; for (ScoreDoc scoreDoc : hits.scoreDocs) { final int docID = scoreDoc.doc; if (lastDocID != -1) { assertEquals(1 + lastDocID, docID); } else { startDocID = docID; } lastDocID = docID; final StoredDocument doc = s.doc(docID); assertEquals(subDocs.packID, doc.get("packID")); } lastDocID = startDocID - 1; for (String subID : subDocs.subIDs) { hits = s.search(new TermQuery(new Term("docid", subID)), 1); assertEquals(1, hits.totalHits); final int docID = hits.scoreDocs[0].doc; if (lastDocID != -1) { assertEquals(1 + lastDocID, docID); } lastDocID = docID; } } } else { // Pack was deleted -- make sure its docs are // deleted. We can't verify packID is deleted // because we can re-use packID for update: for (String subID : subDocs.subIDs) { assertEquals(0, s.search(new TermQuery(new Term("docid", subID)), 1).totalHits); } } } // Verify: make sure all not-deleted docs are in fact // not deleted: final int endID = Integer.parseInt(docs.nextDoc().get("docid")); docs.close(); for (int id = 0; id < endID; id++) { String stringID = "" + id; if (!delIDs.contains(stringID)) { final TopDocs hits = s.search(new TermQuery(new Term("docid", stringID)), 1); if (hits.totalHits != 1) { System.out.println( "doc id=" + stringID + " is not supposed to be deleted, but got hitCount=" + hits.totalHits + "; delIDs=" + delIDs); doFail = true; } } } assertFalse(doFail); assertEquals( "index=" + writer.segString() + " addCount=" + addCount + " delCount=" + delCount, addCount.get() - delCount.get(), s.getIndexReader().numDocs()); releaseSearcher(s); writer.commit(); assertEquals( "index=" + writer.segString() + " addCount=" + addCount + " delCount=" + delCount, addCount.get() - delCount.get(), writer.numDocs()); doClose(); try { writer.commit(); } finally { writer.close(); } // Cannot close until after writer is closed because // writer has merged segment warmer that uses IS to run // searches, and that IS may be using this es! if (es != null) { es.shutdown(); es.awaitTermination(1, TimeUnit.SECONDS); } TestUtil.checkIndex(dir); dir.close(); IOUtils.rm(tempDir); if (VERBOSE) { System.out.println("TEST: done [" + (System.currentTimeMillis() - t0) + " ms]"); } }