public void checkSkipTo(DocsAndPositionsEnum tp, int target, int maxCounter) throws IOException { tp.advance(target); if (maxCounter < counter) { fail("Too many bytes read: " + counter + " vs " + maxCounter); } assertEquals( "Wrong document " + tp.docID() + " after skipTo target " + target, target, tp.docID()); assertEquals("Frequency is not 1: " + tp.freq(), 1, tp.freq()); tp.nextPosition(); BytesRef b = tp.getPayload(); assertEquals(1, b.length); assertEquals( "Wrong payload for the target " + target + ": " + b.bytes[b.offset], (byte) target, b.bytes[b.offset]); }
/** * Reconstruct document fields. * * @param docNum document number. If this document is deleted, but the index is not optimized yet, * the reconstruction process may still yield the reconstructed field content even from * deleted documents. * @return reconstructed document * @throws Exception */ public Reconstructed reconstruct(int docNum) throws Exception { if (docNum < 0 || docNum > reader.maxDoc()) { throw new Exception("Document number outside of valid range."); } Reconstructed res = new Reconstructed(); if (deleted != null && deleted.get(docNum)) { throw new Exception("Document is deleted."); } else { Document doc = reader.document(docNum); for (int i = 0; i < fieldNames.length; i++) { Field[] fs = doc.getFields(fieldNames[i]); if (fs != null && fs.length > 0) { res.getStoredFields().put(fieldNames[i], fs); } } } // collect values from unstored fields HashSet<String> fields = new HashSet<String>(Arrays.asList(fieldNames)); // try to use term vectors if available progress.maxValue = fieldNames.length; progress.curValue = 0; progress.minValue = 0; for (int i = 0; i < fieldNames.length; i++) { TermFreqVector tvf = reader.getTermFreqVector(docNum, fieldNames[i]); if (tvf != null && tvf.size() > 0 && (tvf instanceof TermPositionVector)) { TermPositionVector tpv = (TermPositionVector) tvf; progress.message = "Reading term vectors ..."; progress.curValue = i; setChanged(); notifyObservers(progress); BytesRef[] tv = tpv.getTerms(); for (int k = 0; k < tv.length; k++) { // do we have positions? int[] posArr = tpv.getTermPositions(k); if (posArr == null) { // only offsets TermVectorOffsetInfo[] offsets = tpv.getOffsets(k); if (offsets.length == 0) { continue; } // convert offsets into positions posArr = convertOffsets(offsets); } GrowableStringArray gsa = res.getReconstructedFields().get(fieldNames[i]); if (gsa == null) { gsa = new GrowableStringArray(); res.getReconstructedFields().put(fieldNames[i], gsa); } for (int m = 0; m < posArr.length; m++) { gsa.append(posArr[m], "|", tv[k].utf8ToString()); } } fields.remove(fieldNames[i]); // got what we wanted } } // this loop collects data only from left-over fields // not yet collected through term vectors progress.maxValue = fields.size(); progress.curValue = 0; progress.minValue = 0; for (String fld : fields) { progress.message = "Collecting terms in " + fld + " ..."; progress.curValue++; setChanged(); notifyObservers(progress); Terms terms = MultiFields.getTerms(reader, fld); if (terms == null) { // no terms in this field continue; } TermsEnum te = terms.iterator(); while (te.next() != null) { DocsAndPositionsEnum dpe = te.docsAndPositions(deleted, null); if (dpe == null) { // no position info for this field break; } int num = dpe.advance(docNum); if (num != docNum) { // either greater than or NO_MORE_DOCS continue; // no data for this term in this doc } String term = te.term().utf8ToString(); GrowableStringArray gsa = (GrowableStringArray) res.getReconstructedFields().get(fld); if (gsa == null) { gsa = new GrowableStringArray(); res.getReconstructedFields().put(fld, gsa); } for (int k = 0; k < dpe.freq(); k++) { int pos = dpe.nextPosition(); gsa.append(pos, "|", term); } } } progress.message = "Done."; progress.curValue = 100; setChanged(); notifyObservers(progress); return res; }
public void testLongPostings() throws Exception { // Don't use _TestUtil.getTempDir so that we own the // randomness (ie same seed will point to same dir): Directory dir = newFSDirectory(_TestUtil.getTempDir("longpostings" + "." + random().nextLong())); final int NUM_DOCS = atLeast(2000); if (VERBOSE) { System.out.println("TEST: NUM_DOCS=" + NUM_DOCS); } final String s1 = getRandomTerm(null); final String s2 = getRandomTerm(s1); if (VERBOSE) { System.out.println("\nTEST: s1=" + s1 + " s2=" + s2); /* for(int idx=0;idx<s1.length();idx++) { System.out.println(" s1 ch=0x" + Integer.toHexString(s1.charAt(idx))); } for(int idx=0;idx<s2.length();idx++) { System.out.println(" s2 ch=0x" + Integer.toHexString(s2.charAt(idx))); } */ } final FixedBitSet isS1 = new FixedBitSet(NUM_DOCS); for (int idx = 0; idx < NUM_DOCS; idx++) { if (random().nextBoolean()) { isS1.set(idx); } } final IndexReader r; final IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())) .setOpenMode(IndexWriterConfig.OpenMode.CREATE) .setMergePolicy(newLogMergePolicy()); iwc.setRAMBufferSizeMB(16.0 + 16.0 * random().nextDouble()); iwc.setMaxBufferedDocs(-1); final RandomIndexWriter riw = new RandomIndexWriter(random(), dir, iwc); for (int idx = 0; idx < NUM_DOCS; idx++) { final Document doc = new Document(); String s = isS1.get(idx) ? s1 : s2; final Field f = newTextField("field", s, Field.Store.NO); final int count = _TestUtil.nextInt(random(), 1, 4); for (int ct = 0; ct < count; ct++) { doc.add(f); } riw.addDocument(doc); } r = riw.getReader(); riw.close(); /* if (VERBOSE) { System.out.println("TEST: terms"); TermEnum termEnum = r.terms(); while(termEnum.next()) { System.out.println(" term=" + termEnum.term() + " len=" + termEnum.term().text().length()); assertTrue(termEnum.docFreq() > 0); System.out.println(" s1?=" + (termEnum.term().text().equals(s1)) + " s1len=" + s1.length()); System.out.println(" s2?=" + (termEnum.term().text().equals(s2)) + " s2len=" + s2.length()); final String s = termEnum.term().text(); for(int idx=0;idx<s.length();idx++) { System.out.println(" ch=0x" + Integer.toHexString(s.charAt(idx))); } } } */ assertEquals(NUM_DOCS, r.numDocs()); assertTrue(r.docFreq(new Term("field", s1)) > 0); assertTrue(r.docFreq(new Term("field", s2)) > 0); int num = atLeast(1000); for (int iter = 0; iter < num; iter++) { final String term; final boolean doS1; if (random().nextBoolean()) { term = s1; doS1 = true; } else { term = s2; doS1 = false; } if (VERBOSE) { System.out.println("\nTEST: iter=" + iter + " doS1=" + doS1); } final DocsAndPositionsEnum postings = MultiFields.getTermPositionsEnum(r, null, "field", new BytesRef(term)); int docID = -1; while (docID < DocIdSetIterator.NO_MORE_DOCS) { final int what = random().nextInt(3); if (what == 0) { if (VERBOSE) { System.out.println("TEST: docID=" + docID + "; do next()"); } // nextDoc int expected = docID + 1; while (true) { if (expected == NUM_DOCS) { expected = Integer.MAX_VALUE; break; } else if (isS1.get(expected) == doS1) { break; } else { expected++; } } docID = postings.nextDoc(); if (VERBOSE) { System.out.println(" got docID=" + docID); } assertEquals(expected, docID); if (docID == DocIdSetIterator.NO_MORE_DOCS) { break; } if (random().nextInt(6) == 3) { final int freq = postings.freq(); assertTrue(freq >= 1 && freq <= 4); for (int pos = 0; pos < freq; pos++) { assertEquals(pos, postings.nextPosition()); if (random().nextBoolean()) { postings.getPayload(); if (random().nextBoolean()) { postings.getPayload(); // get it again } } } } } else { // advance final int targetDocID; if (docID == -1) { targetDocID = random().nextInt(NUM_DOCS + 1); } else { targetDocID = docID + _TestUtil.nextInt(random(), 1, NUM_DOCS - docID); } if (VERBOSE) { System.out.println("TEST: docID=" + docID + "; do advance(" + targetDocID + ")"); } int expected = targetDocID; while (true) { if (expected == NUM_DOCS) { expected = Integer.MAX_VALUE; break; } else if (isS1.get(expected) == doS1) { break; } else { expected++; } } docID = postings.advance(targetDocID); if (VERBOSE) { System.out.println(" got docID=" + docID); } assertEquals(expected, docID); if (docID == DocIdSetIterator.NO_MORE_DOCS) { break; } if (random().nextInt(6) == 3) { final int freq = postings.freq(); assertTrue(freq >= 1 && freq <= 4); for (int pos = 0; pos < freq; pos++) { assertEquals(pos, postings.nextPosition()); if (random().nextBoolean()) { postings.getPayload(); if (random().nextBoolean()) { postings.getPayload(); // get it again } } } } } } } r.close(); dir.close(); }