public void checkSkipTo(DocsAndPositionsEnum tp, int target, int maxCounter) throws IOException {
    tp.advance(target);
    if (maxCounter < counter) {
      fail("Too many bytes read: " + counter + " vs " + maxCounter);
    }

    assertEquals(
        "Wrong document " + tp.docID() + " after skipTo target " + target, target, tp.docID());
    assertEquals("Frequency is not 1: " + tp.freq(), 1, tp.freq());
    tp.nextPosition();
    BytesRef b = tp.getPayload();
    assertEquals(1, b.length);
    assertEquals(
        "Wrong payload for the target " + target + ": " + b.bytes[b.offset],
        (byte) target,
        b.bytes[b.offset]);
  }
Beispiel #2
0
 /**
  * Reconstruct document fields.
  *
  * @param docNum document number. If this document is deleted, but the index is not optimized yet,
  *     the reconstruction process may still yield the reconstructed field content even from
  *     deleted documents.
  * @return reconstructed document
  * @throws Exception
  */
 public Reconstructed reconstruct(int docNum) throws Exception {
   if (docNum < 0 || docNum > reader.maxDoc()) {
     throw new Exception("Document number outside of valid range.");
   }
   Reconstructed res = new Reconstructed();
   if (deleted != null && deleted.get(docNum)) {
     throw new Exception("Document is deleted.");
   } else {
     Document doc = reader.document(docNum);
     for (int i = 0; i < fieldNames.length; i++) {
       Field[] fs = doc.getFields(fieldNames[i]);
       if (fs != null && fs.length > 0) {
         res.getStoredFields().put(fieldNames[i], fs);
       }
     }
   }
   // collect values from unstored fields
   HashSet<String> fields = new HashSet<String>(Arrays.asList(fieldNames));
   // try to use term vectors if available
   progress.maxValue = fieldNames.length;
   progress.curValue = 0;
   progress.minValue = 0;
   for (int i = 0; i < fieldNames.length; i++) {
     TermFreqVector tvf = reader.getTermFreqVector(docNum, fieldNames[i]);
     if (tvf != null && tvf.size() > 0 && (tvf instanceof TermPositionVector)) {
       TermPositionVector tpv = (TermPositionVector) tvf;
       progress.message = "Reading term vectors ...";
       progress.curValue = i;
       setChanged();
       notifyObservers(progress);
       BytesRef[] tv = tpv.getTerms();
       for (int k = 0; k < tv.length; k++) {
         // do we have positions?
         int[] posArr = tpv.getTermPositions(k);
         if (posArr == null) {
           // only offsets
           TermVectorOffsetInfo[] offsets = tpv.getOffsets(k);
           if (offsets.length == 0) {
             continue;
           }
           // convert offsets into positions
           posArr = convertOffsets(offsets);
         }
         GrowableStringArray gsa = res.getReconstructedFields().get(fieldNames[i]);
         if (gsa == null) {
           gsa = new GrowableStringArray();
           res.getReconstructedFields().put(fieldNames[i], gsa);
         }
         for (int m = 0; m < posArr.length; m++) {
           gsa.append(posArr[m], "|", tv[k].utf8ToString());
         }
       }
       fields.remove(fieldNames[i]); // got what we wanted
     }
   }
   // this loop collects data only from left-over fields
   // not yet collected through term vectors
   progress.maxValue = fields.size();
   progress.curValue = 0;
   progress.minValue = 0;
   for (String fld : fields) {
     progress.message = "Collecting terms in " + fld + " ...";
     progress.curValue++;
     setChanged();
     notifyObservers(progress);
     Terms terms = MultiFields.getTerms(reader, fld);
     if (terms == null) { // no terms in this field
       continue;
     }
     TermsEnum te = terms.iterator();
     while (te.next() != null) {
       DocsAndPositionsEnum dpe = te.docsAndPositions(deleted, null);
       if (dpe == null) { // no position info for this field
         break;
       }
       int num = dpe.advance(docNum);
       if (num != docNum) { // either greater than or NO_MORE_DOCS
         continue; // no data for this term in this doc
       }
       String term = te.term().utf8ToString();
       GrowableStringArray gsa = (GrowableStringArray) res.getReconstructedFields().get(fld);
       if (gsa == null) {
         gsa = new GrowableStringArray();
         res.getReconstructedFields().put(fld, gsa);
       }
       for (int k = 0; k < dpe.freq(); k++) {
         int pos = dpe.nextPosition();
         gsa.append(pos, "|", term);
       }
     }
   }
   progress.message = "Done.";
   progress.curValue = 100;
   setChanged();
   notifyObservers(progress);
   return res;
 }
  public void testLongPostings() throws Exception {
    // Don't use _TestUtil.getTempDir so that we own the
    // randomness (ie same seed will point to same dir):
    Directory dir =
        newFSDirectory(_TestUtil.getTempDir("longpostings" + "." + random().nextLong()));

    final int NUM_DOCS = atLeast(2000);

    if (VERBOSE) {
      System.out.println("TEST: NUM_DOCS=" + NUM_DOCS);
    }

    final String s1 = getRandomTerm(null);
    final String s2 = getRandomTerm(s1);

    if (VERBOSE) {
      System.out.println("\nTEST: s1=" + s1 + " s2=" + s2);
      /*
      for(int idx=0;idx<s1.length();idx++) {
        System.out.println("  s1 ch=0x" + Integer.toHexString(s1.charAt(idx)));
      }
      for(int idx=0;idx<s2.length();idx++) {
        System.out.println("  s2 ch=0x" + Integer.toHexString(s2.charAt(idx)));
      }
      */
    }

    final FixedBitSet isS1 = new FixedBitSet(NUM_DOCS);
    for (int idx = 0; idx < NUM_DOCS; idx++) {
      if (random().nextBoolean()) {
        isS1.set(idx);
      }
    }

    final IndexReader r;
    final IndexWriterConfig iwc =
        newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()))
            .setOpenMode(IndexWriterConfig.OpenMode.CREATE)
            .setMergePolicy(newLogMergePolicy());
    iwc.setRAMBufferSizeMB(16.0 + 16.0 * random().nextDouble());
    iwc.setMaxBufferedDocs(-1);
    final RandomIndexWriter riw = new RandomIndexWriter(random(), dir, iwc);

    for (int idx = 0; idx < NUM_DOCS; idx++) {
      final Document doc = new Document();
      String s = isS1.get(idx) ? s1 : s2;
      final Field f = newTextField("field", s, Field.Store.NO);
      final int count = _TestUtil.nextInt(random(), 1, 4);
      for (int ct = 0; ct < count; ct++) {
        doc.add(f);
      }
      riw.addDocument(doc);
    }

    r = riw.getReader();
    riw.close();

    /*
    if (VERBOSE) {
      System.out.println("TEST: terms");
      TermEnum termEnum = r.terms();
      while(termEnum.next()) {
        System.out.println("  term=" + termEnum.term() + " len=" + termEnum.term().text().length());
        assertTrue(termEnum.docFreq() > 0);
        System.out.println("    s1?=" + (termEnum.term().text().equals(s1)) + " s1len=" + s1.length());
        System.out.println("    s2?=" + (termEnum.term().text().equals(s2)) + " s2len=" + s2.length());
        final String s = termEnum.term().text();
        for(int idx=0;idx<s.length();idx++) {
          System.out.println("      ch=0x" + Integer.toHexString(s.charAt(idx)));
        }
      }
    }
    */

    assertEquals(NUM_DOCS, r.numDocs());
    assertTrue(r.docFreq(new Term("field", s1)) > 0);
    assertTrue(r.docFreq(new Term("field", s2)) > 0);

    int num = atLeast(1000);
    for (int iter = 0; iter < num; iter++) {

      final String term;
      final boolean doS1;
      if (random().nextBoolean()) {
        term = s1;
        doS1 = true;
      } else {
        term = s2;
        doS1 = false;
      }

      if (VERBOSE) {
        System.out.println("\nTEST: iter=" + iter + " doS1=" + doS1);
      }

      final DocsAndPositionsEnum postings =
          MultiFields.getTermPositionsEnum(r, null, "field", new BytesRef(term));

      int docID = -1;
      while (docID < DocIdSetIterator.NO_MORE_DOCS) {
        final int what = random().nextInt(3);
        if (what == 0) {
          if (VERBOSE) {
            System.out.println("TEST: docID=" + docID + "; do next()");
          }
          // nextDoc
          int expected = docID + 1;
          while (true) {
            if (expected == NUM_DOCS) {
              expected = Integer.MAX_VALUE;
              break;
            } else if (isS1.get(expected) == doS1) {
              break;
            } else {
              expected++;
            }
          }
          docID = postings.nextDoc();
          if (VERBOSE) {
            System.out.println("  got docID=" + docID);
          }
          assertEquals(expected, docID);
          if (docID == DocIdSetIterator.NO_MORE_DOCS) {
            break;
          }

          if (random().nextInt(6) == 3) {
            final int freq = postings.freq();
            assertTrue(freq >= 1 && freq <= 4);
            for (int pos = 0; pos < freq; pos++) {
              assertEquals(pos, postings.nextPosition());
              if (random().nextBoolean()) {
                postings.getPayload();
                if (random().nextBoolean()) {
                  postings.getPayload(); // get it again
                }
              }
            }
          }
        } else {
          // advance
          final int targetDocID;
          if (docID == -1) {
            targetDocID = random().nextInt(NUM_DOCS + 1);
          } else {
            targetDocID = docID + _TestUtil.nextInt(random(), 1, NUM_DOCS - docID);
          }
          if (VERBOSE) {
            System.out.println("TEST: docID=" + docID + "; do advance(" + targetDocID + ")");
          }
          int expected = targetDocID;
          while (true) {
            if (expected == NUM_DOCS) {
              expected = Integer.MAX_VALUE;
              break;
            } else if (isS1.get(expected) == doS1) {
              break;
            } else {
              expected++;
            }
          }

          docID = postings.advance(targetDocID);
          if (VERBOSE) {
            System.out.println("  got docID=" + docID);
          }
          assertEquals(expected, docID);
          if (docID == DocIdSetIterator.NO_MORE_DOCS) {
            break;
          }

          if (random().nextInt(6) == 3) {
            final int freq = postings.freq();
            assertTrue(freq >= 1 && freq <= 4);
            for (int pos = 0; pos < freq; pos++) {
              assertEquals(pos, postings.nextPosition());
              if (random().nextBoolean()) {
                postings.getPayload();
                if (random().nextBoolean()) {
                  postings.getPayload(); // get it again
                }
              }
            }
          }
        }
      }
    }
    r.close();
    dir.close();
  }