public void testThreeBlocks() throws Exception {
    Directory dir = newDirectory();
    RandomIndexWriter w = new RandomIndexWriter(random(), dir);
    List<String> terms = new ArrayList<>();
    for (int i = 0; i < 36; i++) {
      Document doc = new Document();
      String term = "" + (char) (97 + i);
      terms.add(term);
      if (VERBOSE) {
        System.out.println("i=" + i + " term=" + term);
      }
      doc.add(newTextField("field", term, Field.Store.NO));
      w.addDocument(doc);
    }
    for (int i = 0; i < 36; i++) {
      Document doc = new Document();
      String term = "m" + (char) (97 + i);
      terms.add(term);
      if (VERBOSE) {
        System.out.println("i=" + i + " term=" + term);
      }
      doc.add(newTextField("field", term, Field.Store.NO));
      w.addDocument(doc);
    }
    for (int i = 0; i < 36; i++) {
      Document doc = new Document();
      String term = "mo" + (char) (97 + i);
      terms.add(term);
      if (VERBOSE) {
        System.out.println("i=" + i + " term=" + term);
      }
      doc.add(newTextField("field", term, Field.Store.NO));
      w.addDocument(doc);
    }
    w.forceMerge(1);
    IndexReader r = w.getReader();
    TermsEnum te = MultiFields.getTerms(r, "field").iterator(null);

    if (VERBOSE) {
      while (te.next() != null) {
        System.out.println("TERM: " + te.ord() + " " + te.term().utf8ToString());
      }
    }

    assertTrue(te.seekExact(new BytesRef("mo")));
    assertEquals(27, te.ord());

    te.seekExact(90);
    assertEquals(new BytesRef("s"), te.term());

    testEnum(te, terms);

    r.close();
    w.close();
    dir.close();
  }
Example #2
0
  public TermInfo collect(String term) throws IOException {
    TermInfo info = new TermInfo();
    BytesRef luceneTerm = new BytesRef(term.getBytes());
    // this gives documents in which the term is found, but no offset information can be retrieved
    PostingsEnum postings =
        MultiFields.getTermDocsEnum(indexReader, ngramInfoFieldname, luceneTerm);
    // now go through each document
    int docId = postings.nextDoc();
    while (docId != PostingsEnum.NO_MORE_DOCS) {
      // get the term vector for that document.
      TermsEnum it = indexReader.getTermVector(docId, ngramInfoFieldname).iterator();
      // find the term of interest
      it.seekExact(luceneTerm);
      // get its posting info. this will contain offset info
      PostingsEnum postingsInDoc = it.postings(null, PostingsEnum.OFFSETS);
      postingsInDoc.nextDoc();

      Document doc = indexReader.document(docId);
      String id = doc.get(idFieldname);
      JATEDocument jd = new JATEDocument(id);
      Set<int[]> offsets = new HashSet<>();
      int totalFreq = postingsInDoc.freq();
      for (int i = 0; i < totalFreq; i++) {
        postingsInDoc.nextPosition();
        offsets.add(new int[] {postingsInDoc.startOffset(), postingsInDoc.endOffset()});
      }
      info.getOffsets().put(jd, offsets);

      docId = postings.nextDoc();
    }

    return info;
  }
  public void testDocsAndPositionsEnumStart() throws Exception {
    Analyzer analyzer = new MockAnalyzer(random());
    int numIters = atLeast(3);
    MemoryIndex memory = new MemoryIndex(true, false, random().nextInt(50) * 1024 * 1024);
    for (int i = 0; i < numIters; i++) { // check reuse
      memory.addField("foo", "bar", analyzer);
      LeafReader reader = (LeafReader) memory.createSearcher().getIndexReader();
      TestUtil.checkReader(reader);
      assertEquals(1, reader.terms("foo").getSumTotalTermFreq());
      PostingsEnum disi = reader.postings(new Term("foo", "bar"), PostingsEnum.ALL);
      int docid = disi.docID();
      assertEquals(-1, docid);
      assertTrue(disi.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
      assertEquals(0, disi.nextPosition());
      assertEquals(0, disi.startOffset());
      assertEquals(3, disi.endOffset());

      // now reuse and check again
      TermsEnum te = reader.terms("foo").iterator();
      assertTrue(te.seekExact(new BytesRef("bar")));
      disi = te.postings(disi);
      docid = disi.docID();
      assertEquals(-1, docid);
      assertTrue(disi.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
      reader.close();
      memory.reset();
    }
  }
 @Override
 protected void fillDocsAndScores(FixedBitSet matchingDocs, TermsEnum termsEnum)
     throws IOException {
   BytesRef spare = new BytesRef();
   PostingsEnum postingsEnum = null;
   for (int i = 0; i < terms.size(); i++) {
     if (termsEnum.seekExact(terms.get(ords[i], spare))) {
       postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE);
       float score = TermsIncludingScoreQuery.this.scores[ords[i]];
       for (int doc = postingsEnum.nextDoc();
           doc != DocIdSetIterator.NO_MORE_DOCS;
           doc = postingsEnum.nextDoc()) {
         // I prefer this:
         /*if (scores[doc] < score) {
           scores[doc] = score;
           matchingDocs.set(doc);
         }*/
         // But this behaves the same as MVInnerScorer and only then the tests will pass:
         if (!matchingDocs.get(doc)) {
           scores[doc] = score;
           matchingDocs.set(doc);
         }
       }
     }
   }
 }
Example #5
0
 /* Copied from lucene 4.2.x core */
 private static long totalTermFreq(IndexReader r, String field, BytesRef text) throws IOException {
   final Terms terms = MultiFields.getTerms(r, field);
   if (terms != null) {
     final TermsEnum termsEnum = terms.iterator(null);
     if (termsEnum.seekExact(text, true)) {
       return termsEnum.totalTermFreq();
     }
   }
   return 0;
 }
 SegmentResult(int[] counts, int total, TermsEnum tenum, int startFacetOrd, int endFacetOrd)
     throws IOException {
   super(counts, total - counts[0], counts[0], endFacetOrd + 1);
   this.tenum = tenum;
   this.mergePos = startFacetOrd == -1 ? 1 : startFacetOrd + 1;
   if (mergePos < maxTermPos) {
     assert tenum != null;
     tenum.seekExact(startFacetOrd == -1 ? 0 : startFacetOrd);
     mergeTerm = tenum.term();
   }
 }
  private void testEnum(TermsEnum te, List<String> terms) throws IOException {
    Collections.sort(terms);
    for (int i = terms.size() - 1; i >= 0; i--) {
      if (VERBOSE) {
        System.out.println("TEST: seek to ord=" + i);
      }
      te.seekExact(i);
      assertEquals(i, te.ord());
      assertEquals(terms.get(i), te.term().utf8ToString());
    }

    int iters = atLeast(1000);
    for (int iter = 0; iter < iters; iter++) {
      int ord = random().nextInt(terms.size());
      if (random().nextBoolean()) {
        te.seekExact(ord);
        assertEquals(terms.get(ord), te.term().utf8ToString());
      } else {
        te.seekExact(new BytesRef(terms.get(ord)));
        assertEquals(ord, te.ord());
      }
    }
  }
  public void testFloorBlocks() throws Exception {
    Directory dir = newDirectory();
    IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
    IndexWriter w = new IndexWriter(dir, iwc);
    for (int i = 0; i < 128; i++) {
      Document doc = new Document();
      String term = "" + (char) i;
      if (VERBOSE) {
        System.out.println("i=" + i + " term=" + term + " bytes=" + new BytesRef(term));
      }
      doc.add(newStringField("field", term, Field.Store.NO));
      w.addDocument(doc);
    }
    w.forceMerge(1);
    IndexReader r = DirectoryReader.open(w, true);
    TermsEnum te = MultiFields.getTerms(r, "field").iterator(null);

    if (VERBOSE) {
      BytesRef term;
      while ((term = te.next()) != null) {
        System.out.println("  " + te.ord() + ": " + term.utf8ToString());
      }
    }

    assertTrue(te.seekExact(new BytesRef("a")));
    assertEquals(97, te.ord());

    te.seekExact(98);
    assertEquals(new BytesRef("b"), te.term());

    assertTrue(te.seekExact(new BytesRef("z")));
    assertEquals(122, te.ord());

    r.close();
    w.close();
    dir.close();
  }
 public PostingsEnum randomDocsEnum(
     String field, BytesRef term, List<LeafReaderContext> readers, Bits bits) throws IOException {
   if (random().nextInt(10) == 0) {
     return null;
   }
   LeafReader indexReader = readers.get(random().nextInt(readers.size())).reader();
   Terms terms = indexReader.terms(field);
   if (terms == null) {
     return null;
   }
   TermsEnum iterator = terms.iterator();
   if (iterator.seekExact(term)) {
     return iterator.postings(
         bits, null, random().nextBoolean() ? PostingsEnum.FREQS : PostingsEnum.NONE);
   }
   return null;
 }
 SegmentResult(
     int[] counts,
     int total,
     int missingCountIndex,
     TermsEnum tenum,
     int startFacetOrd,
     int endFacetOrd)
     throws IOException {
   super(
       counts,
       total - counts[missingCountIndex],
       counts[missingCountIndex],
       endFacetOrd == missingCountIndex + 1 ? missingCountIndex : endFacetOrd);
   this.tenum = tenum;
   this.mergePos = startFacetOrd;
   if (tenum != null) {
     tenum.seekExact(mergePos);
     mergeTerm = tenum.term();
   }
 }
 private Query newTermQuery(IndexReader reader, Term term) throws IOException {
   if (ignoreTF) {
     return new ConstantScoreQuery(new TermQuery(term));
   } else {
     // we build an artificial TermContext that will give an overall df and ttf
     // equal to 1
     TermContext context = new TermContext(reader.getContext());
     for (LeafReaderContext leafContext : reader.leaves()) {
       Terms terms = leafContext.reader().terms(term.field());
       if (terms != null) {
         TermsEnum termsEnum = terms.iterator();
         if (termsEnum.seekExact(term.bytes())) {
           int freq = 1 - context.docFreq(); // we want the total df and ttf to be 1
           context.register(termsEnum.termState(), leafContext.ord, freq, freq);
         }
       }
     }
     return new TermQuery(term, context);
   }
 }
 protected void fillDocsAndScores(FixedBitSet matchingDocs, TermsEnum termsEnum)
     throws IOException {
   BytesRef spare = new BytesRef();
   PostingsEnum postingsEnum = null;
   for (int i = 0; i < terms.size(); i++) {
     if (termsEnum.seekExact(terms.get(ords[i], spare))) {
       postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE);
       float score = TermsIncludingScoreQuery.this.scores[ords[i]];
       for (int doc = postingsEnum.nextDoc();
           doc != DocIdSetIterator.NO_MORE_DOCS;
           doc = postingsEnum.nextDoc()) {
         matchingDocs.set(doc);
         // In the case the same doc is also related to a another doc, a score might be
         // overwritten. I think this
         // can only happen in a many-to-many relation
         scores[doc] = score;
       }
     }
   }
 }
  public void testSeveralNonRootBlocks() throws Exception {
    Directory dir = newDirectory();
    IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
    IndexWriter w = new IndexWriter(dir, iwc);
    List<String> terms = new ArrayList<>();
    for (int i = 0; i < 30; i++) {
      for (int j = 0; j < 30; j++) {
        Document doc = new Document();
        String term = "" + (char) (97 + i) + (char) (97 + j);
        terms.add(term);
        if (VERBOSE) {
          System.out.println("term=" + term);
        }
        doc.add(newTextField("body", term, Field.Store.NO));
        w.addDocument(doc);
      }
    }
    w.forceMerge(1);
    IndexReader r = DirectoryReader.open(w, true);
    TermsEnum te = MultiFields.getTerms(r, "body").iterator(null);

    for (int i = 0; i < 30; i++) {
      for (int j = 0; j < 30; j++) {
        String term = "" + (char) (97 + i) + (char) (97 + j);
        if (VERBOSE) {
          System.out.println("TEST: check term=" + term);
        }
        assertEquals(term, te.next().utf8ToString());
        assertEquals(30 * i + j, te.ord());
      }
    }

    testEnum(te, terms);

    te.seekExact(0);
    assertEquals("aa", te.term().utf8ToString());

    r.close();
    w.close();
    dir.close();
  }
Example #14
0
  public static DocSet createDocSet(SolrIndexSearcher searcher, Term term) throws IOException {
    DirectoryReader reader = searcher.getRawReader(); // raw reader to avoid extra wrapping overhead
    int maxDoc = searcher.getIndexReader().maxDoc();
    int smallSetSize = smallSetSize(maxDoc);

    String field = term.field();
    BytesRef termVal = term.bytes();

    int maxCount = 0;
    int firstReader = -1;
    List<LeafReaderContext> leaves = reader.leaves();
    PostingsEnum[] postList =
        new PostingsEnum
            [leaves
                .size()]; // use array for slightly higher scanning cost, but fewer memory
                          // allocations
    for (LeafReaderContext ctx : leaves) {
      assert leaves.get(ctx.ord) == ctx;
      LeafReader r = ctx.reader();
      Fields f = r.fields();
      Terms t = f.terms(field);
      if (t == null) continue; // field is missing
      TermsEnum te = t.iterator();
      if (te.seekExact(termVal)) {
        maxCount += te.docFreq();
        postList[ctx.ord] = te.postings(null, PostingsEnum.NONE);
        if (firstReader < 0) firstReader = ctx.ord;
      }
    }

    if (maxCount == 0) {
      return DocSet.EMPTY;
    }

    if (maxCount <= smallSetSize) {
      return createSmallSet(leaves, postList, maxCount, firstReader);
    }

    return createBigSet(leaves, postList, maxDoc, firstReader);
  }
  protected List<OffsetsEnum> createOffsetsEnumsFromReader(LeafReader atomicReader, int doc)
      throws IOException {
    // For strict positions, get a Map of term to Spans:
    //    note: ScriptPhraseHelper.NONE does the right thing for these method calls
    final Map<BytesRef, Spans> strictPhrasesTermToSpans =
        strictPhrases.getTermToSpans(atomicReader, doc);
    // Usually simply wraps terms in a List; but if willRewrite() then can be expanded
    final List<BytesRef> sourceTerms =
        strictPhrases.expandTermsIfRewrite(terms, strictPhrasesTermToSpans);

    final List<OffsetsEnum> offsetsEnums = new ArrayList<>(sourceTerms.size() + 1);

    Terms termsIndex =
        atomicReader == null || sourceTerms.isEmpty() ? null : atomicReader.terms(field);
    if (termsIndex != null) {
      TermsEnum termsEnum = termsIndex.iterator(); // does not return null
      for (BytesRef term : sourceTerms) {
        if (!termsEnum.seekExact(term)) {
          continue; // term not found
        }
        PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.OFFSETS);
        if (postingsEnum == null) {
          // no offsets or positions available
          throw new IllegalArgumentException(
              "field '" + field + "' was indexed without offsets, cannot highlight");
        }
        if (doc != postingsEnum.advance(doc)) { // now it's positioned, although may be exhausted
          continue;
        }
        postingsEnum =
            strictPhrases.filterPostings(term, postingsEnum, strictPhrasesTermToSpans.get(term));
        if (postingsEnum == null) {
          continue; // completely filtered out
        }

        offsetsEnums.add(new OffsetsEnum(term, postingsEnum));
      }
    }
    return offsetsEnums;
  }
  public void testDocsEnumStart() throws Exception {
    Analyzer analyzer = new MockAnalyzer(random());
    MemoryIndex memory =
        new MemoryIndex(random().nextBoolean(), false, random().nextInt(50) * 1024 * 1024);
    memory.addField("foo", "bar", analyzer);
    LeafReader reader = (LeafReader) memory.createSearcher().getIndexReader();
    TestUtil.checkReader(reader);
    PostingsEnum disi =
        TestUtil.docs(random(), reader, "foo", new BytesRef("bar"), null, PostingsEnum.NONE);
    int docid = disi.docID();
    assertEquals(-1, docid);
    assertTrue(disi.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);

    // now reuse and check again
    TermsEnum te = reader.terms("foo").iterator();
    assertTrue(te.seekExact(new BytesRef("bar")));
    disi = te.postings(disi, PostingsEnum.NONE);
    docid = disi.docID();
    assertEquals(-1, docid);
    assertTrue(disi.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
    reader.close();
  }
  protected int[] lookupDocIdByPK(final IndexSearcher searcher, final String... ids)
      throws IOException {
    final List<AtomicReaderContext> subReaders = searcher.getIndexReader().leaves();
    final TermsEnum[] termsEnums = new TermsEnum[subReaders.size()];
    final DocsEnum[] docsEnums = new DocsEnum[subReaders.size()];
    for (int subIDX = 0; subIDX < subReaders.size(); subIDX++) {
      termsEnums[subIDX] = subReaders.get(subIDX).reader().fields().terms("id").iterator(null);
    }

    int[] results = new int[ids.length];

    for (int i = 0; i < results.length; i++) {
      results[i] = -1;
    }

    for (int idx = 0; idx < ids.length; idx++) {
      int base = 0;
      final BytesRef id = new BytesRef(ids[idx]);
      for (int subIDX = 0; subIDX < subReaders.size(); subIDX++) {
        final AtomicReader sub = subReaders.get(subIDX).reader();
        final TermsEnum termsEnum = termsEnums[subIDX];
        if (termsEnum.seekExact(id, false)) {
          final DocsEnum docs =
              docsEnums[subIDX] = termsEnum.docs(sub.getLiveDocs(), docsEnums[subIDX], 0);
          if (docs != null) {
            final int docID = docs.nextDoc();
            if (docID != DocIdSetIterator.NO_MORE_DOCS) {
              results[idx] = base + docID;
              break;
            }
          }
        }
        base += sub.maxDoc();
      }
    }

    return results;
  }
  public void collectTermContext(
      IndexReader reader,
      List<LeafReaderContext> leaves,
      TermContext[] contextArray,
      Term[] queryTerms)
      throws IOException {
    TermsEnum termsEnum = null;
    for (LeafReaderContext context : leaves) {
      final Fields fields = context.reader().fields();
      for (int i = 0; i < queryTerms.length; i++) {
        Term term = queryTerms[i];
        TermContext termContext = contextArray[i];
        final Terms terms = fields.terms(term.field());
        if (terms == null) {
          // field does not exist
          continue;
        }
        termsEnum = terms.iterator();
        assert termsEnum != null;

        if (termsEnum == TermsEnum.EMPTY) continue;
        if (termsEnum.seekExact(term.bytes())) {
          if (termContext == null) {
            contextArray[i] =
                new TermContext(
                    reader.getContext(),
                    termsEnum.termState(),
                    context.ord,
                    termsEnum.docFreq(),
                    termsEnum.totalTermFreq());
          } else {
            termContext.register(
                termsEnum.termState(), context.ord, termsEnum.docFreq(), termsEnum.totalTermFreq());
          }
        }
      }
    }
  }
  public void testBasic() throws Exception {
    Directory dir = newDirectory();
    RandomIndexWriter w = new RandomIndexWriter(random(), dir);
    Document doc = new Document();
    doc.add(newTextField("field", "a b c", Field.Store.NO));
    w.addDocument(doc);
    IndexReader r = w.getReader();
    TermsEnum te = MultiFields.getTerms(r, "field").iterator(null);

    // Test next()
    assertEquals(new BytesRef("a"), te.next());
    assertEquals(0L, te.ord());
    assertEquals(new BytesRef("b"), te.next());
    assertEquals(1L, te.ord());
    assertEquals(new BytesRef("c"), te.next());
    assertEquals(2L, te.ord());
    assertNull(te.next());

    // Test seekExact by term
    assertTrue(te.seekExact(new BytesRef("b")));
    assertEquals(1, te.ord());
    assertTrue(te.seekExact(new BytesRef("a")));
    assertEquals(0, te.ord());
    assertTrue(te.seekExact(new BytesRef("c")));
    assertEquals(2, te.ord());

    // Test seekExact by ord
    te.seekExact(1);
    assertEquals(new BytesRef("b"), te.term());
    te.seekExact(0);
    assertEquals(new BytesRef("a"), te.term());
    te.seekExact(2);
    assertEquals(new BytesRef("c"), te.term());

    r.close();
    w.close();
    dir.close();
  }
Example #20
0
 /** Returns the term ({@link BytesRef}) corresponding to the provided ordinal. */
 public BytesRef lookupTerm(TermsEnum termsEnum, int ord) throws IOException {
   termsEnum.seekExact(ord);
   return termsEnum.term();
 }
  public void testTwoBlocks() throws Exception {
    Directory dir = newDirectory();
    RandomIndexWriter w = new RandomIndexWriter(random(), dir);
    List<String> terms = new ArrayList<>();
    for (int i = 0; i < 36; i++) {
      Document doc = new Document();
      String term = "" + (char) (97 + i);
      terms.add(term);
      if (VERBOSE) {
        System.out.println("i=" + i + " term=" + term);
      }
      doc.add(newTextField("field", term, Field.Store.NO));
      w.addDocument(doc);
    }
    for (int i = 0; i < 36; i++) {
      Document doc = new Document();
      String term = "m" + (char) (97 + i);
      terms.add(term);
      if (VERBOSE) {
        System.out.println("i=" + i + " term=" + term);
      }
      doc.add(newTextField("field", term, Field.Store.NO));
      w.addDocument(doc);
    }
    if (VERBOSE) {
      System.out.println("TEST: now forceMerge");
    }
    w.forceMerge(1);
    IndexReader r = w.getReader();
    TermsEnum te = MultiFields.getTerms(r, "field").iterator(null);

    assertTrue(te.seekExact(new BytesRef("mo")));
    assertEquals(27, te.ord());

    te.seekExact(54);
    assertEquals(new BytesRef("s"), te.term());

    Collections.sort(terms);

    for (int i = terms.size() - 1; i >= 0; i--) {
      te.seekExact(i);
      assertEquals(i, te.ord());
      assertEquals(terms.get(i), te.term().utf8ToString());
    }

    int iters = atLeast(1000);
    for (int iter = 0; iter < iters; iter++) {
      int ord = random().nextInt(terms.size());
      BytesRef term = new BytesRef(terms.get(ord));
      if (random().nextBoolean()) {
        if (VERBOSE) {
          System.out.println("TEST: iter=" + iter + " seek to ord=" + ord + " of " + terms.size());
        }
        te.seekExact(ord);
      } else {
        if (VERBOSE) {
          System.out.println(
              "TEST: iter="
                  + iter
                  + " seek to term="
                  + terms.get(ord)
                  + " ord="
                  + ord
                  + " of "
                  + terms.size());
        }
        te.seekExact(term);
      }
      assertEquals(ord, te.ord());
      assertEquals(term, te.term());
    }

    r.close();
    w.close();
    dir.close();
  }
Example #22
0
  @Override
  public void execute(String[] args, PrintStream out) throws Exception {
    String field = null;
    String termVal = null;
    try {
      field = args[0];
    } catch (Exception e) {
      field = null;
    }

    if (field != null) {
      String[] parts = field.split(":");
      if (parts.length > 1) {
        field = parts[0];
        termVal = parts[1];
      }
    }

    if (field == null || termVal == null) {
      out.println("usage: field:term");
      out.flush();
      return;
    }

    IndexReader reader = ctx.getIndexReader();
    List<AtomicReaderContext> leaves = reader.leaves();
    int docBase = 0;
    int numPerPage = 20;
    for (AtomicReaderContext leaf : leaves) {
      AtomicReader atomicReader = leaf.reader();
      Terms terms = atomicReader.terms(field);
      if (terms == null) {
        continue;
      }
      boolean hasPositions = terms.hasPositions();
      if (terms != null && termVal != null) {
        TermsEnum te = terms.iterator(null);
        int count = 0;
        if (te.seekExact(new BytesRef(termVal), true)) {

          if (hasPositions) {
            DocsAndPositionsEnum iter = te.docsAndPositions(atomicReader.getLiveDocs(), null);
            int docid;
            while ((docid = iter.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
              count++;
              out.print("docid: " + (docid + docBase) + ", freq: " + iter.freq() + ", ");
              for (int i = 0; i < iter.freq(); ++i) {
                out.print("pos " + i + ": " + iter.nextPosition());
                BytesRef payload = iter.getPayload();
                if (payload != null) {
                  out.print(",payload: " + payload);
                }
                out.print(";");
              }
              out.println();
              if (ctx.isInteractiveMode()) {
                if (count % numPerPage == 0) {
                  out.println("Ctrl-D to break");
                  int ch = System.in.read();
                  if (ch == -1) {
                    out.flush();
                    return;
                  }
                }
              }
            }
          } else {
            DocsEnum iter = te.docs(atomicReader.getLiveDocs(), null);

            int docid;
            while ((docid = iter.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
              count++;
              out.println("docid: " + (docid + docBase));
              if (ctx.isInteractiveMode()) {
                if (count % numPerPage == 0) {
                  out.println("Ctrl-D to break");
                  int ch = System.in.read();
                  if (ch == -1) {
                    out.flush();
                    return;
                  }
                }
              }
            }
          }
        }
      }
      docBase += atomicReader.maxDoc();
    }
  }
  private void assertTermsSeeking(Terms leftTerms, Terms rightTerms) throws Exception {
    TermsEnum leftEnum = null;
    TermsEnum rightEnum = null;

    // just an upper bound
    int numTests = atLeast(20);
    Random random = random();

    // collect this number of terms from the left side
    HashSet<BytesRef> tests = new HashSet<BytesRef>();
    int numPasses = 0;
    while (numPasses < 10 && tests.size() < numTests) {
      leftEnum = leftTerms.iterator(leftEnum);
      BytesRef term = null;
      while ((term = leftEnum.next()) != null) {
        int code = random.nextInt(10);
        if (code == 0) {
          // the term
          tests.add(BytesRef.deepCopyOf(term));
        } else if (code == 1) {
          // truncated subsequence of term
          term = BytesRef.deepCopyOf(term);
          if (term.length > 0) {
            // truncate it
            term.length = random.nextInt(term.length);
          }
        } else if (code == 2) {
          // term, but ensure a non-zero offset
          byte newbytes[] = new byte[term.length + 5];
          System.arraycopy(term.bytes, term.offset, newbytes, 5, term.length);
          tests.add(new BytesRef(newbytes, 5, term.length));
        }
      }
      numPasses++;
    }

    ArrayList<BytesRef> shuffledTests = new ArrayList<BytesRef>(tests);
    Collections.shuffle(shuffledTests, random);

    for (BytesRef b : shuffledTests) {
      leftEnum = leftTerms.iterator(leftEnum);
      rightEnum = rightTerms.iterator(rightEnum);

      assertEquals(leftEnum.seekExact(b), rightEnum.seekExact(b));
      assertEquals(leftEnum.seekExact(b), rightEnum.seekExact(b));

      SeekStatus leftStatus;
      SeekStatus rightStatus;

      leftStatus = leftEnum.seekCeil(b);
      rightStatus = rightEnum.seekCeil(b);
      assertEquals(leftStatus, rightStatus);
      if (leftStatus != SeekStatus.END) {
        assertEquals(leftEnum.term(), rightEnum.term());
      }

      leftStatus = leftEnum.seekCeil(b);
      rightStatus = rightEnum.seekCeil(b);
      assertEquals(leftStatus, rightStatus);
      if (leftStatus != SeekStatus.END) {
        assertEquals(leftEnum.term(), rightEnum.term());
      }
    }
  }
    @Override
    protected void doSetNextReader(LeafReaderContext context) throws IOException {
      if (segmentFacetCounts != null) {
        segmentResults.add(createSegmentResult());
      }

      groupFieldTermsIndex = DocValues.getSorted(context.reader(), groupField);
      facetFieldDocTermOrds = DocValues.getSortedSet(context.reader(), facetField);
      facetFieldNumTerms = (int) facetFieldDocTermOrds.getValueCount();
      if (facetFieldNumTerms == 0) {
        facetOrdTermsEnum = null;
      } else {
        facetOrdTermsEnum = facetFieldDocTermOrds.termsEnum();
      }
      // [facetFieldNumTerms() + 1] for all possible facet values and docs not containing facet
      // field
      segmentFacetCounts = new int[facetFieldNumTerms + 1];
      segmentTotalCount = 0;

      segmentGroupedFacetHits.clear();
      for (GroupedFacetHit groupedFacetHit : groupedFacetHits) {
        int groupOrd =
            groupedFacetHit.groupValue == null
                ? -1
                : groupFieldTermsIndex.lookupTerm(groupedFacetHit.groupValue);
        if (groupedFacetHit.groupValue != null && groupOrd < 0) {
          continue;
        }

        int facetOrd;
        if (groupedFacetHit.facetValue != null) {
          if (facetOrdTermsEnum == null
              || !facetOrdTermsEnum.seekExact(groupedFacetHit.facetValue)) {
            continue;
          }
          facetOrd = (int) facetOrdTermsEnum.ord();
        } else {
          facetOrd = facetFieldNumTerms;
        }

        // (facetFieldDocTermOrds.numTerms() + 1) for all possible facet values and docs not
        // containing facet field
        int segmentGroupedFacetsIndex = groupOrd * (facetFieldNumTerms + 1) + facetOrd;
        segmentGroupedFacetHits.put(segmentGroupedFacetsIndex);
      }

      if (facetPrefix != null) {
        TermsEnum.SeekStatus seekStatus;
        if (facetOrdTermsEnum != null) {
          seekStatus = facetOrdTermsEnum.seekCeil(facetPrefix);
        } else {
          seekStatus = TermsEnum.SeekStatus.END;
        }

        if (seekStatus != TermsEnum.SeekStatus.END) {
          startFacetOrd = (int) facetOrdTermsEnum.ord();
        } else {
          startFacetOrd = 0;
          endFacetOrd = 0;
          return;
        }

        BytesRefBuilder facetEndPrefix = new BytesRefBuilder();
        facetEndPrefix.append(facetPrefix);
        facetEndPrefix.append(UnicodeUtil.BIG_TERM);
        seekStatus = facetOrdTermsEnum.seekCeil(facetEndPrefix.get());
        if (seekStatus != TermsEnum.SeekStatus.END) {
          endFacetOrd = (int) facetOrdTermsEnum.ord();
        } else {
          endFacetOrd = facetFieldNumTerms; // Don't include null...
        }
      } else {
        startFacetOrd = 0;
        endFacetOrd = facetFieldNumTerms + 1;
      }
    }
Example #25
0
  private IndexIterationContext createContext(
      int nDocs,
      RandomIndexWriter fromWriter,
      RandomIndexWriter toWriter,
      boolean multipleValuesPerDocument,
      boolean scoreDocsInOrder)
      throws IOException {
    IndexIterationContext context = new IndexIterationContext();
    int numRandomValues = nDocs / 2;
    context.randomUniqueValues = new String[numRandomValues];
    Set<String> trackSet = new HashSet<String>();
    context.randomFrom = new boolean[numRandomValues];
    for (int i = 0; i < numRandomValues; i++) {
      String uniqueRandomValue;
      do {
        uniqueRandomValue = _TestUtil.randomRealisticUnicodeString(random());
        //        uniqueRandomValue = _TestUtil.randomSimpleString(random);
      } while ("".equals(uniqueRandomValue) || trackSet.contains(uniqueRandomValue));
      // Generate unique values and empty strings aren't allowed.
      trackSet.add(uniqueRandomValue);
      context.randomFrom[i] = random().nextBoolean();
      context.randomUniqueValues[i] = uniqueRandomValue;
    }

    RandomDoc[] docs = new RandomDoc[nDocs];
    for (int i = 0; i < nDocs; i++) {
      String id = Integer.toString(i);
      int randomI = random().nextInt(context.randomUniqueValues.length);
      String value = context.randomUniqueValues[randomI];
      Document document = new Document();
      document.add(newTextField(random(), "id", id, Field.Store.NO));
      document.add(newTextField(random(), "value", value, Field.Store.NO));

      boolean from = context.randomFrom[randomI];
      int numberOfLinkValues = multipleValuesPerDocument ? 2 + random().nextInt(10) : 1;
      docs[i] = new RandomDoc(id, numberOfLinkValues, value, from);
      for (int j = 0; j < numberOfLinkValues; j++) {
        String linkValue =
            context.randomUniqueValues[random().nextInt(context.randomUniqueValues.length)];
        docs[i].linkValues.add(linkValue);
        if (from) {
          if (!context.fromDocuments.containsKey(linkValue)) {
            context.fromDocuments.put(linkValue, new ArrayList<RandomDoc>());
          }
          if (!context.randomValueFromDocs.containsKey(value)) {
            context.randomValueFromDocs.put(value, new ArrayList<RandomDoc>());
          }

          context.fromDocuments.get(linkValue).add(docs[i]);
          context.randomValueFromDocs.get(value).add(docs[i]);
          document.add(newTextField(random(), "from", linkValue, Field.Store.NO));
        } else {
          if (!context.toDocuments.containsKey(linkValue)) {
            context.toDocuments.put(linkValue, new ArrayList<RandomDoc>());
          }
          if (!context.randomValueToDocs.containsKey(value)) {
            context.randomValueToDocs.put(value, new ArrayList<RandomDoc>());
          }

          context.toDocuments.get(linkValue).add(docs[i]);
          context.randomValueToDocs.get(value).add(docs[i]);
          document.add(newTextField(random(), "to", linkValue, Field.Store.NO));
        }
      }

      final RandomIndexWriter w;
      if (from) {
        w = fromWriter;
      } else {
        w = toWriter;
      }

      w.addDocument(document);
      if (random().nextInt(10) == 4) {
        w.commit();
      }
      if (VERBOSE) {
        System.out.println("Added document[" + docs[i].id + "]: " + document);
      }
    }

    // Pre-compute all possible hits for all unique random values. On top of this also compute all
    // possible score for
    // any ScoreMode.
    IndexSearcher fromSearcher = newSearcher(fromWriter.getReader());
    IndexSearcher toSearcher = newSearcher(toWriter.getReader());
    for (int i = 0; i < context.randomUniqueValues.length; i++) {
      String uniqueRandomValue = context.randomUniqueValues[i];
      final String fromField;
      final String toField;
      final Map<String, Map<Integer, JoinScore>> queryVals;
      if (context.randomFrom[i]) {
        fromField = "from";
        toField = "to";
        queryVals = context.fromHitsToJoinScore;
      } else {
        fromField = "to";
        toField = "from";
        queryVals = context.toHitsToJoinScore;
      }
      final Map<BytesRef, JoinScore> joinValueToJoinScores = new HashMap<BytesRef, JoinScore>();
      if (multipleValuesPerDocument) {
        fromSearcher.search(
            new TermQuery(new Term("value", uniqueRandomValue)),
            new Collector() {

              private Scorer scorer;
              private SortedSetDocValues docTermOrds;
              final BytesRef joinValue = new BytesRef();

              @Override
              public void collect(int doc) throws IOException {
                docTermOrds.setDocument(doc);
                long ord;
                while ((ord = docTermOrds.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) {
                  docTermOrds.lookupOrd(ord, joinValue);
                  JoinScore joinScore = joinValueToJoinScores.get(joinValue);
                  if (joinScore == null) {
                    joinValueToJoinScores.put(
                        BytesRef.deepCopyOf(joinValue), joinScore = new JoinScore());
                  }
                  joinScore.addScore(scorer.score());
                }
              }

              @Override
              public void setNextReader(AtomicReaderContext context) throws IOException {
                docTermOrds = FieldCache.DEFAULT.getDocTermOrds(context.reader(), fromField);
              }

              @Override
              public void setScorer(Scorer scorer) {
                this.scorer = scorer;
              }

              @Override
              public boolean acceptsDocsOutOfOrder() {
                return false;
              }
            });
      } else {
        fromSearcher.search(
            new TermQuery(new Term("value", uniqueRandomValue)),
            new Collector() {

              private Scorer scorer;
              private BinaryDocValues terms;
              private Bits docsWithField;
              private final BytesRef spare = new BytesRef();

              @Override
              public void collect(int doc) throws IOException {
                terms.get(doc, spare);
                BytesRef joinValue = spare;
                if (joinValue.length == 0 && !docsWithField.get(doc)) {
                  return;
                }

                JoinScore joinScore = joinValueToJoinScores.get(joinValue);
                if (joinScore == null) {
                  joinValueToJoinScores.put(
                      BytesRef.deepCopyOf(joinValue), joinScore = new JoinScore());
                }
                joinScore.addScore(scorer.score());
              }

              @Override
              public void setNextReader(AtomicReaderContext context) throws IOException {
                terms = FieldCache.DEFAULT.getTerms(context.reader(), fromField, true);
                docsWithField = FieldCache.DEFAULT.getDocsWithField(context.reader(), fromField);
              }

              @Override
              public void setScorer(Scorer scorer) {
                this.scorer = scorer;
              }

              @Override
              public boolean acceptsDocsOutOfOrder() {
                return false;
              }
            });
      }

      final Map<Integer, JoinScore> docToJoinScore = new HashMap<Integer, JoinScore>();
      if (multipleValuesPerDocument) {
        if (scoreDocsInOrder) {
          AtomicReader slowCompositeReader =
              SlowCompositeReaderWrapper.wrap(toSearcher.getIndexReader());
          Terms terms = slowCompositeReader.terms(toField);
          if (terms != null) {
            DocsEnum docsEnum = null;
            TermsEnum termsEnum = null;
            SortedSet<BytesRef> joinValues =
                new TreeSet<BytesRef>(BytesRef.getUTF8SortedAsUnicodeComparator());
            joinValues.addAll(joinValueToJoinScores.keySet());
            for (BytesRef joinValue : joinValues) {
              termsEnum = terms.iterator(termsEnum);
              if (termsEnum.seekExact(joinValue)) {
                docsEnum =
                    termsEnum.docs(slowCompositeReader.getLiveDocs(), docsEnum, DocsEnum.FLAG_NONE);
                JoinScore joinScore = joinValueToJoinScores.get(joinValue);

                for (int doc = docsEnum.nextDoc();
                    doc != DocIdSetIterator.NO_MORE_DOCS;
                    doc = docsEnum.nextDoc()) {
                  // First encountered join value determines the score.
                  // Something to keep in mind for many-to-many relations.
                  if (!docToJoinScore.containsKey(doc)) {
                    docToJoinScore.put(doc, joinScore);
                  }
                }
              }
            }
          }
        } else {
          toSearcher.search(
              new MatchAllDocsQuery(),
              new Collector() {

                private SortedSetDocValues docTermOrds;
                private final BytesRef scratch = new BytesRef();
                private int docBase;

                @Override
                public void collect(int doc) throws IOException {
                  docTermOrds.setDocument(doc);
                  long ord;
                  while ((ord = docTermOrds.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) {
                    docTermOrds.lookupOrd(ord, scratch);
                    JoinScore joinScore = joinValueToJoinScores.get(scratch);
                    if (joinScore == null) {
                      continue;
                    }
                    Integer basedDoc = docBase + doc;
                    // First encountered join value determines the score.
                    // Something to keep in mind for many-to-many relations.
                    if (!docToJoinScore.containsKey(basedDoc)) {
                      docToJoinScore.put(basedDoc, joinScore);
                    }
                  }
                }

                @Override
                public void setNextReader(AtomicReaderContext context) throws IOException {
                  docBase = context.docBase;
                  docTermOrds = FieldCache.DEFAULT.getDocTermOrds(context.reader(), toField);
                }

                @Override
                public boolean acceptsDocsOutOfOrder() {
                  return false;
                }

                @Override
                public void setScorer(Scorer scorer) {}
              });
        }
      } else {
        toSearcher.search(
            new MatchAllDocsQuery(),
            new Collector() {

              private BinaryDocValues terms;
              private int docBase;
              private final BytesRef spare = new BytesRef();

              @Override
              public void collect(int doc) {
                terms.get(doc, spare);
                JoinScore joinScore = joinValueToJoinScores.get(spare);
                if (joinScore == null) {
                  return;
                }
                docToJoinScore.put(docBase + doc, joinScore);
              }

              @Override
              public void setNextReader(AtomicReaderContext context) throws IOException {
                terms = FieldCache.DEFAULT.getTerms(context.reader(), toField, false);
                docBase = context.docBase;
              }

              @Override
              public boolean acceptsDocsOutOfOrder() {
                return false;
              }

              @Override
              public void setScorer(Scorer scorer) {}
            });
      }
      queryVals.put(uniqueRandomValue, docToJoinScore);
    }

    fromSearcher.getIndexReader().close();
    toSearcher.getIndexReader().close();

    return context;
  }
  // algorithm: treat sentence snippets as miniature documents
  // we can intersect these with the postings lists via BreakIterator.preceding(offset),s
  // score each sentence as norm(sentenceStartOffset) * sum(weight * tf(freq))
  private Passage[] highlightDoc(
      String field,
      BytesRef terms[],
      int contentLength,
      BreakIterator bi,
      int doc,
      TermsEnum termsEnum,
      DocsAndPositionsEnum[] postings,
      int n)
      throws IOException {
    PassageScorer scorer = getScorer(field);
    if (scorer == null) {
      throw new NullPointerException("PassageScorer cannot be null");
    }
    PriorityQueue<OffsetsEnum> pq = new PriorityQueue<>();
    float weights[] = new float[terms.length];
    // initialize postings
    for (int i = 0; i < terms.length; i++) {
      DocsAndPositionsEnum de = postings[i];
      int pDoc;
      if (de == EMPTY) {
        continue;
      } else if (de == null) {
        postings[i] = EMPTY; // initially
        if (!termsEnum.seekExact(terms[i])) {
          continue; // term not found
        }
        de =
            postings[i] = termsEnum.docsAndPositions(null, null, DocsAndPositionsEnum.FLAG_OFFSETS);
        if (de == null) {
          // no positions available
          throw new IllegalArgumentException(
              "field '" + field + "' was indexed without offsets, cannot highlight");
        }
        pDoc = de.advance(doc);
      } else {
        pDoc = de.docID();
        if (pDoc < doc) {
          pDoc = de.advance(doc);
        }
      }

      if (doc == pDoc) {
        weights[i] = scorer.weight(contentLength, de.freq());
        de.nextPosition();
        pq.add(new OffsetsEnum(de, i));
      }
    }

    pq.add(new OffsetsEnum(EMPTY, Integer.MAX_VALUE)); // a sentinel for termination

    PriorityQueue<Passage> passageQueue =
        new PriorityQueue<>(
            n,
            new Comparator<Passage>() {
              @Override
              public int compare(Passage left, Passage right) {
                if (left.score < right.score) {
                  return -1;
                } else if (left.score > right.score) {
                  return 1;
                } else {
                  return left.startOffset - right.startOffset;
                }
              }
            });
    Passage current = new Passage();

    OffsetsEnum off;
    while ((off = pq.poll()) != null) {
      final DocsAndPositionsEnum dp = off.dp;
      int start = dp.startOffset();
      if (start == -1) {
        throw new IllegalArgumentException(
            "field '" + field + "' was indexed without offsets, cannot highlight");
      }
      int end = dp.endOffset();
      // LUCENE-5166: this hit would span the content limit... however more valid
      // hits may exist (they are sorted by start). so we pretend like we never
      // saw this term, it won't cause a passage to be added to passageQueue or anything.
      assert EMPTY.startOffset() == Integer.MAX_VALUE;
      if (start < contentLength && end > contentLength) {
        continue;
      }
      if (start >= current.endOffset) {
        if (current.startOffset >= 0) {
          // finalize current
          current.score *= scorer.norm(current.startOffset);
          // new sentence: first add 'current' to queue
          if (passageQueue.size() == n && current.score < passageQueue.peek().score) {
            current.reset(); // can't compete, just reset it
          } else {
            passageQueue.offer(current);
            if (passageQueue.size() > n) {
              current = passageQueue.poll();
              current.reset();
            } else {
              current = new Passage();
            }
          }
        }
        // if we exceed limit, we are done
        if (start >= contentLength) {
          Passage passages[] = new Passage[passageQueue.size()];
          passageQueue.toArray(passages);
          for (Passage p : passages) {
            p.sort();
          }
          // sort in ascending order
          Arrays.sort(
              passages,
              new Comparator<Passage>() {
                @Override
                public int compare(Passage left, Passage right) {
                  return left.startOffset - right.startOffset;
                }
              });
          return passages;
        }
        // advance breakiterator
        assert BreakIterator.DONE < 0;
        current.startOffset = Math.max(bi.preceding(start + 1), 0);
        current.endOffset = Math.min(bi.next(), contentLength);
      }
      int tf = 0;
      while (true) {
        tf++;
        BytesRef term = terms[off.id];
        if (term == null) {
          // multitermquery match, pull from payload
          term = off.dp.getPayload();
          assert term != null;
        }
        current.addMatch(start, end, term);
        if (off.pos == dp.freq()) {
          break; // removed from pq
        } else {
          off.pos++;
          dp.nextPosition();
          start = dp.startOffset();
          end = dp.endOffset();
        }
        if (start >= current.endOffset || end > contentLength) {
          pq.offer(off);
          break;
        }
      }
      current.score += weights[off.id] * scorer.tf(tf, current.endOffset - current.startOffset);
    }

    // Dead code but compiler disagrees:
    assert false;
    return null;
  }