@Override
 protected void fillDocsAndScores(FixedBitSet matchingDocs, TermsEnum termsEnum)
     throws IOException {
   BytesRef spare = new BytesRef();
   PostingsEnum postingsEnum = null;
   for (int i = 0; i < terms.size(); i++) {
     if (termsEnum.seekExact(terms.get(ords[i], spare))) {
       postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE);
       float score = TermsIncludingScoreQuery.this.scores[ords[i]];
       for (int doc = postingsEnum.nextDoc();
           doc != DocIdSetIterator.NO_MORE_DOCS;
           doc = postingsEnum.nextDoc()) {
         // I prefer this:
         /*if (scores[doc] < score) {
           scores[doc] = score;
           matchingDocs.set(doc);
         }*/
         // But this behaves the same as MVInnerScorer and only then the tests will pass:
         if (!matchingDocs.get(doc)) {
           scores[doc] = score;
           matchingDocs.set(doc);
         }
       }
     }
   }
 }
Example #2
0
  private static DocSet createBigSet(
      List<LeafReaderContext> leaves, PostingsEnum[] postList, int maxDoc, int firstReader)
      throws IOException {
    long[] bits = new long[FixedBitSet.bits2words(maxDoc)];
    int sz = 0;
    for (int i = firstReader; i < postList.length; i++) {
      PostingsEnum postings = postList[i];
      if (postings == null) continue;
      LeafReaderContext ctx = leaves.get(i);
      Bits liveDocs = ctx.reader().getLiveDocs();
      int base = ctx.docBase;
      for (; ; ) {
        int subId = postings.nextDoc();
        if (subId == DocIdSetIterator.NO_MORE_DOCS) break;
        if (liveDocs != null && !liveDocs.get(subId)) continue;
        int globalId = subId + base;
        bits[globalId >> 6] |= (1L << globalId);
        sz++;
      }
    }

    BitDocSet docSet = new BitDocSet(new FixedBitSet(bits, maxDoc), sz);

    int smallSetSize = smallSetSize(maxDoc);
    if (sz < smallSetSize) {
      // make this optional?
      DocSet smallSet = toSmallSet(docSet);
      // assert equals(docSet, smallSet);
      return smallSet;
    }

    return docSet;
  }
  /**
   * Adds terms and frequencies found in vector into the Map termFreqMap
   *
   * @param termFreqMap a Map of terms and their frequencies
   * @param vector List of terms and their frequencies for a doc/field
   * @param fieldName Optional field name of the terms for skip terms
   */
  private void addTermFrequencies(
      Map<String, Int> termFreqMap, Terms vector, @Nullable String fieldName) throws IOException {
    final TermsEnum termsEnum = vector.iterator();
    final CharsRefBuilder spare = new CharsRefBuilder();
    BytesRef text;
    while ((text = termsEnum.next()) != null) {
      spare.copyUTF8Bytes(text);
      final String term = spare.toString();
      if (isNoiseWord(term)) {
        continue;
      }
      if (isSkipTerm(fieldName, term)) {
        continue;
      }

      final PostingsEnum docs = termsEnum.postings(null, null);
      int freq = 0;
      while (docs != null && docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
        freq += docs.freq();
      }

      // increment frequency
      Int cnt = termFreqMap.get(term);
      if (cnt == null) {
        cnt = new Int();
        termFreqMap.put(term, cnt);
        cnt.x = freq;
      } else {
        cnt.x += freq;
      }
    }
  }
Example #4
0
  /**
   * Add term frequencies for a single document to a frequency map.
   *
   * @param reader the index
   * @param doc doc id
   * @param luceneName the index field from which to use the term vector
   * @param freq where to add to the token frequencies
   */
  public static void getFrequenciesFromTermVector(
      IndexReader reader, int doc, String luceneName, Map<String, Integer> freq) {
    try {
      org.apache.lucene.index.Terms terms = reader.getTermVector(doc, luceneName);
      if (terms == null) {
        throw new IllegalArgumentException("Field " + luceneName + " has no Terms");
      }
      TermsEnum termsEnum = terms.iterator();

      // Verzamel concordantiewoorden uit term vector
      PostingsEnum postingsEnum = null;
      while (termsEnum.next() != null) {
        postingsEnum = termsEnum.postings(null, postingsEnum, PostingsEnum.FREQS);
        String term = termsEnum.term().utf8ToString();
        Integer n = freq.get(term);
        if (n == null) {
          n = 0;
        }
        while (postingsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
          n += termsEnum.docFreq();
        }
        freq.put(term, n);
      }
    } catch (Exception e) {
      throw ExUtil.wrapRuntimeException(e);
    }
  }
Example #5
0
  public TermInfo collect(String term) throws IOException {
    TermInfo info = new TermInfo();
    BytesRef luceneTerm = new BytesRef(term.getBytes());
    // this gives documents in which the term is found, but no offset information can be retrieved
    PostingsEnum postings =
        MultiFields.getTermDocsEnum(indexReader, ngramInfoFieldname, luceneTerm);
    // now go through each document
    int docId = postings.nextDoc();
    while (docId != PostingsEnum.NO_MORE_DOCS) {
      // get the term vector for that document.
      TermsEnum it = indexReader.getTermVector(docId, ngramInfoFieldname).iterator();
      // find the term of interest
      it.seekExact(luceneTerm);
      // get its posting info. this will contain offset info
      PostingsEnum postingsInDoc = it.postings(null, PostingsEnum.OFFSETS);
      postingsInDoc.nextDoc();

      Document doc = indexReader.document(docId);
      String id = doc.get(idFieldname);
      JATEDocument jd = new JATEDocument(id);
      Set<int[]> offsets = new HashSet<>();
      int totalFreq = postingsInDoc.freq();
      for (int i = 0; i < totalFreq; i++) {
        postingsInDoc.nextPosition();
        offsets.add(new int[] {postingsInDoc.startOffset(), postingsInDoc.endOffset()});
      }
      info.getOffsets().put(jd, offsets);

      docId = postings.nextDoc();
    }

    return info;
  }
 protected OffsetsEnum createOffsetsEnumFromTokenStream(int doc, TokenStream tokenStream)
     throws IOException {
   // if there are automata (MTQ), we have to initialize the "fake" enum wrapping them.
   assert tokenStream != null;
   // TODO Opt: we sometimes evaluate the automata twice when this TS isn't the original; can we
   // avoid?
   PostingsEnum mtqPostingsEnum = MultiTermHighlighting.getDocsEnum(tokenStream, automata);
   assert mtqPostingsEnum instanceof Closeable; // FYI we propagate close() later.
   mtqPostingsEnum.advance(doc);
   return new OffsetsEnum(null, mtqPostingsEnum);
 }
  public void testDocsAndPositionsEnumStart() throws Exception {
    Analyzer analyzer = new MockAnalyzer(random());
    int numIters = atLeast(3);
    MemoryIndex memory = new MemoryIndex(true, false, random().nextInt(50) * 1024 * 1024);
    for (int i = 0; i < numIters; i++) { // check reuse
      memory.addField("foo", "bar", analyzer);
      LeafReader reader = (LeafReader) memory.createSearcher().getIndexReader();
      TestUtil.checkReader(reader);
      assertEquals(1, reader.terms("foo").getSumTotalTermFreq());
      PostingsEnum disi = reader.postings(new Term("foo", "bar"), PostingsEnum.ALL);
      int docid = disi.docID();
      assertEquals(-1, docid);
      assertTrue(disi.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
      assertEquals(0, disi.nextPosition());
      assertEquals(0, disi.startOffset());
      assertEquals(3, disi.endOffset());

      // now reuse and check again
      TermsEnum te = reader.terms("foo").iterator();
      assertTrue(te.seekExact(new BytesRef("bar")));
      disi = te.postings(disi);
      docid = disi.docID();
      assertEquals(-1, docid);
      assertTrue(disi.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
      reader.close();
      memory.reset();
    }
  }
  @Override
  public PostingsEnum postings(
      FieldInfo fieldInfo, BlockTermState termState, PostingsEnum reuse, int flags)
      throws IOException {
    SingleDocsEnum docsEnum;

    if (PostingsEnum.featureRequested(flags, PostingsEnum.POSITIONS)) {
      SinglePostingsEnum posEnum;

      if (reuse instanceof SinglePostingsEnum) {
        posEnum = (SinglePostingsEnum) reuse;
      } else {
        posEnum = new SinglePostingsEnum();
      }
      IDVersionTermState _termState = (IDVersionTermState) termState;
      posEnum.reset(_termState.docID, _termState.idVersion);
      return posEnum;
    }

    if (reuse instanceof SingleDocsEnum) {
      docsEnum = (SingleDocsEnum) reuse;
    } else {
      docsEnum = new SingleDocsEnum();
    }
    docsEnum.reset(((IDVersionTermState) termState).docID);

    return docsEnum;
  }
 private void buildTerm(
     XContentBuilder builder, final CharsRefBuilder spare, Terms curTerms, TermsEnum termIter)
     throws IOException {
   // start term, optimized writing
   BytesRef term = termIter.next();
   spare.copyUTF8Bytes(term);
   builder.startObject(spare.toString());
   buildTermStatistics(builder, termIter);
   // finally write the term vectors
   PostingsEnum posEnum = termIter.postings(null, null, PostingsEnum.ALL);
   int termFreq = posEnum.freq();
   builder.field(FieldStrings.TERM_FREQ, termFreq);
   initMemory(curTerms, termFreq);
   initValues(curTerms, posEnum, termFreq);
   buildValues(builder, curTerms, termFreq);
   builder.endObject();
 }
Example #10
0
  public void testChangeGaps() throws Exception {
    // LUCENE-5324: check that it is possible to change the wrapper's gaps
    final int positionGap = random().nextInt(1000);
    final int offsetGap = random().nextInt(1000);
    final Analyzer delegate = new MockAnalyzer(random());
    final Analyzer a =
        new DelegatingAnalyzerWrapper(delegate.getReuseStrategy()) {
          @Override
          protected Analyzer getWrappedAnalyzer(String fieldName) {
            return delegate;
          }

          @Override
          public int getPositionIncrementGap(String fieldName) {
            return positionGap;
          }

          @Override
          public int getOffsetGap(String fieldName) {
            return offsetGap;
          }
        };

    final RandomIndexWriter writer = new RandomIndexWriter(random(), newDirectory(), a);
    final Document doc = new Document();
    final FieldType ft = new FieldType();
    ft.setIndexOptions(IndexOptions.DOCS);
    ft.setTokenized(true);
    ft.setStoreTermVectors(true);
    ft.setStoreTermVectorPositions(true);
    ft.setStoreTermVectorOffsets(true);
    doc.add(new Field("f", "a", ft));
    doc.add(new Field("f", "a", ft));
    writer.addDocument(doc);
    final LeafReader reader = getOnlySegmentReader(writer.getReader());
    final Fields fields = reader.getTermVectors(0);
    final Terms terms = fields.terms("f");
    final TermsEnum te = terms.iterator();
    assertEquals(new BytesRef("a"), te.next());
    final PostingsEnum dpe = te.postings(null, PostingsEnum.ALL);
    assertEquals(0, dpe.nextDoc());
    assertEquals(2, dpe.freq());
    assertEquals(0, dpe.nextPosition());
    assertEquals(0, dpe.startOffset());
    final int endOffset = dpe.endOffset();
    assertEquals(1 + positionGap, dpe.nextPosition());
    assertEquals(1 + endOffset + offsetGap, dpe.endOffset());
    assertEquals(null, te.next());
    reader.close();
    writer.close();
    writer.w.getDirectory().close();
  }
 private void initValues(Terms curTerms, PostingsEnum posEnum, int termFreq) throws IOException {
   for (int j = 0; j < termFreq; j++) {
     int nextPos = posEnum.nextPosition();
     if (curTerms.hasPositions()) {
       currentPositions[j] = nextPos;
     }
     if (curTerms.hasOffsets()) {
       currentStartOffset[j] = posEnum.startOffset();
       currentEndOffset[j] = posEnum.endOffset();
     }
     if (curTerms.hasPayloads()) {
       BytesRef curPayload = posEnum.getPayload();
       if (curPayload != null) {
         currentPayloads[j] = new BytesArray(curPayload.bytes, 0, curPayload.length);
       } else {
         currentPayloads[j] = null;
       }
     }
   }
 }
 protected void fillDocsAndScores(FixedBitSet matchingDocs, TermsEnum termsEnum)
     throws IOException {
   BytesRef spare = new BytesRef();
   PostingsEnum postingsEnum = null;
   for (int i = 0; i < terms.size(); i++) {
     if (termsEnum.seekExact(terms.get(ords[i], spare))) {
       postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE);
       float score = TermsIncludingScoreQuery.this.scores[ords[i]];
       for (int doc = postingsEnum.nextDoc();
           doc != DocIdSetIterator.NO_MORE_DOCS;
           doc = postingsEnum.nextDoc()) {
         matchingDocs.set(doc);
         // In the case the same doc is also related to a another doc, a score might be
         // overwritten. I think this
         // can only happen in a many-to-many relation
         scores[doc] = score;
       }
     }
   }
 }
  protected List<OffsetsEnum> createOffsetsEnumsFromReader(LeafReader atomicReader, int doc)
      throws IOException {
    // For strict positions, get a Map of term to Spans:
    //    note: ScriptPhraseHelper.NONE does the right thing for these method calls
    final Map<BytesRef, Spans> strictPhrasesTermToSpans =
        strictPhrases.getTermToSpans(atomicReader, doc);
    // Usually simply wraps terms in a List; but if willRewrite() then can be expanded
    final List<BytesRef> sourceTerms =
        strictPhrases.expandTermsIfRewrite(terms, strictPhrasesTermToSpans);

    final List<OffsetsEnum> offsetsEnums = new ArrayList<>(sourceTerms.size() + 1);

    Terms termsIndex =
        atomicReader == null || sourceTerms.isEmpty() ? null : atomicReader.terms(field);
    if (termsIndex != null) {
      TermsEnum termsEnum = termsIndex.iterator(); // does not return null
      for (BytesRef term : sourceTerms) {
        if (!termsEnum.seekExact(term)) {
          continue; // term not found
        }
        PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.OFFSETS);
        if (postingsEnum == null) {
          // no offsets or positions available
          throw new IllegalArgumentException(
              "field '" + field + "' was indexed without offsets, cannot highlight");
        }
        if (doc != postingsEnum.advance(doc)) { // now it's positioned, although may be exhausted
          continue;
        }
        postingsEnum =
            strictPhrases.filterPostings(term, postingsEnum, strictPhrasesTermToSpans.get(term));
        if (postingsEnum == null) {
          continue; // completely filtered out
        }

        offsetsEnums.add(new OffsetsEnum(term, postingsEnum));
      }
    }
    return offsetsEnums;
  }
    @Override
    public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException {

      if (PostingsEnum.featureRequested(flags, PostingsEnum.POSITIONS)) {
        SimpleTVPostings postings = current.getValue();
        if (postings.positions != null || postings.startOffsets != null) {
          // TODO: reuse
          SimpleTVPostingsEnum e = new SimpleTVPostingsEnum();
          e.reset(
              postings.positions, postings.startOffsets, postings.endOffsets, postings.payloads);
          return e;
        }
      }

      // TODO: reuse
      SimpleTVDocsEnum e = new SimpleTVDocsEnum();
      e.reset(
          PostingsEnum.featureRequested(flags, PostingsEnum.FREQS) == false
              ? 1
              : current.getValue().freq);
      return e;
    }
  public void testDocsEnumStart() throws Exception {
    Analyzer analyzer = new MockAnalyzer(random());
    MemoryIndex memory =
        new MemoryIndex(random().nextBoolean(), false, random().nextInt(50) * 1024 * 1024);
    memory.addField("foo", "bar", analyzer);
    LeafReader reader = (LeafReader) memory.createSearcher().getIndexReader();
    TestUtil.checkReader(reader);
    PostingsEnum disi =
        TestUtil.docs(random(), reader, "foo", new BytesRef("bar"), null, PostingsEnum.NONE);
    int docid = disi.docID();
    assertEquals(-1, docid);
    assertTrue(disi.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);

    // now reuse and check again
    TermsEnum te = reader.terms("foo").iterator();
    assertTrue(te.seekExact(new BytesRef("bar")));
    disi = te.postings(disi, PostingsEnum.NONE);
    docid = disi.docID();
    assertEquals(-1, docid);
    assertTrue(disi.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
    reader.close();
  }
Example #16
0
  private static DocSet createSmallSet(
      List<LeafReaderContext> leaves, PostingsEnum[] postList, int maxPossible, int firstReader)
      throws IOException {
    int[] docs = new int[maxPossible];
    int sz = 0;
    for (int i = firstReader; i < postList.length; i++) {
      PostingsEnum postings = postList[i];
      if (postings == null) continue;
      LeafReaderContext ctx = leaves.get(i);
      Bits liveDocs = ctx.reader().getLiveDocs();
      int base = ctx.docBase;
      for (; ; ) {
        int subId = postings.nextDoc();
        if (subId == DocIdSetIterator.NO_MORE_DOCS) break;
        if (liveDocs != null && !liveDocs.get(subId)) continue;
        int globalId = subId + base;
        docs[sz++] = globalId;
      }
    }

    return new SortedIntDocSet(docs, sz);
  }
Example #17
0
      @Override
      public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException {

        if (PostingsEnum.featureRequested(flags, DocsAndPositionsEnum.OLD_NULL_SEMANTICS)) {
          if (fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
              < 0) {
            // Positions were not indexed:
            return null;
          }
        }

        // System.out.println("BTR.docs this=" + this);
        decodeMetaData();
        // System.out.println("BTR.docs:  state.docFreq=" + state.docFreq);
        return postingsReader.postings(fieldInfo, state, reuse, flags);
      }
  @Test
  public void testRandom() throws Exception {
    Directory directory = newDirectory();
    final Random r = random();
    final IndexWriterConfig iwc =
        LuceneTestCase.newIndexWriterConfig(r, new MockAnalyzer(r))
            .setMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH)
            .setRAMBufferSizeMB(
                scaledRandomIntBetween(16, 64)); // we might index a lot - don't go crazy here
    RandomIndexWriter indexWriter = new RandomIndexWriter(r, directory, iwc);
    int numUniqueChildValues = scaledRandomIntBetween(100, 2000);
    String[] childValues = new String[numUniqueChildValues];
    for (int i = 0; i < numUniqueChildValues; i++) {
      childValues[i] = Integer.toString(i);
    }

    IntOpenHashSet filteredOrDeletedDocs = new IntOpenHashSet();
    int childDocId = 0;
    int numParentDocs = scaledRandomIntBetween(1, numUniqueChildValues);
    ObjectObjectOpenHashMap<String, NavigableSet<String>> childValueToParentIds =
        new ObjectObjectOpenHashMap<>();
    for (int parentDocId = 0; parentDocId < numParentDocs; parentDocId++) {
      boolean markParentAsDeleted = rarely();
      boolean filterMe = rarely();
      String parent = Integer.toString(parentDocId);
      Document document = new Document();
      document.add(
          new StringField(UidFieldMapper.NAME, Uid.createUid("parent", parent), Field.Store.YES));
      document.add(new StringField(TypeFieldMapper.NAME, "parent", Field.Store.NO));
      if (markParentAsDeleted) {
        filteredOrDeletedDocs.add(parentDocId);
        document.add(new StringField("delete", "me", Field.Store.NO));
      }
      if (filterMe) {
        filteredOrDeletedDocs.add(parentDocId);
        document.add(new StringField("filter", "me", Field.Store.NO));
      }
      indexWriter.addDocument(document);

      final int numChildDocs = scaledRandomIntBetween(0, 100);
      for (int i = 0; i < numChildDocs; i++) {
        boolean markChildAsDeleted = rarely();
        String childValue = childValues[random().nextInt(childValues.length)];

        document = new Document();
        document.add(
            new StringField(
                UidFieldMapper.NAME,
                Uid.createUid("child", Integer.toString(childDocId++)),
                Field.Store.NO));
        document.add(new StringField(TypeFieldMapper.NAME, "child", Field.Store.NO));
        document.add(
            new StringField(
                ParentFieldMapper.NAME, Uid.createUid("parent", parent), Field.Store.NO));
        document.add(new StringField("field1", childValue, Field.Store.NO));
        if (markChildAsDeleted) {
          document.add(new StringField("delete", "me", Field.Store.NO));
        }
        indexWriter.addDocument(document);

        if (!markChildAsDeleted) {
          NavigableSet<String> parentIds;
          if (childValueToParentIds.containsKey(childValue)) {
            parentIds = childValueToParentIds.lget();
          } else {
            childValueToParentIds.put(childValue, parentIds = new TreeSet<>());
          }
          if (!markParentAsDeleted && !filterMe) {
            parentIds.add(parent);
          }
        }
      }
    }

    // Delete docs that are marked to be deleted.
    indexWriter.deleteDocuments(new Term("delete", "me"));

    indexWriter.commit();
    IndexReader indexReader = DirectoryReader.open(directory);
    IndexSearcher searcher = new IndexSearcher(indexReader);
    Engine.Searcher engineSearcher =
        new Engine.Searcher(ChildrenConstantScoreQueryTests.class.getSimpleName(), searcher);
    ((TestSearchContext) SearchContext.current())
        .setSearcher(new ContextIndexSearcher(SearchContext.current(), engineSearcher));

    int max = numUniqueChildValues / 4;
    for (int i = 0; i < max; i++) {
      // Simulate a parent update
      if (random().nextBoolean()) {
        final int numberOfUpdatableParents = numParentDocs - filteredOrDeletedDocs.size();
        int numberOfUpdates = scaledRandomIntBetween(0, numberOfUpdatableParents);
        for (int j = 0; j < numberOfUpdates; j++) {
          int parentId;
          do {
            parentId = random().nextInt(numParentDocs);
          } while (filteredOrDeletedDocs.contains(parentId));

          String parentUid = Uid.createUid("parent", Integer.toString(parentId));
          indexWriter.deleteDocuments(new Term(UidFieldMapper.NAME, parentUid));

          Document document = new Document();
          document.add(new StringField(UidFieldMapper.NAME, parentUid, Field.Store.YES));
          document.add(new StringField(TypeFieldMapper.NAME, "parent", Field.Store.NO));
          indexWriter.addDocument(document);
        }

        indexReader.close();
        indexReader = DirectoryReader.open(indexWriter.w, true);
        searcher = new IndexSearcher(indexReader);
        engineSearcher =
            new Engine.Searcher(ChildrenConstantScoreQueryTests.class.getSimpleName(), searcher);
        ((TestSearchContext) SearchContext.current())
            .setSearcher(new ContextIndexSearcher(SearchContext.current(), engineSearcher));
      }

      String childValue = childValues[random().nextInt(numUniqueChildValues)];
      int shortCircuitParentDocSet = random().nextInt(numParentDocs);
      QueryBuilder queryBuilder;
      if (random().nextBoolean()) {
        queryBuilder =
            hasChildQuery("child", termQuery("field1", childValue))
                .setShortCircuitCutoff(shortCircuitParentDocSet);
      } else {
        queryBuilder =
            constantScoreQuery(
                hasChildFilter("child", termQuery("field1", childValue))
                    .setShortCircuitCutoff(shortCircuitParentDocSet));
      }
      // Using a FQ, will invoke / test the Scorer#advance(..) and also let the Weight#scorer not
      // get live docs as acceptedDocs
      queryBuilder = filteredQuery(queryBuilder, notFilter(termFilter("filter", "me")));
      Query query = parseQuery(queryBuilder);

      BitSetCollector collector = new BitSetCollector(indexReader.maxDoc());
      searcher.search(query, collector);
      FixedBitSet actualResult = collector.getResult();

      FixedBitSet expectedResult = new FixedBitSet(indexReader.maxDoc());
      if (childValueToParentIds.containsKey(childValue)) {
        LeafReader slowLeafReader = SlowCompositeReaderWrapper.wrap(indexReader);
        Terms terms = slowLeafReader.terms(UidFieldMapper.NAME);
        if (terms != null) {
          NavigableSet<String> parentIds = childValueToParentIds.lget();
          TermsEnum termsEnum = terms.iterator(null);
          PostingsEnum docsEnum = null;
          for (String id : parentIds) {
            TermsEnum.SeekStatus seekStatus =
                termsEnum.seekCeil(Uid.createUidAsBytes("parent", id));
            if (seekStatus == TermsEnum.SeekStatus.FOUND) {
              docsEnum =
                  termsEnum.postings(slowLeafReader.getLiveDocs(), docsEnum, PostingsEnum.NONE);
              expectedResult.set(docsEnum.nextDoc());
            } else if (seekStatus == TermsEnum.SeekStatus.END) {
              break;
            }
          }
        }
      }

      assertBitSet(actualResult, expectedResult, searcher);
    }

    indexWriter.close();
    indexReader.close();
    directory.close();
  }
 @Override
 public void collectLeaf(PostingsEnum postings, int position, Term term) throws IOException {
   if (postings.getPayload() != null) {
     payloads.add(BytesRef.deepCopyOf(postings.getPayload()));
   }
 }
  private void duellReaders(CompositeReader other, LeafReader memIndexReader) throws IOException {
    Fields memFields = memIndexReader.fields();
    for (String field : MultiFields.getFields(other)) {
      Terms memTerms = memFields.terms(field);
      Terms iwTerms = memIndexReader.terms(field);
      if (iwTerms == null) {
        assertNull(memTerms);
      } else {
        NumericDocValues normValues = MultiDocValues.getNormValues(other, field);
        NumericDocValues memNormValues = memIndexReader.getNormValues(field);
        if (normValues != null) {
          // mem idx always computes norms on the fly
          assertNotNull(memNormValues);
          assertEquals(normValues.get(0), memNormValues.get(0));
        }

        assertNotNull(memTerms);
        assertEquals(iwTerms.getDocCount(), memTerms.getDocCount());
        assertEquals(iwTerms.getSumDocFreq(), memTerms.getSumDocFreq());
        assertEquals(iwTerms.getSumTotalTermFreq(), memTerms.getSumTotalTermFreq());
        TermsEnum iwTermsIter = iwTerms.iterator();
        TermsEnum memTermsIter = memTerms.iterator();
        if (iwTerms.hasPositions()) {
          final boolean offsets = iwTerms.hasOffsets() && memTerms.hasOffsets();

          while (iwTermsIter.next() != null) {
            assertNotNull(memTermsIter.next());
            assertEquals(iwTermsIter.term(), memTermsIter.term());
            PostingsEnum iwDocsAndPos = iwTermsIter.postings(null, PostingsEnum.ALL);
            PostingsEnum memDocsAndPos = memTermsIter.postings(null, PostingsEnum.ALL);
            while (iwDocsAndPos.nextDoc() != PostingsEnum.NO_MORE_DOCS) {
              assertEquals(iwDocsAndPos.docID(), memDocsAndPos.nextDoc());
              assertEquals(iwDocsAndPos.freq(), memDocsAndPos.freq());
              for (int i = 0; i < iwDocsAndPos.freq(); i++) {
                assertEquals(
                    "term: " + iwTermsIter.term().utf8ToString(),
                    iwDocsAndPos.nextPosition(),
                    memDocsAndPos.nextPosition());
                if (offsets) {
                  assertEquals(iwDocsAndPos.startOffset(), memDocsAndPos.startOffset());
                  assertEquals(iwDocsAndPos.endOffset(), memDocsAndPos.endOffset());
                }

                if (iwTerms.hasPayloads()) {
                  assertEquals(iwDocsAndPos.getPayload(), memDocsAndPos.getPayload());
                }
              }
            }
          }
        } else {
          while (iwTermsIter.next() != null) {
            assertEquals(iwTermsIter.term(), memTermsIter.term());
            PostingsEnum iwDocsAndPos = iwTermsIter.postings(null);
            PostingsEnum memDocsAndPos = memTermsIter.postings(null);
            while (iwDocsAndPos.nextDoc() != PostingsEnum.NO_MORE_DOCS) {
              assertEquals(iwDocsAndPos.docID(), memDocsAndPos.nextDoc());
              assertEquals(iwDocsAndPos.freq(), memDocsAndPos.freq());
            }
          }
        }
      }
    }
  }
  /**
   * Returns a list of terms in the specified field along with the corresponding count of documents
   * in the set that match that constraint. This method uses the FilterCache to get the intersection
   * count between <code>docs</code> and the DocSet for each term in the filter.
   *
   * @see FacetParams#FACET_LIMIT
   * @see FacetParams#FACET_ZEROS
   * @see FacetParams#FACET_MISSING
   */
  public NamedList<Integer> getFacetTermEnumCounts(
      SolrIndexSearcher searcher,
      DocSet docs,
      String field,
      int offset,
      int limit,
      int mincount,
      boolean missing,
      String sort,
      String prefix,
      String contains,
      boolean ignoreCase,
      SolrParams params)
      throws IOException {

    /* :TODO: potential optimization...
     * cache the Terms with the highest docFreq and try them first
     * don't enum if we get our max from them
     */

    // Minimum term docFreq in order to use the filterCache for that term.
    int minDfFilterCache = global.getFieldInt(field, FacetParams.FACET_ENUM_CACHE_MINDF, 0);

    // make sure we have a set that is fast for random access, if we will use it for that
    DocSet fastForRandomSet = docs;
    if (minDfFilterCache > 0 && docs instanceof SortedIntDocSet) {
      SortedIntDocSet sset = (SortedIntDocSet) docs;
      fastForRandomSet = new HashDocSet(sset.getDocs(), 0, sset.size());
    }

    IndexSchema schema = searcher.getSchema();
    LeafReader r = searcher.getLeafReader();
    FieldType ft = schema.getFieldType(field);

    boolean sortByCount = sort.equals("count") || sort.equals("true");
    final int maxsize = limit >= 0 ? offset + limit : Integer.MAX_VALUE - 1;
    final BoundedTreeSet<CountPair<BytesRef, Integer>> queue =
        sortByCount ? new BoundedTreeSet<CountPair<BytesRef, Integer>>(maxsize) : null;
    final NamedList<Integer> res = new NamedList<>();

    int min = mincount - 1; // the smallest value in the top 'N' values
    int off = offset;
    int lim = limit >= 0 ? limit : Integer.MAX_VALUE;

    BytesRef prefixTermBytes = null;
    if (prefix != null) {
      String indexedPrefix = ft.toInternal(prefix);
      prefixTermBytes = new BytesRef(indexedPrefix);
    }

    Fields fields = r.fields();
    Terms terms = fields == null ? null : fields.terms(field);
    TermsEnum termsEnum = null;
    SolrIndexSearcher.DocsEnumState deState = null;
    BytesRef term = null;
    if (terms != null) {
      termsEnum = terms.iterator();

      // TODO: OPT: if seek(ord) is supported for this termsEnum, then we could use it for
      // facet.offset when sorting by index order.

      if (prefixTermBytes != null) {
        if (termsEnum.seekCeil(prefixTermBytes) == TermsEnum.SeekStatus.END) {
          termsEnum = null;
        } else {
          term = termsEnum.term();
        }
      } else {
        // position termsEnum on first term
        term = termsEnum.next();
      }
    }

    PostingsEnum postingsEnum = null;
    CharsRefBuilder charsRef = new CharsRefBuilder();

    if (docs.size() >= mincount) {
      while (term != null) {

        if (prefixTermBytes != null && !StringHelper.startsWith(term, prefixTermBytes)) break;

        if (contains == null || contains(term.utf8ToString(), contains, ignoreCase)) {
          int df = termsEnum.docFreq();

          // If we are sorting, we can use df>min (rather than >=) since we
          // are going in index order.  For certain term distributions this can
          // make a large difference (for example, many terms with df=1).
          if (df > 0 && df > min) {
            int c;

            if (df >= minDfFilterCache) {
              // use the filter cache

              if (deState == null) {
                deState = new SolrIndexSearcher.DocsEnumState();
                deState.fieldName = field;
                deState.liveDocs = r.getLiveDocs();
                deState.termsEnum = termsEnum;
                deState.postingsEnum = postingsEnum;
              }

              c = searcher.numDocs(docs, deState);

              postingsEnum = deState.postingsEnum;
            } else {
              // iterate over TermDocs to calculate the intersection

              // TODO: specialize when base docset is a bitset or hash set (skipDocs)?  or does it
              // matter for this?
              // TODO: do this per-segment for better efficiency (MultiDocsEnum just uses base class
              // impl)
              // TODO: would passing deleted docs lead to better efficiency over checking the
              // fastForRandomSet?
              postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE);
              c = 0;

              if (postingsEnum instanceof MultiPostingsEnum) {
                MultiPostingsEnum.EnumWithSlice[] subs =
                    ((MultiPostingsEnum) postingsEnum).getSubs();
                int numSubs = ((MultiPostingsEnum) postingsEnum).getNumSubs();
                for (int subindex = 0; subindex < numSubs; subindex++) {
                  MultiPostingsEnum.EnumWithSlice sub = subs[subindex];
                  if (sub.postingsEnum == null) continue;
                  int base = sub.slice.start;
                  int docid;
                  while ((docid = sub.postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                    if (fastForRandomSet.exists(docid + base)) c++;
                  }
                }
              } else {
                int docid;
                while ((docid = postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                  if (fastForRandomSet.exists(docid)) c++;
                }
              }
            }

            if (sortByCount) {
              if (c > min) {
                BytesRef termCopy = BytesRef.deepCopyOf(term);
                queue.add(new CountPair<>(termCopy, c));
                if (queue.size() >= maxsize) min = queue.last().val;
              }
            } else {
              if (c >= mincount && --off < 0) {
                if (--lim < 0) break;
                ft.indexedToReadable(term, charsRef);
                res.add(charsRef.toString(), c);
              }
            }
          }
        }
        term = termsEnum.next();
      }
    }

    if (sortByCount) {
      for (CountPair<BytesRef, Integer> p : queue) {
        if (--off >= 0) continue;
        if (--lim < 0) break;
        ft.indexedToReadable(p.key, charsRef);
        res.add(charsRef.toString(), p.val);
      }
    }

    if (missing) {
      res.add(null, getFieldMissingCount(searcher, docs, field));
    }

    return res;
  }
  protected void compareTermVectors(Terms terms, Terms memTerms, String field_name)
      throws IOException {

    TermsEnum termEnum = terms.iterator();
    TermsEnum memTermEnum = memTerms.iterator();

    while (termEnum.next() != null) {
      assertNotNull(memTermEnum.next());
      assertThat(termEnum.totalTermFreq(), equalTo(memTermEnum.totalTermFreq()));

      PostingsEnum docsPosEnum = termEnum.postings(null, PostingsEnum.POSITIONS);
      PostingsEnum memDocsPosEnum = memTermEnum.postings(null, PostingsEnum.POSITIONS);
      String currentTerm = termEnum.term().utf8ToString();

      assertThat(
          "Token mismatch for field: " + field_name,
          currentTerm,
          equalTo(memTermEnum.term().utf8ToString()));

      docsPosEnum.nextDoc();
      memDocsPosEnum.nextDoc();

      int freq = docsPosEnum.freq();
      assertThat(freq, equalTo(memDocsPosEnum.freq()));
      for (int i = 0; i < freq; i++) {
        String failDesc = " (field:" + field_name + " term:" + currentTerm + ")";
        int memPos = memDocsPosEnum.nextPosition();
        int pos = docsPosEnum.nextPosition();
        assertThat("Position test failed" + failDesc, memPos, equalTo(pos));
        assertThat(
            "Start offset test failed" + failDesc,
            memDocsPosEnum.startOffset(),
            equalTo(docsPosEnum.startOffset()));
        assertThat(
            "End offset test failed" + failDesc,
            memDocsPosEnum.endOffset(),
            equalTo(docsPosEnum.endOffset()));
        assertThat(
            "Missing payload test failed" + failDesc,
            docsPosEnum.getPayload(),
            equalTo(docsPosEnum.getPayload()));
      }
    }
    assertNull("Still some tokens not processed", memTermEnum.next());
  }
  public void testSetPosition() throws Exception {
    Analyzer analyzer =
        new Analyzer() {
          @Override
          public TokenStreamComponents createComponents(String fieldName) {
            return new TokenStreamComponents(
                new Tokenizer() {
                  // TODO: use CannedTokenStream
                  private final String[] TOKENS = {"1", "2", "3", "4", "5"};
                  private final int[] INCREMENTS = {1, 2, 1, 0, 1};
                  private int i = 0;

                  PositionIncrementAttribute posIncrAtt =
                      addAttribute(PositionIncrementAttribute.class);
                  CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
                  OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);

                  @Override
                  public boolean incrementToken() {
                    if (i == TOKENS.length) return false;
                    clearAttributes();
                    termAtt.append(TOKENS[i]);
                    offsetAtt.setOffset(i, i);
                    posIncrAtt.setPositionIncrement(INCREMENTS[i]);
                    i++;
                    return true;
                  }

                  @Override
                  public void reset() throws IOException {
                    super.reset();
                    this.i = 0;
                  }
                });
          }
        };
    Directory store = newDirectory();
    RandomIndexWriter writer = new RandomIndexWriter(random(), store, analyzer);
    Document d = new Document();
    d.add(newTextField("field", "bogus", Field.Store.YES));
    writer.addDocument(d);
    IndexReader reader = writer.getReader();
    writer.close();

    IndexSearcher searcher = newSearcher(reader);

    PostingsEnum pos =
        MultiFields.getTermPositionsEnum(searcher.getIndexReader(), "field", new BytesRef("1"));
    pos.nextDoc();
    // first token should be at position 0
    assertEquals(0, pos.nextPosition());

    pos = MultiFields.getTermPositionsEnum(searcher.getIndexReader(), "field", new BytesRef("2"));
    pos.nextDoc();
    // second token should be at position 2
    assertEquals(2, pos.nextPosition());

    PhraseQuery q;
    ScoreDoc[] hits;

    q = new PhraseQuery("field", "1", "2");
    hits = searcher.search(q, 1000).scoreDocs;
    assertEquals(0, hits.length);

    // same as previous, using the builder with implicit positions
    PhraseQuery.Builder builder = new PhraseQuery.Builder();
    builder.add(new Term("field", "1"));
    builder.add(new Term("field", "2"));
    q = builder.build();
    hits = searcher.search(q, 1000).scoreDocs;
    assertEquals(0, hits.length);

    // same as previous, just specify positions explicitely.
    builder = new PhraseQuery.Builder();
    builder.add(new Term("field", "1"), 0);
    builder.add(new Term("field", "2"), 1);
    q = builder.build();
    hits = searcher.search(q, 1000).scoreDocs;
    assertEquals(0, hits.length);

    // specifying correct positions should find the phrase.
    builder = new PhraseQuery.Builder();
    builder.add(new Term("field", "1"), 0);
    builder.add(new Term("field", "2"), 2);
    q = builder.build();
    hits = searcher.search(q, 1000).scoreDocs;
    assertEquals(1, hits.length);

    q = new PhraseQuery("field", "2", "3");
    hits = searcher.search(q, 1000).scoreDocs;
    assertEquals(1, hits.length);

    q = new PhraseQuery("field", "3", "4");
    hits = searcher.search(q, 1000).scoreDocs;
    assertEquals(0, hits.length);

    // phrase query would find it when correct positions are specified.
    builder = new PhraseQuery.Builder();
    builder.add(new Term("field", "3"), 0);
    builder.add(new Term("field", "4"), 0);
    q = builder.build();
    hits = searcher.search(q, 1000).scoreDocs;
    assertEquals(1, hits.length);

    // phrase query should fail for non existing searched term
    // even if there exist another searched terms in the same searched position.
    builder = new PhraseQuery.Builder();
    builder.add(new Term("field", "3"), 0);
    builder.add(new Term("field", "9"), 0);
    q = builder.build();
    hits = searcher.search(q, 1000).scoreDocs;
    assertEquals(0, hits.length);

    // multi-phrase query should succed for non existing searched term
    // because there exist another searched terms in the same searched position.
    MultiPhraseQuery mq = new MultiPhraseQuery();
    mq.add(new Term[] {new Term("field", "3"), new Term("field", "9")}, 0);
    hits = searcher.search(mq, 1000).scoreDocs;
    assertEquals(1, hits.length);

    q = new PhraseQuery("field", "2", "4");
    hits = searcher.search(q, 1000).scoreDocs;
    assertEquals(1, hits.length);

    q = new PhraseQuery("field", "3", "5");
    hits = searcher.search(q, 1000).scoreDocs;
    assertEquals(1, hits.length);

    q = new PhraseQuery("field", "4", "5");
    hits = searcher.search(q, 1000).scoreDocs;
    assertEquals(1, hits.length);

    q = new PhraseQuery("field", "2", "5");
    hits = searcher.search(q, 1000).scoreDocs;
    assertEquals(0, hits.length);

    reader.close();
    store.close();
  }
Example #24
0
  /**
   * Get all words between the specified start and end positions from the term vector.
   *
   * <p>NOTE: this may return an array of less than the size requested, if the document ends before
   * the requested end position.
   *
   * @param reader the index
   * @param doc doc id
   * @param luceneName the index field from which to use the term vector
   * @param start start position (first word we want to request)
   * @param end end position (last word we want to request)
   * @param partialOk is it okay if we're missing words in the middle, or do we need them all?
   *     (debug)
   * @return the words found, in order
   */
  public static String[] getWordsFromTermVector(
      IndexReader reader, int doc, String luceneName, int start, int end, boolean partialOk) {

    // Retrieve the term position vector of the contents of this document.
    // NOTE: might be faster to retrieve all term vectors at once

    try {
      org.apache.lucene.index.Terms terms = reader.getTermVector(doc, luceneName);
      if (terms == null) {
        throw new IllegalArgumentException("Field " + luceneName + " has no Terms");
      }
      if (!terms.hasPositions())
        throw new IllegalArgumentException(
            "Field " + luceneName + " has no character postion information");
      // String[] docTerms = new String[(int) terms.size()];
      // final List<BytesRef> termsList = new ArrayList<BytesRef>();
      TermsEnum termsEnum = terms.iterator();

      // Verzamel concordantiewoorden uit term vector
      PostingsEnum docPosEnum = null;
      int numFound = 0;
      String[] concordanceWords = new String[end - start + 1];
      while (termsEnum.next() != null) {
        docPosEnum = termsEnum.postings(null, docPosEnum, PostingsEnum.POSITIONS);
        while (docPosEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
          // NOTE: .docId() will always return 0 in this case
          // if (docPosEnum.docID() != doc)
          //	throw new RuntimeException("Wrong doc id: " + docPosEnum.docID() + " (expected " + doc
          // + ")");
          for (int i = 0; i < docPosEnum.freq(); i++) {
            int position = docPosEnum.nextPosition();
            if (position == -1)
              throw new RuntimeException(
                  "Unexpected missing position (i="
                      + i
                      + ", docPosEnum.freq() = "
                      + docPosEnum.freq()
                      + ")");
            if (position >= start && position <= end) {
              if (concordanceWords[position - start] == null)
                concordanceWords[position - start] = termsEnum.term().utf8ToString();
              else concordanceWords[position - start] += "|" + termsEnum.term().utf8ToString();
              numFound++;
            }
          }
          if (numFound == concordanceWords.length) return concordanceWords;
        }
      }

      if (numFound < concordanceWords.length && !partialOk) {
        // If we simply ran into the end of the document, that's okay;
        // but if words are missing in the middle, that's not.
        String[] partial = new String[numFound];
        for (int i = 0; i < numFound; i++) {
          partial[i] = concordanceWords[i];
          if (partial[i] == null) {
            throw new RuntimeException(
                "Not all words found ("
                    + numFound
                    + " out of "
                    + concordanceWords.length
                    + "); missing words in the middle of concordance!");
          }
        }
        return partial;
      }
      return concordanceWords;
    } catch (Exception e) {
      throw ExUtil.wrapRuntimeException(e);
    }
  }
  private void checkAllInfo(
      int numDocs,
      String[] values,
      int[] freq,
      int[][] pos,
      int[][] startOffset,
      int[][] endOffset,
      int i)
      throws IOException {
    TermVectorsRequestBuilder resp =
        client()
            .prepareTermVectors("test", "type1", Integer.toString(i))
            .setPayloads(true)
            .setOffsets(true)
            .setPositions(true)
            .setFieldStatistics(true)
            .setTermStatistics(true)
            .setSelectedFields();
    assertThat(resp.request().fieldStatistics(), equalTo(true));
    TermVectorsResponse response = resp.execute().actionGet();
    assertThat("doc id: " + i + " doesn't exists but should", response.isExists(), equalTo(true));
    Fields fields = response.getFields();
    assertThat(fields.size(), equalTo(1));
    Terms terms = fields.terms("field");
    assertThat(terms.size(), equalTo(8l));
    assertThat(terms.getSumTotalTermFreq(), Matchers.equalTo((long) (9 * numDocs)));
    assertThat(terms.getDocCount(), Matchers.equalTo(numDocs));
    assertThat(terms.getSumDocFreq(), equalTo((long) numDocs * values.length));
    TermsEnum iterator = terms.iterator();
    for (int j = 0; j < values.length; j++) {
      String string = values[j];
      BytesRef next = iterator.next();
      assertThat(next, Matchers.notNullValue());
      assertThat("expected " + string, string, equalTo(next.utf8ToString()));
      assertThat(next, Matchers.notNullValue());
      if (string.equals("the")) {
        assertThat(
            "expected ttf of " + string, numDocs * 2, equalTo((int) iterator.totalTermFreq()));
      } else {
        assertThat("expected ttf of " + string, numDocs, equalTo((int) iterator.totalTermFreq()));
      }

      PostingsEnum docsAndPositions = iterator.postings(null, null, PostingsEnum.ALL);
      assertThat(docsAndPositions.nextDoc(), equalTo(0));
      assertThat(freq[j], equalTo(docsAndPositions.freq()));
      assertThat(iterator.docFreq(), equalTo(numDocs));
      int[] termPos = pos[j];
      int[] termStartOffset = startOffset[j];
      int[] termEndOffset = endOffset[j];
      assertThat(termPos.length, equalTo(freq[j]));
      assertThat(termStartOffset.length, equalTo(freq[j]));
      assertThat(termEndOffset.length, equalTo(freq[j]));
      for (int k = 0; k < freq[j]; k++) {
        int nextPosition = docsAndPositions.nextPosition();
        assertThat("term: " + string, nextPosition, equalTo(termPos[k]));
        assertThat("term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k]));
        assertThat("term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k]));
        assertThat("term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word")));
      }
    }
    assertThat(iterator.next(), Matchers.nullValue());

    XContentBuilder xBuilder = XContentFactory.jsonBuilder();
    xBuilder.startObject();
    response.toXContent(xBuilder, ToXContent.EMPTY_PARAMS);
    xBuilder.endObject();
    BytesStream bytesStream = xBuilder.bytesStream();
    String utf8 = bytesStream.bytes().toUtf8().replaceFirst("\"took\":\\d+,", "");
    ;
    String expectedString =
        "{\"_index\":\"test\",\"_type\":\"type1\",\"_id\":\""
            + i
            + "\",\"_version\":1,\"found\":true,\"term_vectors\":{\"field\":{\"field_statistics\":{\"sum_doc_freq\":120,\"doc_count\":15,\"sum_ttf\":135},\"terms\":{\"brown\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":2,\"start_offset\":10,\"end_offset\":15,\"payload\":\"d29yZA==\"}]},\"dog\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":8,\"start_offset\":40,\"end_offset\":43,\"payload\":\"d29yZA==\"}]},\"fox\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":3,\"start_offset\":16,\"end_offset\":19,\"payload\":\"d29yZA==\"}]},\"jumps\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":4,\"start_offset\":20,\"end_offset\":25,\"payload\":\"d29yZA==\"}]},\"lazy\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":7,\"start_offset\":35,\"end_offset\":39,\"payload\":\"d29yZA==\"}]},\"over\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":5,\"start_offset\":26,\"end_offset\":30,\"payload\":\"d29yZA==\"}]},\"quick\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":1,\"start_offset\":4,\"end_offset\":9,\"payload\":\"d29yZA==\"}]},\"the\":{\"doc_freq\":15,\"ttf\":30,\"term_freq\":2,\"tokens\":[{\"position\":0,\"start_offset\":0,\"end_offset\":3,\"payload\":\"d29yZA==\"},{\"position\":6,\"start_offset\":31,\"end_offset\":34,\"payload\":\"d29yZA==\"}]}}}}}";
    assertThat(utf8, equalTo(expectedString));
  }
Example #26
0
  /** Call this only once (if you subclass!) */
  protected void uninvert(final LeafReader reader, Bits liveDocs, final BytesRef termPrefix)
      throws IOException {
    final FieldInfo info = reader.getFieldInfos().fieldInfo(field);
    if (checkForDocValues && info != null && info.getDocValuesType() != DocValuesType.NONE) {
      throw new IllegalStateException(
          "Type mismatch: " + field + " was indexed as " + info.getDocValuesType());
    }
    // System.out.println("DTO uninvert field=" + field + " prefix=" + termPrefix);
    final long startTime = System.nanoTime();
    prefix = termPrefix == null ? null : BytesRef.deepCopyOf(termPrefix);

    final int maxDoc = reader.maxDoc();
    final int[] index =
        new int
            [maxDoc]; // immediate term numbers, or the index into the byte[] representing the last
    // number
    final int[] lastTerm = new int[maxDoc]; // last term we saw for this document
    final byte[][] bytes =
        new byte[maxDoc][]; // list of term numbers for the doc (delta encoded vInts)

    final Terms terms = reader.terms(field);
    if (terms == null) {
      // No terms
      return;
    }

    final TermsEnum te = terms.iterator();
    final BytesRef seekStart = termPrefix != null ? termPrefix : new BytesRef();
    // System.out.println("seekStart=" + seekStart.utf8ToString());
    if (te.seekCeil(seekStart) == TermsEnum.SeekStatus.END) {
      // No terms match
      return;
    }

    // For our "term index wrapper"
    final List<BytesRef> indexedTerms = new ArrayList<>();
    final PagedBytes indexedTermsBytes = new PagedBytes(15);

    // we need a minimum of 9 bytes, but round up to 12 since the space would
    // be wasted with most allocators anyway.
    byte[] tempArr = new byte[12];

    //
    // enumerate all terms, and build an intermediate form of the un-inverted field.
    //
    // During this intermediate form, every document has a (potential) byte[]
    // and the int[maxDoc()] array either contains the termNumber list directly
    // or the *end* offset of the termNumber list in its byte array (for faster
    // appending and faster creation of the final form).
    //
    // idea... if things are too large while building, we could do a range of docs
    // at a time (but it would be a fair amount slower to build)
    // could also do ranges in parallel to take advantage of multiple CPUs

    // OPTIONAL: remap the largest df terms to the lowest 128 (single byte)
    // values.  This requires going over the field first to find the most
    // frequent terms ahead of time.

    int termNum = 0;
    postingsEnum = null;

    // Loop begins with te positioned to first term (we call
    // seek above):
    for (; ; ) {
      final BytesRef t = te.term();
      if (t == null || (termPrefix != null && !StringHelper.startsWith(t, termPrefix))) {
        break;
      }
      // System.out.println("visit term=" + t.utf8ToString() + " " + t + " termNum=" + termNum);

      visitTerm(te, termNum);

      if ((termNum & indexIntervalMask) == 0) {
        // Index this term
        sizeOfIndexedStrings += t.length;
        BytesRef indexedTerm = new BytesRef();
        indexedTermsBytes.copy(t, indexedTerm);
        // TODO: really should 1) strip off useless suffix,
        // and 2) use FST not array/PagedBytes
        indexedTerms.add(indexedTerm);
      }

      final int df = te.docFreq();
      if (df <= maxTermDocFreq) {

        postingsEnum = te.postings(postingsEnum, PostingsEnum.NONE);

        // dF, but takes deletions into account
        int actualDF = 0;

        for (; ; ) {
          int doc = postingsEnum.nextDoc();
          if (doc == DocIdSetIterator.NO_MORE_DOCS) {
            break;
          }
          // System.out.println("  chunk=" + chunk + " docs");

          actualDF++;
          termInstances++;

          // System.out.println("    docID=" + doc);
          // add TNUM_OFFSET to the term number to make room for special reserved values:
          // 0 (end term) and 1 (index into byte array follows)
          int delta = termNum - lastTerm[doc] + TNUM_OFFSET;
          lastTerm[doc] = termNum;
          int val = index[doc];

          if ((val & 0xff) == 1) {
            // index into byte array (actually the end of
            // the doc-specific byte[] when building)
            int pos = val >>> 8;
            int ilen = vIntSize(delta);
            byte[] arr = bytes[doc];
            int newend = pos + ilen;
            if (newend > arr.length) {
              // We avoid a doubling strategy to lower memory usage.
              // this faceting method isn't for docs with many terms.
              // In hotspot, objects have 2 words of overhead, then fields, rounded up to a 64-bit
              // boundary.
              // TODO: figure out what array lengths we can round up to w/o actually using more
              // memory
              // (how much space does a byte[] take up?  Is data preceded by a 32 bit length only?
              // It should be safe to round up to the nearest 32 bits in any case.
              int newLen = (newend + 3) & 0xfffffffc; // 4 byte alignment
              byte[] newarr = new byte[newLen];
              System.arraycopy(arr, 0, newarr, 0, pos);
              arr = newarr;
              bytes[doc] = newarr;
            }
            pos = writeInt(delta, arr, pos);
            index[doc] = (pos << 8) | 1; // update pointer to end index in byte[]
          } else {
            // OK, this int has data in it... find the end (a zero starting byte - not
            // part of another number, hence not following a byte with the high bit set).
            int ipos;
            if (val == 0) {
              ipos = 0;
            } else if ((val & 0x0000ff80) == 0) {
              ipos = 1;
            } else if ((val & 0x00ff8000) == 0) {
              ipos = 2;
            } else if ((val & 0xff800000) == 0) {
              ipos = 3;
            } else {
              ipos = 4;
            }

            // System.out.println("      ipos=" + ipos);

            int endPos = writeInt(delta, tempArr, ipos);
            // System.out.println("      endpos=" + endPos);
            if (endPos <= 4) {
              // System.out.println("      fits!");
              // value will fit in the integer... move bytes back
              for (int j = ipos; j < endPos; j++) {
                val |= (tempArr[j] & 0xff) << (j << 3);
              }
              index[doc] = val;
            } else {
              // value won't fit... move integer into byte[]
              for (int j = 0; j < ipos; j++) {
                tempArr[j] = (byte) val;
                val >>>= 8;
              }
              // point at the end index in the byte[]
              index[doc] = (endPos << 8) | 1;
              bytes[doc] = tempArr;
              tempArr = new byte[12];
            }
          }
        }
        setActualDocFreq(termNum, actualDF);
      }

      termNum++;
      if (te.next() == null) {
        break;
      }
    }

    numTermsInField = termNum;

    long midPoint = System.nanoTime();

    if (termInstances == 0) {
      // we didn't invert anything
      // lower memory consumption.
      tnums = null;
    } else {

      this.index = index;

      //
      // transform intermediate form into the final form, building a single byte[]
      // at a time, and releasing the intermediate byte[]s as we go to avoid
      // increasing the memory footprint.
      //

      for (int pass = 0; pass < 256; pass++) {
        byte[] target = tnums[pass];
        int pos = 0; // end in target;
        if (target != null) {
          pos = target.length;
        } else {
          target = new byte[4096];
        }

        // loop over documents, 0x00ppxxxx, 0x01ppxxxx, 0x02ppxxxx
        // where pp is the pass (which array we are building), and xx is all values.
        // each pass shares the same byte[] for termNumber lists.
        for (int docbase = pass << 16; docbase < maxDoc; docbase += (1 << 24)) {
          int lim = Math.min(docbase + (1 << 16), maxDoc);
          for (int doc = docbase; doc < lim; doc++) {
            // System.out.println("  pass="******" process docID=" + doc);
            int val = index[doc];
            if ((val & 0xff) == 1) {
              int len = val >>> 8;
              // System.out.println("    ptr pos=" + pos);
              index[doc] = (pos << 8) | 1; // change index to point to start of array
              if ((pos & 0xff000000) != 0) {
                // we only have 24 bits for the array index
                throw new IllegalStateException(
                    "Too many values for UnInvertedField faceting on field " + field);
              }
              byte[] arr = bytes[doc];
              /*
              for(byte b : arr) {
                //System.out.println("      b=" + Integer.toHexString((int) b));
              }
              */
              bytes[doc] = null; // IMPORTANT: allow GC to avoid OOM
              if (target.length <= pos + len) {
                int newlen = target.length;
                /**
                 * * we don't have to worry about the array getting too large since the "pos" param
                 * will overflow first (only 24 bits available) if ((newlen<<1) <= 0) { //
                 * overflow... newlen = Integer.MAX_VALUE; if (newlen <= pos + len) { throw new
                 * SolrException(400,"Too many terms to uninvert field!"); } } else { while (newlen
                 * <= pos + len) newlen<<=1; // doubling strategy } **
                 */
                while (newlen <= pos + len) newlen <<= 1; // doubling strategy
                byte[] newtarget = new byte[newlen];
                System.arraycopy(target, 0, newtarget, 0, pos);
                target = newtarget;
              }
              System.arraycopy(arr, 0, target, pos, len);
              pos += len + 1; // skip single byte at end and leave it 0 for terminator
            }
          }
        }

        // shrink array
        if (pos < target.length) {
          byte[] newtarget = new byte[pos];
          System.arraycopy(target, 0, newtarget, 0, pos);
          target = newtarget;
        }

        tnums[pass] = target;

        if ((pass << 16) > maxDoc) break;
      }
    }
    indexedTermsArray = indexedTerms.toArray(new BytesRef[indexedTerms.size()]);

    long endTime = System.nanoTime();

    total_time = (int) TimeUnit.MILLISECONDS.convert(endTime - startTime, TimeUnit.NANOSECONDS);
    phase1_time = (int) TimeUnit.MILLISECONDS.convert(midPoint - startTime, TimeUnit.NANOSECONDS);
  }
  public void testPayloadsPos0() throws Exception {
    Directory dir = newDirectory();
    RandomIndexWriter writer = new RandomIndexWriter(random(), dir, new MockPayloadAnalyzer());
    Document doc = new Document();
    doc.add(new TextField("content", new StringReader("a a b c d e a f g h i j a b k k")));
    writer.addDocument(doc);

    final IndexReader readerFromWriter = writer.getReader();
    LeafReader r = SlowCompositeReaderWrapper.wrap(readerFromWriter);

    PostingsEnum tp = r.postings(new Term("content", "a"), PostingsEnum.ALL);

    int count = 0;
    assertTrue(tp.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
    // "a" occurs 4 times
    assertEquals(4, tp.freq());
    assertEquals(0, tp.nextPosition());
    assertEquals(1, tp.nextPosition());
    assertEquals(3, tp.nextPosition());
    assertEquals(6, tp.nextPosition());

    // only one doc has "a"
    assertEquals(DocIdSetIterator.NO_MORE_DOCS, tp.nextDoc());

    IndexSearcher is = newSearcher(readerFromWriter);

    SpanTermQuery stq1 = new SpanTermQuery(new Term("content", "a"));
    SpanTermQuery stq2 = new SpanTermQuery(new Term("content", "k"));
    SpanQuery[] sqs = {stq1, stq2};
    SpanNearQuery snq = new SpanNearQuery(sqs, 30, false);

    count = 0;
    boolean sawZero = false;
    if (VERBOSE) {
      System.out.println("\ngetPayloadSpans test");
    }
    PayloadSpanCollector collector = new PayloadSpanCollector();
    Spans pspans = MultiSpansWrapper.wrap(is.getIndexReader(), snq, SpanWeight.Postings.PAYLOADS);
    while (pspans.nextDoc() != Spans.NO_MORE_DOCS) {
      while (pspans.nextStartPosition() != Spans.NO_MORE_POSITIONS) {
        if (VERBOSE) {
          System.out.println(
              "doc "
                  + pspans.docID()
                  + ": span "
                  + pspans.startPosition()
                  + " to "
                  + pspans.endPosition());
        }
        collector.reset();
        pspans.collect(collector);
        sawZero |= pspans.startPosition() == 0;
        for (BytesRef payload : collector.payloads) {
          count++;
          if (VERBOSE) {
            System.out.println("  payload: " + Term.toString(payload));
          }
        }
      }
    }
    assertTrue(sawZero);
    assertEquals(8, count);

    // System.out.println("\ngetSpans test");
    Spans spans = MultiSpansWrapper.wrap(is.getIndexReader(), snq);
    count = 0;
    sawZero = false;
    while (spans.nextDoc() != Spans.NO_MORE_DOCS) {
      while (spans.nextStartPosition() != Spans.NO_MORE_POSITIONS) {
        count++;
        sawZero |= spans.startPosition() == 0;
        // System.out.println(spans.doc() + " - " + spans.start() + " - " +
        // spans.end());
      }
    }
    assertEquals(4, count);
    assertTrue(sawZero);

    writer.close();
    is.getIndexReader().close();
    dir.close();
  }