private void initMemory(Terms curTerms, int termFreq) {
   // init memory for performance reasons
   if (curTerms.hasPositions()) {
     currentPositions = ArrayUtil.grow(currentPositions, termFreq);
   }
   if (curTerms.hasOffsets()) {
     currentStartOffset = ArrayUtil.grow(currentStartOffset, termFreq);
     currentEndOffset = ArrayUtil.grow(currentEndOffset, termFreq);
   }
   if (curTerms.hasPayloads()) {
     currentPayloads = new BytesArray[termFreq];
   }
 }
  private void buildValues(XContentBuilder builder, Terms curTerms, int termFreq)
      throws IOException {
    if (!(curTerms.hasPayloads() || curTerms.hasOffsets() || curTerms.hasPositions())) {
      return;
    }

    builder.startArray(FieldStrings.TOKENS);
    for (int i = 0; i < termFreq; i++) {
      builder.startObject();
      if (curTerms.hasPositions()) {
        builder.field(FieldStrings.POS, currentPositions[i]);
      }
      if (curTerms.hasOffsets()) {
        builder.field(FieldStrings.START_OFFSET, currentStartOffset[i]);
        builder.field(FieldStrings.END_OFFSET, currentEndOffset[i]);
      }
      if (curTerms.hasPayloads() && (currentPayloads[i].length() > 0)) {
        builder.field(FieldStrings.PAYLOAD, currentPayloads[i]);
      }
      builder.endObject();
    }
    builder.endArray();
  }
 private void initValues(Terms curTerms, PostingsEnum posEnum, int termFreq) throws IOException {
   for (int j = 0; j < termFreq; j++) {
     int nextPos = posEnum.nextPosition();
     if (curTerms.hasPositions()) {
       currentPositions[j] = nextPos;
     }
     if (curTerms.hasOffsets()) {
       currentStartOffset[j] = posEnum.startOffset();
       currentEndOffset[j] = posEnum.endOffset();
     }
     if (curTerms.hasPayloads()) {
       BytesRef curPayload = posEnum.getPayload();
       if (curPayload != null) {
         currentPayloads[j] = new BytesArray(curPayload.bytes, 0, curPayload.length);
       } else {
         currentPayloads[j] = null;
       }
     }
   }
 }
Beispiel #4
0
  /**
   * Get all words between the specified start and end positions from the term vector.
   *
   * <p>NOTE: this may return an array of less than the size requested, if the document ends before
   * the requested end position.
   *
   * @param reader the index
   * @param doc doc id
   * @param luceneName the index field from which to use the term vector
   * @param start start position (first word we want to request)
   * @param end end position (last word we want to request)
   * @param partialOk is it okay if we're missing words in the middle, or do we need them all?
   *     (debug)
   * @return the words found, in order
   */
  public static String[] getWordsFromTermVector(
      IndexReader reader, int doc, String luceneName, int start, int end, boolean partialOk) {

    // Retrieve the term position vector of the contents of this document.
    // NOTE: might be faster to retrieve all term vectors at once

    try {
      org.apache.lucene.index.Terms terms = reader.getTermVector(doc, luceneName);
      if (terms == null) {
        throw new IllegalArgumentException("Field " + luceneName + " has no Terms");
      }
      if (!terms.hasPositions())
        throw new IllegalArgumentException(
            "Field " + luceneName + " has no character postion information");
      // String[] docTerms = new String[(int) terms.size()];
      // final List<BytesRef> termsList = new ArrayList<BytesRef>();
      TermsEnum termsEnum = terms.iterator();

      // Verzamel concordantiewoorden uit term vector
      PostingsEnum docPosEnum = null;
      int numFound = 0;
      String[] concordanceWords = new String[end - start + 1];
      while (termsEnum.next() != null) {
        docPosEnum = termsEnum.postings(null, docPosEnum, PostingsEnum.POSITIONS);
        while (docPosEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
          // NOTE: .docId() will always return 0 in this case
          // if (docPosEnum.docID() != doc)
          //	throw new RuntimeException("Wrong doc id: " + docPosEnum.docID() + " (expected " + doc
          // + ")");
          for (int i = 0; i < docPosEnum.freq(); i++) {
            int position = docPosEnum.nextPosition();
            if (position == -1)
              throw new RuntimeException(
                  "Unexpected missing position (i="
                      + i
                      + ", docPosEnum.freq() = "
                      + docPosEnum.freq()
                      + ")");
            if (position >= start && position <= end) {
              if (concordanceWords[position - start] == null)
                concordanceWords[position - start] = termsEnum.term().utf8ToString();
              else concordanceWords[position - start] += "|" + termsEnum.term().utf8ToString();
              numFound++;
            }
          }
          if (numFound == concordanceWords.length) return concordanceWords;
        }
      }

      if (numFound < concordanceWords.length && !partialOk) {
        // If we simply ran into the end of the document, that's okay;
        // but if words are missing in the middle, that's not.
        String[] partial = new String[numFound];
        for (int i = 0; i < numFound; i++) {
          partial[i] = concordanceWords[i];
          if (partial[i] == null) {
            throw new RuntimeException(
                "Not all words found ("
                    + numFound
                    + " out of "
                    + concordanceWords.length
                    + "); missing words in the middle of concordance!");
          }
        }
        return partial;
      }
      return concordanceWords;
    } catch (Exception e) {
      throw ExUtil.wrapRuntimeException(e);
    }
  }
 @Override
 public boolean hasPositions() {
   return delegateTerms.hasPositions();
 }
  @Override
  public void execute(String[] args, PrintStream out) throws Exception {
    String field = null;
    String termVal = null;
    try {
      field = args[0];
    } catch (Exception e) {
      field = null;
    }

    if (field != null) {
      String[] parts = field.split(":");
      if (parts.length > 1) {
        field = parts[0];
        termVal = parts[1];
      }
    }

    if (field == null || termVal == null) {
      out.println("usage: field:term");
      out.flush();
      return;
    }

    IndexReader reader = ctx.getIndexReader();
    List<AtomicReaderContext> leaves = reader.leaves();
    int docBase = 0;
    int numPerPage = 20;
    for (AtomicReaderContext leaf : leaves) {
      AtomicReader atomicReader = leaf.reader();
      Terms terms = atomicReader.terms(field);
      if (terms == null) {
        continue;
      }
      boolean hasPositions = terms.hasPositions();
      if (terms != null && termVal != null) {
        TermsEnum te = terms.iterator(null);
        int count = 0;
        if (te.seekExact(new BytesRef(termVal), true)) {

          if (hasPositions) {
            DocsAndPositionsEnum iter = te.docsAndPositions(atomicReader.getLiveDocs(), null);
            int docid;
            while ((docid = iter.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
              count++;
              out.print("docid: " + (docid + docBase) + ", freq: " + iter.freq() + ", ");
              for (int i = 0; i < iter.freq(); ++i) {
                out.print("pos " + i + ": " + iter.nextPosition());
                BytesRef payload = iter.getPayload();
                if (payload != null) {
                  out.print(",payload: " + payload);
                }
                out.print(";");
              }
              out.println();
              if (ctx.isInteractiveMode()) {
                if (count % numPerPage == 0) {
                  out.println("Ctrl-D to break");
                  int ch = System.in.read();
                  if (ch == -1) {
                    out.flush();
                    return;
                  }
                }
              }
            }
          } else {
            DocsEnum iter = te.docs(atomicReader.getLiveDocs(), null);

            int docid;
            while ((docid = iter.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
              count++;
              out.println("docid: " + (docid + docBase));
              if (ctx.isInteractiveMode()) {
                if (count % numPerPage == 0) {
                  out.println("Ctrl-D to break");
                  int ch = System.in.read();
                  if (ch == -1) {
                    out.flush();
                    return;
                  }
                }
              }
            }
          }
        }
      }
      docBase += atomicReader.maxDoc();
    }
  }
  private void duellReaders(CompositeReader other, LeafReader memIndexReader) throws IOException {
    Fields memFields = memIndexReader.fields();
    for (String field : MultiFields.getFields(other)) {
      Terms memTerms = memFields.terms(field);
      Terms iwTerms = memIndexReader.terms(field);
      if (iwTerms == null) {
        assertNull(memTerms);
      } else {
        NumericDocValues normValues = MultiDocValues.getNormValues(other, field);
        NumericDocValues memNormValues = memIndexReader.getNormValues(field);
        if (normValues != null) {
          // mem idx always computes norms on the fly
          assertNotNull(memNormValues);
          assertEquals(normValues.get(0), memNormValues.get(0));
        }

        assertNotNull(memTerms);
        assertEquals(iwTerms.getDocCount(), memTerms.getDocCount());
        assertEquals(iwTerms.getSumDocFreq(), memTerms.getSumDocFreq());
        assertEquals(iwTerms.getSumTotalTermFreq(), memTerms.getSumTotalTermFreq());
        TermsEnum iwTermsIter = iwTerms.iterator();
        TermsEnum memTermsIter = memTerms.iterator();
        if (iwTerms.hasPositions()) {
          final boolean offsets = iwTerms.hasOffsets() && memTerms.hasOffsets();

          while (iwTermsIter.next() != null) {
            assertNotNull(memTermsIter.next());
            assertEquals(iwTermsIter.term(), memTermsIter.term());
            PostingsEnum iwDocsAndPos = iwTermsIter.postings(null, PostingsEnum.ALL);
            PostingsEnum memDocsAndPos = memTermsIter.postings(null, PostingsEnum.ALL);
            while (iwDocsAndPos.nextDoc() != PostingsEnum.NO_MORE_DOCS) {
              assertEquals(iwDocsAndPos.docID(), memDocsAndPos.nextDoc());
              assertEquals(iwDocsAndPos.freq(), memDocsAndPos.freq());
              for (int i = 0; i < iwDocsAndPos.freq(); i++) {
                assertEquals(
                    "term: " + iwTermsIter.term().utf8ToString(),
                    iwDocsAndPos.nextPosition(),
                    memDocsAndPos.nextPosition());
                if (offsets) {
                  assertEquals(iwDocsAndPos.startOffset(), memDocsAndPos.startOffset());
                  assertEquals(iwDocsAndPos.endOffset(), memDocsAndPos.endOffset());
                }

                if (iwTerms.hasPayloads()) {
                  assertEquals(iwDocsAndPos.getPayload(), memDocsAndPos.getPayload());
                }
              }
            }
          }
        } else {
          while (iwTermsIter.next() != null) {
            assertEquals(iwTermsIter.term(), memTermsIter.term());
            PostingsEnum iwDocsAndPos = iwTermsIter.postings(null);
            PostingsEnum memDocsAndPos = memTermsIter.postings(null);
            while (iwDocsAndPos.nextDoc() != PostingsEnum.NO_MORE_DOCS) {
              assertEquals(iwDocsAndPos.docID(), memDocsAndPos.nextDoc());
              assertEquals(iwDocsAndPos.freq(), memDocsAndPos.freq());
            }
          }
        }
      }
    }
  }