Пример #1
0
  /*
   *  Utility function to display a term vector.
   */
  static void termVectorDisplay(Terms terms) throws IOException {

    if ((terms == null) || (terms.size() == -1)) System.out.println("    The field is not stored.");
    else {
      /*
       *  The terms for this field are stored.
       */
      System.out.println("    Vocabulary size: " + terms.size() + " terms");

      TermsEnum ithTerm = terms.iterator(null);

      /*
       *  Iterate over the terms in this document.
       *  Information about a term's occurrences (tf and
       *  positions) is accessed via the indexing API, which
       *  returns inverted lists that describe (only) the
       *  current document.
       */
      while (ithTerm.next() != null) {
        System.out.format(
            "      %10d %-20s %d ",
            ithTerm.ord(), ithTerm.term().utf8ToString(), ithTerm.totalTermFreq());

        DocsAndPositionsEnum currDoc = ithTerm.docsAndPositions(null, null);
        currDoc.nextDoc();

        for (int jthPosition = 0; jthPosition < ithTerm.totalTermFreq(); jthPosition++)
          System.out.print(currDoc.nextPosition() + " ");

        System.out.println();
      }
      ;
    }
    ;
  }
Пример #2
0
  /*
   *  listTermDictionary displays the term dictionary for a field.
   */
  static void listTermDictionary(IndexReader reader, String fieldName) throws IOException {

    System.out.println("\nTerm Dictionary:  field " + fieldName);

    /*
      Grant says:
      MultiFields.getTerms(IndexReader, fieldName)
    */

    Terms terms = MultiFields.getTerms(reader, fieldName);

    if ((terms == null) || (terms.size() == -1))
      System.out.println("    The term dictionary is empty.");
    else {
      System.out.println("    Vocabulary size: " + terms.size() + " terms");

      TermsEnum ithTerm = terms.iterator(null);

      /*
       *  Iterate over the terms in this document.
       *  Information about a term's occurrences (tf and
       *  positions) is accessed via the indexing API, which
       *  returns inverted lists that describe (only) the
       *  current document.
       */
      while (ithTerm.next() != null) {
        System.out.format(
            "      %-30s %d %d\n",
            ithTerm.term().utf8ToString(), ithTerm.docFreq(), ithTerm.totalTermFreq());
      }
      ;
    }
    ;
  }
Пример #3
0
  // make sure we never reuse from another reader even if it is the same field & codec etc
  public void testReuseDocsEnumDifferentReader() throws IOException {
    Directory dir = newDirectory();
    Codec cp = TestUtil.alwaysPostingsFormat(new Lucene40RWPostingsFormat());
    MockAnalyzer analyzer = new MockAnalyzer(random());
    analyzer.setMaxTokenLength(TestUtil.nextInt(random(), 1, IndexWriter.MAX_TERM_LENGTH));

    RandomIndexWriter writer =
        new RandomIndexWriter(random(), dir, newIndexWriterConfig(analyzer).setCodec(cp));
    int numdocs = atLeast(20);
    createRandomIndex(numdocs, writer, random());
    writer.commit();

    DirectoryReader firstReader = DirectoryReader.open(dir);
    DirectoryReader secondReader = DirectoryReader.open(dir);
    List<LeafReaderContext> leaves = firstReader.leaves();
    List<LeafReaderContext> leaves2 = secondReader.leaves();

    for (LeafReaderContext ctx : leaves) {
      Terms terms = ctx.reader().terms("body");
      TermsEnum iterator = terms.iterator();
      IdentityHashMap<PostingsEnum, Boolean> enums = new IdentityHashMap<>();
      MatchNoBits bits = new Bits.MatchNoBits(firstReader.maxDoc());
      iterator = terms.iterator();
      PostingsEnum docs = null;
      BytesRef term = null;
      while ((term = iterator.next()) != null) {
        docs =
            iterator.postings(
                null,
                randomDocsEnum("body", term, leaves2, bits),
                random().nextBoolean() ? PostingsEnum.FREQS : PostingsEnum.NONE);
        enums.put(docs, true);
      }
      assertEquals(terms.size(), enums.size());

      iterator = terms.iterator();
      enums.clear();
      docs = null;
      while ((term = iterator.next()) != null) {
        docs =
            iterator.postings(
                bits,
                randomDocsEnum("body", term, leaves2, bits),
                random().nextBoolean() ? PostingsEnum.FREQS : PostingsEnum.NONE);
        enums.put(docs, true);
      }
      assertEquals(terms.size(), enums.size());
    }
    writer.close();
    IOUtils.close(firstReader, secondReader, dir);
  }
 /** checks collection-level statistics on Terms */
 public void assertTermsStatistics(Terms leftTerms, Terms rightTerms) throws Exception {
   if (leftTerms.getDocCount() != -1 && rightTerms.getDocCount() != -1) {
     assertEquals(leftTerms.getDocCount(), rightTerms.getDocCount());
   }
   if (leftTerms.getSumDocFreq() != -1 && rightTerms.getSumDocFreq() != -1) {
     assertEquals(leftTerms.getSumDocFreq(), rightTerms.getSumDocFreq());
   }
   if (leftTerms.getSumTotalTermFreq() != -1 && rightTerms.getSumTotalTermFreq() != -1) {
     assertEquals(leftTerms.getSumTotalTermFreq(), rightTerms.getSumTotalTermFreq());
   }
   if (leftTerms.size() != -1 && rightTerms.size() != -1) {
     assertEquals(leftTerms.size(), rightTerms.size());
   }
 }
Пример #5
0
  public void testReuseDocsEnumNoReuse() throws IOException {
    Directory dir = newDirectory();
    Codec cp = TestUtil.alwaysPostingsFormat(new Lucene40RWPostingsFormat());
    RandomIndexWriter writer =
        new RandomIndexWriter(
            random(), dir, newIndexWriterConfig(new MockAnalyzer(random())).setCodec(cp));
    int numdocs = atLeast(20);
    createRandomIndex(numdocs, writer, random());
    writer.commit();

    DirectoryReader open = DirectoryReader.open(dir);
    for (LeafReaderContext ctx : open.leaves()) {
      LeafReader indexReader = ctx.reader();
      Terms terms = indexReader.terms("body");
      TermsEnum iterator = terms.iterator();
      IdentityHashMap<PostingsEnum, Boolean> enums = new IdentityHashMap<>();
      MatchNoBits bits = new Bits.MatchNoBits(indexReader.maxDoc());
      while ((iterator.next()) != null) {
        PostingsEnum docs =
            iterator.postings(
                random().nextBoolean() ? bits : new Bits.MatchNoBits(indexReader.maxDoc()),
                null,
                random().nextBoolean() ? PostingsEnum.FREQS : PostingsEnum.NONE);
        enums.put(docs, true);
      }

      assertEquals(terms.size(), enums.size());
    }
    writer.commit();
    IOUtils.close(writer, open, dir);
  }
 private void buildField(
     XContentBuilder builder,
     final CharsRefBuilder spare,
     Fields theFields,
     Iterator<String> fieldIter)
     throws IOException {
   String fieldName = fieldIter.next();
   builder.startObject(fieldName);
   Terms curTerms = theFields.terms(fieldName);
   // write field statistics
   buildFieldStatistics(builder, curTerms);
   builder.startObject(FieldStrings.TERMS);
   TermsEnum termIter = curTerms.iterator(null);
   for (int i = 0; i < curTerms.size(); i++) {
     buildTerm(builder, spare, curTerms, termIter);
   }
   builder.endObject();
   builder.endObject();
 }
 @Override
 public long size() throws IOException {
   return delegateTerms.size();
 }
  private void checkAllInfo(
      int numDocs,
      String[] values,
      int[] freq,
      int[][] pos,
      int[][] startOffset,
      int[][] endOffset,
      int i)
      throws IOException {
    TermVectorsRequestBuilder resp =
        client()
            .prepareTermVectors("test", "type1", Integer.toString(i))
            .setPayloads(true)
            .setOffsets(true)
            .setPositions(true)
            .setFieldStatistics(true)
            .setTermStatistics(true)
            .setSelectedFields();
    assertThat(resp.request().fieldStatistics(), equalTo(true));
    TermVectorsResponse response = resp.execute().actionGet();
    assertThat("doc id: " + i + " doesn't exists but should", response.isExists(), equalTo(true));
    Fields fields = response.getFields();
    assertThat(fields.size(), equalTo(1));
    Terms terms = fields.terms("field");
    assertThat(terms.size(), equalTo(8l));
    assertThat(terms.getSumTotalTermFreq(), Matchers.equalTo((long) (9 * numDocs)));
    assertThat(terms.getDocCount(), Matchers.equalTo(numDocs));
    assertThat(terms.getSumDocFreq(), equalTo((long) numDocs * values.length));
    TermsEnum iterator = terms.iterator();
    for (int j = 0; j < values.length; j++) {
      String string = values[j];
      BytesRef next = iterator.next();
      assertThat(next, Matchers.notNullValue());
      assertThat("expected " + string, string, equalTo(next.utf8ToString()));
      assertThat(next, Matchers.notNullValue());
      if (string.equals("the")) {
        assertThat(
            "expected ttf of " + string, numDocs * 2, equalTo((int) iterator.totalTermFreq()));
      } else {
        assertThat("expected ttf of " + string, numDocs, equalTo((int) iterator.totalTermFreq()));
      }

      PostingsEnum docsAndPositions = iterator.postings(null, null, PostingsEnum.ALL);
      assertThat(docsAndPositions.nextDoc(), equalTo(0));
      assertThat(freq[j], equalTo(docsAndPositions.freq()));
      assertThat(iterator.docFreq(), equalTo(numDocs));
      int[] termPos = pos[j];
      int[] termStartOffset = startOffset[j];
      int[] termEndOffset = endOffset[j];
      assertThat(termPos.length, equalTo(freq[j]));
      assertThat(termStartOffset.length, equalTo(freq[j]));
      assertThat(termEndOffset.length, equalTo(freq[j]));
      for (int k = 0; k < freq[j]; k++) {
        int nextPosition = docsAndPositions.nextPosition();
        assertThat("term: " + string, nextPosition, equalTo(termPos[k]));
        assertThat("term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k]));
        assertThat("term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k]));
        assertThat("term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word")));
      }
    }
    assertThat(iterator.next(), Matchers.nullValue());

    XContentBuilder xBuilder = XContentFactory.jsonBuilder();
    xBuilder.startObject();
    response.toXContent(xBuilder, ToXContent.EMPTY_PARAMS);
    xBuilder.endObject();
    BytesStream bytesStream = xBuilder.bytesStream();
    String utf8 = bytesStream.bytes().toUtf8().replaceFirst("\"took\":\\d+,", "");
    ;
    String expectedString =
        "{\"_index\":\"test\",\"_type\":\"type1\",\"_id\":\""
            + i
            + "\",\"_version\":1,\"found\":true,\"term_vectors\":{\"field\":{\"field_statistics\":{\"sum_doc_freq\":120,\"doc_count\":15,\"sum_ttf\":135},\"terms\":{\"brown\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":2,\"start_offset\":10,\"end_offset\":15,\"payload\":\"d29yZA==\"}]},\"dog\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":8,\"start_offset\":40,\"end_offset\":43,\"payload\":\"d29yZA==\"}]},\"fox\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":3,\"start_offset\":16,\"end_offset\":19,\"payload\":\"d29yZA==\"}]},\"jumps\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":4,\"start_offset\":20,\"end_offset\":25,\"payload\":\"d29yZA==\"}]},\"lazy\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":7,\"start_offset\":35,\"end_offset\":39,\"payload\":\"d29yZA==\"}]},\"over\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":5,\"start_offset\":26,\"end_offset\":30,\"payload\":\"d29yZA==\"}]},\"quick\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":1,\"start_offset\":4,\"end_offset\":9,\"payload\":\"d29yZA==\"}]},\"the\":{\"doc_freq\":15,\"ttf\":30,\"term_freq\":2,\"tokens\":[{\"position\":0,\"start_offset\":0,\"end_offset\":3,\"payload\":\"d29yZA==\"},{\"position\":6,\"start_offset\":31,\"end_offset\":34,\"payload\":\"d29yZA==\"}]}}}}}";
    assertThat(utf8, equalTo(expectedString));
  }
Пример #9
0
  public SparseInstances readIndex(String indexPath, String destFile, int threshold)
      throws Exception {

    if (indexPath == null || destFile == null) {
      System.out.println("error: indexPath or destFile is null\n");
      return null;
    }

    DirectoryReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(indexPath)));
    Terms terms = SlowCompositeReaderWrapper.wrap(reader).terms(reviewKey);

    int capacity = (int) terms.size();
    HashMap<String, Integer> wordDict = new HashMap<>(capacity);
    capacity = capacity > 65535 ? 65535 : capacity;
    SparseInstances instData = new SparseInstances(capacity, reader.numDocs());
    TermsEnum termsEnum = terms.iterator();
    int index = 0;
    BytesRef term = null;
    String strTerm = null;
    while ((term = termsEnum.next()) != null) {
      strTerm = term.toString();
      if (termsEnum.totalTermFreq() < threshold) {
        continue;
      }
      if (strTerm.isEmpty()) {
        continue;
      }
      if (wordDict.get(strTerm) != null) {
        continue;
      }
      instData.addAttribute(strTerm);
      index++;
    }
    int numAtt = instData.numAttributes();
    int numInst = instData.numInstances();
    Integer attIndex = null;
    String id = null;
    int termIndex = 0;
    for (int docIndex = 0; docIndex < numInst; docIndex++) {
      id = reader.document(docIndex).getField(idKey).stringValue();
      Terms docTerms = reader.getTermVector(docIndex, reviewKey);
      if (docTerms == null) {
        continue;
      }
      int[] indices = new int[(int) docTerms.size()];
      double[] attValues = new double[(int) docTerms.size()];
      termsEnum = docTerms.iterator();
      termIndex = 0;
      while ((term = termsEnum.next()) != null) {
        strTerm = term.toString();
        attIndex = wordDict.get(strTerm);
        if (attIndex == null) {
          continue;
        }
        indices[termIndex] = attIndex.intValue();
        attValues[termIndex] = termsEnum.totalTermFreq();
      }
      ESparseInstance instance = new ESparseInstance(id, 1.0, attValues, indices, numAtt);
      instData.addInstance(instance);
    }

    return null;
  }
Пример #10
0
  /**
   * Safe (but, slowish) default method to write every vector field in the document. This default
   * implementation requires that the vectors implement both Fields.size and Terms.size.
   */
  protected final void addAllDocVectors(Fields vectors, FieldInfos fieldInfos) throws IOException {
    if (vectors == null) {
      startDocument(0);
      return;
    }

    final int numFields = vectors.size();
    if (numFields == -1) {
      throw new IllegalStateException("vectors.size() must be implemented (it returned -1)");
    }
    startDocument(numFields);

    final FieldsEnum fieldsEnum = vectors.iterator();
    String fieldName;
    String lastFieldName = null;

    while ((fieldName = fieldsEnum.next()) != null) {
      final FieldInfo fieldInfo = fieldInfos.fieldInfo(fieldName);

      assert lastFieldName == null || fieldName.compareTo(lastFieldName) > 0
          : "lastFieldName=" + lastFieldName + " fieldName=" + fieldName;
      lastFieldName = fieldName;

      final Terms terms = fieldsEnum.terms();
      if (terms == null) {
        // FieldsEnum shouldn't lie...
        continue;
      }
      final int numTerms = (int) terms.size();
      if (numTerms == -1) {
        throw new IllegalStateException("terms.size() must be implemented (it returned -1)");
      }
      final TermsEnum termsEnum = terms.iterator(null);

      DocsAndPositionsEnum docsAndPositionsEnum = null;

      boolean startedField = false;

      // NOTE: this is tricky, because TermVectors allow
      // indexing offsets but NOT positions.  So we must
      // lazily init the field by checking whether first
      // position we see is -1 or not.

      int termCount = 0;
      while (termsEnum.next() != null) {
        termCount++;

        final int freq = (int) termsEnum.totalTermFreq();

        if (startedField) {
          startTerm(termsEnum.term(), freq);
        }

        // TODO: we need a "query" API where we can ask (via
        // flex API) what this term was indexed with...
        // Both positions & offsets:
        docsAndPositionsEnum = termsEnum.docsAndPositions(null, null, true);
        final boolean hasOffsets;
        boolean hasPositions = false;
        if (docsAndPositionsEnum == null) {
          // Fallback: no offsets
          docsAndPositionsEnum = termsEnum.docsAndPositions(null, null, false);
          hasOffsets = false;
        } else {
          hasOffsets = true;
        }

        if (docsAndPositionsEnum != null) {
          final int docID = docsAndPositionsEnum.nextDoc();
          assert docID != DocIdSetIterator.NO_MORE_DOCS;
          assert docsAndPositionsEnum.freq() == freq;

          for (int posUpto = 0; posUpto < freq; posUpto++) {
            final int pos = docsAndPositionsEnum.nextPosition();
            if (!startedField) {
              assert numTerms > 0;
              hasPositions = pos != -1;
              startField(fieldInfo, numTerms, hasPositions, hasOffsets);
              startTerm(termsEnum.term(), freq);
              startedField = true;
            }
            final int startOffset;
            final int endOffset;
            if (hasOffsets) {
              startOffset = docsAndPositionsEnum.startOffset();
              endOffset = docsAndPositionsEnum.endOffset();
              assert startOffset != -1;
              assert endOffset != -1;
            } else {
              startOffset = -1;
              endOffset = -1;
            }
            assert !hasPositions || pos >= 0;
            addPosition(pos, startOffset, endOffset);
          }
        } else {
          if (!startedField) {
            assert numTerms > 0;
            startField(fieldInfo, numTerms, hasPositions, hasOffsets);
            startTerm(termsEnum.term(), freq);
            startedField = true;
          }
        }
      }
      assert termCount == numTerms;
    }
  }