public void testGetTermVector() throws IOException {
    createIndexWithAlias();
    assertAcked(
        client()
            .admin()
            .indices()
            .preparePutMapping("test")
            .setType("type1")
            .setSource("field", "type=text,term_vector=with_positions_offsets_payloads")
            .get());
    ensureYellow("test");

    client()
        .prepareIndex(indexOrAlias(), "type1", "1")
        .setSource("field", "the quick brown fox jumps over the lazy dog")
        .get();
    refresh();

    TermVectorsResponse termVectorsResponse =
        client().prepareTermVectors(indexOrAlias(), "type1", "1").get();
    assertThat(termVectorsResponse.getIndex(), equalTo("test"));
    assertThat(termVectorsResponse.isExists(), equalTo(true));
    Fields fields = termVectorsResponse.getFields();
    assertThat(fields.size(), equalTo(1));
    assertThat(fields.terms("field").size(), equalTo(8L));
  }
  private void checkAllInfo(
      int numDocs,
      String[] values,
      int[] freq,
      int[][] pos,
      int[][] startOffset,
      int[][] endOffset,
      int i)
      throws IOException {
    TermVectorsRequestBuilder resp =
        client()
            .prepareTermVectors("test", "type1", Integer.toString(i))
            .setPayloads(true)
            .setOffsets(true)
            .setPositions(true)
            .setFieldStatistics(true)
            .setTermStatistics(true)
            .setSelectedFields();
    assertThat(resp.request().fieldStatistics(), equalTo(true));
    TermVectorsResponse response = resp.execute().actionGet();
    assertThat("doc id: " + i + " doesn't exists but should", response.isExists(), equalTo(true));
    Fields fields = response.getFields();
    assertThat(fields.size(), equalTo(1));
    Terms terms = fields.terms("field");
    assertThat(terms.size(), equalTo(8l));
    assertThat(terms.getSumTotalTermFreq(), Matchers.equalTo((long) (9 * numDocs)));
    assertThat(terms.getDocCount(), Matchers.equalTo(numDocs));
    assertThat(terms.getSumDocFreq(), equalTo((long) numDocs * values.length));
    TermsEnum iterator = terms.iterator();
    for (int j = 0; j < values.length; j++) {
      String string = values[j];
      BytesRef next = iterator.next();
      assertThat(next, Matchers.notNullValue());
      assertThat("expected " + string, string, equalTo(next.utf8ToString()));
      assertThat(next, Matchers.notNullValue());
      if (string.equals("the")) {
        assertThat(
            "expected ttf of " + string, numDocs * 2, equalTo((int) iterator.totalTermFreq()));
      } else {
        assertThat("expected ttf of " + string, numDocs, equalTo((int) iterator.totalTermFreq()));
      }

      PostingsEnum docsAndPositions = iterator.postings(null, null, PostingsEnum.ALL);
      assertThat(docsAndPositions.nextDoc(), equalTo(0));
      assertThat(freq[j], equalTo(docsAndPositions.freq()));
      assertThat(iterator.docFreq(), equalTo(numDocs));
      int[] termPos = pos[j];
      int[] termStartOffset = startOffset[j];
      int[] termEndOffset = endOffset[j];
      assertThat(termPos.length, equalTo(freq[j]));
      assertThat(termStartOffset.length, equalTo(freq[j]));
      assertThat(termEndOffset.length, equalTo(freq[j]));
      for (int k = 0; k < freq[j]; k++) {
        int nextPosition = docsAndPositions.nextPosition();
        assertThat("term: " + string, nextPosition, equalTo(termPos[k]));
        assertThat("term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k]));
        assertThat("term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k]));
        assertThat("term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word")));
      }
    }
    assertThat(iterator.next(), Matchers.nullValue());

    XContentBuilder xBuilder = XContentFactory.jsonBuilder();
    xBuilder.startObject();
    response.toXContent(xBuilder, ToXContent.EMPTY_PARAMS);
    xBuilder.endObject();
    BytesStream bytesStream = xBuilder.bytesStream();
    String utf8 = bytesStream.bytes().toUtf8().replaceFirst("\"took\":\\d+,", "");
    ;
    String expectedString =
        "{\"_index\":\"test\",\"_type\":\"type1\",\"_id\":\""
            + i
            + "\",\"_version\":1,\"found\":true,\"term_vectors\":{\"field\":{\"field_statistics\":{\"sum_doc_freq\":120,\"doc_count\":15,\"sum_ttf\":135},\"terms\":{\"brown\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":2,\"start_offset\":10,\"end_offset\":15,\"payload\":\"d29yZA==\"}]},\"dog\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":8,\"start_offset\":40,\"end_offset\":43,\"payload\":\"d29yZA==\"}]},\"fox\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":3,\"start_offset\":16,\"end_offset\":19,\"payload\":\"d29yZA==\"}]},\"jumps\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":4,\"start_offset\":20,\"end_offset\":25,\"payload\":\"d29yZA==\"}]},\"lazy\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":7,\"start_offset\":35,\"end_offset\":39,\"payload\":\"d29yZA==\"}]},\"over\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":5,\"start_offset\":26,\"end_offset\":30,\"payload\":\"d29yZA==\"}]},\"quick\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":1,\"start_offset\":4,\"end_offset\":9,\"payload\":\"d29yZA==\"}]},\"the\":{\"doc_freq\":15,\"ttf\":30,\"term_freq\":2,\"tokens\":[{\"position\":0,\"start_offset\":0,\"end_offset\":3,\"payload\":\"d29yZA==\"},{\"position\":6,\"start_offset\":31,\"end_offset\":34,\"payload\":\"d29yZA==\"}]}}}}}";
    assertThat(utf8, equalTo(expectedString));
  }
  public static void main(String[] args) throws IOException {

    IndexReader reader = null;

    /*
     *  Opening the index first simplifies the processing of the
     *  rest of the command line arguments.
     */
    for (int i = 0; i < args.length; i++) {
      if (("-index".equals(args[i])) && ((i + 1) < args.length)) {
        reader = DirectoryReader.open(FSDirectory.open(new File(args[i + 1])));

        if (reader == null) {
          System.err.println("Error:  Can't open index " + args[i + 1]);
          System.exit(1);
        }
        ;

        break;
      }
      ;
    }
    ;

    if (reader == null) {
      System.err.println(usage);
      System.exit(1);
    }
    ;

    /*
     *  Process the command line arguments sequentially.
     */
    for (int i = 0; i < args.length; i++) {

      if ("-index".equals(args[i])) {

        /*
         *  Handled in the previous loop, so just skip the argument.
         */
        i++;

      } else if ("-list-edocid".equals(args[i])) {

        System.out.println("-list-edocid:");

        if ((i + 1) >= args.length) {
          System.out.println(usage);
          break;
        }
        ;

        Document d = reader.document(Integer.parseInt(args[i + 1]));

        System.out.println(
            "Internal docid --> External docid: " + args[i + 1] + " --> " + d.get("externalId"));

        i += 1;
      } else if ("-list-docids".equals(args[i])) {

        System.out.println("-list-docids:");

        for (int j = 0; j < reader.numDocs(); j++) {
          Document d = reader.document(j);
          System.out.println("Internal --> external docid: " + j + " --> " + d.get("externalId"));
        }
        ;

      } else if ("-list-fields".equals(args[i])) {

        Fields fields = MultiFields.getFields(reader);

        System.out.print("\nNumber of fields:  ");

        if (fields == null) System.out.println("0");
        else {
          System.out.println(fields.size());

          Iterator<String> is = fields.iterator();

          while (is.hasNext()) {
            System.out.println("\t" + is.next());
          }
          ;
        }
        ;

      } else if ("-list-postings".equals(args[i])) {

        if ((i + 2) >= args.length) {
          System.out.println(usage);
          break;
        }
        ;

        listPostings(reader, args[i + 1], args[i + 2], Integer.MAX_VALUE);
        i += 2;

      } else if ("-list-postings-sample".equals(args[i])) {

        if ((i + 2) >= args.length) {
          System.out.println(usage);
          break;
        }
        ;

        listPostings(reader, args[i + 1], args[i + 2], 5);
        i += 2;

      } else if ("-list-stats".equals(args[i])) {

        System.out.println("Corpus statistics:");
        System.out.println("\tnumdocs\t\t" + reader.numDocs());
        System.out.println(
            "\turl:\t"
                + "\tnumdocs="
                + reader.getDocCount("url")
                + "\tsumTotalTF="
                + reader.getSumTotalTermFreq("url")
                + "\tavglen="
                + reader.getSumTotalTermFreq("url") / (float) reader.getDocCount("url"));

        System.out.println(
            "\tkeywords:"
                + "\tnumdocs="
                + reader.getDocCount("keywords")
                + "\tsumTotalTF="
                + reader.getSumTotalTermFreq("keywords")
                + "\tavglen="
                + reader.getSumTotalTermFreq("keywords") / (float) reader.getDocCount("keywords"));

        System.out.println(
            "\ttitle:\t"
                + "\tnumdocs="
                + reader.getDocCount("title")
                + "\tsumTotalTF="
                + reader.getSumTotalTermFreq("title")
                + "\tavglen="
                + reader.getSumTotalTermFreq("title") / (float) reader.getDocCount("title"));

        System.out.println(
            "\tbody:\t"
                + "\tnumdocs="
                + reader.getDocCount("body")
                + "\tsumTotalTF="
                + reader.getSumTotalTermFreq("body")
                + "\tavglen="
                + reader.getSumTotalTermFreq("body") / (float) reader.getDocCount("body"));

        System.out.println(
            "\tinlink:\t"
                + "\tnumdocs="
                + reader.getDocCount("inlink")
                + "\tsumTotalTF="
                + reader.getSumTotalTermFreq("inlink")
                + "\tavglen="
                + reader.getSumTotalTermFreq("inlink") / (float) reader.getDocCount("inlink"));

      } else if ("-list-terms".equals(args[i])) {

        if ((i + 1) >= args.length) {
          System.out.println(usage);
          break;
        }
        ;

        listTermDictionary(reader, args[i + 1]);
        i += 1;

      } else if ("-list-termvector".equals(args[i])) {

        if ((i + 1) >= args.length) {
          System.out.println(usage);
          break;
        }
        ;

        listTermVectors(reader, args[i + 1]);
        i += 1;

      } else if ("-list-termvector-field".equals(args[i])) {

        if ((i + 2) >= args.length) {
          System.out.println(usage);
          break;
        }
        ;

        listTermVectorField(reader, args[i + 1], args[i + 2]);
        i += 2;

      } else System.err.println("\nWarning:  Unknown argument " + args[i] + " ignored.");
    }
    ;

    /*
     *  Close the index and exit gracefully.
     */
    reader.close();
  }
  public static void verifyEquals(Fields d1, Fields d2) throws IOException {
    if (d1 == null) {
      assertTrue(d2 == null || d2.size() == 0);
      return;
    }
    assertTrue(d2 != null);

    Iterator<String> fieldsEnum2 = d2.iterator();

    for (String field1 : d1) {
      String field2 = fieldsEnum2.next();
      assertEquals(field1, field2);

      Terms terms1 = d1.terms(field1);
      assertNotNull(terms1);
      TermsEnum termsEnum1 = terms1.iterator(null);

      Terms terms2 = d2.terms(field2);
      assertNotNull(terms2);
      TermsEnum termsEnum2 = terms2.iterator(null);

      DocsAndPositionsEnum dpEnum1 = null;
      DocsAndPositionsEnum dpEnum2 = null;
      DocsEnum dEnum1 = null;
      DocsEnum dEnum2 = null;

      BytesRef term1;
      while ((term1 = termsEnum1.next()) != null) {
        BytesRef term2 = termsEnum2.next();
        assertEquals(term1, term2);
        assertEquals(termsEnum1.totalTermFreq(), termsEnum2.totalTermFreq());

        dpEnum1 = termsEnum1.docsAndPositions(null, dpEnum1);
        dpEnum2 = termsEnum2.docsAndPositions(null, dpEnum2);
        if (dpEnum1 != null) {
          assertNotNull(dpEnum2);
          int docID1 = dpEnum1.nextDoc();
          dpEnum2.nextDoc();
          // docIDs are not supposed to be equal
          // int docID2 = dpEnum2.nextDoc();
          // assertEquals(docID1, docID2);
          assertTrue(docID1 != DocIdSetIterator.NO_MORE_DOCS);

          int freq1 = dpEnum1.freq();
          int freq2 = dpEnum2.freq();
          assertEquals(freq1, freq2);
          OffsetAttribute offsetAtt1 =
              dpEnum1.attributes().hasAttribute(OffsetAttribute.class)
                  ? dpEnum1.attributes().getAttribute(OffsetAttribute.class)
                  : null;
          OffsetAttribute offsetAtt2 =
              dpEnum2.attributes().hasAttribute(OffsetAttribute.class)
                  ? dpEnum2.attributes().getAttribute(OffsetAttribute.class)
                  : null;

          if (offsetAtt1 != null) {
            assertNotNull(offsetAtt2);
          } else {
            assertNull(offsetAtt2);
          }

          for (int posUpto = 0; posUpto < freq1; posUpto++) {
            int pos1 = dpEnum1.nextPosition();
            int pos2 = dpEnum2.nextPosition();
            assertEquals(pos1, pos2);
            if (offsetAtt1 != null) {
              assertEquals(offsetAtt1.startOffset(), offsetAtt2.startOffset());
              assertEquals(offsetAtt1.endOffset(), offsetAtt2.endOffset());
            }
          }
          assertEquals(DocIdSetIterator.NO_MORE_DOCS, dpEnum1.nextDoc());
          assertEquals(DocIdSetIterator.NO_MORE_DOCS, dpEnum2.nextDoc());
        } else {
          dEnum1 = TestUtil.docs(random(), termsEnum1, null, dEnum1, DocsEnum.FLAG_FREQS);
          dEnum2 = TestUtil.docs(random(), termsEnum2, null, dEnum2, DocsEnum.FLAG_FREQS);
          assertNotNull(dEnum1);
          assertNotNull(dEnum2);
          int docID1 = dEnum1.nextDoc();
          dEnum2.nextDoc();
          // docIDs are not supposed to be equal
          // int docID2 = dEnum2.nextDoc();
          // assertEquals(docID1, docID2);
          assertTrue(docID1 != DocIdSetIterator.NO_MORE_DOCS);
          int freq1 = dEnum1.freq();
          int freq2 = dEnum2.freq();
          assertEquals(freq1, freq2);
          assertEquals(DocIdSetIterator.NO_MORE_DOCS, dEnum1.nextDoc());
          assertEquals(DocIdSetIterator.NO_MORE_DOCS, dEnum2.nextDoc());
        }
      }

      assertNull(termsEnum2.next());
    }
    assertFalse(fieldsEnum2.hasNext());
  }
Beispiel #5
0
  /**
   * Safe (but, slowish) default method to write every vector field in the document. This default
   * implementation requires that the vectors implement both Fields.size and Terms.size.
   */
  protected final void addAllDocVectors(Fields vectors, FieldInfos fieldInfos) throws IOException {
    if (vectors == null) {
      startDocument(0);
      return;
    }

    final int numFields = vectors.size();
    if (numFields == -1) {
      throw new IllegalStateException("vectors.size() must be implemented (it returned -1)");
    }
    startDocument(numFields);

    final FieldsEnum fieldsEnum = vectors.iterator();
    String fieldName;
    String lastFieldName = null;

    while ((fieldName = fieldsEnum.next()) != null) {
      final FieldInfo fieldInfo = fieldInfos.fieldInfo(fieldName);

      assert lastFieldName == null || fieldName.compareTo(lastFieldName) > 0
          : "lastFieldName=" + lastFieldName + " fieldName=" + fieldName;
      lastFieldName = fieldName;

      final Terms terms = fieldsEnum.terms();
      if (terms == null) {
        // FieldsEnum shouldn't lie...
        continue;
      }
      final int numTerms = (int) terms.size();
      if (numTerms == -1) {
        throw new IllegalStateException("terms.size() must be implemented (it returned -1)");
      }
      final TermsEnum termsEnum = terms.iterator(null);

      DocsAndPositionsEnum docsAndPositionsEnum = null;

      boolean startedField = false;

      // NOTE: this is tricky, because TermVectors allow
      // indexing offsets but NOT positions.  So we must
      // lazily init the field by checking whether first
      // position we see is -1 or not.

      int termCount = 0;
      while (termsEnum.next() != null) {
        termCount++;

        final int freq = (int) termsEnum.totalTermFreq();

        if (startedField) {
          startTerm(termsEnum.term(), freq);
        }

        // TODO: we need a "query" API where we can ask (via
        // flex API) what this term was indexed with...
        // Both positions & offsets:
        docsAndPositionsEnum = termsEnum.docsAndPositions(null, null, true);
        final boolean hasOffsets;
        boolean hasPositions = false;
        if (docsAndPositionsEnum == null) {
          // Fallback: no offsets
          docsAndPositionsEnum = termsEnum.docsAndPositions(null, null, false);
          hasOffsets = false;
        } else {
          hasOffsets = true;
        }

        if (docsAndPositionsEnum != null) {
          final int docID = docsAndPositionsEnum.nextDoc();
          assert docID != DocIdSetIterator.NO_MORE_DOCS;
          assert docsAndPositionsEnum.freq() == freq;

          for (int posUpto = 0; posUpto < freq; posUpto++) {
            final int pos = docsAndPositionsEnum.nextPosition();
            if (!startedField) {
              assert numTerms > 0;
              hasPositions = pos != -1;
              startField(fieldInfo, numTerms, hasPositions, hasOffsets);
              startTerm(termsEnum.term(), freq);
              startedField = true;
            }
            final int startOffset;
            final int endOffset;
            if (hasOffsets) {
              startOffset = docsAndPositionsEnum.startOffset();
              endOffset = docsAndPositionsEnum.endOffset();
              assert startOffset != -1;
              assert endOffset != -1;
            } else {
              startOffset = -1;
              endOffset = -1;
            }
            assert !hasPositions || pos >= 0;
            addPosition(pos, startOffset, endOffset);
          }
        } else {
          if (!startedField) {
            assert numTerms > 0;
            startField(fieldInfo, numTerms, hasPositions, hasOffsets);
            startTerm(termsEnum.term(), freq);
            startedField = true;
          }
        }
      }
      assert termCount == numTerms;
    }
  }