예제 #1
0
  /*
   *  listTermDictionary displays the term dictionary for a field.
   */
  static void listTermDictionary(IndexReader reader, String fieldName) throws IOException {

    System.out.println("\nTerm Dictionary:  field " + fieldName);

    /*
      Grant says:
      MultiFields.getTerms(IndexReader, fieldName)
    */

    Terms terms = MultiFields.getTerms(reader, fieldName);

    if ((terms == null) || (terms.size() == -1))
      System.out.println("    The term dictionary is empty.");
    else {
      System.out.println("    Vocabulary size: " + terms.size() + " terms");

      TermsEnum ithTerm = terms.iterator(null);

      /*
       *  Iterate over the terms in this document.
       *  Information about a term's occurrences (tf and
       *  positions) is accessed via the indexing API, which
       *  returns inverted lists that describe (only) the
       *  current document.
       */
      while (ithTerm.next() != null) {
        System.out.format(
            "      %-30s %d %d\n",
            ithTerm.term().utf8ToString(), ithTerm.docFreq(), ithTerm.totalTermFreq());
      }
      ;
    }
    ;
  }
예제 #2
0
  /*
   *  Utility function to display a term vector.
   */
  static void termVectorDisplay(Terms terms) throws IOException {

    if ((terms == null) || (terms.size() == -1)) System.out.println("    The field is not stored.");
    else {
      /*
       *  The terms for this field are stored.
       */
      System.out.println("    Vocabulary size: " + terms.size() + " terms");

      TermsEnum ithTerm = terms.iterator(null);

      /*
       *  Iterate over the terms in this document.
       *  Information about a term's occurrences (tf and
       *  positions) is accessed via the indexing API, which
       *  returns inverted lists that describe (only) the
       *  current document.
       */
      while (ithTerm.next() != null) {
        System.out.format(
            "      %10d %-20s %d ",
            ithTerm.ord(), ithTerm.term().utf8ToString(), ithTerm.totalTermFreq());

        DocsAndPositionsEnum currDoc = ithTerm.docsAndPositions(null, null);
        currDoc.nextDoc();

        for (int jthPosition = 0; jthPosition < ithTerm.totalTermFreq(); jthPosition++)
          System.out.print(currDoc.nextPosition() + " ");

        System.out.println();
      }
      ;
    }
    ;
  }
예제 #3
0
  public SparseInstances readIndex(String indexPath, String destFile, int threshold)
      throws Exception {

    if (indexPath == null || destFile == null) {
      System.out.println("error: indexPath or destFile is null\n");
      return null;
    }

    DirectoryReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(indexPath)));
    Terms terms = SlowCompositeReaderWrapper.wrap(reader).terms(reviewKey);

    int capacity = (int) terms.size();
    HashMap<String, Integer> wordDict = new HashMap<>(capacity);
    capacity = capacity > 65535 ? 65535 : capacity;
    SparseInstances instData = new SparseInstances(capacity, reader.numDocs());
    TermsEnum termsEnum = terms.iterator();
    int index = 0;
    BytesRef term = null;
    String strTerm = null;
    while ((term = termsEnum.next()) != null) {
      strTerm = term.toString();
      if (termsEnum.totalTermFreq() < threshold) {
        continue;
      }
      if (strTerm.isEmpty()) {
        continue;
      }
      if (wordDict.get(strTerm) != null) {
        continue;
      }
      instData.addAttribute(strTerm);
      index++;
    }
    int numAtt = instData.numAttributes();
    int numInst = instData.numInstances();
    Integer attIndex = null;
    String id = null;
    int termIndex = 0;
    for (int docIndex = 0; docIndex < numInst; docIndex++) {
      id = reader.document(docIndex).getField(idKey).stringValue();
      Terms docTerms = reader.getTermVector(docIndex, reviewKey);
      if (docTerms == null) {
        continue;
      }
      int[] indices = new int[(int) docTerms.size()];
      double[] attValues = new double[(int) docTerms.size()];
      termsEnum = docTerms.iterator();
      termIndex = 0;
      while ((term = termsEnum.next()) != null) {
        strTerm = term.toString();
        attIndex = wordDict.get(strTerm);
        if (attIndex == null) {
          continue;
        }
        indices[termIndex] = attIndex.intValue();
        attValues[termIndex] = termsEnum.totalTermFreq();
      }
      ESparseInstance instance = new ESparseInstance(id, 1.0, attValues, indices, numAtt);
      instData.addInstance(instance);
    }

    return null;
  }
예제 #4
0
  public void testMerge() throws IOException {
    final Codec codec = Codec.getDefault();
    final SegmentInfo si =
        new SegmentInfo(
            mergedDir,
            Version.LATEST,
            mergedSegment,
            -1,
            false,
            codec,
            Collections.emptyMap(),
            StringHelper.randomId(),
            new HashMap<>());

    SegmentMerger merger =
        new SegmentMerger(
            Arrays.<CodecReader>asList(reader1, reader2),
            si,
            InfoStream.getDefault(),
            mergedDir,
            new FieldInfos.FieldNumbers(),
            newIOContext(random(), new IOContext(new MergeInfo(-1, -1, false, -1))));
    MergeState mergeState = merger.merge();
    int docsMerged = mergeState.segmentInfo.maxDoc();
    assertTrue(docsMerged == 2);
    // Should be able to open a new SegmentReader against the new directory
    SegmentReader mergedReader =
        new SegmentReader(
            new SegmentCommitInfo(mergeState.segmentInfo, 0, -1L, -1L, -1L),
            newIOContext(random()));
    assertTrue(mergedReader != null);
    assertTrue(mergedReader.numDocs() == 2);
    Document newDoc1 = mergedReader.document(0);
    assertTrue(newDoc1 != null);
    // There are 2 unstored fields on the document
    assertTrue(
        DocHelper.numFields(newDoc1) == DocHelper.numFields(doc1) - DocHelper.unstored.size());
    Document newDoc2 = mergedReader.document(1);
    assertTrue(newDoc2 != null);
    assertTrue(
        DocHelper.numFields(newDoc2) == DocHelper.numFields(doc2) - DocHelper.unstored.size());

    PostingsEnum termDocs =
        TestUtil.docs(
            random(), mergedReader, DocHelper.TEXT_FIELD_2_KEY, new BytesRef("field"), null, 0);
    assertTrue(termDocs != null);
    assertTrue(termDocs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);

    int tvCount = 0;
    for (FieldInfo fieldInfo : mergedReader.getFieldInfos()) {
      if (fieldInfo.hasVectors()) {
        tvCount++;
      }
    }

    // System.out.println("stored size: " + stored.size());
    assertEquals("We do not have 3 fields that were indexed with term vector", 3, tvCount);

    Terms vector = mergedReader.getTermVectors(0).terms(DocHelper.TEXT_FIELD_2_KEY);
    assertNotNull(vector);
    assertEquals(3, vector.size());
    TermsEnum termsEnum = vector.iterator();

    int i = 0;
    while (termsEnum.next() != null) {
      String term = termsEnum.term().utf8ToString();
      int freq = (int) termsEnum.totalTermFreq();
      // System.out.println("Term: " + term + " Freq: " + freq);
      assertTrue(DocHelper.FIELD_2_TEXT.indexOf(term) != -1);
      assertTrue(DocHelper.FIELD_2_FREQS[i] == freq);
      i++;
    }

    TestSegmentReader.checkNorms(mergedReader);
    mergedReader.close();
  }