/*
   *  Utility function to display a term vector.
   */
  static void termVectorDisplay(Terms terms) throws IOException {

    if ((terms == null) || (terms.size() == -1)) System.out.println("    The field is not stored.");
    else {
      /*
       *  The terms for this field are stored.
       */
      System.out.println("    Vocabulary size: " + terms.size() + " terms");

      TermsEnum ithTerm = terms.iterator(null);

      /*
       *  Iterate over the terms in this document.
       *  Information about a term's occurrences (tf and
       *  positions) is accessed via the indexing API, which
       *  returns inverted lists that describe (only) the
       *  current document.
       */
      while (ithTerm.next() != null) {
        System.out.format(
            "      %10d %-20s %d ",
            ithTerm.ord(), ithTerm.term().utf8ToString(), ithTerm.totalTermFreq());

        DocsAndPositionsEnum currDoc = ithTerm.docsAndPositions(null, null);
        currDoc.nextDoc();

        for (int jthPosition = 0; jthPosition < ithTerm.totalTermFreq(); jthPosition++)
          System.out.print(currDoc.nextPosition() + " ");

        System.out.println();
      }
      ;
    }
    ;
  }
  /*
   *  listTermDictionary displays the term dictionary for a field.
   */
  static void listTermDictionary(IndexReader reader, String fieldName) throws IOException {

    System.out.println("\nTerm Dictionary:  field " + fieldName);

    /*
      Grant says:
      MultiFields.getTerms(IndexReader, fieldName)
    */

    Terms terms = MultiFields.getTerms(reader, fieldName);

    if ((terms == null) || (terms.size() == -1))
      System.out.println("    The term dictionary is empty.");
    else {
      System.out.println("    Vocabulary size: " + terms.size() + " terms");

      TermsEnum ithTerm = terms.iterator(null);

      /*
       *  Iterate over the terms in this document.
       *  Information about a term's occurrences (tf and
       *  positions) is accessed via the indexing API, which
       *  returns inverted lists that describe (only) the
       *  current document.
       */
      while (ithTerm.next() != null) {
        System.out.format(
            "      %-30s %d %d\n",
            ithTerm.term().utf8ToString(), ithTerm.docFreq(), ithTerm.totalTermFreq());
      }
      ;
    }
    ;
  }
Exemple #3
0
  public SparseInstances readIndex(String indexPath, String destFile, int threshold)
      throws Exception {

    if (indexPath == null || destFile == null) {
      System.out.println("error: indexPath or destFile is null\n");
      return null;
    }

    DirectoryReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(indexPath)));
    Terms terms = SlowCompositeReaderWrapper.wrap(reader).terms(reviewKey);

    int capacity = (int) terms.size();
    HashMap<String, Integer> wordDict = new HashMap<>(capacity);
    capacity = capacity > 65535 ? 65535 : capacity;
    SparseInstances instData = new SparseInstances(capacity, reader.numDocs());
    TermsEnum termsEnum = terms.iterator();
    int index = 0;
    BytesRef term = null;
    String strTerm = null;
    while ((term = termsEnum.next()) != null) {
      strTerm = term.toString();
      if (termsEnum.totalTermFreq() < threshold) {
        continue;
      }
      if (strTerm.isEmpty()) {
        continue;
      }
      if (wordDict.get(strTerm) != null) {
        continue;
      }
      instData.addAttribute(strTerm);
      index++;
    }
    int numAtt = instData.numAttributes();
    int numInst = instData.numInstances();
    Integer attIndex = null;
    String id = null;
    int termIndex = 0;
    for (int docIndex = 0; docIndex < numInst; docIndex++) {
      id = reader.document(docIndex).getField(idKey).stringValue();
      Terms docTerms = reader.getTermVector(docIndex, reviewKey);
      if (docTerms == null) {
        continue;
      }
      int[] indices = new int[(int) docTerms.size()];
      double[] attValues = new double[(int) docTerms.size()];
      termsEnum = docTerms.iterator();
      termIndex = 0;
      while ((term = termsEnum.next()) != null) {
        strTerm = term.toString();
        attIndex = wordDict.get(strTerm);
        if (attIndex == null) {
          continue;
        }
        indices[termIndex] = attIndex.intValue();
        attValues[termIndex] = termsEnum.totalTermFreq();
      }
      ESparseInstance instance = new ESparseInstance(id, 1.0, attValues, indices, numAtt);
      instData.addInstance(instance);
    }

    return null;
  }