Exemplo n.º 1
0
 public static Map<String, Integer> termFrequencies(
     IndexSearcher indexSearcher,
     Query documentFilterQuery,
     String fieldName,
     String propName,
     String altName) {
   try {
     String luceneField = ComplexFieldUtil.propertyField(fieldName, propName, altName);
     Weight weight = indexSearcher.createNormalizedWeight(documentFilterQuery, false);
     Map<String, Integer> freq = new HashMap<>();
     IndexReader indexReader = indexSearcher.getIndexReader();
     for (LeafReaderContext arc : indexReader.leaves()) {
       if (weight == null) throw new RuntimeException("weight == null");
       if (arc == null) throw new RuntimeException("arc == null");
       if (arc.reader() == null) throw new RuntimeException("arc.reader() == null");
       Scorer scorer = weight.scorer(arc, arc.reader().getLiveDocs());
       if (scorer != null) {
         while (scorer.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
           getFrequenciesFromTermVector(
               indexReader, scorer.docID() + arc.docBase, luceneField, freq);
         }
       }
     }
     return freq;
   } catch (IOException e) {
     throw ExUtil.wrapRuntimeException(e);
   }
 }
Exemplo n.º 2
0
  /**
   * Add term frequencies for a single document to a frequency map.
   *
   * @param reader the index
   * @param doc doc id
   * @param luceneName the index field from which to use the term vector
   * @param freq where to add to the token frequencies
   */
  public static void getFrequenciesFromTermVector(
      IndexReader reader, int doc, String luceneName, Map<String, Integer> freq) {
    try {
      org.apache.lucene.index.Terms terms = reader.getTermVector(doc, luceneName);
      if (terms == null) {
        throw new IllegalArgumentException("Field " + luceneName + " has no Terms");
      }
      TermsEnum termsEnum = terms.iterator();

      // Verzamel concordantiewoorden uit term vector
      PostingsEnum postingsEnum = null;
      while (termsEnum.next() != null) {
        postingsEnum = termsEnum.postings(null, postingsEnum, PostingsEnum.FREQS);
        String term = termsEnum.term().utf8ToString();
        Integer n = freq.get(term);
        if (n == null) {
          n = 0;
        }
        while (postingsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
          n += termsEnum.docFreq();
        }
        freq.put(term, n);
      }
    } catch (Exception e) {
      throw ExUtil.wrapRuntimeException(e);
    }
  }
Exemplo n.º 3
0
  /**
   * Read Properties from the specified file
   *
   * @param file the file to read
   * @return the Properties read
   */
  public static Properties readFromFile(File file) {
    try {
      if (!file.isFile()) {
        throw new RuntimeException(
            "Property file "
                + file.getCanonicalPath()
                + " does not exist or is not a regular file!");
      }

      Reader in = new BufferedReader(new FileReader(file));
      try {
        Properties properties = new Properties();
        properties.load(in);
        return properties;
      } finally {
        in.close();
      }
    } catch (Exception e) {
      throw ExUtil.wrapRuntimeException(e);
    }
  }
Exemplo n.º 4
0
  /**
   * Get all words between the specified start and end positions from the term vector.
   *
   * <p>NOTE: this may return an array of less than the size requested, if the document ends before
   * the requested end position.
   *
   * @param reader the index
   * @param doc doc id
   * @param luceneName the index field from which to use the term vector
   * @param start start position (first word we want to request)
   * @param end end position (last word we want to request)
   * @param partialOk is it okay if we're missing words in the middle, or do we need them all?
   *     (debug)
   * @return the words found, in order
   */
  public static String[] getWordsFromTermVector(
      IndexReader reader, int doc, String luceneName, int start, int end, boolean partialOk) {

    // Retrieve the term position vector of the contents of this document.
    // NOTE: might be faster to retrieve all term vectors at once

    try {
      org.apache.lucene.index.Terms terms = reader.getTermVector(doc, luceneName);
      if (terms == null) {
        throw new IllegalArgumentException("Field " + luceneName + " has no Terms");
      }
      if (!terms.hasPositions())
        throw new IllegalArgumentException(
            "Field " + luceneName + " has no character postion information");
      // String[] docTerms = new String[(int) terms.size()];
      // final List<BytesRef> termsList = new ArrayList<BytesRef>();
      TermsEnum termsEnum = terms.iterator();

      // Verzamel concordantiewoorden uit term vector
      PostingsEnum docPosEnum = null;
      int numFound = 0;
      String[] concordanceWords = new String[end - start + 1];
      while (termsEnum.next() != null) {
        docPosEnum = termsEnum.postings(null, docPosEnum, PostingsEnum.POSITIONS);
        while (docPosEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
          // NOTE: .docId() will always return 0 in this case
          // if (docPosEnum.docID() != doc)
          //	throw new RuntimeException("Wrong doc id: " + docPosEnum.docID() + " (expected " + doc
          // + ")");
          for (int i = 0; i < docPosEnum.freq(); i++) {
            int position = docPosEnum.nextPosition();
            if (position == -1)
              throw new RuntimeException(
                  "Unexpected missing position (i="
                      + i
                      + ", docPosEnum.freq() = "
                      + docPosEnum.freq()
                      + ")");
            if (position >= start && position <= end) {
              if (concordanceWords[position - start] == null)
                concordanceWords[position - start] = termsEnum.term().utf8ToString();
              else concordanceWords[position - start] += "|" + termsEnum.term().utf8ToString();
              numFound++;
            }
          }
          if (numFound == concordanceWords.length) return concordanceWords;
        }
      }

      if (numFound < concordanceWords.length && !partialOk) {
        // If we simply ran into the end of the document, that's okay;
        // but if words are missing in the middle, that's not.
        String[] partial = new String[numFound];
        for (int i = 0; i < numFound; i++) {
          partial[i] = concordanceWords[i];
          if (partial[i] == null) {
            throw new RuntimeException(
                "Not all words found ("
                    + numFound
                    + " out of "
                    + concordanceWords.length
                    + "); missing words in the middle of concordance!");
          }
        }
        return partial;
      }
      return concordanceWords;
    } catch (Exception e) {
      throw ExUtil.wrapRuntimeException(e);
    }
  }