Пример #1
0
  /**
   * Find words for a more-like-this query former.
   *
   * @param docNum the id of the lucene document from which to find terms
   */
  private PriorityQueue<ScoreTerm> retrieveTerms(int docNum) throws IOException {
    Map<String, Map<String, Int>> field2termFreqMap = new HashMap<>();
    for (String fieldName : fieldNames) {
      final Fields vectors = ir.getTermVectors(docNum);
      final Terms vector;
      if (vectors != null) {
        vector = vectors.terms(fieldName);
      } else {
        vector = null;
      }

      // field does not store term vector info
      if (vector == null) {
        Document d = ir.document(docNum);
        IndexableField[] fields = d.getFields(fieldName);
        for (IndexableField field : fields) {
          final String stringValue = field.stringValue();
          if (stringValue != null) {
            addTermFrequencies(new StringReader(stringValue), field2termFreqMap, fieldName);
          }
        }
      } else {
        addTermFrequencies(field2termFreqMap, vector, fieldName);
      }
    }

    return createQueue(field2termFreqMap);
  }
Пример #2
0
 /**
  * Return a query that will return docs like the passed Readers. This was added in order to treat
  * multi-value fields.
  *
  * @return a query that will return docs like the passed Readers.
  */
 public Query like(String fieldName, Reader... readers) throws IOException {
   Map<String, Map<String, Int>> perFieldTermFrequencies = new HashMap<>();
   for (Reader r : readers) {
     addTermFrequencies(r, perFieldTermFrequencies, fieldName);
   }
   return createQuery(createQueue(perFieldTermFrequencies));
 }
Пример #3
0
 private PriorityQueue<ScoreTerm> retrieveTerms(Map<String, Collection<Object>> field2fieldValues)
     throws IOException {
   Map<String, Map<String, Int>> field2termFreqMap = new HashMap<>();
   for (String fieldName : fieldNames) {
     for (String field : field2fieldValues.keySet()) {
       Collection<Object> fieldValues = field2fieldValues.get(field);
       if (fieldValues == null) continue;
       for (Object fieldValue : fieldValues) {
         if (fieldValue != null) {
           addTermFrequencies(
               new StringReader(String.valueOf(fieldValue)), field2termFreqMap, fieldName);
         }
       }
     }
   }
   return createQueue(field2termFreqMap);
 }
Пример #4
0
 /**
  * Find words for a more-like-this query former. The result is a priority queue of arrays with one
  * entry for <b>every word</b> in the document. Each array has 6 elements. The elements are:
  *
  * <ol>
  *   <li>The word (String)
  *   <li>The top field that this word comes from (String)
  *   <li>The score for this word (Float)
  *   <li>The IDF value (Float)
  *   <li>The frequency of this word in the index (Integer)
  *   <li>The frequency of this word in the source document (Integer)
  * </ol>
  *
  * This is a somewhat "advanced" routine, and in general only the 1st entry in the array is of
  * interest. This method is exposed so that you can identify the "interesting words" in a
  * document. For an easier method to call see {@link #retrieveInterestingTerms
  * retrieveInterestingTerms()}.
  *
  * @param r the reader that has the content of the document
  * @param fieldName field passed to the analyzer to use when analyzing the content
  * @return the most interesting words in the document ordered by score, with the highest scoring,
  *     or best entry, first
  * @see #retrieveInterestingTerms
  */
 private PriorityQueue<ScoreTerm> retrieveTerms(Reader r, String fieldName) throws IOException {
   Map<String, Map<String, Int>> field2termFreqMap = new HashMap<>();
   addTermFrequencies(r, field2termFreqMap, fieldName);
   return createQueue(field2termFreqMap);
 }
Пример #5
0
 /**
  * Find words for a more-like-this query former. The result is a priority queue of arrays with one
  * entry for <b>every word</b> in the document. Each array has 6 elements. The elements are:
  *
  * <ol>
  *   <li>The word (String)
  *   <li>The top field that this word comes from (String)
  *   <li>The score for this word (Float)
  *   <li>The IDF value (Float)
  *   <li>The frequency of this word in the index (Integer)
  *   <li>The frequency of this word in the source document (Integer)
  * </ol>
  *
  * This is a somewhat "advanced" routine, and in general only the 1st entry in the array is of
  * interest. This method is exposed so that you can identify the "interesting words" in a
  * document. For an easier method to call see {@link #retrieveInterestingTerms
  * retrieveInterestingTerms()}.
  *
  * @param r the reader that has the content of the document
  * @param fieldName field passed to the analyzer to use when analyzing the content
  * @return the most interesting words in the document ordered by score, with the highest scoring,
  *     or best entry, first
  * @see #retrieveInterestingTerms
  */
 private PriorityQueue<ScoreTerm> retrieveTerms(Reader r, String fieldName) throws IOException {
   Map<String, Int> words = new HashMap<>();
   addTermFrequencies(r, words, fieldName);
   return createQueue(words);
 }