/** * Find words for a more-like-this query former. * * @param docNum the id of the lucene document from which to find terms */ private PriorityQueue<ScoreTerm> retrieveTerms(int docNum) throws IOException { Map<String, Map<String, Int>> field2termFreqMap = new HashMap<>(); for (String fieldName : fieldNames) { final Fields vectors = ir.getTermVectors(docNum); final Terms vector; if (vectors != null) { vector = vectors.terms(fieldName); } else { vector = null; } // field does not store term vector info if (vector == null) { Document d = ir.document(docNum); IndexableField[] fields = d.getFields(fieldName); for (IndexableField field : fields) { final String stringValue = field.stringValue(); if (stringValue != null) { addTermFrequencies(new StringReader(stringValue), field2termFreqMap, fieldName); } } } else { addTermFrequencies(field2termFreqMap, vector, fieldName); } } return createQueue(field2termFreqMap); }
/** * Return a query that will return docs like the passed Readers. This was added in order to treat * multi-value fields. * * @return a query that will return docs like the passed Readers. */ public Query like(String fieldName, Reader... readers) throws IOException { Map<String, Map<String, Int>> perFieldTermFrequencies = new HashMap<>(); for (Reader r : readers) { addTermFrequencies(r, perFieldTermFrequencies, fieldName); } return createQuery(createQueue(perFieldTermFrequencies)); }
private PriorityQueue<ScoreTerm> retrieveTerms(Map<String, Collection<Object>> field2fieldValues) throws IOException { Map<String, Map<String, Int>> field2termFreqMap = new HashMap<>(); for (String fieldName : fieldNames) { for (String field : field2fieldValues.keySet()) { Collection<Object> fieldValues = field2fieldValues.get(field); if (fieldValues == null) continue; for (Object fieldValue : fieldValues) { if (fieldValue != null) { addTermFrequencies( new StringReader(String.valueOf(fieldValue)), field2termFreqMap, fieldName); } } } } return createQueue(field2termFreqMap); }
/** * Find words for a more-like-this query former. The result is a priority queue of arrays with one * entry for <b>every word</b> in the document. Each array has 6 elements. The elements are: * * <ol> * <li>The word (String) * <li>The top field that this word comes from (String) * <li>The score for this word (Float) * <li>The IDF value (Float) * <li>The frequency of this word in the index (Integer) * <li>The frequency of this word in the source document (Integer) * </ol> * * This is a somewhat "advanced" routine, and in general only the 1st entry in the array is of * interest. This method is exposed so that you can identify the "interesting words" in a * document. For an easier method to call see {@link #retrieveInterestingTerms * retrieveInterestingTerms()}. * * @param r the reader that has the content of the document * @param fieldName field passed to the analyzer to use when analyzing the content * @return the most interesting words in the document ordered by score, with the highest scoring, * or best entry, first * @see #retrieveInterestingTerms */ private PriorityQueue<ScoreTerm> retrieveTerms(Reader r, String fieldName) throws IOException { Map<String, Map<String, Int>> field2termFreqMap = new HashMap<>(); addTermFrequencies(r, field2termFreqMap, fieldName); return createQueue(field2termFreqMap); }
/** * Find words for a more-like-this query former. The result is a priority queue of arrays with one * entry for <b>every word</b> in the document. Each array has 6 elements. The elements are: * * <ol> * <li>The word (String) * <li>The top field that this word comes from (String) * <li>The score for this word (Float) * <li>The IDF value (Float) * <li>The frequency of this word in the index (Integer) * <li>The frequency of this word in the source document (Integer) * </ol> * * This is a somewhat "advanced" routine, and in general only the 1st entry in the array is of * interest. This method is exposed so that you can identify the "interesting words" in a * document. For an easier method to call see {@link #retrieveInterestingTerms * retrieveInterestingTerms()}. * * @param r the reader that has the content of the document * @param fieldName field passed to the analyzer to use when analyzing the content * @return the most interesting words in the document ordered by score, with the highest scoring, * or best entry, first * @see #retrieveInterestingTerms */ private PriorityQueue<ScoreTerm> retrieveTerms(Reader r, String fieldName) throws IOException { Map<String, Int> words = new HashMap<>(); addTermFrequencies(r, words, fieldName); return createQueue(words); }