public static Map<String, Integer> termFrequencies( IndexSearcher indexSearcher, Query documentFilterQuery, String fieldName, String propName, String altName) { try { String luceneField = ComplexFieldUtil.propertyField(fieldName, propName, altName); Weight weight = indexSearcher.createNormalizedWeight(documentFilterQuery, false); Map<String, Integer> freq = new HashMap<>(); IndexReader indexReader = indexSearcher.getIndexReader(); for (LeafReaderContext arc : indexReader.leaves()) { if (weight == null) throw new RuntimeException("weight == null"); if (arc == null) throw new RuntimeException("arc == null"); if (arc.reader() == null) throw new RuntimeException("arc.reader() == null"); Scorer scorer = weight.scorer(arc, arc.reader().getLiveDocs()); if (scorer != null) { while (scorer.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { getFrequenciesFromTermVector( indexReader, scorer.docID() + arc.docBase, luceneField, freq); } } } return freq; } catch (IOException e) { throw ExUtil.wrapRuntimeException(e); } }
/** * Add term frequencies for a single document to a frequency map. * * @param reader the index * @param doc doc id * @param luceneName the index field from which to use the term vector * @param freq where to add to the token frequencies */ public static void getFrequenciesFromTermVector( IndexReader reader, int doc, String luceneName, Map<String, Integer> freq) { try { org.apache.lucene.index.Terms terms = reader.getTermVector(doc, luceneName); if (terms == null) { throw new IllegalArgumentException("Field " + luceneName + " has no Terms"); } TermsEnum termsEnum = terms.iterator(); // Verzamel concordantiewoorden uit term vector PostingsEnum postingsEnum = null; while (termsEnum.next() != null) { postingsEnum = termsEnum.postings(null, postingsEnum, PostingsEnum.FREQS); String term = termsEnum.term().utf8ToString(); Integer n = freq.get(term); if (n == null) { n = 0; } while (postingsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { n += termsEnum.docFreq(); } freq.put(term, n); } } catch (Exception e) { throw ExUtil.wrapRuntimeException(e); } }
/** * Read Properties from the specified file * * @param file the file to read * @return the Properties read */ public static Properties readFromFile(File file) { try { if (!file.isFile()) { throw new RuntimeException( "Property file " + file.getCanonicalPath() + " does not exist or is not a regular file!"); } Reader in = new BufferedReader(new FileReader(file)); try { Properties properties = new Properties(); properties.load(in); return properties; } finally { in.close(); } } catch (Exception e) { throw ExUtil.wrapRuntimeException(e); } }
/** * Get all words between the specified start and end positions from the term vector. * * <p>NOTE: this may return an array of less than the size requested, if the document ends before * the requested end position. * * @param reader the index * @param doc doc id * @param luceneName the index field from which to use the term vector * @param start start position (first word we want to request) * @param end end position (last word we want to request) * @param partialOk is it okay if we're missing words in the middle, or do we need them all? * (debug) * @return the words found, in order */ public static String[] getWordsFromTermVector( IndexReader reader, int doc, String luceneName, int start, int end, boolean partialOk) { // Retrieve the term position vector of the contents of this document. // NOTE: might be faster to retrieve all term vectors at once try { org.apache.lucene.index.Terms terms = reader.getTermVector(doc, luceneName); if (terms == null) { throw new IllegalArgumentException("Field " + luceneName + " has no Terms"); } if (!terms.hasPositions()) throw new IllegalArgumentException( "Field " + luceneName + " has no character postion information"); // String[] docTerms = new String[(int) terms.size()]; // final List<BytesRef> termsList = new ArrayList<BytesRef>(); TermsEnum termsEnum = terms.iterator(); // Verzamel concordantiewoorden uit term vector PostingsEnum docPosEnum = null; int numFound = 0; String[] concordanceWords = new String[end - start + 1]; while (termsEnum.next() != null) { docPosEnum = termsEnum.postings(null, docPosEnum, PostingsEnum.POSITIONS); while (docPosEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { // NOTE: .docId() will always return 0 in this case // if (docPosEnum.docID() != doc) // throw new RuntimeException("Wrong doc id: " + docPosEnum.docID() + " (expected " + doc // + ")"); for (int i = 0; i < docPosEnum.freq(); i++) { int position = docPosEnum.nextPosition(); if (position == -1) throw new RuntimeException( "Unexpected missing position (i=" + i + ", docPosEnum.freq() = " + docPosEnum.freq() + ")"); if (position >= start && position <= end) { if (concordanceWords[position - start] == null) concordanceWords[position - start] = termsEnum.term().utf8ToString(); else concordanceWords[position - start] += "|" + termsEnum.term().utf8ToString(); numFound++; } } if (numFound == concordanceWords.length) return concordanceWords; } } if (numFound < concordanceWords.length && !partialOk) { // If we simply ran into the end of the document, that's okay; // but if words are missing in the middle, that's not. String[] partial = new String[numFound]; for (int i = 0; i < numFound; i++) { partial[i] = concordanceWords[i]; if (partial[i] == null) { throw new RuntimeException( "Not all words found (" + numFound + " out of " + concordanceWords.length + "); missing words in the middle of concordance!"); } } return partial; } return concordanceWords; } catch (Exception e) { throw ExUtil.wrapRuntimeException(e); } }