/** * Summarize text. * * @param sentences Tokenized sentences to summarize. * @param summarySentences Maximum number of sentences to return in the summary. * @return Summary of the input text. */ public <T extends Comparable> List<Integer> summarize( List<List<T>> sentences, int summarySentences) { // Get word counts ignoring stop words. Map<String, WordCountAndSentences> wordCounts = WordCountUtils.countWordsInSentences(sentences, new BuckleyAndSaltonStopWords()); // Sort the counts into descending // order by count. List<WordCountAndSentences> wcsData = new SortedArrayList<WordCountAndSentences>(); Iterator<String> iterator = wordCounts.keySet().iterator(); while (iterator.hasNext()) { wcsData.add(wordCounts.get(iterator.next())); } // Holds summary sentence indices. Set<Integer> summarySentencesSet = new TreeSet<Integer>(); // Use up to 100 most commonly used words. int maxWords = Math.min(100, wcsData.size()); // For each commonly word word, // find the first sentence in which // that word appears, and add it to the // summary sentences collection. for (int i = 0; (i < wcsData.size()) && (summarySentencesSet.size() < summarySentences); i++) { WordCountAndSentences wcs = wcsData.get(i); if (CharUtils.isNumber(wcs.word.toString())) continue; if (CharUtils.hasDigit(wcs.word.toString())) continue; Integer[] sentenceNumbers = (Integer[]) wcs.sentences.toArray(new Integer[wcs.sentences.size()]); summarySentencesSet.add(sentenceNumbers[0]); } // Return indices of selected // summary sentences. return new ArrayList<Integer>(summarySentencesSet); }