/** * Read stream into array of strings. * * @param inputStream The InputStream for the file. */ protected void openInputStream(InputStream inputStream) { String textLine; // Collect input lines in an array list. List<String> lines = ListFactory.createNewList(); BufferedReader bufferedReader = null; try { bufferedReader = new BufferedReader(new UnicodeReader(inputStream, textFileEncoding)); while ((textLine = bufferedReader.readLine()) != null) { lines.add(textLine); } textFileLoaded = true; } catch (IOException e) { } finally { try { if (bufferedReader != null) bufferedReader.close(); } catch (Exception e) { } } // Convert array list to array of strings. textFileLines = new String[lines.size()]; for (int i = 0; i < lines.size(); i++) { textFileLines[i] = lines.get(i); } }
/** * Count words in sentences. * * @param sentences The sentences. * @param stopWords Stop words. * @return Map of words to WordCountAndSentence objects. */ public static <W extends Comparable> Map<String, WordCountAndSentences> countWordsInSentences( List<List<W>> sentences, StopWords stopWords) { // Holds map between each word // and the word's count and appearance. Map<String, WordCountAndSentences> wordCounts = new TreeMap<String, WordCountAndSentences>(); // Note if we are filtering using // a stop word list. boolean checkStopWords = (stopWords != null); // Loop over sentences. for (int i = 0; i < sentences.size(); i++) { // Get next sentence. List<W> sentence = sentences.get(i); // Loop over words in sentence. for (int j = 0; j < sentence.size(); j++) { // Get next word. W word = sentence.get(j); // Get string version of word in // lower case. String lcWord = word.toString().toLowerCase(); // Ignore punctuation and symbols. if (CharUtils.isPunctuationOrSymbol(lcWord)) { } // Ignore stop words. else if (checkStopWords && stopWords.isStopWord(lcWord)) { } else { // Create/update count and appearance data // for this word. WordCountAndSentences wcs = wordCounts.get(lcWord); if (wcs == null) { wcs = new WordCountAndSentences(lcWord); wordCounts.put(lcWord, wcs); } wcs.count++; wcs.sentences.add(i); } } } return wordCounts; }
/** * Summarize text. * * @param sentences Tokenized sentences to summarize. * @param summarySentences Maximum number of sentences to return in the summary. * @return Summary of the input text. */ public <T extends Comparable> List<Integer> summarize( List<List<T>> sentences, int summarySentences) { // Get word counts ignoring stop words. Map<String, WordCountAndSentences> wordCounts = WordCountUtils.countWordsInSentences(sentences, new BuckleyAndSaltonStopWords()); // Sort the counts into descending // order by count. List<WordCountAndSentences> wcsData = new SortedArrayList<WordCountAndSentences>(); Iterator<String> iterator = wordCounts.keySet().iterator(); while (iterator.hasNext()) { wcsData.add(wordCounts.get(iterator.next())); } // Holds summary sentence indices. Set<Integer> summarySentencesSet = new TreeSet<Integer>(); // Use up to 100 most commonly used words. int maxWords = Math.min(100, wcsData.size()); // For each commonly word word, // find the first sentence in which // that word appears, and add it to the // summary sentences collection. for (int i = 0; (i < wcsData.size()) && (summarySentencesSet.size() < summarySentences); i++) { WordCountAndSentences wcs = wcsData.get(i); if (CharUtils.isNumber(wcs.word.toString())) continue; if (CharUtils.hasDigit(wcs.word.toString())) continue; Integer[] sentenceNumbers = (Integer[]) wcs.sentences.toArray(new Integer[wcs.sentences.size()]); summarySentencesSet.add(sentenceNumbers[0]); } // Return indices of selected // summary sentences. return new ArrayList<Integer>(summarySentencesSet); }