コード例 #1
0
  /**
   * Read stream into array of strings.
   *
   * @param inputStream The InputStream for the file.
   */
  protected void openInputStream(InputStream inputStream) {
    String textLine;
    // Collect input lines in an array list.

    List<String> lines = ListFactory.createNewList();
    BufferedReader bufferedReader = null;

    try {
      bufferedReader = new BufferedReader(new UnicodeReader(inputStream, textFileEncoding));

      while ((textLine = bufferedReader.readLine()) != null) {
        lines.add(textLine);
      }

      textFileLoaded = true;
    } catch (IOException e) {
    } finally {
      try {
        if (bufferedReader != null) bufferedReader.close();
      } catch (Exception e) {
      }
    }
    // Convert array list to array of strings.

    textFileLines = new String[lines.size()];

    for (int i = 0; i < lines.size(); i++) {
      textFileLines[i] = lines.get(i);
    }
  }
コード例 #2
0
  /**
   * Count words in sentences.
   *
   * @param sentences The sentences.
   * @param stopWords Stop words.
   * @return Map of words to WordCountAndSentence objects.
   */
  public static <W extends Comparable> Map<String, WordCountAndSentences> countWordsInSentences(
      List<List<W>> sentences, StopWords stopWords) {
    //	Holds map between each word
    //	and the word's count and appearance.

    Map<String, WordCountAndSentences> wordCounts = new TreeMap<String, WordCountAndSentences>();

    //	Note if we are filtering using
    //	a stop word list.

    boolean checkStopWords = (stopWords != null);

    //	Loop over sentences.

    for (int i = 0; i < sentences.size(); i++) {
      //	Get next sentence.

      List<W> sentence = sentences.get(i);

      //	Loop over words in sentence.

      for (int j = 0; j < sentence.size(); j++) {
        //	Get next word.

        W word = sentence.get(j);

        //	Get string version of word in
        //	lower case.

        String lcWord = word.toString().toLowerCase();

        //	Ignore punctuation and symbols.

        if (CharUtils.isPunctuationOrSymbol(lcWord)) {
        }
        //	Ignore stop words.

        else if (checkStopWords && stopWords.isStopWord(lcWord)) {
        } else {
          //	Create/update count and appearance data
          //	for this word.

          WordCountAndSentences wcs = wordCounts.get(lcWord);

          if (wcs == null) {
            wcs = new WordCountAndSentences(lcWord);
            wordCounts.put(lcWord, wcs);
          }

          wcs.count++;
          wcs.sentences.add(i);
        }
      }
    }

    return wordCounts;
  }
コード例 #3
0
  /**
   * Summarize text.
   *
   * @param sentences Tokenized sentences to summarize.
   * @param summarySentences Maximum number of sentences to return in the summary.
   * @return Summary of the input text.
   */
  public <T extends Comparable> List<Integer> summarize(
      List<List<T>> sentences, int summarySentences) {
    //  Get word counts ignoring stop words.

    Map<String, WordCountAndSentences> wordCounts =
        WordCountUtils.countWordsInSentences(sentences, new BuckleyAndSaltonStopWords());
    //  Sort the counts into descending
    //  order by count.

    List<WordCountAndSentences> wcsData = new SortedArrayList<WordCountAndSentences>();

    Iterator<String> iterator = wordCounts.keySet().iterator();

    while (iterator.hasNext()) {
      wcsData.add(wordCounts.get(iterator.next()));
    }
    //  Holds summary sentence indices.

    Set<Integer> summarySentencesSet = new TreeSet<Integer>();

    //  Use up to 100 most commonly used words.

    int maxWords = Math.min(100, wcsData.size());

    //  For each commonly word word,
    //  find the first sentence in which
    //  that word appears, and add it to the
    //  summary sentences collection.
    for (int i = 0; (i < wcsData.size()) && (summarySentencesSet.size() < summarySentences); i++) {
      WordCountAndSentences wcs = wcsData.get(i);

      if (CharUtils.isNumber(wcs.word.toString())) continue;
      if (CharUtils.hasDigit(wcs.word.toString())) continue;

      Integer[] sentenceNumbers =
          (Integer[]) wcs.sentences.toArray(new Integer[wcs.sentences.size()]);

      summarySentencesSet.add(sentenceNumbers[0]);
    }
    //  Return indices of selected
    //  summary sentences.

    return new ArrayList<Integer>(summarySentencesSet);
  }