public void readFileAsList() throws IOException, DictionaryException { BufferedReader br; StringTokenizer tz; for (int i = 0; i < files.length; i++) { List<String> words = new ArrayList<String>(); String wordLine = ""; String[] word; Scanner file = new Scanner(new FileReader(files[i])); while (file.hasNext()) wordLine += file.nextLine().toLowerCase(); file.close(); word = wordLine.split("[?!*,-.\"_\"\\;/:)(+\\s]+"); for (int j = 0; j < word.length; j++) { words.add(word[j]); } StopWords stopWords = new StopWords(); words = StopWords.removeStopWords(words); /*for (int j = 0; j < words.size(); j++) { System.out.print(words.get(j) + "/"); }*/ write(words, files[i].getName()); } }
/** * Count words in sentences. * * @param sentences The sentences. * @param stopWords Stop words. * @return Map of words to WordCountAndSentence objects. */ public static <W extends Comparable> Map<String, WordCountAndSentences> countWordsInSentences( List<List<W>> sentences, StopWords stopWords) { // Holds map between each word // and the word's count and appearance. Map<String, WordCountAndSentences> wordCounts = new TreeMap<String, WordCountAndSentences>(); // Note if we are filtering using // a stop word list. boolean checkStopWords = (stopWords != null); // Loop over sentences. for (int i = 0; i < sentences.size(); i++) { // Get next sentence. List<W> sentence = sentences.get(i); // Loop over words in sentence. for (int j = 0; j < sentence.size(); j++) { // Get next word. W word = sentence.get(j); // Get string version of word in // lower case. String lcWord = word.toString().toLowerCase(); // Ignore punctuation and symbols. if (CharUtils.isPunctuationOrSymbol(lcWord)) { } // Ignore stop words. else if (checkStopWords && stopWords.isStopWord(lcWord)) { } else { // Create/update count and appearance data // for this word. WordCountAndSentences wcs = wordCounts.get(lcWord); if (wcs == null) { wcs = new WordCountAndSentences(lcWord); wordCounts.put(lcWord, wcs); } wcs.count++; wcs.sentences.add(i); } } } return wordCounts; }