/** * Checks if a word is a name or not * * @param word * @return */ private static boolean isName(String word) { String[] syllables = word.split("\\s+"); if (syllables.length == 1) { String s = syllables[0]; if (s.length() > 0) { char c = s.charAt(0); if ((c >= 'A' && c <= 'Z') || CaseConverter.isValidUpper(c)) { return true; } } } for (String s : syllables) { if (s.length() > 0) { char c = s.charAt(0); if ((c >= 'a' && c <= 'z') || CaseConverter.isValidLower(c)) { return false; } } } return true; }
/** * Gets the words of a tagged sentence. In the tagged sentence, word/tag pairs are separated by * space characters. * * @param taggedSentence a tagged sentence. * @return a set of words. */ public static Set<String> getWords(String taggedSentence) { Set<String> words = new HashSet<String>(); String[] pairs = taggedSentence.split("\\s+"); int slashIndex = -1; for (String pair : pairs) { slashIndex = pair.indexOf('/'); if (slashIndex > 0) { String word = pair.substring(0, slashIndex); // replace all the _ with spaces word = word.replaceAll("_", " ").trim(); if (PRUNE_NAME) { if (!containsStopwords(word) && !isName(word)) { // make the word lowercase words.add(CaseConverter.toLower(word)); } } else { words.add(word); } } } return words; }