public void setFromString(String labelStr, String divider) { int first = labelStr.indexOf(divider); int second = labelStr.lastIndexOf(divider); if (first == second) { setWord(labelStr.substring(0, first)); setTag(labelStr.substring(first + 1)); setLemma(Morphology.lemmaStatic(labelStr.substring(0, first), labelStr.substring(first + 1))); } else if (first >= 0) { setWord(labelStr.substring(0, first)); setLemma(labelStr.substring(first + 1, second)); setTag(labelStr.substring(second + 1)); } else { setWord(labelStr); setLemma(null); setTag(null); } }
public static ArrayList<TaggedWord> Preprocess(ArrayList<TaggedWord> taggedWords) { ArrayList<TaggedWord> newList = new ArrayList<TaggedWord>(); String[] punctuationsAndSpecialCharacters = { ",", ".", "?", "!", ":", ";", "\"", "-", "--", "'", "-LRB-", "-RRB-", "''", "``", "&" }; // , "/", "\\", "<", ">", "#", "&", "*", "(", ")", "{", "}", "[", "]", "~", "|"}; HashMap<String, String> punctuationMarks = new HashMap<String, String>(); for (int i = 0; i < punctuationsAndSpecialCharacters.length; i++) { punctuationMarks.put( punctuationsAndSpecialCharacters[i], punctuationsAndSpecialCharacters[i]); } for (int i = 0; i < taggedWords.size(); i++) { String word = taggedWords.get(i).word(); String posTag = taggedWords.get(i).tag(); if (!punctuationMarks.containsKey(word)) { if (!(posTag.length() > 2 && posTag.substring(0, 3).equals("NNP"))) { word = Morphology.lemmaStatic(word, posTag, true); word = word.replace('-', ' '); } String newWord, newPosTag; if (word.equals("n't")) newWord = "not"; else if (word.equals("'s")) newWord = "is"; else if (word.equals("'ll")) newWord = "will"; else if (word.equals("'m") || word.equals("m")) newWord = "am"; else if (word.equals("im")) newWord = "am"; else newWord = word; newPosTag = posTag; newList.add(new TaggedWord(newWord, newPosTag)); } } newList = StopWordRemoval(newList); return newList; }