public static ArrayList<ArrayList<TaggedWord>> getPhrasesNaive( String sentence, LexicalizedParser lp, AbstractSequenceClassifier<CoreLabel> classifier) { ArrayList<ArrayList<TaggedWord>> newList = new ArrayList<ArrayList<TaggedWord>>(); ArrayList<TaggedWord> taggedWords = StanfordNER.parse(sentence, lp, classifier); HashMap<String, String> phraseBoundaries = new HashMap<String, String>(); phraseBoundaries.put(",", ","); phraseBoundaries.put("\"", "\""); phraseBoundaries.put("''", "''"); phraseBoundaries.put("``", "``"); phraseBoundaries.put("--", "--"); // List<Tree> leaves = parse.getLeaves(); ArrayList<TaggedWord> temp = new ArrayList<TaggedWord>(); int index = 0; while (index < taggedWords.size()) { if ((phraseBoundaries.containsKey(taggedWords.get(index).word()))) { if (temp.size() > 0) { // System.out.println(temp); ArrayList<TaggedWord> tempCopy = new ArrayList<TaggedWord>(temp); newList.add(Preprocess(tempCopy)); } temp.clear(); } else { // System.out.println(taggedWords.get(index).toString()); temp.add(taggedWords.get(index)); } index += 1; } if (temp.size() > 0) { ArrayList<TaggedWord> tempCopy = new ArrayList<TaggedWord>(temp); newList.add(Preprocess(tempCopy)); } // System.out.println(newList); return newList; }
public static ArrayList<TaggedWord> StopWordRemoval(ArrayList<TaggedWord> taggedWords) { ArrayList<TaggedWord> newList = new ArrayList<TaggedWord>(); try { String path = "data/nltk_stoplist.txt"; File textFile = new File(path); BufferedReader br = new BufferedReader(new FileReader(textFile)); String stopwordsLine = br.readLine(); br.close(); String[] stopwords = stopwordsLine.split(","); HashMap<String, String> stopwordsDict = new HashMap<String, String>(); for (int i = 0; i < stopwords.length; i++) { stopwordsDict.put(stopwords[i], stopwords[i]); } for (int i = 0; i < taggedWords.size(); i++) { String word = taggedWords.get(i).word(); String posTag = taggedWords.get(i).tag(); if (!stopwordsDict.containsKey(word.toLowerCase())) { String newWord, newPosTag; newWord = word; newPosTag = posTag; newList.add(new TaggedWord(newWord, newPosTag)); } } } catch (Exception ex) { ex.printStackTrace(); } return newList; }
public static ArrayList<TaggedWord> Preprocess(ArrayList<TaggedWord> taggedWords) { ArrayList<TaggedWord> newList = new ArrayList<TaggedWord>(); String[] punctuationsAndSpecialCharacters = { ",", ".", "?", "!", ":", ";", "\"", "-", "--", "'", "-LRB-", "-RRB-", "''", "``", "&" }; // , "/", "\\", "<", ">", "#", "&", "*", "(", ")", "{", "}", "[", "]", "~", "|"}; HashMap<String, String> punctuationMarks = new HashMap<String, String>(); for (int i = 0; i < punctuationsAndSpecialCharacters.length; i++) { punctuationMarks.put( punctuationsAndSpecialCharacters[i], punctuationsAndSpecialCharacters[i]); } for (int i = 0; i < taggedWords.size(); i++) { String word = taggedWords.get(i).word(); String posTag = taggedWords.get(i).tag(); if (!punctuationMarks.containsKey(word)) { if (!(posTag.length() > 2 && posTag.substring(0, 3).equals("NNP"))) { word = Morphology.lemmaStatic(word, posTag, true); word = word.replace('-', ' '); } String newWord, newPosTag; if (word.equals("n't")) newWord = "not"; else if (word.equals("'s")) newWord = "is"; else if (word.equals("'ll")) newWord = "will"; else if (word.equals("'m") || word.equals("m")) newWord = "am"; else if (word.equals("im")) newWord = "am"; else newWord = word; newPosTag = posTag; newList.add(new TaggedWord(newWord, newPosTag)); } } newList = StopWordRemoval(newList); return newList; }