public static ArrayList<TaggedWord> StopWordRemoval(ArrayList<TaggedWord> taggedWords) { ArrayList<TaggedWord> newList = new ArrayList<TaggedWord>(); try { String path = "data/nltk_stoplist.txt"; File textFile = new File(path); BufferedReader br = new BufferedReader(new FileReader(textFile)); String stopwordsLine = br.readLine(); br.close(); String[] stopwords = stopwordsLine.split(","); HashMap<String, String> stopwordsDict = new HashMap<String, String>(); for (int i = 0; i < stopwords.length; i++) { stopwordsDict.put(stopwords[i], stopwords[i]); } for (int i = 0; i < taggedWords.size(); i++) { String word = taggedWords.get(i).word(); String posTag = taggedWords.get(i).tag(); if (!stopwordsDict.containsKey(word.toLowerCase())) { String newWord, newPosTag; newWord = word; newPosTag = posTag; newList.add(new TaggedWord(newWord, newPosTag)); } } } catch (Exception ex) { ex.printStackTrace(); } return newList; }
public static ArrayList<ArrayList<TaggedWord>> getPhrasesNaive( String sentence, LexicalizedParser lp, AbstractSequenceClassifier<CoreLabel> classifier) { ArrayList<ArrayList<TaggedWord>> newList = new ArrayList<ArrayList<TaggedWord>>(); ArrayList<TaggedWord> taggedWords = StanfordNER.parse(sentence, lp, classifier); HashMap<String, String> phraseBoundaries = new HashMap<String, String>(); phraseBoundaries.put(",", ","); phraseBoundaries.put("\"", "\""); phraseBoundaries.put("''", "''"); phraseBoundaries.put("``", "``"); phraseBoundaries.put("--", "--"); // List<Tree> leaves = parse.getLeaves(); ArrayList<TaggedWord> temp = new ArrayList<TaggedWord>(); int index = 0; while (index < taggedWords.size()) { if ((phraseBoundaries.containsKey(taggedWords.get(index).word()))) { if (temp.size() > 0) { // System.out.println(temp); ArrayList<TaggedWord> tempCopy = new ArrayList<TaggedWord>(temp); newList.add(Preprocess(tempCopy)); } temp.clear(); } else { // System.out.println(taggedWords.get(index).toString()); temp.add(taggedWords.get(index)); } index += 1; } if (temp.size() > 0) { ArrayList<TaggedWord> tempCopy = new ArrayList<TaggedWord>(temp); newList.add(Preprocess(tempCopy)); } // System.out.println(newList); return newList; }
public ArrayList<String> getNounsFromSentence(String sentence) { ArrayList<TaggedWord> tw = parseSentenceTD(sentence); ArrayList<String> nouns = new ArrayList<String>(); for (TaggedWord t : tw) { if (t.tag().startsWith("N")) { nouns.add(t.value()); } } return nouns; }
public static double LexicalSimilarityScoreMin( ArrayList<TaggedWord> taggedWords1, ArrayList<TaggedWord> taggedWords2, DISCOSimilarity discoRAM, LexicalizedParser lp) { // System.out.println(taggedWords1.size() + "," + taggedWords2.size()); // array of edge weights with default weight 0 int length1 = taggedWords1.size(); int length2 = taggedWords2.size(); int arrSize = Math.max(length1, length2); double[][] array = new double[arrSize][arrSize]; for (int i = 0; i < arrSize; i++) { for (int j = 0; j < arrSize; j++) { array[i][j] = 0; } } for (int i = 0; i < length1; i++) { for (int j = 0; j < length2; j++) { String word1 = taggedWords1.get(i).word(); String word2 = taggedWords2.get(j).word(); double edgeWeight = 0; // LSA Similarity // edgeWeight = LSASimilarity.LSAWordSimilarity(word1, word2); // DISCO Similarity // DISCOSimilarity discoObj = new DISCOSimilarity(); try { if (word1.compareToIgnoreCase(word2) == 0) edgeWeight = 1; else { edgeWeight = discoRAM.similarity2(word1, word2); // edgeWeight = LSASimilarity.LSAWordSimilarity(word1, word2); } } catch (Exception ex) { ex.printStackTrace(); } array[i][j] = edgeWeight; } } // System.out.println("Hungarian starts " + arrSize); double finalScore; String sumType = "max"; int minLength = Math.min(length1, length2); finalScore = HungarianAlgorithm.hgAlgorithm(array, sumType) / minLength * 5; // finalScore = HungarianAlgorithm.hgAlgorithm(array, sumType)/arrSize * 5; return finalScore; }
public ArrayList<String> getPairsFromSentence(String sentence) { Collection<TypedDependency> tdl = parseSentenceTDL(sentence); ArrayList<String> pairs = new ArrayList<String>(); for (TypedDependency td : tdl) { StringBuilder sb = new StringBuilder(); sb.append(td.gov().originalText()); sb.append(" "); sb.append(td.dep().originalText()); pairs.add(sb.toString()); } return pairs; }
public static ArrayList<TaggedWord> PreprocessPhrase(ArrayList<TaggedWord> taggedWords) { ArrayList<TaggedWord> newList = new ArrayList<TaggedWord>(); taggedWords = Preprocess(taggedWords); for (int i = 0; i < taggedWords.size(); i++) { String posTag = taggedWords.get(i).tag(); if (posTag.length() >= 2) { String reducedTag = posTag.substring(0, 2); if (reducedTag.equals("CD") || reducedTag.equals("NN") || reducedTag.equals("VB")) { newList.add(taggedWords.get(i)); } } } return newList; }
public static ArrayList<ArrayList<TaggedWord>> getPhrases(Tree parse, int phraseSizeLimit) { ArrayList<ArrayList<TaggedWord>> newList = new ArrayList<ArrayList<TaggedWord>>(); List<Tree> leaves = parse.getLeaves(); if (leaves.size() <= phraseSizeLimit) { // ArrayList<TaggedWord> phraseElements = PreprocessPhrase(parse.taggedYield()); ArrayList<TaggedWord> phraseElements = Preprocess(parse.taggedYield()); if (phraseElements.size() > 0) newList.add(phraseElements); } else { Tree[] childrenNodes = parse.children(); for (int i = 0; i < childrenNodes.length; i++) { Tree currentParse = childrenNodes[i]; newList.addAll(getPhrases(currentParse, phraseSizeLimit)); } } return newList; }
public static void main(String args[]) { // String sentence1 = "A large bird standing on a table picks up a plastic glass // containing liquid and places it in a bowl of something."; // String sentence2 = "A bird picks up a plastic cup containing a liquid with it's beak // and puts the cup into a bowl."; // LexicalizedParser lp = new LexicalizedParser("englishPCFG.ser.gz"); // LeskWSD tm = new LeskWSD(lp); // WordNetSimilarity ws = new WordNetSimilarity(); // // System.out.println(LexicalSimilarityScoreWordNet(sentence1, sentence2, tm, lp, ws)); String sentence = "The broader Standard & Poor's 500 Index <.SPX> shed 2.38 points, or 0.24 percent, at 995.10."; LexicalizedParser lp = new LexicalizedParser("englishPCFG.ser.gz"); Tree parse = lp.apply(sentence); ArrayList<TaggedWord> taggedWords = parse.taggedYield(); taggedWords = Preprocess(taggedWords); for (int i = 0; i < taggedWords.size(); i++) System.out.println(taggedWords.get(i).word()); }
public static double BestWordMatchEdgeWeight( ArrayList<TaggedWord> taggedWords1, ArrayList<TaggedWord> taggedWords2, DISCOSimilarity discoRAM) { double bestMatchScore = 0; for (int i = 0; i < taggedWords1.size(); i++) { String word1 = taggedWords1.get(i).word(); for (int j = 0; j < taggedWords2.size(); j++) { String word2 = taggedWords2.get(j).word(); double currentScore; if (word1.equals(word2)) currentScore = 1; else currentScore = discoRAM.similarity2(word1, word2); if (currentScore > bestMatchScore) bestMatchScore = currentScore; } } return bestMatchScore; }
private Distribution<Integer> getSegmentedWordLengthDistribution(Treebank tb) { // CharacterLevelTagExtender ext = new CharacterLevelTagExtender(); ClassicCounter<Integer> c = new ClassicCounter<Integer>(); for (Iterator iterator = tb.iterator(); iterator.hasNext(); ) { Tree gold = (Tree) iterator.next(); StringBuilder goldChars = new StringBuilder(); ArrayList goldYield = gold.yield(); for (Iterator wordIter = goldYield.iterator(); wordIter.hasNext(); ) { Word word = (Word) wordIter.next(); goldChars.append(word); } List<HasWord> ourWords = segment(goldChars.toString()); for (int i = 0; i < ourWords.size(); i++) { c.incrementCount(Integer.valueOf(ourWords.get(i).word().length())); } } return Distribution.getDistribution(c); }
@SuppressWarnings("static-access") @Override public EventSet createEventSet(Document doc) { EventSet es = new EventSet(doc.getAuthor()); char[] text = doc.getProcessedText(); String stringText = new String(text); // use MaxentPOSTagsEventDriver's tagger // initialize tagger and return empty event set if encountered a problem if (tagger == null) { tagger = MaxentPOSTagsEventDriver.initTagger(); if (tagger == null) return es; } List<List<HasWord>> sentences = tagger.tokenizeText(new BufferedReader(new StringReader(stringText))); ArrayList<TaggedWord> tagged = new ArrayList<TaggedWord>(); for (List<HasWord> sentence : sentences) tagged.addAll(tagger.tagSentence(sentence)); int i, j, n; try { n = Integer.parseInt(getParameter("N")); } catch (NumberFormatException e) { n = 2; } String curr; for (i = 0; i < tagged.size() - n + 1; i++) { curr = "(" + tagged.get(i).tag() + ")"; for (j = 1; j < n; j++) { curr += "-(" + tagged.get(i + j).tag() + ")"; } es.addEvent(new Event(curr)); } sentences.clear(); sentences = null; return es; }
public static ArrayList<TaggedWord> Preprocess(ArrayList<TaggedWord> taggedWords) { ArrayList<TaggedWord> newList = new ArrayList<TaggedWord>(); String[] punctuationsAndSpecialCharacters = { ",", ".", "?", "!", ":", ";", "\"", "-", "--", "'", "-LRB-", "-RRB-", "''", "``", "&" }; // , "/", "\\", "<", ">", "#", "&", "*", "(", ")", "{", "}", "[", "]", "~", "|"}; HashMap<String, String> punctuationMarks = new HashMap<String, String>(); for (int i = 0; i < punctuationsAndSpecialCharacters.length; i++) { punctuationMarks.put( punctuationsAndSpecialCharacters[i], punctuationsAndSpecialCharacters[i]); } for (int i = 0; i < taggedWords.size(); i++) { String word = taggedWords.get(i).word(); String posTag = taggedWords.get(i).tag(); if (!punctuationMarks.containsKey(word)) { if (!(posTag.length() > 2 && posTag.substring(0, 3).equals("NNP"))) { word = Morphology.lemmaStatic(word, posTag, true); word = word.replace('-', ' '); } String newWord, newPosTag; if (word.equals("n't")) newWord = "not"; else if (word.equals("'s")) newWord = "is"; else if (word.equals("'ll")) newWord = "will"; else if (word.equals("'m") || word.equals("m")) newWord = "am"; else if (word.equals("im")) newWord = "am"; else newWord = word; newPosTag = posTag; newList.add(new TaggedWord(newWord, newPosTag)); } } newList = StopWordRemoval(newList); return newList; }
public static double LexicalSimilarityScoreWordNet( String sentence1, String sentence2, LeskWSD tm, LexicalizedParser lp, WordNetSimilarity ws) { ArrayList<TaggedWord> taggedWordsPrev1 = Preprocess(StanfordParse(sentence1, lp)); ArrayList<TaggedWord> taggedWordsPrev2 = Preprocess(StanfordParse(sentence2, lp)); ArrayList<TaggedWord> taggedWords1 = new ArrayList<TaggedWord>(); ArrayList<TaggedWord> taggedWords2 = new ArrayList<TaggedWord>(); WordNetSense[] sensesPrev1 = tm.LeskJWI(sentence1); WordNetSense[] sensesPrev2 = tm.LeskJWI(sentence2); // System.out.println("Senses found!"); ArrayList<WordNetSense> senses1 = new ArrayList<WordNetSense>(); ArrayList<WordNetSense> senses2 = new ArrayList<WordNetSense>(); for (int i = 0; i < taggedWordsPrev1.size(); i++) { String word = taggedWordsPrev1.get(i).word(); String posTag = taggedWordsPrev1.get(i).tag(); if (posTag.length() >= 2 && posTag.substring(0, 2).equals("NN")) { taggedWords1.add(new TaggedWord(word, "NN")); senses1.add(sensesPrev1[i]); } else if (posTag.length() >= 2 && posTag.substring(0, 2).equals("VB")) { taggedWords1.add(new TaggedWord(word, "VB")); senses1.add(sensesPrev1[i]); } } for (int i = 0; i < taggedWordsPrev2.size(); i++) { String word = taggedWordsPrev2.get(i).word(); String posTag = taggedWordsPrev2.get(i).tag(); if (posTag.length() >= 2 && posTag.substring(0, 2).equals("NN")) { taggedWords2.add(new TaggedWord(word, "NN")); senses2.add(sensesPrev2[i]); } else if (posTag.length() >= 2 && posTag.substring(0, 2).equals("VB")) { taggedWords2.add(new TaggedWord(word, "VB")); senses2.add(sensesPrev2[i]); } } // System.out.println(taggedWords1.size() + "," + taggedWords2.size()); // array of edge weights with default weight 0 int length1 = taggedWords1.size(); int length2 = taggedWords2.size(); int arrSize = Math.max(length1, length2); double[][] array = new double[arrSize][arrSize]; for (int i = 0; i < arrSize; i++) { for (int j = 0; j < arrSize; j++) { array[i][j] = 0; } } for (int i = 0; i < length1; i++) { for (int j = 0; j < length2; j++) { String word1 = taggedWords1.get(i).word(); String posTag1 = taggedWords1.get(i).tag(); String word2 = taggedWords2.get(j).word(); String posTag2 = taggedWords2.get(j).tag(); double edgeWeight = 0; // LSA Similarity // edgeWeight = LSASimilarity.LSAWordSimilarity(word1, word2); // DISCO Similarity // DISCOSimilarity discoObj = new DISCOSimilarity(); try { if (word1.compareToIgnoreCase(word2) == 0) edgeWeight = 1; else { // edgeWeight = ws.wuPalmerSimilarity(senses1.get(i), senses2.get(j)); edgeWeight = ws.linSimilarity(senses1.get(i), senses2.get(j)); } } catch (Exception ex) { ex.printStackTrace(); } array[i][j] = edgeWeight; } } // System.out.println("Hungarian starts " + arrSize); double finalScore; String sumType = "max"; int minLength = Math.min(length1, length2); // finalScore = HungarianAlgorithm.hgAlgorithm(array, sumType)/minLength * 5; if (arrSize == 0) finalScore = 0; else finalScore = HungarianAlgorithm.hgAlgorithm(array, sumType) / arrSize * 5; return finalScore; }
public static double LexicalSimilarityScoreMax( ArrayList<TaggedWord> taggedWords1, ArrayList<TaggedWord> taggedWords2, DISCOSimilarity discoRAM, LexicalizedParser lp) { // System.out.println(taggedWords1.size() + "," + taggedWords2.size()); // array of edge weights with default weight 0 int length1 = taggedWords1.size(); int length2 = taggedWords2.size(); int arrSize = Math.max(length1, length2); double[][] array = new double[arrSize][arrSize]; for (int i = 0; i < arrSize; i++) { for (int j = 0; j < arrSize; j++) { array[i][j] = 0; } } for (int i = 0; i < length1; i++) { for (int j = 0; j < length2; j++) { String word1 = taggedWords1.get(i).word(); String posTag1 = taggedWords1.get(i).tag(); String word2 = taggedWords2.get(j).word(); String posTag2 = taggedWords2.get(j).tag(); ArrayList<TaggedWord> newList1 = new ArrayList<TaggedWord>(); if (posTag1.length() >= 3 && posTag1.substring(0, 3).equals("NNP")) { newList1.add(taggedWords1.get(i)); } else { String[] words = word1.split(" "); for (int k = 0; k < words.length; k++) newList1.add(new TaggedWord(words[k], posTag1)); } ArrayList<TaggedWord> newList2 = new ArrayList<TaggedWord>(); if (posTag2.length() >= 3 && posTag2.substring(0, 3).equals("NNP")) { newList2.add(taggedWords2.get(j)); } else { String[] words = word2.split(" "); for (int k = 0; k < words.length; k++) newList2.add(new TaggedWord(words[k], posTag2)); } double edgeWeight = LexicalSimilarityScoreMin(newList1, newList2, discoRAM, lp); array[i][j] = edgeWeight; } } // System.out.println("Hungarian starts " + arrSize); double finalScore; String sumType = "max"; // int minLength = Math.min(length1, length2); // finalScore = HungarianAlgorithm.hgAlgorithm(array, sumType)/minLength * 5; finalScore = HungarianAlgorithm.hgAlgorithm(array, sumType) / arrSize * 5; return finalScore; }
/** * Do max language model markov segmentation. Note that this algorithm inherently tags words as it * goes, but that we throw away the tags in the final result so that the segmented words are * untagged. (Note: for a couple of years till Aug 2007, a tagged result was returned, but this * messed up the parser, because it could use no tagging but the given tagging, which often wasn't * very good. Or in particular it was a subcategorized tagging which never worked with the current * forceTags option which assumes that gold taggings are inherently basic taggings.) * * @param s A String to segment * @return The list of segmented words. */ private ArrayList<HasWord> segmentWordsWithMarkov(String s) { int length = s.length(); // Set<String> POSes = (Set<String>) POSDistribution.keySet(); // 1.5 int numTags = POSes.size(); // score of span with initial word of this tag double[][][] scores = new double[length][length + 1][numTags]; // best (length of) first word for this span with this tag int[][][] splitBacktrace = new int[length][length + 1][numTags]; // best tag for second word over this span, if first is this tag int[][][] POSbacktrace = new int[length][length + 1][numTags]; for (int i = 0; i < length; i++) { for (int j = 0; j < length + 1; j++) { Arrays.fill(scores[i][j], Double.NEGATIVE_INFINITY); } } // first fill in word probabilities for (int diff = 1; diff <= 10; diff++) { for (int start = 0; start + diff <= length; start++) { int end = start + diff; StringBuilder wordBuf = new StringBuilder(); for (int pos = start; pos < end; pos++) { wordBuf.append(s.charAt(pos)); } String word = wordBuf.toString(); for (String tag : POSes) { IntTaggedWord itw = new IntTaggedWord(word, tag, wordIndex, tagIndex); double score = lex.score(itw, 0, word, null); if (start == 0) { score += Math.log(initialPOSDist.probabilityOf(tag)); } scores[start][end][itw.tag()] = score; splitBacktrace[start][end][itw.tag()] = end; } } } // now fill in word combination probabilities for (int diff = 2; diff <= length; diff++) { for (int start = 0; start + diff <= length; start++) { int end = start + diff; for (int split = start + 1; split < end && split - start <= 10; split++) { for (String tag : POSes) { int tagNum = tagIndex.indexOf(tag, true); if (splitBacktrace[start][split][tagNum] != split) { continue; } Distribution<String> rTagDist = markovPOSDists.get(tag); if (rTagDist == null) { continue; // this happens with "*" POS } for (String rTag : POSes) { int rTagNum = tagIndex.indexOf(rTag, true); double newScore = scores[start][split][tagNum] + scores[split][end][rTagNum] + Math.log(rTagDist.probabilityOf(rTag)); if (newScore > scores[start][end][tagNum]) { scores[start][end][tagNum] = newScore; splitBacktrace[start][end][tagNum] = split; POSbacktrace[start][end][tagNum] = rTagNum; } } } } } } int nextPOS = ArrayMath.argmax(scores[0][length]); ArrayList<HasWord> words = new ArrayList<HasWord>(); int start = 0; while (start < length) { int split = splitBacktrace[start][length][nextPOS]; StringBuilder wordBuf = new StringBuilder(); for (int i = start; i < split; i++) { wordBuf.append(s.charAt(i)); } String word = wordBuf.toString(); // String tag = tagIndex.get(nextPOS); // words.add(new TaggedWord(word, tag)); words.add(new Word(word)); if (split < length) { nextPOS = POSbacktrace[start][length][nextPOS]; } start = split; } return words; }