/** * Return information about the objects in this Tree. * * @param t The tree to examine. * @return A human-readable String */ public static String toDebugStructureString(Tree t) { StringBuilder sb = new StringBuilder(); String tCl = StringUtils.getShortClassName(t); String tfCl = StringUtils.getShortClassName(t.treeFactory()); String lCl = StringUtils.getShortClassName(t.label()); String lfCl = StringUtils.getShortClassName(t.label().labelFactory()); Set<String> otherClasses = new HashSet<String>(); for (Tree st : t) { String stCl = StringUtils.getShortClassName(st); String stfCl = StringUtils.getShortClassName(st.treeFactory()); String slCl = StringUtils.getShortClassName(st.label()); String slfCl = StringUtils.getShortClassName(st.label().labelFactory()); if (!tCl.equals(stCl)) { otherClasses.add(stCl); } if (!tfCl.equals(stfCl)) { otherClasses.add(stfCl); } if (!lCl.equals(slCl)) { otherClasses.add(slCl); } if (!lfCl.equals(slfCl)) { otherClasses.add(slfCl); } } sb.append("Tree with root of class ").append(tCl).append(" and factory ").append(tfCl); sb.append(" with label class ").append(lCl).append(" and factory ").append(lfCl); if (!otherClasses.isEmpty()) { sb.append(" with the following classes also found within the tree: ").append(otherClasses); } return sb.toString(); }
static String escape(String s) { StringBuilder sb = new StringBuilder(); for (int i = 0; i < s.length(); i++) { char c = s.charAt(i); if (c == '^') sb.append('\\'); sb.append(c); if (c == '^') sb.append("{}"); } return sb.toString(); }
protected String historyToString(List history) { String str = (String) historyToString.get(history); if (str == null) { StringBuilder sb = new StringBuilder(); for (int i = 0; i < history.size(); i++) { sb.append('^'); sb.append(history.get(i)); } str = sb.toString(); historyToString.put(history, str); } return str; }
public ArrayList<String> getPairsFromSentence(String sentence) { Collection<TypedDependency> tdl = parseSentenceTDL(sentence); ArrayList<String> pairs = new ArrayList<String>(); for (TypedDependency td : tdl) { StringBuilder sb = new StringBuilder(); sb.append(td.gov().originalText()); sb.append(" "); sb.append(td.dep().originalText()); pairs.add(sb.toString()); } return pairs; }
public String project(String tagStr) { StringBuilder sb = new StringBuilder(); boolean good = true; for (int pos = 0, len = tagStr.length(); pos < len; pos++) { char c = tagStr.charAt(pos); if (c == '-') { good = true; } else if (c == '^') { good = false; } if (good) { sb.append(c); } } String ret = sb.toString(); // System.err.println("TTP mapped " + tagStr + " to " + ret); return ret; }
private Distribution<Integer> getSegmentedWordLengthDistribution(Treebank tb) { // CharacterLevelTagExtender ext = new CharacterLevelTagExtender(); ClassicCounter<Integer> c = new ClassicCounter<Integer>(); for (Iterator iterator = tb.iterator(); iterator.hasNext(); ) { Tree gold = (Tree) iterator.next(); StringBuilder goldChars = new StringBuilder(); ArrayList goldYield = gold.yield(); for (Iterator wordIter = goldYield.iterator(); wordIter.hasNext(); ) { Word word = (Word) wordIter.next(); goldChars.append(word); } List<HasWord> ourWords = segment(goldChars.toString()); for (int i = 0; i < ourWords.size(); i++) { c.incrementCount(Integer.valueOf(ourWords.get(i).word().length())); } } return Distribution.getDistribution(c); }
protected String projectString(String str) { if (str.indexOf('@') == -1) { if (str.indexOf('^') == -1) { return str; } return str.substring(0, str.indexOf('^')); } StringBuilder sb = new StringBuilder(); sb.append(str.substring(0, str.indexOf(' '))); if (str.indexOf('^') > -1) { // sb.append(str.substring(str.indexOf('^'),str.length())); } int num = -2; for (int i = 0; i < str.length(); i++) { if (str.charAt(i) == ' ') { num++; } } sb.append(" w " + num); return sb.toString(); }
private static int treeToLatexEvenHelper( Tree t, StringBuilder c, StringBuilder h, int n, int nextN, int indent, int curDepth, int maxDepth) { StringBuilder sb = new StringBuilder(); for (int i = 0; i < indent; i++) sb.append(" "); h.append('\n').append(sb); int tDepth = t.depth(); if (tDepth == 0 && tDepth + curDepth < maxDepth) { for (int pad = 0; pad < maxDepth - tDepth - curDepth; pad++) { h.append("{\\ntnode{pad}{}, "); } } h.append("{\\ntnode{z").append(n).append("}{").append(t.label()).append('}'); if (!t.isLeaf()) { for (int k = 0; k < t.children().length; k++) { h.append(", "); c.append("\\nodeconnect{z").append(n).append("}{z").append(nextN).append("}\n"); nextN = treeToLatexEvenHelper( t.children()[k], c, h, nextN, nextN + 1, indent + 1, curDepth + 1, maxDepth); } } if (tDepth == 0 && tDepth + curDepth < maxDepth) { for (int pad = 0; pad < maxDepth - tDepth - curDepth; pad++) { h.append('}'); } } h.append('}'); return nextN; }
private static int treeToLatexHelper( Tree t, StringBuilder c, StringBuilder h, int n, int nextN, int indent) { StringBuilder sb = new StringBuilder(); for (int i = 0; i < indent; i++) sb.append(" "); h.append('\n').append(sb); h.append("{\\") .append(t.isLeaf() ? "" : "n") .append("tnode{z") .append(n) .append("}{") .append(t.label()) .append('}'); if (!t.isLeaf()) { for (int k = 0; k < t.children().length; k++) { h.append(", "); c.append("\\nodeconnect{z").append(n).append("}{z").append(nextN).append("}\n"); nextN = treeToLatexHelper(t.children()[k], c, h, nextN, nextN + 1, indent + 1); } } h.append('}'); return nextN; }
/** * Do max language model markov segmentation. Note that this algorithm inherently tags words as it * goes, but that we throw away the tags in the final result so that the segmented words are * untagged. (Note: for a couple of years till Aug 2007, a tagged result was returned, but this * messed up the parser, because it could use no tagging but the given tagging, which often wasn't * very good. Or in particular it was a subcategorized tagging which never worked with the current * forceTags option which assumes that gold taggings are inherently basic taggings.) * * @param s A String to segment * @return The list of segmented words. */ private ArrayList<HasWord> segmentWordsWithMarkov(String s) { int length = s.length(); // Set<String> POSes = (Set<String>) POSDistribution.keySet(); // 1.5 int numTags = POSes.size(); // score of span with initial word of this tag double[][][] scores = new double[length][length + 1][numTags]; // best (length of) first word for this span with this tag int[][][] splitBacktrace = new int[length][length + 1][numTags]; // best tag for second word over this span, if first is this tag int[][][] POSbacktrace = new int[length][length + 1][numTags]; for (int i = 0; i < length; i++) { for (int j = 0; j < length + 1; j++) { Arrays.fill(scores[i][j], Double.NEGATIVE_INFINITY); } } // first fill in word probabilities for (int diff = 1; diff <= 10; diff++) { for (int start = 0; start + diff <= length; start++) { int end = start + diff; StringBuilder wordBuf = new StringBuilder(); for (int pos = start; pos < end; pos++) { wordBuf.append(s.charAt(pos)); } String word = wordBuf.toString(); for (String tag : POSes) { IntTaggedWord itw = new IntTaggedWord(word, tag, wordIndex, tagIndex); double score = lex.score(itw, 0, word, null); if (start == 0) { score += Math.log(initialPOSDist.probabilityOf(tag)); } scores[start][end][itw.tag()] = score; splitBacktrace[start][end][itw.tag()] = end; } } } // now fill in word combination probabilities for (int diff = 2; diff <= length; diff++) { for (int start = 0; start + diff <= length; start++) { int end = start + diff; for (int split = start + 1; split < end && split - start <= 10; split++) { for (String tag : POSes) { int tagNum = tagIndex.indexOf(tag, true); if (splitBacktrace[start][split][tagNum] != split) { continue; } Distribution<String> rTagDist = markovPOSDists.get(tag); if (rTagDist == null) { continue; // this happens with "*" POS } for (String rTag : POSes) { int rTagNum = tagIndex.indexOf(rTag, true); double newScore = scores[start][split][tagNum] + scores[split][end][rTagNum] + Math.log(rTagDist.probabilityOf(rTag)); if (newScore > scores[start][end][tagNum]) { scores[start][end][tagNum] = newScore; splitBacktrace[start][end][tagNum] = split; POSbacktrace[start][end][tagNum] = rTagNum; } } } } } } int nextPOS = ArrayMath.argmax(scores[0][length]); ArrayList<HasWord> words = new ArrayList<HasWord>(); int start = 0; while (start < length) { int split = splitBacktrace[start][length][nextPOS]; StringBuilder wordBuf = new StringBuilder(); for (int i = start; i < split; i++) { wordBuf.append(s.charAt(i)); } String word = wordBuf.toString(); // String tag = tagIndex.get(nextPOS); // words.add(new TaggedWord(word, tag)); words.add(new Word(word)); if (split < length) { nextPOS = POSbacktrace[start][length][nextPOS]; } start = split; } return words; }
// CDM 2007: I wonder what this does differently from segmentWordsWithMarkov??? private ArrayList<TaggedWord> basicSegmentWords(String s) { int length = s.length(); // Set<String> POSes = (Set<String>) POSDistribution.keySet(); // 1.5 // best score of span double[][] scores = new double[length][length + 1]; // best (last index of) first word for this span int[][] splitBacktrace = new int[length][length + 1]; // best tag for word over this span int[][] POSbacktrace = new int[length][length + 1]; for (int i = 0; i < length; i++) { Arrays.fill(scores[i], Double.NEGATIVE_INFINITY); } // first fill in word probabilities for (int diff = 1; diff <= 10; diff++) { for (int start = 0; start + diff <= length; start++) { int end = start + diff; StringBuilder wordBuf = new StringBuilder(); for (int pos = start; pos < end; pos++) { wordBuf.append(s.charAt(pos)); } String word = wordBuf.toString(); // for (String tag : POSes) { // 1.5 for (Iterator<String> iter = POSes.iterator(); iter.hasNext(); ) { String tag = iter.next(); IntTaggedWord itw = new IntTaggedWord(word, tag, wordIndex, tagIndex); double newScore = lex.score(itw, 0, word, null) + Math.log(lex.getPOSDistribution().probabilityOf(tag)); if (newScore > scores[start][end]) { scores[start][end] = newScore; splitBacktrace[start][end] = end; POSbacktrace[start][end] = itw.tag(); } } } } // now fill in word combination probabilities for (int diff = 2; diff <= length; diff++) { for (int start = 0; start + diff <= length; start++) { int end = start + diff; for (int split = start + 1; split < end && split - start <= 10; split++) { if (splitBacktrace[start][split] != split) { continue; // only consider words on left } double newScore = scores[start][split] + scores[split][end]; if (newScore > scores[start][end]) { scores[start][end] = newScore; splitBacktrace[start][end] = split; } } } } List<TaggedWord> words = new ArrayList<TaggedWord>(); int start = 0; while (start < length) { int end = splitBacktrace[start][length]; StringBuilder wordBuf = new StringBuilder(); for (int pos = start; pos < end; pos++) { wordBuf.append(s.charAt(pos)); } String word = wordBuf.toString(); String tag = tagIndex.get(POSbacktrace[start][end]); words.add(new TaggedWord(word, tag)); start = end; } return new ArrayList<TaggedWord>(words); }