Пример #1
0
  /**
   * Return information about the objects in this Tree.
   *
   * @param t The tree to examine.
   * @return A human-readable String
   */
  public static String toDebugStructureString(Tree t) {
    StringBuilder sb = new StringBuilder();
    String tCl = StringUtils.getShortClassName(t);
    String tfCl = StringUtils.getShortClassName(t.treeFactory());
    String lCl = StringUtils.getShortClassName(t.label());
    String lfCl = StringUtils.getShortClassName(t.label().labelFactory());
    Set<String> otherClasses = new HashSet<String>();
    for (Tree st : t) {
      String stCl = StringUtils.getShortClassName(st);
      String stfCl = StringUtils.getShortClassName(st.treeFactory());
      String slCl = StringUtils.getShortClassName(st.label());
      String slfCl = StringUtils.getShortClassName(st.label().labelFactory());

      if (!tCl.equals(stCl)) {
        otherClasses.add(stCl);
      }
      if (!tfCl.equals(stfCl)) {
        otherClasses.add(stfCl);
      }
      if (!lCl.equals(slCl)) {
        otherClasses.add(slCl);
      }
      if (!lfCl.equals(slfCl)) {
        otherClasses.add(slfCl);
      }
    }
    sb.append("Tree with root of class ").append(tCl).append(" and factory ").append(tfCl);
    sb.append(" with label class ").append(lCl).append(" and factory ").append(lfCl);
    if (!otherClasses.isEmpty()) {
      sb.append(" with the following classes also found within the tree: ").append(otherClasses);
    }
    return sb.toString();
  }
Пример #2
0
 static String escape(String s) {
   StringBuilder sb = new StringBuilder();
   for (int i = 0; i < s.length(); i++) {
     char c = s.charAt(i);
     if (c == '^') sb.append('\\');
     sb.append(c);
     if (c == '^') sb.append("{}");
   }
   return sb.toString();
 }
Пример #3
0
 protected String historyToString(List history) {
   String str = (String) historyToString.get(history);
   if (str == null) {
     StringBuilder sb = new StringBuilder();
     for (int i = 0; i < history.size(); i++) {
       sb.append('^');
       sb.append(history.get(i));
     }
     str = sb.toString();
     historyToString.put(history, str);
   }
   return str;
 }
  public ArrayList<String> getPairsFromSentence(String sentence) {
    Collection<TypedDependency> tdl = parseSentenceTDL(sentence);
    ArrayList<String> pairs = new ArrayList<String>();

    for (TypedDependency td : tdl) {
      StringBuilder sb = new StringBuilder();
      sb.append(td.gov().originalText());
      sb.append(" ");
      sb.append(td.dep().originalText());
      pairs.add(sb.toString());
    }

    return pairs;
  }
Пример #5
0
 public String project(String tagStr) {
   StringBuilder sb = new StringBuilder();
   boolean good = true;
   for (int pos = 0, len = tagStr.length(); pos < len; pos++) {
     char c = tagStr.charAt(pos);
     if (c == '-') {
       good = true;
     } else if (c == '^') {
       good = false;
     }
     if (good) {
       sb.append(c);
     }
   }
   String ret = sb.toString();
   // System.err.println("TTP mapped " + tagStr + " to " + ret);
   return ret;
 }
 private Distribution<Integer> getSegmentedWordLengthDistribution(Treebank tb) {
   // CharacterLevelTagExtender ext = new CharacterLevelTagExtender();
   ClassicCounter<Integer> c = new ClassicCounter<Integer>();
   for (Iterator iterator = tb.iterator(); iterator.hasNext(); ) {
     Tree gold = (Tree) iterator.next();
     StringBuilder goldChars = new StringBuilder();
     ArrayList goldYield = gold.yield();
     for (Iterator wordIter = goldYield.iterator(); wordIter.hasNext(); ) {
       Word word = (Word) wordIter.next();
       goldChars.append(word);
     }
     List<HasWord> ourWords = segment(goldChars.toString());
     for (int i = 0; i < ourWords.size(); i++) {
       c.incrementCount(Integer.valueOf(ourWords.get(i).word().length()));
     }
   }
   return Distribution.getDistribution(c);
 }
Пример #7
0
 protected String projectString(String str) {
   if (str.indexOf('@') == -1) {
     if (str.indexOf('^') == -1) {
       return str;
     }
     return str.substring(0, str.indexOf('^'));
   }
   StringBuilder sb = new StringBuilder();
   sb.append(str.substring(0, str.indexOf(' ')));
   if (str.indexOf('^') > -1) {
     // sb.append(str.substring(str.indexOf('^'),str.length()));
   }
   int num = -2;
   for (int i = 0; i < str.length(); i++) {
     if (str.charAt(i) == ' ') {
       num++;
     }
   }
   sb.append(" w " + num);
   return sb.toString();
 }
Пример #8
0
 private static int treeToLatexEvenHelper(
     Tree t,
     StringBuilder c,
     StringBuilder h,
     int n,
     int nextN,
     int indent,
     int curDepth,
     int maxDepth) {
   StringBuilder sb = new StringBuilder();
   for (int i = 0; i < indent; i++) sb.append("  ");
   h.append('\n').append(sb);
   int tDepth = t.depth();
   if (tDepth == 0 && tDepth + curDepth < maxDepth) {
     for (int pad = 0; pad < maxDepth - tDepth - curDepth; pad++) {
       h.append("{\\ntnode{pad}{}, ");
     }
   }
   h.append("{\\ntnode{z").append(n).append("}{").append(t.label()).append('}');
   if (!t.isLeaf()) {
     for (int k = 0; k < t.children().length; k++) {
       h.append(", ");
       c.append("\\nodeconnect{z").append(n).append("}{z").append(nextN).append("}\n");
       nextN =
           treeToLatexEvenHelper(
               t.children()[k], c, h, nextN, nextN + 1, indent + 1, curDepth + 1, maxDepth);
     }
   }
   if (tDepth == 0 && tDepth + curDepth < maxDepth) {
     for (int pad = 0; pad < maxDepth - tDepth - curDepth; pad++) {
       h.append('}');
     }
   }
   h.append('}');
   return nextN;
 }
Пример #9
0
 private static int treeToLatexHelper(
     Tree t, StringBuilder c, StringBuilder h, int n, int nextN, int indent) {
   StringBuilder sb = new StringBuilder();
   for (int i = 0; i < indent; i++) sb.append("  ");
   h.append('\n').append(sb);
   h.append("{\\")
       .append(t.isLeaf() ? "" : "n")
       .append("tnode{z")
       .append(n)
       .append("}{")
       .append(t.label())
       .append('}');
   if (!t.isLeaf()) {
     for (int k = 0; k < t.children().length; k++) {
       h.append(", ");
       c.append("\\nodeconnect{z").append(n).append("}{z").append(nextN).append("}\n");
       nextN = treeToLatexHelper(t.children()[k], c, h, nextN, nextN + 1, indent + 1);
     }
   }
   h.append('}');
   return nextN;
 }
  /**
   * Do max language model markov segmentation. Note that this algorithm inherently tags words as it
   * goes, but that we throw away the tags in the final result so that the segmented words are
   * untagged. (Note: for a couple of years till Aug 2007, a tagged result was returned, but this
   * messed up the parser, because it could use no tagging but the given tagging, which often wasn't
   * very good. Or in particular it was a subcategorized tagging which never worked with the current
   * forceTags option which assumes that gold taggings are inherently basic taggings.)
   *
   * @param s A String to segment
   * @return The list of segmented words.
   */
  private ArrayList<HasWord> segmentWordsWithMarkov(String s) {
    int length = s.length();
    //    Set<String> POSes = (Set<String>) POSDistribution.keySet();  // 1.5
    int numTags = POSes.size();
    // score of span with initial word of this tag
    double[][][] scores = new double[length][length + 1][numTags];
    // best (length of) first word for this span with this tag
    int[][][] splitBacktrace = new int[length][length + 1][numTags];
    // best tag for second word over this span, if first is this tag
    int[][][] POSbacktrace = new int[length][length + 1][numTags];
    for (int i = 0; i < length; i++) {
      for (int j = 0; j < length + 1; j++) {
        Arrays.fill(scores[i][j], Double.NEGATIVE_INFINITY);
      }
    }
    // first fill in word probabilities
    for (int diff = 1; diff <= 10; diff++) {
      for (int start = 0; start + diff <= length; start++) {
        int end = start + diff;
        StringBuilder wordBuf = new StringBuilder();
        for (int pos = start; pos < end; pos++) {
          wordBuf.append(s.charAt(pos));
        }
        String word = wordBuf.toString();
        for (String tag : POSes) {
          IntTaggedWord itw = new IntTaggedWord(word, tag, wordIndex, tagIndex);
          double score = lex.score(itw, 0, word, null);
          if (start == 0) {
            score += Math.log(initialPOSDist.probabilityOf(tag));
          }
          scores[start][end][itw.tag()] = score;
          splitBacktrace[start][end][itw.tag()] = end;
        }
      }
    }
    // now fill in word combination probabilities
    for (int diff = 2; diff <= length; diff++) {
      for (int start = 0; start + diff <= length; start++) {
        int end = start + diff;
        for (int split = start + 1; split < end && split - start <= 10; split++) {
          for (String tag : POSes) {
            int tagNum = tagIndex.indexOf(tag, true);
            if (splitBacktrace[start][split][tagNum] != split) {
              continue;
            }
            Distribution<String> rTagDist = markovPOSDists.get(tag);
            if (rTagDist == null) {
              continue; // this happens with "*" POS
            }
            for (String rTag : POSes) {
              int rTagNum = tagIndex.indexOf(rTag, true);
              double newScore =
                  scores[start][split][tagNum]
                      + scores[split][end][rTagNum]
                      + Math.log(rTagDist.probabilityOf(rTag));
              if (newScore > scores[start][end][tagNum]) {
                scores[start][end][tagNum] = newScore;
                splitBacktrace[start][end][tagNum] = split;
                POSbacktrace[start][end][tagNum] = rTagNum;
              }
            }
          }
        }
      }
    }
    int nextPOS = ArrayMath.argmax(scores[0][length]);
    ArrayList<HasWord> words = new ArrayList<HasWord>();

    int start = 0;
    while (start < length) {
      int split = splitBacktrace[start][length][nextPOS];
      StringBuilder wordBuf = new StringBuilder();
      for (int i = start; i < split; i++) {
        wordBuf.append(s.charAt(i));
      }
      String word = wordBuf.toString();
      // String tag = tagIndex.get(nextPOS);
      // words.add(new TaggedWord(word, tag));
      words.add(new Word(word));
      if (split < length) {
        nextPOS = POSbacktrace[start][length][nextPOS];
      }
      start = split;
    }

    return words;
  }
  // CDM 2007: I wonder what this does differently from segmentWordsWithMarkov???
  private ArrayList<TaggedWord> basicSegmentWords(String s) {
    int length = s.length();
    //    Set<String> POSes = (Set<String>) POSDistribution.keySet();  // 1.5
    // best score of span
    double[][] scores = new double[length][length + 1];
    // best (last index of) first word for this span
    int[][] splitBacktrace = new int[length][length + 1];
    // best tag for word over this span
    int[][] POSbacktrace = new int[length][length + 1];
    for (int i = 0; i < length; i++) {
      Arrays.fill(scores[i], Double.NEGATIVE_INFINITY);
    }
    // first fill in word probabilities
    for (int diff = 1; diff <= 10; diff++) {
      for (int start = 0; start + diff <= length; start++) {
        int end = start + diff;
        StringBuilder wordBuf = new StringBuilder();
        for (int pos = start; pos < end; pos++) {
          wordBuf.append(s.charAt(pos));
        }
        String word = wordBuf.toString();
        //        for (String tag : POSes) {  // 1.5
        for (Iterator<String> iter = POSes.iterator(); iter.hasNext(); ) {
          String tag = iter.next();
          IntTaggedWord itw = new IntTaggedWord(word, tag, wordIndex, tagIndex);
          double newScore =
              lex.score(itw, 0, word, null) + Math.log(lex.getPOSDistribution().probabilityOf(tag));
          if (newScore > scores[start][end]) {
            scores[start][end] = newScore;
            splitBacktrace[start][end] = end;
            POSbacktrace[start][end] = itw.tag();
          }
        }
      }
    }
    // now fill in word combination probabilities
    for (int diff = 2; diff <= length; diff++) {
      for (int start = 0; start + diff <= length; start++) {
        int end = start + diff;
        for (int split = start + 1; split < end && split - start <= 10; split++) {
          if (splitBacktrace[start][split] != split) {
            continue; // only consider words on left
          }
          double newScore = scores[start][split] + scores[split][end];
          if (newScore > scores[start][end]) {
            scores[start][end] = newScore;
            splitBacktrace[start][end] = split;
          }
        }
      }
    }

    List<TaggedWord> words = new ArrayList<TaggedWord>();
    int start = 0;
    while (start < length) {
      int end = splitBacktrace[start][length];
      StringBuilder wordBuf = new StringBuilder();
      for (int pos = start; pos < end; pos++) {
        wordBuf.append(s.charAt(pos));
      }
      String word = wordBuf.toString();
      String tag = tagIndex.get(POSbacktrace[start][end]);

      words.add(new TaggedWord(word, tag));
      start = end;
    }

    return new ArrayList<TaggedWord>(words);
  }