private List<Tree> helper(List<Tree> treeList, int start) {
   List<Tree> newTreeList = new ArrayList<Tree>(treeList.size());
   for (Tree tree : treeList) {
     int end = start + tree.yield().size();
     newTreeList.add(prune(tree, start));
     start = end;
   }
   return newTreeList;
 }
 // TODO: Rewrite this as general matching predicate
 private static boolean hasV(List tags) {
   for (int i = 0, tsize = tags.size(); i < tsize; i++) {
     String str = tags.get(i).toString();
     if (str.startsWith("V")) {
       return true;
     }
   }
   return false;
 }
 private static List<TaggedWord> cleanTags(List twList, TreebankLanguagePack tlp) {
   int sz = twList.size();
   List<TaggedWord> l = new ArrayList<TaggedWord>(sz);
   for (int i = 0; i < sz; i++) {
     TaggedWord tw = (TaggedWord) twList.get(i);
     TaggedWord tw2 = new TaggedWord(tw.word(), tlp.basicCategory(tw.tag()));
     l.add(tw2);
   }
   return l;
 }
 List<Tree> prune(List<Tree> treeList, Label label, int start, int end) {
   // get reference tree
   if (treeList.size() == 1) {
     return treeList;
   }
   Tree testTree = treeList.get(0).treeFactory().newTreeNode(label, treeList);
   int goal = Numberer.getGlobalNumberer("states").number(label.value());
   Tree tempTree = parser.extractBestParse(goal, start, end);
   // parser.restoreUnaries(tempTree);
   Tree pcfgTree = debinarizer.transformTree(tempTree);
   Set<Constituent> pcfgConstituents =
       pcfgTree.constituents(new LabeledScoredConstituentFactory());
   // delete child labels that are not in reference but do not cross reference
   List<Tree> prunedChildren = new ArrayList<Tree>();
   int childStart = 0;
   for (int c = 0, numCh = testTree.numChildren(); c < numCh; c++) {
     Tree child = testTree.getChild(c);
     boolean isExtra = true;
     int childEnd = childStart + child.yield().size();
     Constituent childConstituent =
         new LabeledScoredConstituent(childStart, childEnd, child.label(), 0);
     if (pcfgConstituents.contains(childConstituent)) {
       isExtra = false;
     }
     if (childConstituent.crosses(pcfgConstituents)) {
       isExtra = false;
     }
     if (child.isLeaf() || child.isPreTerminal()) {
       isExtra = false;
     }
     if (pcfgTree.yield().size() != testTree.yield().size()) {
       isExtra = false;
     }
     if (!label.value().startsWith("NP^NP")) {
       isExtra = false;
     }
     if (isExtra) {
       System.err.println(
           "Pruning: "
               + child.label()
               + " from "
               + (childStart + start)
               + " to "
               + (childEnd + start));
       System.err.println("Was: " + testTree + " vs " + pcfgTree);
       prunedChildren.addAll(child.getChildrenAsList());
     } else {
       prunedChildren.add(child);
     }
     childStart = childEnd;
   }
   return prunedChildren;
 }
 protected String historyToString(List history) {
   String str = (String) historyToString.get(history);
   if (str == null) {
     StringBuilder sb = new StringBuilder();
     for (int i = 0; i < history.size(); i++) {
       sb.append('^');
       sb.append(history.get(i));
     }
     str = sb.toString();
     historyToString.put(history, str);
   }
   return str;
 }
  public Object formResult() {
    Set brs = new HashSet();
    Set urs = new HashSet();
    // scan each rule / history pair
    int ruleCount = 0;
    for (Iterator pairI = rulePairs.keySet().iterator(); pairI.hasNext(); ) {
      if (ruleCount % 100 == 0) {
        System.err.println("Rules multiplied: " + ruleCount);
      }
      ruleCount++;
      Pair rulePair = (Pair) pairI.next();
      Rule baseRule = (Rule) rulePair.first;
      String baseLabel = (String) ruleToLabel.get(baseRule);
      List history = (List) rulePair.second;
      double totalProb = 0;
      for (int depth = 1; depth <= HISTORY_DEPTH() && depth <= history.size(); depth++) {
        List subHistory = history.subList(0, depth);
        double c_label = labelPairs.getCount(new Pair(baseLabel, subHistory));
        double c_rule = rulePairs.getCount(new Pair(baseRule, subHistory));
        // System.out.println("Multiplying out "+baseRule+" with history "+subHistory);
        // System.out.println("Count of "+baseLabel+" with "+subHistory+" is "+c_label);
        // System.out.println("Count of "+baseRule+" with "+subHistory+" is "+c_rule );

        double prob = (1.0 / HISTORY_DEPTH()) * (c_rule) / (c_label);
        totalProb += prob;
        for (int childDepth = 0; childDepth <= Math.min(HISTORY_DEPTH() - 1, depth); childDepth++) {
          Rule rule = specifyRule(baseRule, subHistory, childDepth);
          rule.score = (float) Math.log(totalProb);
          // System.out.println("Created  "+rule+" with score "+rule.score);
          if (rule instanceof UnaryRule) {
            urs.add(rule);
          } else {
            brs.add(rule);
          }
        }
      }
    }
    System.out.println("Total states: " + stateNumberer.total());
    BinaryGrammar bg = new BinaryGrammar(stateNumberer.total());
    UnaryGrammar ug = new UnaryGrammar(stateNumberer.total());
    for (Iterator brI = brs.iterator(); brI.hasNext(); ) {
      BinaryRule br = (BinaryRule) brI.next();
      bg.addRule(br);
    }
    for (Iterator urI = urs.iterator(); urI.hasNext(); ) {
      UnaryRule ur = (UnaryRule) urI.next();
      ug.addRule(ur);
    }
    return new Pair(ug, bg);
  }
 protected void tallyInternalNode(Tree lt, List parents) {
   // form base rule
   String label = lt.label().value();
   Rule baseR = ltToRule(lt);
   ruleToLabel.put(baseR, label);
   // act on each history depth
   for (int depth = 0, maxDepth = Math.min(HISTORY_DEPTH(), parents.size());
       depth <= maxDepth;
       depth++) {
     List history = new ArrayList(parents.subList(0, depth));
     // tally each history level / rewrite pair
     rulePairs.incrementCount(new Pair(baseR, history), 1);
     labelPairs.incrementCount(new Pair(label, history), 1);
   }
 }
  public static ArrayList<ArrayList<TaggedWord>> getPhrases(Tree parse, int phraseSizeLimit) {
    ArrayList<ArrayList<TaggedWord>> newList = new ArrayList<ArrayList<TaggedWord>>();
    List<Tree> leaves = parse.getLeaves();

    if (leaves.size() <= phraseSizeLimit) {
      // ArrayList<TaggedWord> phraseElements = PreprocessPhrase(parse.taggedYield());
      ArrayList<TaggedWord> phraseElements = Preprocess(parse.taggedYield());
      if (phraseElements.size() > 0) newList.add(phraseElements);
    } else {
      Tree[] childrenNodes = parse.children();
      for (int i = 0; i < childrenNodes.length; i++) {
        Tree currentParse = childrenNodes[i];
        newList.addAll(getPhrases(currentParse, phraseSizeLimit));
      }
    }
    return newList;
  }
Exemple #9
0
  /**
   * Assumes the given reader has only tsurgeon operations (not a tregex pattern), and parses these
   * out, collecting them into one operation. Stops on a whitespace line.
   *
   * @throws IOException
   */
  public static TsurgeonPattern getTsurgeonOperationsFromReader(BufferedReader reader)
      throws IOException {
    List<TsurgeonPattern> operations = new ArrayList<TsurgeonPattern>();
    for (String thisLine; (thisLine = reader.readLine()) != null; ) {
      if (emptyLinePattern.matcher(thisLine).matches()) {
        break;
      }
      thisLine = removeComments(thisLine);
      if (emptyLinePattern.matcher(thisLine).matches()) {
        continue;
      }
      // System.err.println("Read tsurgeon op: " + thisLine);
      operations.add(parseOperation(thisLine));
    }

    if (operations.size() == 0) throw new TsurgeonParseException("No Tsurgeon operation provided.");

    return collectOperations(operations);
  }
Exemple #10
0
 protected Rule specifyRule(Rule rule, List history, int childDepth) {
   Rule r;
   String topHistoryStr = historyToString(history.subList(1, history.size()));
   String bottomHistoryStr = historyToString(history.subList(0, childDepth));
   if (rule instanceof UnaryRule) {
     UnaryRule ur = new UnaryRule();
     UnaryRule urule = (UnaryRule) rule;
     ur.parent = stateNumberer.number(stateNumberer.object(urule.parent) + topHistoryStr);
     if (isSynthetic(urule.child)) {
       ur.child = stateNumberer.number(stateNumberer.object(urule.child) + topHistoryStr);
     } else if (isTag(urule.child)) {
       ur.child = urule.child;
     } else {
       ur.child = stateNumberer.number(stateNumberer.object(urule.child) + bottomHistoryStr);
     }
     r = ur;
   } else {
     BinaryRule br = new BinaryRule();
     BinaryRule brule = (BinaryRule) rule;
     br.parent = stateNumberer.number(stateNumberer.object(brule.parent) + topHistoryStr);
     if (isSynthetic(brule.leftChild)) {
       br.leftChild = stateNumberer.number(stateNumberer.object(brule.leftChild) + topHistoryStr);
     } else if (isTag(brule.leftChild)) {
       br.leftChild = brule.leftChild;
     } else {
       br.leftChild =
           stateNumberer.number(stateNumberer.object(brule.leftChild) + bottomHistoryStr);
     }
     if (isSynthetic(brule.rightChild)) {
       br.rightChild =
           stateNumberer.number(stateNumberer.object(brule.rightChild) + topHistoryStr);
     } else if (isTag(brule.rightChild)) {
       br.rightChild = brule.rightChild;
     } else {
       br.rightChild =
           stateNumberer.number(stateNumberer.object(brule.rightChild) + bottomHistoryStr);
     }
     r = br;
   }
   return r;
 }
Exemple #11
0
 /**
  * Collects a list of operation patterns into a sequence of operations to be applied. Required to
  * keep track of global properties across a sequence of operations. For example, if you want to
  * insert a named node and then coindex it with another node, you will need to collect the
  * insertion and coindexation operations into a single TsurgeonPattern so that tsurgeon is aware
  * of the name of the new node and coindexation becomes possible.
  *
  * @param patterns a list of {@link TsurgeonPattern} operations that you want to collect together
  *     into a single compound operation
  * @return a new {@link TsurgeonPattern} that performs all the operations in the sequence of the
  *     <code>patterns</code> argument
  */
 public static TsurgeonPattern collectOperations(List<TsurgeonPattern> patterns) {
   return new TsurgeonPatternRoot(patterns.toArray(new TsurgeonPattern[patterns.size()]));
 }