/**
  * Create a searcher manually, suppling a dependency tree, an optional classifier for when to
  * split clauses, and a featurizer for that classifier. You almost certainly want to use {@link
  * ClauseSplitter#load(String)} instead of this constructor.
  *
  * @param tree The dependency tree to search over.
  * @param assumedTruth The assumed truth of the tree (relevant for natural logic inference). If in
  *     doubt, pass in true.
  * @param isClauseClassifier The classifier for whether a given dependency arc should be a new
  *     clause. If this is not given, all arcs are treated as clause separators.
  * @param featurizer The featurizer for the classifier. If no featurizer is given, one should be
  *     given in {@link ClauseSplitterSearchProblem#search(java.util.function.Predicate,
  *     Classifier, Map, java.util.function.Function, int)}, or else the classifier will be
  *     useless.
  * @see ClauseSplitter#load(String)
  */
 protected ClauseSplitterSearchProblem(
     SemanticGraph tree,
     boolean assumedTruth,
     Optional<Classifier<ClauseSplitter.ClauseClassifierLabel, String>> isClauseClassifier,
     Optional<
             Function<
                 Triple<
                     ClauseSplitterSearchProblem.State,
                     ClauseSplitterSearchProblem.Action,
                     ClauseSplitterSearchProblem.State>,
                 Counter<String>>>
         featurizer) {
   this.tree = new SemanticGraph(tree);
   this.assumedTruth = assumedTruth;
   this.isClauseClassifier = isClauseClassifier;
   this.featurizer = featurizer;
   // Index edges
   this.tree.edgeIterable().forEach(edgeToIndex::addToIndex);
   // Get length
   List<IndexedWord> sortedVertices = tree.vertexListSorted();
   sentenceLength = sortedVertices.get(sortedVertices.size() - 1).index();
   // Register extra edges
   for (IndexedWord vertex : sortedVertices) {
     extraEdgesByGovernor.put(vertex, new ArrayList<>());
     extraEdgesByDependent.put(vertex, new ArrayList<>());
   }
   List<SemanticGraphEdge> extraEdges = Util.cleanTree(this.tree);
   assert Util.isTree(this.tree);
   for (SemanticGraphEdge edge : extraEdges) {
     extraEdgesByGovernor.get(edge.getGovernor()).add(edge);
     extraEdgesByDependent.get(edge.getDependent()).add(edge);
   }
 }
Esempio n. 2
0
  /**
   * A little utility function to make sure a SemanticGraph is a tree.
   *
   * @param tree The tree to check.
   * @return True if this {@link edu.stanford.nlp.semgraph.SemanticGraph} is a tree (versus a DAG,
   *     or Graph).
   */
  public static boolean isTree(SemanticGraph tree) {
    for (IndexedWord vertex : tree.vertexSet()) {
      // Check one and only one incoming edge
      if (tree.getRoots().contains(vertex)) {
        if (tree.incomingEdgeIterator(vertex).hasNext()) {
          return false;
        }
      } else {
        Iterator<SemanticGraphEdge> iter = tree.incomingEdgeIterator(vertex);
        if (!iter.hasNext()) {
          return false;
        }
        iter.next();
        if (iter.hasNext()) {
          return false;
        }
      }
      // Check incoming and outgoing edges match
      for (SemanticGraphEdge edge : tree.outgoingEdgeIterable(vertex)) {
        boolean foundReverse = false;
        for (SemanticGraphEdge reverse : tree.incomingEdgeIterable(edge.getDependent())) {
          if (reverse == edge) {
            foundReverse = true;
          }
        }
        if (!foundReverse) {
          return false;
        }
      }
      for (SemanticGraphEdge edge : tree.incomingEdgeIterable(vertex)) {
        boolean foundReverse = false;
        for (SemanticGraphEdge reverse : tree.outgoingEdgeIterable(edge.getGovernor())) {
          if (reverse == edge) {
            foundReverse = true;
          }
        }
        if (!foundReverse) {
          return false;
        }
      }
    }

    // Check for cycles
    if (isCyclic(tree)) {
      return false;
    }

    // Check topological sort -- sometimes fails?
    //    try {
    //      tree.topologicalSort();
    //    } catch (Exception e) {
    //      e.printStackTrace();
    //      return false;
    //    }
    return true;
  }
 /**
  * Stips aux and mark edges when we are splitting into a clause.
  *
  * @param toModify The tree we are stripping the edges from.
  */
 private void stripAuxMark(SemanticGraph toModify) {
   List<SemanticGraphEdge> toClean = new ArrayList<>();
   for (SemanticGraphEdge edge : toModify.outgoingEdgeIterable(toModify.getFirstRoot())) {
     String rel = edge.getRelation().toString();
     if (("aux".equals(rel) || "mark".equals(rel))
         && !toModify.outgoingEdgeIterator(edge.getDependent()).hasNext()) {
       toClean.add(edge);
     }
   }
   for (SemanticGraphEdge edge : toClean) {
     toModify.removeEdge(edge);
     toModify.removeVertex(edge.getDependent());
   }
 }
Esempio n. 4
0
  /**
   * Strip away case edges, if the incoming edge is a preposition. This replicates the behavior of
   * the old Stanford dependencies on universal dependencies.
   *
   * @param tree The tree to modify in place.
   */
  public static void stripPrepCases(SemanticGraph tree) {
    // Find incoming case edges that have an 'nmod' incoming edge
    List<SemanticGraphEdge> toClean = new ArrayList<>();
    for (SemanticGraphEdge edge : tree.edgeIterable()) {
      if ("case".equals(edge.getRelation().toString())) {
        boolean isPrepTarget = false;
        for (SemanticGraphEdge incoming : tree.incomingEdgeIterable(edge.getGovernor())) {
          if ("nmod".equals(incoming.getRelation().getShortName())) {
            isPrepTarget = true;
            break;
          }
        }
        if (isPrepTarget && !tree.outgoingEdgeIterator(edge.getDependent()).hasNext()) {
          toClean.add(edge);
        }
      }
    }

    // Delete these edges
    for (SemanticGraphEdge edge : toClean) {
      tree.removeEdge(edge);
      tree.removeVertex(edge.getDependent());
      assert isTree(tree);
    }
  }
  /**
   * The basic method for splitting off a clause of a tree. This modifies the tree in place. This
   * method addtionally follows ref edges.
   *
   * @param tree The tree to split a clause from.
   * @param toKeep The edge representing the clause to keep.
   */
  @SuppressWarnings("unchecked")
  private void simpleClause(SemanticGraph tree, SemanticGraphEdge toKeep) {
    splitToChildOfEdge(tree, toKeep);

    // Follow 'ref' edges
    Map<IndexedWord, IndexedWord> refReplaceMap = new HashMap<>();
    // (find replacements)
    for (IndexedWord vertex : tree.vertexSet()) {
      for (SemanticGraphEdge edge : extraEdgesByDependent.get(vertex)) {
        if ("ref".equals(edge.getRelation().toString())
            && // it's a ref edge...
            !tree.containsVertex(
                edge.getGovernor())) { // ...that doesn't already exist in the tree.
          refReplaceMap.put(vertex, edge.getGovernor());
        }
      }
    }
    // (do replacements)
    for (Map.Entry<IndexedWord, IndexedWord> entry : refReplaceMap.entrySet()) {
      Iterator<SemanticGraphEdge> iter = tree.incomingEdgeIterator(entry.getKey());
      if (!iter.hasNext()) {
        continue;
      }
      SemanticGraphEdge incomingEdge = iter.next();
      IndexedWord governor = incomingEdge.getGovernor();
      tree.removeVertex(entry.getKey());
      addSubtree(
          tree,
          governor,
          incomingEdge.getRelation().toString(),
          this.tree,
          entry.getValue(),
          this.tree.incomingEdgeList(tree.getFirstRoot()));
    }
  }
 /**
  * The basic method for splitting off a clause of a tree. This modifies the tree in place.
  *
  * @param tree The tree to split a clause from.
  * @param toKeep The edge representing the clause to keep.
  */
 static void splitToChildOfEdge(SemanticGraph tree, SemanticGraphEdge toKeep) {
   Queue<IndexedWord> fringe = new LinkedList<>();
   List<IndexedWord> nodesToRemove = new ArrayList<>();
   // Find nodes to remove
   // (from the root)
   for (IndexedWord root : tree.getRoots()) {
     nodesToRemove.add(root);
     for (SemanticGraphEdge out : tree.outgoingEdgeIterable(root)) {
       if (!out.equals(toKeep)) {
         fringe.add(out.getDependent());
       }
     }
   }
   // (recursively)
   while (!fringe.isEmpty()) {
     IndexedWord node = fringe.poll();
     nodesToRemove.add(node);
     for (SemanticGraphEdge out : tree.outgoingEdgeIterable(node)) {
       if (!out.equals(toKeep)) {
         fringe.add(out.getDependent());
       }
     }
   }
   // Remove nodes
   nodesToRemove.forEach(tree::removeVertex);
   // Set new root
   tree.setRoot(toKeep.getDependent());
 }
  @Override
  public void process(JCas jCas) throws AnalysisEngineProcessException {
    Annotation document = this.processor.process(jCas.getDocumentText());

    String lastNETag = "O";
    int lastNEBegin = -1;
    int lastNEEnd = -1;
    for (CoreMap tokenAnn : document.get(TokensAnnotation.class)) {

      // create the token annotation
      int begin = tokenAnn.get(CharacterOffsetBeginAnnotation.class);
      int end = tokenAnn.get(CharacterOffsetEndAnnotation.class);
      String pos = tokenAnn.get(PartOfSpeechAnnotation.class);
      String lemma = tokenAnn.get(LemmaAnnotation.class);
      Token token = new Token(jCas, begin, end);
      token.setPos(pos);
      token.setLemma(lemma);
      token.addToIndexes();

      // hackery to convert token-level named entity tag into phrase-level tag
      String neTag = tokenAnn.get(NamedEntityTagAnnotation.class);
      if (neTag.equals("O") && !lastNETag.equals("O")) {
        NamedEntityMention ne = new NamedEntityMention(jCas, lastNEBegin, lastNEEnd);
        ne.setMentionType(lastNETag);
        ne.addToIndexes();
      } else {
        if (lastNETag.equals("O")) {
          lastNEBegin = begin;
        } else if (lastNETag.equals(neTag)) {
          // do nothing - begin was already set
        } else {
          NamedEntityMention ne = new NamedEntityMention(jCas, lastNEBegin, lastNEEnd);
          ne.setMentionType(lastNETag);
          ne.addToIndexes();
          lastNEBegin = begin;
        }
        lastNEEnd = end;
      }
      lastNETag = neTag;
    }
    if (!lastNETag.equals("O")) {
      NamedEntityMention ne = new NamedEntityMention(jCas, lastNEBegin, lastNEEnd);
      ne.setMentionType(lastNETag);
      ne.addToIndexes();
    }

    // add sentences and trees
    for (CoreMap sentenceAnn : document.get(SentencesAnnotation.class)) {

      // add the sentence annotation
      int sentBegin = sentenceAnn.get(CharacterOffsetBeginAnnotation.class);
      int sentEnd = sentenceAnn.get(CharacterOffsetEndAnnotation.class);
      Sentence sentence = new Sentence(jCas, sentBegin, sentEnd);
      sentence.addToIndexes();

      // add the syntactic tree annotation
      List<CoreLabel> tokenAnns = sentenceAnn.get(TokensAnnotation.class);
      Tree tree = sentenceAnn.get(TreeAnnotation.class);
      if (tree.children().length != 1) {
        throw new RuntimeException("Expected single root node, found " + tree);
      }
      tree = tree.firstChild();
      tree.indexSpans(0);
      TopTreebankNode root = new TopTreebankNode(jCas);
      root.setTreebankParse(tree.toString());
      // TODO: root.setTerminals(v)
      this.addTreebankNodeToIndexes(root, jCas, tree, tokenAnns);

      // get the dependencies
      SemanticGraph dependencies =
          sentenceAnn.get(CollapsedCCProcessedDependenciesAnnotation.class);

      // convert Stanford nodes to UIMA annotations
      List<Token> tokens = JCasUtil.selectCovered(jCas, Token.class, sentence);
      Map<IndexedWord, DependencyNode> stanfordToUima = new HashMap<IndexedWord, DependencyNode>();
      for (IndexedWord stanfordNode : dependencies.vertexSet()) {
        int indexBegin = stanfordNode.get(BeginIndexAnnotation.class);
        int indexEnd = stanfordNode.get(EndIndexAnnotation.class);
        int tokenBegin = tokens.get(indexBegin).getBegin();
        int tokenEnd = tokens.get(indexEnd - 1).getEnd();
        DependencyNode node;
        if (dependencies.getRoots().contains(stanfordNode)) {
          node = new TopDependencyNode(jCas, tokenBegin, tokenEnd);
        } else {
          node = new DependencyNode(jCas, tokenBegin, tokenEnd);
        }
        stanfordToUima.put(stanfordNode, node);
      }

      // create relation annotations for each Stanford dependency
      ArrayListMultimap<DependencyNode, DependencyRelation> headRelations =
          ArrayListMultimap.create();
      ArrayListMultimap<DependencyNode, DependencyRelation> childRelations =
          ArrayListMultimap.create();
      for (SemanticGraphEdge stanfordEdge : dependencies.edgeIterable()) {
        DependencyRelation relation = new DependencyRelation(jCas);
        DependencyNode head = stanfordToUima.get(stanfordEdge.getGovernor());
        DependencyNode child = stanfordToUima.get(stanfordEdge.getDependent());
        String relationType = stanfordEdge.getRelation().toString();
        if (head == null || child == null || relationType == null) {
          throw new RuntimeException(
              String.format(
                  "null elements not allowed in relation:\nrelation=%s\nchild=%s\nhead=%s\n",
                  relation, child, head));
        }
        relation.setHead(head);
        relation.setChild(child);
        relation.setRelation(relationType);
        relation.addToIndexes();
        headRelations.put(child, relation);
        childRelations.put(head, relation);
      }

      // set the relations for each node annotation
      for (DependencyNode node : stanfordToUima.values()) {
        List<DependencyRelation> heads = headRelations.get(node);
        node.setHeadRelations(new FSArray(jCas, heads == null ? 0 : heads.size()));
        if (heads != null) {
          FSCollectionFactory.fillArrayFS(node.getHeadRelations(), heads);
        }
        List<DependencyRelation> children = childRelations.get(node);
        node.setChildRelations(new FSArray(jCas, children == null ? 0 : children.size()));
        if (children != null) {
          FSCollectionFactory.fillArrayFS(node.getChildRelations(), children);
        }
        node.addToIndexes();
      }
    }

    // map from spans to named entity mentions
    Map<Span, NamedEntityMention> spanMentionMap = new HashMap<Span, NamedEntityMention>();
    for (NamedEntityMention mention : JCasUtil.select(jCas, NamedEntityMention.class)) {
      spanMentionMap.put(new Span(mention.getBegin(), mention.getEnd()), mention);
    }

    // add mentions for all entities identified by the coreference system
    List<NamedEntity> entities = new ArrayList<NamedEntity>();
    List<List<Token>> sentenceTokens = new ArrayList<List<Token>>();
    for (Sentence sentence : JCasUtil.select(jCas, Sentence.class)) {
      sentenceTokens.add(JCasUtil.selectCovered(jCas, Token.class, sentence));
    }
    Map<Integer, CorefChain> corefChains = document.get(CorefChainAnnotation.class);
    for (CorefChain chain : corefChains.values()) {
      List<NamedEntityMention> mentions = new ArrayList<NamedEntityMention>();
      for (CorefMention corefMention : chain.getMentionsInTextualOrder()) {

        // figure out the character span of the token
        List<Token> tokens = sentenceTokens.get(corefMention.sentNum - 1);
        int begin = tokens.get(corefMention.startIndex - 1).getBegin();
        int end = tokens.get(corefMention.endIndex - 2).getEnd();

        // use an existing named entity mention when possible; otherwise create a new one
        NamedEntityMention mention = spanMentionMap.get(new Span(begin, end));
        if (mention == null) {
          mention = new NamedEntityMention(jCas, begin, end);
          mention.addToIndexes();
        }
        mentions.add(mention);
      }

      // create an entity for the mentions
      Collections.sort(
          mentions,
          new Comparator<NamedEntityMention>() {
            @Override
            public int compare(NamedEntityMention m1, NamedEntityMention m2) {
              return m1.getBegin() - m2.getBegin();
            }
          });

      // create mentions and add them to entity
      NamedEntity entity = new NamedEntity(jCas);
      entity.setMentions(new FSArray(jCas, mentions.size()));
      int index = 0;
      for (NamedEntityMention mention : mentions) {
        mention.setMentionedEntity(entity);
        entity.setMentions(index, mention);
        index += 1;
      }
      entities.add(entity);
    }

    // add singleton entities for any named entities not picked up by coreference system
    for (NamedEntityMention mention : JCasUtil.select(jCas, NamedEntityMention.class)) {
      if (mention.getMentionedEntity() == null) {
        NamedEntity entity = new NamedEntity(jCas);
        entity.setMentions(new FSArray(jCas, 1));
        entity.setMentions(0, mention);
        mention.setMentionedEntity(entity);
        entity.getMentions();
        entities.add(entity);
      }
    }

    // sort entities by document order
    Collections.sort(
        entities,
        new Comparator<NamedEntity>() {
          @Override
          public int compare(NamedEntity o1, NamedEntity o2) {
            return getFirstBegin(o1) - getFirstBegin(o2);
          }

          private int getFirstBegin(NamedEntity entity) {
            int min = Integer.MAX_VALUE;
            for (NamedEntityMention mention :
                JCasUtil.select(entity.getMentions(), NamedEntityMention.class)) {
              if (mention.getBegin() < min) {
                min = mention.getBegin();
              }
            }
            return min;
          }
        });

    // add entities to document
    for (NamedEntity entity : entities) {
      entity.addToIndexes();
    }
  }
  /**
   * The core implementation of the search.
   *
   * @param root The root word to search from. Traditionally, this is the root of the sentence.
   * @param candidateFragments The callback for the resulting sentence fragments. This is a
   *     predicate of a triple of values. The return value of the predicate determines whether we
   *     should continue searching. The triple is a triple of
   *     <ol>
   *       <li>The log probability of the sentence fragment, according to the featurizer and the
   *           weights
   *       <li>The features along the path to this fragment. The last element of this is the
   *           features from the most recent step.
   *       <li>The sentence fragment. Because it is relatively expensive to compute the resulting
   *           tree, this is returned as a lazy {@link Supplier}.
   *     </ol>
   *
   * @param classifier The classifier for whether an arc should be on the path to a clause split, a
   *     clause split itself, or neither.
   * @param featurizer The featurizer to use. Make sure this matches the weights!
   * @param actionSpace The action space we are allowed to take. Each action defines a means of
   *     splitting a clause on a dependency boundary.
   */
  protected void search(
      // The root to search from
      IndexedWord root,
      // The output specs
      final Predicate<Triple<Double, List<Counter<String>>, Supplier<SentenceFragment>>>
          candidateFragments,
      // The learning specs
      final Classifier<ClauseSplitter.ClauseClassifierLabel, String> classifier,
      Map<String, ? extends List<String>> hardCodedSplits,
      final Function<Triple<State, Action, State>, Counter<String>> featurizer,
      final Collection<Action> actionSpace,
      final int maxTicks) {
    // (the fringe)
    PriorityQueue<Pair<State, List<Counter<String>>>> fringe = new FixedPrioritiesPriorityQueue<>();
    // (avoid duplicate work)
    Set<IndexedWord> seenWords = new HashSet<>();

    State firstState =
        new State(null, null, -9000, null, x -> {}, true); // First state is implicitly "done"
    fringe.add(Pair.makePair(firstState, new ArrayList<>(0)), -0.0);
    int ticks = 0;

    while (!fringe.isEmpty()) {
      if (++ticks > maxTicks) {
        //        System.err.println("WARNING! Timed out on search with " + ticks + " ticks");
        return;
      }
      // Useful variables
      double logProbSoFar = fringe.getPriority();
      assert logProbSoFar <= 0.0;
      Pair<State, List<Counter<String>>> lastStatePair = fringe.removeFirst();
      State lastState = lastStatePair.first;
      List<Counter<String>> featuresSoFar = lastStatePair.second;
      IndexedWord rootWord = lastState.edge == null ? root : lastState.edge.getDependent();

      // Register thunk
      if (lastState.isDone) {
        if (!candidateFragments.test(
            Triple.makeTriple(
                logProbSoFar,
                featuresSoFar,
                () -> {
                  SemanticGraph copy = new SemanticGraph(tree);
                  lastState
                      .thunk
                      .andThen(
                          x -> {
                            // Add the extra edges back in, if they don't break the tree-ness of the
                            // extraction
                            for (IndexedWord newTreeRoot : x.getRoots()) {
                              if (newTreeRoot != null) { // what a strange thing to have happen...
                                for (SemanticGraphEdge extraEdge :
                                    extraEdgesByGovernor.get(newTreeRoot)) {
                                  assert Util.isTree(x);
                                  //noinspection unchecked
                                  addSubtree(
                                      x,
                                      newTreeRoot,
                                      extraEdge.getRelation().toString(),
                                      tree,
                                      extraEdge.getDependent(),
                                      tree.getIncomingEdgesSorted(newTreeRoot));
                                  assert Util.isTree(x);
                                }
                              }
                            }
                          })
                      .accept(copy);
                  return new SentenceFragment(copy, assumedTruth, false);
                }))) {
          break;
        }
      }

      // Find relevant auxilliary terms
      SemanticGraphEdge subjOrNull = null;
      SemanticGraphEdge objOrNull = null;
      for (SemanticGraphEdge auxEdge : tree.outgoingEdgeIterable(rootWord)) {
        String relString = auxEdge.getRelation().toString();
        if (relString.contains("obj")) {
          objOrNull = auxEdge;
        } else if (relString.contains("subj")) {
          subjOrNull = auxEdge;
        }
      }

      // Iterate over children
      // For each outgoing edge...
      for (SemanticGraphEdge outgoingEdge : tree.outgoingEdgeIterable(rootWord)) {
        // Prohibit indirect speech verbs from splitting off clauses
        // (e.g., 'said', 'think')
        // This fires if the governor is an indirect speech verb, and the outgoing edge is a ccomp
        if (outgoingEdge.getRelation().toString().equals("ccomp")
            && ((outgoingEdge.getGovernor().lemma() != null
                    && INDIRECT_SPEECH_LEMMAS.contains(outgoingEdge.getGovernor().lemma()))
                || INDIRECT_SPEECH_LEMMAS.contains(outgoingEdge.getGovernor().word()))) {
          continue;
        }
        // Get some variables
        String outgoingEdgeRelation = outgoingEdge.getRelation().toString();
        List<String> forcedArcOrder = hardCodedSplits.get(outgoingEdgeRelation);
        if (forcedArcOrder == null && outgoingEdgeRelation.contains(":")) {
          forcedArcOrder =
              hardCodedSplits.get(
                  outgoingEdgeRelation.substring(0, outgoingEdgeRelation.indexOf(":")) + ":*");
        }
        boolean doneForcedArc = false;
        // For each action...
        for (Action action :
            (forcedArcOrder == null ? actionSpace : orderActions(actionSpace, forcedArcOrder))) {
          // Check the prerequisite
          if (!action.prerequisitesMet(tree, outgoingEdge)) {
            continue;
          }
          if (forcedArcOrder != null && doneForcedArc) {
            break;
          }
          // 1. Compute the child state
          Optional<State> candidate =
              action.applyTo(tree, lastState, outgoingEdge, subjOrNull, objOrNull);
          if (candidate.isPresent()) {
            double logProbability;
            ClauseClassifierLabel bestLabel;
            Counter<String> features =
                featurizer.apply(Triple.makeTriple(lastState, action, candidate.get()));
            if (forcedArcOrder != null && !doneForcedArc) {
              logProbability = 0.0;
              bestLabel = ClauseClassifierLabel.CLAUSE_SPLIT;
              doneForcedArc = true;
            } else if (features.containsKey("__undocumented_junit_no_classifier")) {
              logProbability = Double.NEGATIVE_INFINITY;
              bestLabel = ClauseClassifierLabel.CLAUSE_INTERM;
            } else {
              Counter<ClauseClassifierLabel> scores = classifier.scoresOf(new RVFDatum<>(features));
              if (scores.size() > 0) {
                Counters.logNormalizeInPlace(scores);
              }
              String rel = outgoingEdge.getRelation().toString();
              if ("nsubj".equals(rel) || "dobj".equals(rel)) {
                scores.remove(
                    ClauseClassifierLabel.NOT_A_CLAUSE); // Always at least yield on nsubj and dobj
              }
              logProbability = Counters.max(scores, Double.NEGATIVE_INFINITY);
              bestLabel = Counters.argmax(scores, (x, y) -> 0, ClauseClassifierLabel.CLAUSE_SPLIT);
            }

            if (bestLabel != ClauseClassifierLabel.NOT_A_CLAUSE) {
              Pair<State, List<Counter<String>>> childState =
                  Pair.makePair(
                      candidate.get().withIsDone(bestLabel),
                      new ArrayList<Counter<String>>(featuresSoFar) {
                        {
                          add(features);
                        }
                      });
              // 2. Register the child state
              if (!seenWords.contains(childState.first.edge.getDependent())) {
                //            System.err.println("  pushing " + action.signature() + " with " +
                // argmax.first.edge);
                fringe.add(childState, logProbability);
              }
            }
          }
        }
      }

      seenWords.add(rootWord);
    }
    //    System.err.println("Search finished in " + ticks + " ticks and " + classifierEvals + "
    // classifier evaluations.");
  }
  /**
   * A helper to add an entire subtree to a given dependency tree.
   *
   * @param toModify The tree to add the subtree to.
   * @param root The root of the tree where we should be adding the subtree.
   * @param rel The relation to add the subtree with.
   * @param originalTree The orignal tree (i.e., {@link ClauseSplitterSearchProblem#tree}).
   * @param subject The root of the clause to add.
   * @param ignoredEdges The edges to ignore adding when adding this subtree.
   */
  private static void addSubtree(
      SemanticGraph toModify,
      IndexedWord root,
      String rel,
      SemanticGraph originalTree,
      IndexedWord subject,
      Collection<SemanticGraphEdge> ignoredEdges) {
    if (toModify.containsVertex(subject)) {
      return; // This subtree already exists.
    }
    Queue<IndexedWord> fringe = new LinkedList<>();
    Collection<IndexedWord> wordsToAdd = new ArrayList<>();
    Collection<SemanticGraphEdge> edgesToAdd = new ArrayList<>();
    // Search for subtree to add
    for (SemanticGraphEdge edge : originalTree.outgoingEdgeIterable(subject)) {
      if (!ignoredEdges.contains(edge)) {
        if (toModify.containsVertex(edge.getDependent())) {
          // Case: we're adding a subtree that's not disjoint from toModify. This is bad news.
          return;
        }
        edgesToAdd.add(edge);
        fringe.add(edge.getDependent());
      }
    }
    while (!fringe.isEmpty()) {
      IndexedWord node = fringe.poll();
      wordsToAdd.add(node);
      for (SemanticGraphEdge edge : originalTree.outgoingEdgeIterable(node)) {
        if (!ignoredEdges.contains(edge)) {
          if (toModify.containsVertex(edge.getDependent())) {
            // Case: we're adding a subtree that's not disjoint from toModify. This is bad news.
            return;
          }
          edgesToAdd.add(edge);
          fringe.add(edge.getDependent());
        }
      }
    }
    // Add subtree
    // (add subject)
    toModify.addVertex(subject);
    toModify.addEdge(
        root,
        subject,
        GrammaticalRelation.valueOf(Language.English, rel),
        Double.NEGATIVE_INFINITY,
        false);

    // (add nodes)
    wordsToAdd.forEach(toModify::addVertex);
    // (add edges)
    for (SemanticGraphEdge edge : edgesToAdd) {
      assert !toModify.incomingEdgeIterator(edge.getDependent()).hasNext();
      toModify.addEdge(
          edge.getGovernor(),
          edge.getDependent(),
          edge.getRelation(),
          edge.getWeight(),
          edge.isExtra());
    }
  }
        @Override
        public Counter<String> apply(Triple<State, Action, State> triple) {
          // Variables
          State from = triple.first;
          Action action = triple.second;
          State to = triple.third;
          String signature = action.signature();
          String edgeRelTaken = to.edge == null ? "root" : to.edge.getRelation().toString();
          String edgeRelShort = to.edge == null ? "root" : to.edge.getRelation().getShortName();
          if (edgeRelShort.contains("_")) {
            edgeRelShort = edgeRelShort.substring(0, edgeRelShort.indexOf("_"));
          }

          // -- Featurize --
          // Variables to aggregate
          boolean parentHasSubj = false;
          boolean parentHasObj = false;
          boolean childHasSubj = false;
          boolean childHasObj = false;
          Counter<String> feats = new ClassicCounter<>();

          // 1. edge taken
          feats.incrementCount(signature + "&edge:" + edgeRelTaken);
          feats.incrementCount(signature + "&edge_type:" + edgeRelShort);

          // 2. last edge taken
          if (from.edge == null) {
            assert to.edge == null || to.originalTree().getRoots().contains(to.edge.getGovernor());
            feats.incrementCount(signature + "&at_root");
            feats.incrementCount(
                signature + "&at_root&root_pos:" + to.originalTree().getFirstRoot().tag());
          } else {
            feats.incrementCount(signature + "&not_root");
            String lastRelShort = from.edge.getRelation().getShortName();
            if (lastRelShort.contains("_")) {
              lastRelShort = lastRelShort.substring(0, lastRelShort.indexOf("_"));
            }
            feats.incrementCount(signature + "&last_edge:" + lastRelShort);
          }

          if (to.edge != null) {
            // 3. other edges at parent
            for (SemanticGraphEdge parentNeighbor :
                from.originalTree().outgoingEdgeIterable(to.edge.getGovernor())) {
              if (parentNeighbor != to.edge) {
                String parentNeighborRel = parentNeighbor.getRelation().toString();
                if (parentNeighborRel.contains("subj")) {
                  parentHasSubj = true;
                }
                if (parentNeighborRel.contains("obj")) {
                  parentHasObj = true;
                }
                // (add feature)
                feats.incrementCount(signature + "&parent_neighbor:" + parentNeighborRel);
                feats.incrementCount(
                    signature
                        + "&edge_type:"
                        + edgeRelShort
                        + "&parent_neighbor:"
                        + parentNeighborRel);
              }
            }

            // 4. Other edges at child
            int childNeighborCount = 0;
            for (SemanticGraphEdge childNeighbor :
                from.originalTree().outgoingEdgeIterable(to.edge.getDependent())) {
              String childNeighborRel = childNeighbor.getRelation().toString();
              if (childNeighborRel.contains("subj")) {
                childHasSubj = true;
              }
              if (childNeighborRel.contains("obj")) {
                childHasObj = true;
              }
              childNeighborCount += 1;
              // (add feature)
              feats.incrementCount(signature + "&child_neighbor:" + childNeighborRel);
              feats.incrementCount(
                  signature + "&edge_type:" + edgeRelShort + "&child_neighbor:" + childNeighborRel);
            }
            // 4.1 Number of other edges at child
            feats.incrementCount(
                signature
                    + "&child_neighbor_count:"
                    + (childNeighborCount < 3 ? childNeighborCount : ">2"));
            feats.incrementCount(
                signature
                    + "&edge_type:"
                    + edgeRelShort
                    + "&child_neighbor_count:"
                    + (childNeighborCount < 3 ? childNeighborCount : ">2"));

            // 5. Subject/Object stats
            feats.incrementCount(signature + "&parent_neighbor_subj:" + parentHasSubj);
            feats.incrementCount(signature + "&parent_neighbor_obj:" + parentHasObj);
            feats.incrementCount(signature + "&child_neighbor_subj:" + childHasSubj);
            feats.incrementCount(signature + "&child_neighbor_obj:" + childHasObj);

            // 6. POS tag info
            feats.incrementCount(signature + "&parent_pos:" + to.edge.getGovernor().tag());
            feats.incrementCount(signature + "&child_pos:" + to.edge.getDependent().tag());
            feats.incrementCount(
                signature
                    + "&pos_signature:"
                    + to.edge.getGovernor().tag()
                    + "_"
                    + to.edge.getDependent().tag());
            feats.incrementCount(
                signature
                    + "&edge_type:"
                    + edgeRelShort
                    + "&pos_signature:"
                    + to.edge.getGovernor().tag()
                    + "_"
                    + to.edge.getDependent().tag());
          }
          return feats;
        }
Esempio n. 11
0
  public static DependencyParse parse(String text) {

    if (pipeline == null) {
      loadModels();
    }

    DependencyParse parse = new DependencyParse();

    Annotation document = new Annotation(text);

    pipeline.annotate(document);

    List<CoreMap> sentences = document.get(SentencesAnnotation.class);

    for (CoreMap sentence : sentences) {

      SemanticGraph dependencies = sentence.get(CollapsedCCProcessedDependenciesAnnotation.class);

      IndexedWord root = dependencies.getFirstRoot();

      parse.setHeadNode(root.index());

      List<SemanticGraphEdge> edges = dependencies.edgeListSorted();

      // System.out.println(edges);
      for (SemanticGraphEdge t : edges) {

        String dep = t.getDependent().originalText();
        int depIndex = t.getDependent().index();
        String depPOS = t.getDependent().tag();
        int depStart = t.getDependent().beginPosition();
        int depEnd = t.getDependent().endPosition();

        String gov = t.getGovernor().originalText();
        int govIndex = t.getGovernor().index();
        String govPOS = t.getGovernor().tag();
        int govStart = t.getGovernor().beginPosition();
        int govEnd = t.getGovernor().endPosition();

        parse.addNode(govIndex, gov, govPOS, govStart, govEnd);
        parse.addNode(depIndex, dep, depPOS, depStart, depEnd);

        parse.addEdge(depIndex, govIndex, t.getRelation().getShortName());
      }
    }

    return parse;
  }
Esempio n. 12
0
  /**
   * Fix some bizarre peculiarities with certain trees. So far, these include:
   *
   * <ul>
   *   <li>Sometimes there's a node from a word to itself. This seems wrong.
   * </ul>
   *
   * @param tree The tree to clean (in place!).
   * @return A list of extra edges, which are valid but were removed.
   */
  public static List<SemanticGraphEdge> cleanTree(SemanticGraph tree) {
    //    assert !isCyclic(tree);

    // Clean nodes
    List<IndexedWord> toDelete = new ArrayList<>();
    for (IndexedWord vertex : tree.vertexSet()) {
      // Clean punctuation
      if (vertex.tag() == null) {
        continue;
      }
      char tag = vertex.backingLabel().tag().charAt(0);
      if (tag == '.' || tag == ',' || tag == '(' || tag == ')' || tag == ':') {
        if (!tree.outgoingEdgeIterator(vertex)
            .hasNext()) { // This should really never happen, but it does.
          toDelete.add(vertex);
        }
      }
    }
    toDelete.forEach(tree::removeVertex);

    // Clean edges
    Iterator<SemanticGraphEdge> iter = tree.edgeIterable().iterator();
    while (iter.hasNext()) {
      SemanticGraphEdge edge = iter.next();
      if (edge.getDependent().index() == edge.getGovernor().index()) {
        // Clean self-edges
        iter.remove();
      } else if (edge.getRelation().toString().equals("punct")) {
        // Clean punctuation (again)
        if (!tree.outgoingEdgeIterator(edge.getDependent())
            .hasNext()) { // This should really never happen, but it does.
          iter.remove();
        }
      }
    }

    // Remove extra edges
    List<SemanticGraphEdge> extraEdges = new ArrayList<>();
    for (SemanticGraphEdge edge : tree.edgeIterable()) {
      if (edge.isExtra()) {
        if (tree.incomingEdgeList(edge.getDependent()).size() > 1) {
          extraEdges.add(edge);
        }
      }
    }
    extraEdges.forEach(tree::removeEdge);

    // Add apposition edges (simple coref)
    for (SemanticGraphEdge extraEdge :
        new ArrayList<>(extraEdges)) { // note[gabor] prevent concurrent modification exception
      for (SemanticGraphEdge candidateAppos : tree.incomingEdgeIterable(extraEdge.getDependent())) {
        if (candidateAppos.getRelation().toString().equals("appos")) {
          extraEdges.add(
              new SemanticGraphEdge(
                  extraEdge.getGovernor(),
                  candidateAppos.getGovernor(),
                  extraEdge.getRelation(),
                  extraEdge.getWeight(),
                  extraEdge.isExtra()));
        }
      }
      for (SemanticGraphEdge candidateAppos : tree.outgoingEdgeIterable(extraEdge.getDependent())) {
        if (candidateAppos.getRelation().toString().equals("appos")) {
          extraEdges.add(
              new SemanticGraphEdge(
                  extraEdge.getGovernor(),
                  candidateAppos.getDependent(),
                  extraEdge.getRelation(),
                  extraEdge.getWeight(),
                  extraEdge.isExtra()));
        }
      }
    }

    // Brute force ensure tree
    // Remove incoming edges from roots
    List<SemanticGraphEdge> rootIncomingEdges = new ArrayList<>();
    for (IndexedWord root : tree.getRoots()) {
      for (SemanticGraphEdge incomingEdge : tree.incomingEdgeIterable(root)) {
        rootIncomingEdges.add(incomingEdge);
      }
    }
    rootIncomingEdges.forEach(tree::removeEdge);
    // Loop until it becomes a tree.
    boolean changed = true;
    while (changed) { // I just want trees to be trees; is that so much to ask!?
      changed = false;
      List<IndexedWord> danglingNodes = new ArrayList<>();
      List<SemanticGraphEdge> invalidEdges = new ArrayList<>();

      for (IndexedWord vertex : tree.vertexSet()) {
        // Collect statistics
        Iterator<SemanticGraphEdge> incomingIter = tree.incomingEdgeIterator(vertex);
        boolean hasIncoming = incomingIter.hasNext();
        boolean hasMultipleIncoming = false;
        if (hasIncoming) {
          incomingIter.next();
          hasMultipleIncoming = incomingIter.hasNext();
        }

        // Register actions
        if (!hasIncoming && !tree.getRoots().contains(vertex)) {
          danglingNodes.add(vertex);
        } else {
          if (hasMultipleIncoming) {
            for (SemanticGraphEdge edge : new IterableIterator<>(incomingIter)) {
              invalidEdges.add(edge);
            }
          }
        }
      }

      // Perform actions
      for (IndexedWord vertex : danglingNodes) {
        tree.removeVertex(vertex);
        changed = true;
      }
      for (SemanticGraphEdge edge : invalidEdges) {
        tree.removeEdge(edge);
        changed = true;
      }
    }

    // Return
    assert isTree(tree);
    return extraEdges;
  }