Ejemplo n.º 1
0
  public TokensRegexNERAnnotator(String name, Properties properties) {
    String prefix = (name != null && !name.isEmpty()) ? name + '.' : "";
    String backgroundSymbol =
        properties.getProperty(prefix + "backgroundSymbol", DEFAULT_BACKGROUND_SYMBOL);
    String[] backgroundSymbols = backgroundSymbol.split("\\s*,\\s*");
    String mappingFiles =
        properties.getProperty(prefix + "mapping", DefaultPaths.DEFAULT_REGEXNER_RULES);
    String[] mappings = mappingFiles.split("\\s*[,;]\\s*");
    String validPosRegex = properties.getProperty(prefix + "validpospattern");
    this.posMatchType =
        PosMatchType.valueOf(
            properties.getProperty(prefix + "posmatchtype", DEFAULT_POS_MATCH_TYPE.name()));

    String noDefaultOverwriteLabelsProp =
        properties.getProperty(prefix + "noDefaultOverwriteLabels");
    this.noDefaultOverwriteLabels =
        (noDefaultOverwriteLabelsProp != null)
            ? Collections.unmodifiableSet(
                CollectionUtils.asSet(noDefaultOverwriteLabelsProp.split("\\s*,\\s*")))
            : Collections.unmodifiableSet(new HashSet<>());
    this.ignoreCase = PropertiesUtils.getBool(properties, prefix + "ignorecase", false);
    this.verbose = PropertiesUtils.getBool(properties, prefix + "verbose", false);

    if (validPosRegex != null && !validPosRegex.isEmpty()) {
      validPosPattern = Pattern.compile(validPosRegex);
    } else {
      validPosPattern = null;
    }
    entries =
        Collections.unmodifiableList(
            readEntries(name, noDefaultOverwriteLabels, ignoreCase, verbose, mappings));
    IdentityHashMap<SequencePattern<CoreMap>, Entry> patternToEntry = new IdentityHashMap<>();
    multiPatternMatcher = createPatternMatcher(patternToEntry);
    this.patternToEntry = Collections.unmodifiableMap(patternToEntry);
    Set<String> myLabels = Generics.newHashSet();
    // Can always override background or none.
    Collections.addAll(myLabels, backgroundSymbols);
    myLabels.add(null);
    // Always overwrite labels
    for (Entry entry : entries) myLabels.add(entry.type);
    this.myLabels = Collections.unmodifiableSet(myLabels);
  }
/**
 * A search problem for finding clauses in a sentence.
 *
 * <p>For usage at test time, load a model from {@link ClauseSplitter#load(String)}, and then take
 * the top clauses of a given tree with {@link ClauseSplitterSearchProblem#topClauses(double)},
 * yielding a list of {@link edu.stanford.nlp.naturalli.SentenceFragment}s.
 *
 * <pre>{@code
 * ClauseSearcher searcher = ClauseSearcher.factory("/model/path/");
 * List<SentenceFragment> sentences = searcher.topClauses(threshold);
 *
 * }</pre>
 *
 * <p>For training, see {@link ClauseSplitter#train(Stream, File, File)}.
 *
 * @author Gabor Angeli
 */
public class ClauseSplitterSearchProblem {

  /**
   * A specification for clause splits we _always_ want to do. The format is a map from the edge
   * label we are splitting, to the preference for the type of split we should do. The most
   * preferred is at the front of the list, and then it backs off to the less and less preferred
   * split types.
   */
  protected static final Map<String, List<String>> HARD_SPLITS =
      Collections.unmodifiableMap(
          new HashMap<String, List<String>>() {
            {
              put(
                  "comp",
                  new ArrayList<String>() {
                    {
                      add("simple");
                    }
                  });
              put(
                  "ccomp",
                  new ArrayList<String>() {
                    {
                      add("simple");
                    }
                  });
              put(
                  "xcomp",
                  new ArrayList<String>() {
                    {
                      add("clone_dobj");
                      add("clone_nsubj");
                      add("simple");
                    }
                  });
              put(
                  "vmod",
                  new ArrayList<String>() {
                    {
                      add("clone_nsubj");
                      add("simple");
                    }
                  });
              put(
                  "csubj",
                  new ArrayList<String>() {
                    {
                      add("clone_dobj");
                      add("simple");
                    }
                  });
              put(
                  "advcl",
                  new ArrayList<String>() {
                    {
                      add("clone_nsubj");
                      add("simple");
                    }
                  });
              put(
                  "conj:*",
                  new ArrayList<String>() {
                    {
                      add("clone_nsubj");
                      add("clone_dobj");
                      add("simple");
                    }
                  });
              put(
                  "acl:relcl",
                  new ArrayList<String>() {
                    { // no doubt (-> that cats have tails <-)
                      add("simple");
                    }
                  });
            }
          });

  /**
   * A set of words which indicate that the complement clause is not factual, or at least not
   * necessarily factual.
   */
  protected static final Set<String> INDIRECT_SPEECH_LEMMAS =
      Collections.unmodifiableSet(
          new HashSet<String>() {
            {
              add("report");
              add("say");
              add("told");
              add("claim");
              add("assert");
              add("think");
              add("believe");
              add("suppose");
            }
          });

  /** The tree to search over. */
  public final SemanticGraph tree;
  /** The assumed truth of the original clause. */
  public final boolean assumedTruth;
  /** The length of the sentence, as determined from the tree. */
  public final int sentenceLength;
  /** A mapping from a word to the extra edges that come out of it. */
  private final Map<IndexedWord, Collection<SemanticGraphEdge>> extraEdgesByGovernor =
      new HashMap<>();
  /** A mapping from a word to the extra edges that to into it. */
  private final Map<IndexedWord, Collection<SemanticGraphEdge>> extraEdgesByDependent =
      new HashMap<>();
  /** The classifier for whether a particular dependency edge defines a clause boundary. */
  private final Optional<Classifier<ClauseSplitter.ClauseClassifierLabel, String>>
      isClauseClassifier;
  /**
   * An optional featurizer to use with the clause classifier ({@link
   * ClauseSplitterSearchProblem#isClauseClassifier}). If that classifier is defined, this should be
   * as well.
   */
  private final Optional<
          Function<
              Triple<
                  ClauseSplitterSearchProblem.State,
                  ClauseSplitterSearchProblem.Action,
                  ClauseSplitterSearchProblem.State>,
              Counter<String>>>
      featurizer;

  /** A mapping from edges in the tree, to an index. */
  @SuppressWarnings("Convert2Diamond") // It's lying -- type inference times out with a diamond
  private final Index<SemanticGraphEdge> edgeToIndex =
      new HashIndex<SemanticGraphEdge>(ArrayList::new, IdentityHashMap::new);

  /** A search state. */
  public class State {
    public final SemanticGraphEdge edge;
    public final int edgeIndex;
    public final SemanticGraphEdge subjectOrNull;
    public final int distanceFromSubj;
    public final SemanticGraphEdge objectOrNull;
    public final Consumer<SemanticGraph> thunk;
    public boolean isDone;

    public State(
        SemanticGraphEdge edge,
        SemanticGraphEdge subjectOrNull,
        int distanceFromSubj,
        SemanticGraphEdge objectOrNull,
        Consumer<SemanticGraph> thunk,
        boolean isDone) {
      this.edge = edge;
      this.edgeIndex = edgeToIndex.indexOf(edge);
      this.subjectOrNull = subjectOrNull;
      this.distanceFromSubj = distanceFromSubj;
      this.objectOrNull = objectOrNull;
      this.thunk = thunk;
      this.isDone = isDone;
    }

    public State(State source, boolean isDone) {
      this.edge = source.edge;
      this.edgeIndex = edgeToIndex.indexOf(edge);
      this.subjectOrNull = source.subjectOrNull;
      this.distanceFromSubj = source.distanceFromSubj;
      this.objectOrNull = source.objectOrNull;
      this.thunk = source.thunk;
      this.isDone = isDone;
    }

    public SemanticGraph originalTree() {
      return ClauseSplitterSearchProblem.this.tree;
    }

    public State withIsDone(ClauseClassifierLabel argmax) {
      if (argmax == ClauseClassifierLabel.CLAUSE_SPLIT) {
        isDone = true;
      } else if (argmax == ClauseClassifierLabel.CLAUSE_INTERM) {
        isDone = false;
      } else {
        throw new IllegalStateException("Invalid classifier label for isDone: " + argmax);
      }
      return this;
    }
  }

  /** An action being taken; that is, the type of clause splitting going on. */
  public interface Action {
    /** The name of this action. */
    String signature();

    /**
     * A check to make sure this is actually a valid action to take, in the context of the given
     * tree.
     *
     * @param originalTree The _original_ tree we are searching over. This is before any clauses are
     *     split off.
     * @param edge The edge that we are traversing with this clause.
     * @return True if this is a valid action.
     */
    @SuppressWarnings("UnusedParameters")
    default boolean prerequisitesMet(SemanticGraph originalTree, SemanticGraphEdge edge) {
      return true;
    }

    /**
     * Apply this action to the given state.
     *
     * @param tree The original tree we are applying the action to.
     * @param source The source state we are mutating from.
     * @param outgoingEdge The edge we are splitting off as a clause.
     * @param subjectOrNull The subject of the parent tree, if there is one.
     * @param ppOrNull The preposition attachment of the parent tree, if there is one.
     * @return A new state, or {@link Optional#empty()} if this action was not successful.
     */
    Optional<State> applyTo(
        SemanticGraph tree,
        State source,
        SemanticGraphEdge outgoingEdge,
        SemanticGraphEdge subjectOrNull,
        SemanticGraphEdge ppOrNull);
  }

  /** The options used for training the clause searcher. */
  public static class TrainingOptions {
    @ArgumentParser.Option(
        name = "negativeSubsampleRatio",
        gloss = "The percent of negative datums to take")
    public double negativeSubsampleRatio = 1.00;

    @ArgumentParser.Option(
        name = "positiveDatumWeight",
        gloss = "The weight to assign every positive datum.")
    public float positiveDatumWeight = 100.0f;

    @ArgumentParser.Option(
        name = "unknownDatumWeight",
        gloss =
            "The weight to assign every unknown datum (everything extracted with an unconfirmed relation).")
    public float unknownDatumWeight = 1.0f;

    @ArgumentParser.Option(
        name = "clauseSplitWeight",
        gloss =
            "The weight to assign for clause splitting datums. Higher values push towards higher recall.")
    public float clauseSplitWeight = 1.0f;

    @ArgumentParser.Option(
        name = "clauseIntermWeight",
        gloss =
            "The weight to assign for intermediate splits. Higher values push towards higher recall.")
    public float clauseIntermWeight = 2.0f;

    @ArgumentParser.Option(name = "seed", gloss = "The random seed to use")
    public int seed = 42;

    @SuppressWarnings("unchecked")
    @ArgumentParser.Option(
        name = "classifierFactory",
        gloss = "The class of the classifier factory to use for training the various classifiers")
    public Class<
            ? extends
                ClassifierFactory<
                    ClauseSplitter.ClauseClassifierLabel,
                    String,
                    Classifier<ClauseSplitter.ClauseClassifierLabel, String>>>
        classifierFactory =
            (Class<
                    ? extends
                        ClassifierFactory<
                            ClauseSplitter.ClauseClassifierLabel,
                            String,
                            Classifier<ClauseSplitter.ClauseClassifierLabel, String>>>)
                ((Object) LinearClassifierFactory.class);
  }

  /** Mostly just an alias, but make sure our featurizer is serializable! */
  public interface Featurizer
      extends Function<
              Triple<
                  ClauseSplitterSearchProblem.State,
                  ClauseSplitterSearchProblem.Action,
                  ClauseSplitterSearchProblem.State>,
              Counter<String>>,
          Serializable {
    boolean isSimpleSplit(Counter<String> feats);
  }

  /**
   * Create a searcher manually, suppling a dependency tree, an optional classifier for when to
   * split clauses, and a featurizer for that classifier. You almost certainly want to use {@link
   * ClauseSplitter#load(String)} instead of this constructor.
   *
   * @param tree The dependency tree to search over.
   * @param assumedTruth The assumed truth of the tree (relevant for natural logic inference). If in
   *     doubt, pass in true.
   * @param isClauseClassifier The classifier for whether a given dependency arc should be a new
   *     clause. If this is not given, all arcs are treated as clause separators.
   * @param featurizer The featurizer for the classifier. If no featurizer is given, one should be
   *     given in {@link ClauseSplitterSearchProblem#search(java.util.function.Predicate,
   *     Classifier, Map, java.util.function.Function, int)}, or else the classifier will be
   *     useless.
   * @see ClauseSplitter#load(String)
   */
  protected ClauseSplitterSearchProblem(
      SemanticGraph tree,
      boolean assumedTruth,
      Optional<Classifier<ClauseSplitter.ClauseClassifierLabel, String>> isClauseClassifier,
      Optional<
              Function<
                  Triple<
                      ClauseSplitterSearchProblem.State,
                      ClauseSplitterSearchProblem.Action,
                      ClauseSplitterSearchProblem.State>,
                  Counter<String>>>
          featurizer) {
    this.tree = new SemanticGraph(tree);
    this.assumedTruth = assumedTruth;
    this.isClauseClassifier = isClauseClassifier;
    this.featurizer = featurizer;
    // Index edges
    this.tree.edgeIterable().forEach(edgeToIndex::addToIndex);
    // Get length
    List<IndexedWord> sortedVertices = tree.vertexListSorted();
    sentenceLength = sortedVertices.get(sortedVertices.size() - 1).index();
    // Register extra edges
    for (IndexedWord vertex : sortedVertices) {
      extraEdgesByGovernor.put(vertex, new ArrayList<>());
      extraEdgesByDependent.put(vertex, new ArrayList<>());
    }
    List<SemanticGraphEdge> extraEdges = Util.cleanTree(this.tree);
    assert Util.isTree(this.tree);
    for (SemanticGraphEdge edge : extraEdges) {
      extraEdgesByGovernor.get(edge.getGovernor()).add(edge);
      extraEdgesByDependent.get(edge.getDependent()).add(edge);
    }
  }

  /**
   * Create a clause searcher which searches naively through every possible subtree as a clause. For
   * an end-user, this is almost certainly not what you want. However, it is very useful for
   * training time.
   *
   * @param tree The dependency tree to search over.
   * @param assumedTruth The truth of the premise. Almost always True.
   */
  public ClauseSplitterSearchProblem(SemanticGraph tree, boolean assumedTruth) {
    this(tree, assumedTruth, Optional.empty(), Optional.empty());
  }

  /**
   * The basic method for splitting off a clause of a tree. This modifies the tree in place.
   *
   * @param tree The tree to split a clause from.
   * @param toKeep The edge representing the clause to keep.
   */
  static void splitToChildOfEdge(SemanticGraph tree, SemanticGraphEdge toKeep) {
    Queue<IndexedWord> fringe = new LinkedList<>();
    List<IndexedWord> nodesToRemove = new ArrayList<>();
    // Find nodes to remove
    // (from the root)
    for (IndexedWord root : tree.getRoots()) {
      nodesToRemove.add(root);
      for (SemanticGraphEdge out : tree.outgoingEdgeIterable(root)) {
        if (!out.equals(toKeep)) {
          fringe.add(out.getDependent());
        }
      }
    }
    // (recursively)
    while (!fringe.isEmpty()) {
      IndexedWord node = fringe.poll();
      nodesToRemove.add(node);
      for (SemanticGraphEdge out : tree.outgoingEdgeIterable(node)) {
        if (!out.equals(toKeep)) {
          fringe.add(out.getDependent());
        }
      }
    }
    // Remove nodes
    nodesToRemove.forEach(tree::removeVertex);
    // Set new root
    tree.setRoot(toKeep.getDependent());
  }

  /**
   * The basic method for splitting off a clause of a tree. This modifies the tree in place. This
   * method addtionally follows ref edges.
   *
   * @param tree The tree to split a clause from.
   * @param toKeep The edge representing the clause to keep.
   */
  @SuppressWarnings("unchecked")
  private void simpleClause(SemanticGraph tree, SemanticGraphEdge toKeep) {
    splitToChildOfEdge(tree, toKeep);

    // Follow 'ref' edges
    Map<IndexedWord, IndexedWord> refReplaceMap = new HashMap<>();
    // (find replacements)
    for (IndexedWord vertex : tree.vertexSet()) {
      for (SemanticGraphEdge edge : extraEdgesByDependent.get(vertex)) {
        if ("ref".equals(edge.getRelation().toString())
            && // it's a ref edge...
            !tree.containsVertex(
                edge.getGovernor())) { // ...that doesn't already exist in the tree.
          refReplaceMap.put(vertex, edge.getGovernor());
        }
      }
    }
    // (do replacements)
    for (Map.Entry<IndexedWord, IndexedWord> entry : refReplaceMap.entrySet()) {
      Iterator<SemanticGraphEdge> iter = tree.incomingEdgeIterator(entry.getKey());
      if (!iter.hasNext()) {
        continue;
      }
      SemanticGraphEdge incomingEdge = iter.next();
      IndexedWord governor = incomingEdge.getGovernor();
      tree.removeVertex(entry.getKey());
      addSubtree(
          tree,
          governor,
          incomingEdge.getRelation().toString(),
          this.tree,
          entry.getValue(),
          this.tree.incomingEdgeList(tree.getFirstRoot()));
    }
  }

  /**
   * A helper to add a single word to a given dependency tree
   *
   * @param toModify The tree to add the word to.
   * @param root The root of the tree where we should be adding the word.
   * @param rel The relation to add the word with.
   * @param coreLabel The word to add.
   */
  @SuppressWarnings("UnusedDeclaration")
  private static void addWord(
      SemanticGraph toModify, IndexedWord root, String rel, CoreLabel coreLabel) {
    IndexedWord dependent = new IndexedWord(coreLabel);
    toModify.addVertex(dependent);
    toModify.addEdge(
        root,
        dependent,
        GrammaticalRelation.valueOf(Language.English, rel),
        Double.NEGATIVE_INFINITY,
        false);
  }

  /**
   * A helper to add an entire subtree to a given dependency tree.
   *
   * @param toModify The tree to add the subtree to.
   * @param root The root of the tree where we should be adding the subtree.
   * @param rel The relation to add the subtree with.
   * @param originalTree The orignal tree (i.e., {@link ClauseSplitterSearchProblem#tree}).
   * @param subject The root of the clause to add.
   * @param ignoredEdges The edges to ignore adding when adding this subtree.
   */
  private static void addSubtree(
      SemanticGraph toModify,
      IndexedWord root,
      String rel,
      SemanticGraph originalTree,
      IndexedWord subject,
      Collection<SemanticGraphEdge> ignoredEdges) {
    if (toModify.containsVertex(subject)) {
      return; // This subtree already exists.
    }
    Queue<IndexedWord> fringe = new LinkedList<>();
    Collection<IndexedWord> wordsToAdd = new ArrayList<>();
    Collection<SemanticGraphEdge> edgesToAdd = new ArrayList<>();
    // Search for subtree to add
    for (SemanticGraphEdge edge : originalTree.outgoingEdgeIterable(subject)) {
      if (!ignoredEdges.contains(edge)) {
        if (toModify.containsVertex(edge.getDependent())) {
          // Case: we're adding a subtree that's not disjoint from toModify. This is bad news.
          return;
        }
        edgesToAdd.add(edge);
        fringe.add(edge.getDependent());
      }
    }
    while (!fringe.isEmpty()) {
      IndexedWord node = fringe.poll();
      wordsToAdd.add(node);
      for (SemanticGraphEdge edge : originalTree.outgoingEdgeIterable(node)) {
        if (!ignoredEdges.contains(edge)) {
          if (toModify.containsVertex(edge.getDependent())) {
            // Case: we're adding a subtree that's not disjoint from toModify. This is bad news.
            return;
          }
          edgesToAdd.add(edge);
          fringe.add(edge.getDependent());
        }
      }
    }
    // Add subtree
    // (add subject)
    toModify.addVertex(subject);
    toModify.addEdge(
        root,
        subject,
        GrammaticalRelation.valueOf(Language.English, rel),
        Double.NEGATIVE_INFINITY,
        false);

    // (add nodes)
    wordsToAdd.forEach(toModify::addVertex);
    // (add edges)
    for (SemanticGraphEdge edge : edgesToAdd) {
      assert !toModify.incomingEdgeIterator(edge.getDependent()).hasNext();
      toModify.addEdge(
          edge.getGovernor(),
          edge.getDependent(),
          edge.getRelation(),
          edge.getWeight(),
          edge.isExtra());
    }
  }

  /**
   * Stips aux and mark edges when we are splitting into a clause.
   *
   * @param toModify The tree we are stripping the edges from.
   */
  private void stripAuxMark(SemanticGraph toModify) {
    List<SemanticGraphEdge> toClean = new ArrayList<>();
    for (SemanticGraphEdge edge : toModify.outgoingEdgeIterable(toModify.getFirstRoot())) {
      String rel = edge.getRelation().toString();
      if (("aux".equals(rel) || "mark".equals(rel))
          && !toModify.outgoingEdgeIterator(edge.getDependent()).hasNext()) {
        toClean.add(edge);
      }
    }
    for (SemanticGraphEdge edge : toClean) {
      toModify.removeEdge(edge);
      toModify.removeVertex(edge.getDependent());
    }
  }

  /**
   * Create a mock node, to be added to the dependency tree but which is not part of the original
   * sentence.
   *
   * @param toCopy The CoreLabel to copy from initially.
   * @param word The new word to add.
   * @param POS The new part of speech to add.
   * @return A CoreLabel copying most fields from toCopy, but with a new word and POS tag (as well
   *     as a new index).
   */
  @SuppressWarnings("UnusedDeclaration")
  private CoreLabel mockNode(CoreLabel toCopy, String word, String POS) {
    CoreLabel mock = new CoreLabel(toCopy);
    mock.setWord(word);
    mock.setLemma(word);
    mock.setValue(word);
    mock.setNER("O");
    mock.setTag(POS);
    mock.setIndex(sentenceLength + 5);
    return mock;
  }

  /**
   * Get the top few clauses from this searcher, cutting off at the given minimum probability.
   *
   * @param thresholdProbability The threshold under which to stop returning clauses. This should be
   *     between 0 and 1.
   * @return The resulting {@link edu.stanford.nlp.naturalli.SentenceFragment} objects, representing
   *     the top clauses of the sentence.
   */
  public List<SentenceFragment> topClauses(double thresholdProbability) {
    List<SentenceFragment> results = new ArrayList<>();
    search(
        triple -> {
          assert triple.first <= 0.0;
          double prob = Math.exp(triple.first);
          assert prob <= 1.0;
          assert prob >= 0.0;
          assert !Double.isNaN(prob);
          if (prob >= thresholdProbability) {
            SentenceFragment fragment = triple.third.get();
            fragment.score = prob;
            results.add(fragment);
            return true;
          } else {
            return false;
          }
        });
    return results;
  }

  /**
   * Search, using the default weights / featurizer. This is the most common entry method for the
   * raw search, though {@link ClauseSplitterSearchProblem#topClauses(double)} may be a more
   * convenient method for an end user.
   *
   * @param candidateFragments The callback function for results. The return value defines whether
   *     to continue searching.
   */
  public void search(
      final Predicate<Triple<Double, List<Counter<String>>, Supplier<SentenceFragment>>>
          candidateFragments) {
    if (!isClauseClassifier.isPresent()) {
      search(
          candidateFragments,
          new LinearClassifier<>(new ClassicCounter<>()),
          HARD_SPLITS,
          this.featurizer.isPresent() ? this.featurizer.get() : DEFAULT_FEATURIZER,
          1000);
    } else {
      if (!(isClauseClassifier.get() instanceof LinearClassifier)) {
        throw new IllegalArgumentException("For now, only linear classifiers are supported");
      }
      search(
          candidateFragments, isClauseClassifier.get(), HARD_SPLITS, this.featurizer.get(), 1000);
    }
  }

  /**
   * Search from the root of the tree. This function also defines the default action space to use
   * during search. This is NOT recommended to be used at test time.
   *
   * @see edu.stanford.nlp.naturalli.ClauseSplitterSearchProblem#search(Predicate)
   * @param candidateFragments The callback function.
   * @param classifier The classifier for whether an arc should be on the path to a clause split, a
   *     clause split itself, or neither.
   * @param featurizer The featurizer to use during search, to be dot producted with the weights.
   */
  public void search(
      // The output specs
      final Predicate<Triple<Double, List<Counter<String>>, Supplier<SentenceFragment>>>
          candidateFragments,
      // The learning specs
      final Classifier<ClauseSplitter.ClauseClassifierLabel, String> classifier,
      final Map<String, List<String>> hardCodedSplits,
      final Function<Triple<State, Action, State>, Counter<String>> featurizer,
      final int maxTicks) {
    Collection<Action> actionSpace = new ArrayList<>();

    // SIMPLE SPLIT
    actionSpace.add(
        new Action() {
          @Override
          public String signature() {
            return "simple";
          }

          @Override
          public boolean prerequisitesMet(SemanticGraph originalTree, SemanticGraphEdge edge) {
            char tag = edge.getDependent().tag().charAt(0);
            return !(tag != 'V' && tag != 'N' && tag != 'J' && tag != 'P' && tag != 'D');
          }

          @Override
          public Optional<State> applyTo(
              SemanticGraph tree,
              State source,
              SemanticGraphEdge outgoingEdge,
              SemanticGraphEdge subjectOrNull,
              SemanticGraphEdge objectOrNull) {
            return Optional.of(
                new State(
                    outgoingEdge,
                    subjectOrNull == null ? source.subjectOrNull : subjectOrNull,
                    subjectOrNull == null ? (source.distanceFromSubj + 1) : 0,
                    objectOrNull == null ? source.objectOrNull : objectOrNull,
                    source.thunk.andThen(
                        toModify -> {
                          assert Util.isTree(toModify);
                          simpleClause(toModify, outgoingEdge);
                          if (outgoingEdge.getRelation().toString().endsWith("comp")) {
                            stripAuxMark(toModify);
                          }
                          assert Util.isTree(toModify);
                        }),
                    false));
          }
        });

    // CLONE ROOT
    actionSpace.add(
        new Action() {
          @Override
          public String signature() {
            return "clone_root_as_nsubjpass";
          }

          @Override
          public boolean prerequisitesMet(SemanticGraph originalTree, SemanticGraphEdge edge) {
            // Only valid if there's a single nontrivial outgoing edge from a node. Otherwise it's a
            // whole can of worms.
            Iterator<SemanticGraphEdge> iter =
                originalTree.outgoingEdgeIterable(edge.getGovernor()).iterator();
            if (!iter.hasNext()) {
              return false; // what?
            }
            boolean nontrivialEdge = false;
            while (iter.hasNext()) {
              SemanticGraphEdge outEdge = iter.next();
              switch (outEdge.getRelation().toString()) {
                case "nn":
                case "amod":
                  break;
                default:
                  if (nontrivialEdge) {
                    return false;
                  }
                  nontrivialEdge = true;
              }
            }
            return true;
          }

          @Override
          public Optional<State> applyTo(
              SemanticGraph tree,
              State source,
              SemanticGraphEdge outgoingEdge,
              SemanticGraphEdge subjectOrNull,
              SemanticGraphEdge objectOrNull) {
            return Optional.of(
                new State(
                    outgoingEdge,
                    subjectOrNull == null ? source.subjectOrNull : subjectOrNull,
                    subjectOrNull == null ? (source.distanceFromSubj + 1) : 0,
                    objectOrNull == null ? source.objectOrNull : objectOrNull,
                    source.thunk.andThen(
                        toModify -> {
                          assert Util.isTree(toModify);
                          simpleClause(toModify, outgoingEdge);
                          addSubtree(
                              toModify,
                              outgoingEdge.getDependent(),
                              "nsubjpass",
                              tree,
                              outgoingEdge.getGovernor(),
                              Collections.singleton(outgoingEdge));
                          //              addWord(toModify, outgoingEdge.getDependent(), "auxpass",
                          // mockNode(outgoingEdge.getDependent().backingLabel(), "is", "VBZ"));
                          assert Util.isTree(toModify);
                        }),
                    true));
          }
        });

    // COPY SUBJECT
    actionSpace.add(
        new Action() {
          @Override
          public String signature() {
            return "clone_nsubj";
          }

          @Override
          public boolean prerequisitesMet(SemanticGraph originalTree, SemanticGraphEdge edge) {
            // Don't split into anything but verbs or nouns
            char tag = edge.getDependent().tag().charAt(0);
            if (tag != 'V' && tag != 'N') {
              return false;
            }
            for (SemanticGraphEdge grandchild :
                originalTree.outgoingEdgeIterable(edge.getDependent())) {
              if (grandchild.getRelation().toString().contains("subj")) {
                return false;
              }
            }
            return true;
          }

          @Override
          public Optional<State> applyTo(
              SemanticGraph tree,
              State source,
              SemanticGraphEdge outgoingEdge,
              SemanticGraphEdge subjectOrNull,
              SemanticGraphEdge objectOrNull) {
            if (subjectOrNull != null && !outgoingEdge.equals(subjectOrNull)) {
              return Optional.of(
                  new State(
                      outgoingEdge,
                      subjectOrNull,
                      0,
                      objectOrNull == null ? source.objectOrNull : objectOrNull,
                      source.thunk.andThen(
                          toModify -> {
                            assert Util.isTree(toModify);
                            simpleClause(toModify, outgoingEdge);
                            addSubtree(
                                toModify,
                                outgoingEdge.getDependent(),
                                "nsubj",
                                tree,
                                subjectOrNull.getDependent(),
                                Collections.singleton(outgoingEdge));
                            assert Util.isTree(toModify);
                            stripAuxMark(toModify);
                            assert Util.isTree(toModify);
                          }),
                      false));
            } else {
              return Optional.empty();
            }
          }
        });

    // COPY OBJECT
    actionSpace.add(
        new Action() {
          @Override
          public String signature() {
            return "clone_dobj";
          }

          @Override
          public boolean prerequisitesMet(SemanticGraph originalTree, SemanticGraphEdge edge) {
            // Don't split into anything but verbs or nouns
            char tag = edge.getDependent().tag().charAt(0);
            if (tag != 'V' && tag != 'N') {
              return false;
            }
            for (SemanticGraphEdge grandchild :
                originalTree.outgoingEdgeIterable(edge.getDependent())) {
              if (grandchild.getRelation().toString().contains("subj")) {
                return false;
              }
            }
            return true;
          }

          @Override
          public Optional<State> applyTo(
              SemanticGraph tree,
              State source,
              SemanticGraphEdge outgoingEdge,
              SemanticGraphEdge subjectOrNull,
              SemanticGraphEdge objectOrNull) {
            if (objectOrNull != null && !outgoingEdge.equals(objectOrNull)) {
              return Optional.of(
                  new State(
                      outgoingEdge,
                      subjectOrNull == null ? source.subjectOrNull : subjectOrNull,
                      subjectOrNull == null ? (source.distanceFromSubj + 1) : 0,
                      objectOrNull,
                      source.thunk.andThen(
                          toModify -> {
                            assert Util.isTree(toModify);
                            // Split the clause
                            simpleClause(toModify, outgoingEdge);
                            // Attach the new subject
                            addSubtree(
                                toModify,
                                outgoingEdge.getDependent(),
                                "nsubj",
                                tree,
                                objectOrNull.getDependent(),
                                Collections.singleton(outgoingEdge));
                            // Strip bits we don't want
                            assert Util.isTree(toModify);
                            stripAuxMark(toModify);
                            assert Util.isTree(toModify);
                          }),
                      false));
            } else {
              return Optional.empty();
            }
          }
        });

    for (IndexedWord root : tree.getRoots()) {
      search(
          root, candidateFragments, classifier, hardCodedSplits, featurizer, actionSpace, maxTicks);
    }
  }

  /** Re-order the action space based on the specified order of names. */
  private Collection<Action> orderActions(Collection<Action> actionSpace, List<String> order) {
    List<Action> tmp = new ArrayList<>(actionSpace);
    List<Action> out = new ArrayList<>();
    for (String key : order) {
      Iterator<Action> iter = tmp.iterator();
      while (iter.hasNext()) {
        Action a = iter.next();
        if (a.signature().equals(key)) {
          out.add(a);
          iter.remove();
        }
      }
    }
    out.addAll(tmp);
    return out;
  }

  /**
   * The core implementation of the search.
   *
   * @param root The root word to search from. Traditionally, this is the root of the sentence.
   * @param candidateFragments The callback for the resulting sentence fragments. This is a
   *     predicate of a triple of values. The return value of the predicate determines whether we
   *     should continue searching. The triple is a triple of
   *     <ol>
   *       <li>The log probability of the sentence fragment, according to the featurizer and the
   *           weights
   *       <li>The features along the path to this fragment. The last element of this is the
   *           features from the most recent step.
   *       <li>The sentence fragment. Because it is relatively expensive to compute the resulting
   *           tree, this is returned as a lazy {@link Supplier}.
   *     </ol>
   *
   * @param classifier The classifier for whether an arc should be on the path to a clause split, a
   *     clause split itself, or neither.
   * @param featurizer The featurizer to use. Make sure this matches the weights!
   * @param actionSpace The action space we are allowed to take. Each action defines a means of
   *     splitting a clause on a dependency boundary.
   */
  protected void search(
      // The root to search from
      IndexedWord root,
      // The output specs
      final Predicate<Triple<Double, List<Counter<String>>, Supplier<SentenceFragment>>>
          candidateFragments,
      // The learning specs
      final Classifier<ClauseSplitter.ClauseClassifierLabel, String> classifier,
      Map<String, ? extends List<String>> hardCodedSplits,
      final Function<Triple<State, Action, State>, Counter<String>> featurizer,
      final Collection<Action> actionSpace,
      final int maxTicks) {
    // (the fringe)
    PriorityQueue<Pair<State, List<Counter<String>>>> fringe = new FixedPrioritiesPriorityQueue<>();
    // (avoid duplicate work)
    Set<IndexedWord> seenWords = new HashSet<>();

    State firstState =
        new State(null, null, -9000, null, x -> {}, true); // First state is implicitly "done"
    fringe.add(Pair.makePair(firstState, new ArrayList<>(0)), -0.0);
    int ticks = 0;

    while (!fringe.isEmpty()) {
      if (++ticks > maxTicks) {
        //        System.err.println("WARNING! Timed out on search with " + ticks + " ticks");
        return;
      }
      // Useful variables
      double logProbSoFar = fringe.getPriority();
      assert logProbSoFar <= 0.0;
      Pair<State, List<Counter<String>>> lastStatePair = fringe.removeFirst();
      State lastState = lastStatePair.first;
      List<Counter<String>> featuresSoFar = lastStatePair.second;
      IndexedWord rootWord = lastState.edge == null ? root : lastState.edge.getDependent();

      // Register thunk
      if (lastState.isDone) {
        if (!candidateFragments.test(
            Triple.makeTriple(
                logProbSoFar,
                featuresSoFar,
                () -> {
                  SemanticGraph copy = new SemanticGraph(tree);
                  lastState
                      .thunk
                      .andThen(
                          x -> {
                            // Add the extra edges back in, if they don't break the tree-ness of the
                            // extraction
                            for (IndexedWord newTreeRoot : x.getRoots()) {
                              if (newTreeRoot != null) { // what a strange thing to have happen...
                                for (SemanticGraphEdge extraEdge :
                                    extraEdgesByGovernor.get(newTreeRoot)) {
                                  assert Util.isTree(x);
                                  //noinspection unchecked
                                  addSubtree(
                                      x,
                                      newTreeRoot,
                                      extraEdge.getRelation().toString(),
                                      tree,
                                      extraEdge.getDependent(),
                                      tree.getIncomingEdgesSorted(newTreeRoot));
                                  assert Util.isTree(x);
                                }
                              }
                            }
                          })
                      .accept(copy);
                  return new SentenceFragment(copy, assumedTruth, false);
                }))) {
          break;
        }
      }

      // Find relevant auxilliary terms
      SemanticGraphEdge subjOrNull = null;
      SemanticGraphEdge objOrNull = null;
      for (SemanticGraphEdge auxEdge : tree.outgoingEdgeIterable(rootWord)) {
        String relString = auxEdge.getRelation().toString();
        if (relString.contains("obj")) {
          objOrNull = auxEdge;
        } else if (relString.contains("subj")) {
          subjOrNull = auxEdge;
        }
      }

      // Iterate over children
      // For each outgoing edge...
      for (SemanticGraphEdge outgoingEdge : tree.outgoingEdgeIterable(rootWord)) {
        // Prohibit indirect speech verbs from splitting off clauses
        // (e.g., 'said', 'think')
        // This fires if the governor is an indirect speech verb, and the outgoing edge is a ccomp
        if (outgoingEdge.getRelation().toString().equals("ccomp")
            && ((outgoingEdge.getGovernor().lemma() != null
                    && INDIRECT_SPEECH_LEMMAS.contains(outgoingEdge.getGovernor().lemma()))
                || INDIRECT_SPEECH_LEMMAS.contains(outgoingEdge.getGovernor().word()))) {
          continue;
        }
        // Get some variables
        String outgoingEdgeRelation = outgoingEdge.getRelation().toString();
        List<String> forcedArcOrder = hardCodedSplits.get(outgoingEdgeRelation);
        if (forcedArcOrder == null && outgoingEdgeRelation.contains(":")) {
          forcedArcOrder =
              hardCodedSplits.get(
                  outgoingEdgeRelation.substring(0, outgoingEdgeRelation.indexOf(":")) + ":*");
        }
        boolean doneForcedArc = false;
        // For each action...
        for (Action action :
            (forcedArcOrder == null ? actionSpace : orderActions(actionSpace, forcedArcOrder))) {
          // Check the prerequisite
          if (!action.prerequisitesMet(tree, outgoingEdge)) {
            continue;
          }
          if (forcedArcOrder != null && doneForcedArc) {
            break;
          }
          // 1. Compute the child state
          Optional<State> candidate =
              action.applyTo(tree, lastState, outgoingEdge, subjOrNull, objOrNull);
          if (candidate.isPresent()) {
            double logProbability;
            ClauseClassifierLabel bestLabel;
            Counter<String> features =
                featurizer.apply(Triple.makeTriple(lastState, action, candidate.get()));
            if (forcedArcOrder != null && !doneForcedArc) {
              logProbability = 0.0;
              bestLabel = ClauseClassifierLabel.CLAUSE_SPLIT;
              doneForcedArc = true;
            } else if (features.containsKey("__undocumented_junit_no_classifier")) {
              logProbability = Double.NEGATIVE_INFINITY;
              bestLabel = ClauseClassifierLabel.CLAUSE_INTERM;
            } else {
              Counter<ClauseClassifierLabel> scores = classifier.scoresOf(new RVFDatum<>(features));
              if (scores.size() > 0) {
                Counters.logNormalizeInPlace(scores);
              }
              String rel = outgoingEdge.getRelation().toString();
              if ("nsubj".equals(rel) || "dobj".equals(rel)) {
                scores.remove(
                    ClauseClassifierLabel.NOT_A_CLAUSE); // Always at least yield on nsubj and dobj
              }
              logProbability = Counters.max(scores, Double.NEGATIVE_INFINITY);
              bestLabel = Counters.argmax(scores, (x, y) -> 0, ClauseClassifierLabel.CLAUSE_SPLIT);
            }

            if (bestLabel != ClauseClassifierLabel.NOT_A_CLAUSE) {
              Pair<State, List<Counter<String>>> childState =
                  Pair.makePair(
                      candidate.get().withIsDone(bestLabel),
                      new ArrayList<Counter<String>>(featuresSoFar) {
                        {
                          add(features);
                        }
                      });
              // 2. Register the child state
              if (!seenWords.contains(childState.first.edge.getDependent())) {
                //            System.err.println("  pushing " + action.signature() + " with " +
                // argmax.first.edge);
                fringe.add(childState, logProbability);
              }
            }
          }
        }
      }

      seenWords.add(rootWord);
    }
    //    System.err.println("Search finished in " + ticks + " ticks and " + classifierEvals + "
    // classifier evaluations.");
  }

  /** The default featurizer to use during training. */
  public static final Featurizer DEFAULT_FEATURIZER =
      new Featurizer() {
        private static final long serialVersionUID = 4145523451314579506l;

        @Override
        public boolean isSimpleSplit(Counter<String> feats) {
          for (String key : feats.keySet()) {
            if (key.startsWith("simple&")) {
              return true;
            }
          }
          return false;
        }

        @Override
        public Counter<String> apply(Triple<State, Action, State> triple) {
          // Variables
          State from = triple.first;
          Action action = triple.second;
          State to = triple.third;
          String signature = action.signature();
          String edgeRelTaken = to.edge == null ? "root" : to.edge.getRelation().toString();
          String edgeRelShort = to.edge == null ? "root" : to.edge.getRelation().getShortName();
          if (edgeRelShort.contains("_")) {
            edgeRelShort = edgeRelShort.substring(0, edgeRelShort.indexOf("_"));
          }

          // -- Featurize --
          // Variables to aggregate
          boolean parentHasSubj = false;
          boolean parentHasObj = false;
          boolean childHasSubj = false;
          boolean childHasObj = false;
          Counter<String> feats = new ClassicCounter<>();

          // 1. edge taken
          feats.incrementCount(signature + "&edge:" + edgeRelTaken);
          feats.incrementCount(signature + "&edge_type:" + edgeRelShort);

          // 2. last edge taken
          if (from.edge == null) {
            assert to.edge == null || to.originalTree().getRoots().contains(to.edge.getGovernor());
            feats.incrementCount(signature + "&at_root");
            feats.incrementCount(
                signature + "&at_root&root_pos:" + to.originalTree().getFirstRoot().tag());
          } else {
            feats.incrementCount(signature + "&not_root");
            String lastRelShort = from.edge.getRelation().getShortName();
            if (lastRelShort.contains("_")) {
              lastRelShort = lastRelShort.substring(0, lastRelShort.indexOf("_"));
            }
            feats.incrementCount(signature + "&last_edge:" + lastRelShort);
          }

          if (to.edge != null) {
            // 3. other edges at parent
            for (SemanticGraphEdge parentNeighbor :
                from.originalTree().outgoingEdgeIterable(to.edge.getGovernor())) {
              if (parentNeighbor != to.edge) {
                String parentNeighborRel = parentNeighbor.getRelation().toString();
                if (parentNeighborRel.contains("subj")) {
                  parentHasSubj = true;
                }
                if (parentNeighborRel.contains("obj")) {
                  parentHasObj = true;
                }
                // (add feature)
                feats.incrementCount(signature + "&parent_neighbor:" + parentNeighborRel);
                feats.incrementCount(
                    signature
                        + "&edge_type:"
                        + edgeRelShort
                        + "&parent_neighbor:"
                        + parentNeighborRel);
              }
            }

            // 4. Other edges at child
            int childNeighborCount = 0;
            for (SemanticGraphEdge childNeighbor :
                from.originalTree().outgoingEdgeIterable(to.edge.getDependent())) {
              String childNeighborRel = childNeighbor.getRelation().toString();
              if (childNeighborRel.contains("subj")) {
                childHasSubj = true;
              }
              if (childNeighborRel.contains("obj")) {
                childHasObj = true;
              }
              childNeighborCount += 1;
              // (add feature)
              feats.incrementCount(signature + "&child_neighbor:" + childNeighborRel);
              feats.incrementCount(
                  signature + "&edge_type:" + edgeRelShort + "&child_neighbor:" + childNeighborRel);
            }
            // 4.1 Number of other edges at child
            feats.incrementCount(
                signature
                    + "&child_neighbor_count:"
                    + (childNeighborCount < 3 ? childNeighborCount : ">2"));
            feats.incrementCount(
                signature
                    + "&edge_type:"
                    + edgeRelShort
                    + "&child_neighbor_count:"
                    + (childNeighborCount < 3 ? childNeighborCount : ">2"));

            // 5. Subject/Object stats
            feats.incrementCount(signature + "&parent_neighbor_subj:" + parentHasSubj);
            feats.incrementCount(signature + "&parent_neighbor_obj:" + parentHasObj);
            feats.incrementCount(signature + "&child_neighbor_subj:" + childHasSubj);
            feats.incrementCount(signature + "&child_neighbor_obj:" + childHasObj);

            // 6. POS tag info
            feats.incrementCount(signature + "&parent_pos:" + to.edge.getGovernor().tag());
            feats.incrementCount(signature + "&child_pos:" + to.edge.getDependent().tag());
            feats.incrementCount(
                signature
                    + "&pos_signature:"
                    + to.edge.getGovernor().tag()
                    + "_"
                    + to.edge.getDependent().tag());
            feats.incrementCount(
                signature
                    + "&edge_type:"
                    + edgeRelShort
                    + "&pos_signature:"
                    + to.edge.getGovernor().tag()
                    + "_"
                    + to.edge.getDependent().tag());
          }
          return feats;
        }
      };
}