/** * Create a searcher manually, suppling a dependency tree, an optional classifier for when to * split clauses, and a featurizer for that classifier. You almost certainly want to use {@link * ClauseSplitter#load(String)} instead of this constructor. * * @param tree The dependency tree to search over. * @param assumedTruth The assumed truth of the tree (relevant for natural logic inference). If in * doubt, pass in true. * @param isClauseClassifier The classifier for whether a given dependency arc should be a new * clause. If this is not given, all arcs are treated as clause separators. * @param featurizer The featurizer for the classifier. If no featurizer is given, one should be * given in {@link ClauseSplitterSearchProblem#search(java.util.function.Predicate, * Classifier, Map, java.util.function.Function, int)}, or else the classifier will be * useless. * @see ClauseSplitter#load(String) */ protected ClauseSplitterSearchProblem( SemanticGraph tree, boolean assumedTruth, Optional<Classifier<ClauseSplitter.ClauseClassifierLabel, String>> isClauseClassifier, Optional< Function< Triple< ClauseSplitterSearchProblem.State, ClauseSplitterSearchProblem.Action, ClauseSplitterSearchProblem.State>, Counter<String>>> featurizer) { this.tree = new SemanticGraph(tree); this.assumedTruth = assumedTruth; this.isClauseClassifier = isClauseClassifier; this.featurizer = featurizer; // Index edges this.tree.edgeIterable().forEach(edgeToIndex::addToIndex); // Get length List<IndexedWord> sortedVertices = tree.vertexListSorted(); sentenceLength = sortedVertices.get(sortedVertices.size() - 1).index(); // Register extra edges for (IndexedWord vertex : sortedVertices) { extraEdgesByGovernor.put(vertex, new ArrayList<>()); extraEdgesByDependent.put(vertex, new ArrayList<>()); } List<SemanticGraphEdge> extraEdges = Util.cleanTree(this.tree); assert Util.isTree(this.tree); for (SemanticGraphEdge edge : extraEdges) { extraEdgesByGovernor.get(edge.getGovernor()).add(edge); extraEdgesByDependent.get(edge.getDependent()).add(edge); } }
/** * A little utility function to make sure a SemanticGraph is a tree. * * @param tree The tree to check. * @return True if this {@link edu.stanford.nlp.semgraph.SemanticGraph} is a tree (versus a DAG, * or Graph). */ public static boolean isTree(SemanticGraph tree) { for (IndexedWord vertex : tree.vertexSet()) { // Check one and only one incoming edge if (tree.getRoots().contains(vertex)) { if (tree.incomingEdgeIterator(vertex).hasNext()) { return false; } } else { Iterator<SemanticGraphEdge> iter = tree.incomingEdgeIterator(vertex); if (!iter.hasNext()) { return false; } iter.next(); if (iter.hasNext()) { return false; } } // Check incoming and outgoing edges match for (SemanticGraphEdge edge : tree.outgoingEdgeIterable(vertex)) { boolean foundReverse = false; for (SemanticGraphEdge reverse : tree.incomingEdgeIterable(edge.getDependent())) { if (reverse == edge) { foundReverse = true; } } if (!foundReverse) { return false; } } for (SemanticGraphEdge edge : tree.incomingEdgeIterable(vertex)) { boolean foundReverse = false; for (SemanticGraphEdge reverse : tree.outgoingEdgeIterable(edge.getGovernor())) { if (reverse == edge) { foundReverse = true; } } if (!foundReverse) { return false; } } } // Check for cycles if (isCyclic(tree)) { return false; } // Check topological sort -- sometimes fails? // try { // tree.topologicalSort(); // } catch (Exception e) { // e.printStackTrace(); // return false; // } return true; }
/** * Stips aux and mark edges when we are splitting into a clause. * * @param toModify The tree we are stripping the edges from. */ private void stripAuxMark(SemanticGraph toModify) { List<SemanticGraphEdge> toClean = new ArrayList<>(); for (SemanticGraphEdge edge : toModify.outgoingEdgeIterable(toModify.getFirstRoot())) { String rel = edge.getRelation().toString(); if (("aux".equals(rel) || "mark".equals(rel)) && !toModify.outgoingEdgeIterator(edge.getDependent()).hasNext()) { toClean.add(edge); } } for (SemanticGraphEdge edge : toClean) { toModify.removeEdge(edge); toModify.removeVertex(edge.getDependent()); } }
/** * Strip away case edges, if the incoming edge is a preposition. This replicates the behavior of * the old Stanford dependencies on universal dependencies. * * @param tree The tree to modify in place. */ public static void stripPrepCases(SemanticGraph tree) { // Find incoming case edges that have an 'nmod' incoming edge List<SemanticGraphEdge> toClean = new ArrayList<>(); for (SemanticGraphEdge edge : tree.edgeIterable()) { if ("case".equals(edge.getRelation().toString())) { boolean isPrepTarget = false; for (SemanticGraphEdge incoming : tree.incomingEdgeIterable(edge.getGovernor())) { if ("nmod".equals(incoming.getRelation().getShortName())) { isPrepTarget = true; break; } } if (isPrepTarget && !tree.outgoingEdgeIterator(edge.getDependent()).hasNext()) { toClean.add(edge); } } } // Delete these edges for (SemanticGraphEdge edge : toClean) { tree.removeEdge(edge); tree.removeVertex(edge.getDependent()); assert isTree(tree); } }
/** * The basic method for splitting off a clause of a tree. This modifies the tree in place. This * method addtionally follows ref edges. * * @param tree The tree to split a clause from. * @param toKeep The edge representing the clause to keep. */ @SuppressWarnings("unchecked") private void simpleClause(SemanticGraph tree, SemanticGraphEdge toKeep) { splitToChildOfEdge(tree, toKeep); // Follow 'ref' edges Map<IndexedWord, IndexedWord> refReplaceMap = new HashMap<>(); // (find replacements) for (IndexedWord vertex : tree.vertexSet()) { for (SemanticGraphEdge edge : extraEdgesByDependent.get(vertex)) { if ("ref".equals(edge.getRelation().toString()) && // it's a ref edge... !tree.containsVertex( edge.getGovernor())) { // ...that doesn't already exist in the tree. refReplaceMap.put(vertex, edge.getGovernor()); } } } // (do replacements) for (Map.Entry<IndexedWord, IndexedWord> entry : refReplaceMap.entrySet()) { Iterator<SemanticGraphEdge> iter = tree.incomingEdgeIterator(entry.getKey()); if (!iter.hasNext()) { continue; } SemanticGraphEdge incomingEdge = iter.next(); IndexedWord governor = incomingEdge.getGovernor(); tree.removeVertex(entry.getKey()); addSubtree( tree, governor, incomingEdge.getRelation().toString(), this.tree, entry.getValue(), this.tree.incomingEdgeList(tree.getFirstRoot())); } }
/** * The basic method for splitting off a clause of a tree. This modifies the tree in place. * * @param tree The tree to split a clause from. * @param toKeep The edge representing the clause to keep. */ static void splitToChildOfEdge(SemanticGraph tree, SemanticGraphEdge toKeep) { Queue<IndexedWord> fringe = new LinkedList<>(); List<IndexedWord> nodesToRemove = new ArrayList<>(); // Find nodes to remove // (from the root) for (IndexedWord root : tree.getRoots()) { nodesToRemove.add(root); for (SemanticGraphEdge out : tree.outgoingEdgeIterable(root)) { if (!out.equals(toKeep)) { fringe.add(out.getDependent()); } } } // (recursively) while (!fringe.isEmpty()) { IndexedWord node = fringe.poll(); nodesToRemove.add(node); for (SemanticGraphEdge out : tree.outgoingEdgeIterable(node)) { if (!out.equals(toKeep)) { fringe.add(out.getDependent()); } } } // Remove nodes nodesToRemove.forEach(tree::removeVertex); // Set new root tree.setRoot(toKeep.getDependent()); }
@Override public void process(JCas jCas) throws AnalysisEngineProcessException { Annotation document = this.processor.process(jCas.getDocumentText()); String lastNETag = "O"; int lastNEBegin = -1; int lastNEEnd = -1; for (CoreMap tokenAnn : document.get(TokensAnnotation.class)) { // create the token annotation int begin = tokenAnn.get(CharacterOffsetBeginAnnotation.class); int end = tokenAnn.get(CharacterOffsetEndAnnotation.class); String pos = tokenAnn.get(PartOfSpeechAnnotation.class); String lemma = tokenAnn.get(LemmaAnnotation.class); Token token = new Token(jCas, begin, end); token.setPos(pos); token.setLemma(lemma); token.addToIndexes(); // hackery to convert token-level named entity tag into phrase-level tag String neTag = tokenAnn.get(NamedEntityTagAnnotation.class); if (neTag.equals("O") && !lastNETag.equals("O")) { NamedEntityMention ne = new NamedEntityMention(jCas, lastNEBegin, lastNEEnd); ne.setMentionType(lastNETag); ne.addToIndexes(); } else { if (lastNETag.equals("O")) { lastNEBegin = begin; } else if (lastNETag.equals(neTag)) { // do nothing - begin was already set } else { NamedEntityMention ne = new NamedEntityMention(jCas, lastNEBegin, lastNEEnd); ne.setMentionType(lastNETag); ne.addToIndexes(); lastNEBegin = begin; } lastNEEnd = end; } lastNETag = neTag; } if (!lastNETag.equals("O")) { NamedEntityMention ne = new NamedEntityMention(jCas, lastNEBegin, lastNEEnd); ne.setMentionType(lastNETag); ne.addToIndexes(); } // add sentences and trees for (CoreMap sentenceAnn : document.get(SentencesAnnotation.class)) { // add the sentence annotation int sentBegin = sentenceAnn.get(CharacterOffsetBeginAnnotation.class); int sentEnd = sentenceAnn.get(CharacterOffsetEndAnnotation.class); Sentence sentence = new Sentence(jCas, sentBegin, sentEnd); sentence.addToIndexes(); // add the syntactic tree annotation List<CoreLabel> tokenAnns = sentenceAnn.get(TokensAnnotation.class); Tree tree = sentenceAnn.get(TreeAnnotation.class); if (tree.children().length != 1) { throw new RuntimeException("Expected single root node, found " + tree); } tree = tree.firstChild(); tree.indexSpans(0); TopTreebankNode root = new TopTreebankNode(jCas); root.setTreebankParse(tree.toString()); // TODO: root.setTerminals(v) this.addTreebankNodeToIndexes(root, jCas, tree, tokenAnns); // get the dependencies SemanticGraph dependencies = sentenceAnn.get(CollapsedCCProcessedDependenciesAnnotation.class); // convert Stanford nodes to UIMA annotations List<Token> tokens = JCasUtil.selectCovered(jCas, Token.class, sentence); Map<IndexedWord, DependencyNode> stanfordToUima = new HashMap<IndexedWord, DependencyNode>(); for (IndexedWord stanfordNode : dependencies.vertexSet()) { int indexBegin = stanfordNode.get(BeginIndexAnnotation.class); int indexEnd = stanfordNode.get(EndIndexAnnotation.class); int tokenBegin = tokens.get(indexBegin).getBegin(); int tokenEnd = tokens.get(indexEnd - 1).getEnd(); DependencyNode node; if (dependencies.getRoots().contains(stanfordNode)) { node = new TopDependencyNode(jCas, tokenBegin, tokenEnd); } else { node = new DependencyNode(jCas, tokenBegin, tokenEnd); } stanfordToUima.put(stanfordNode, node); } // create relation annotations for each Stanford dependency ArrayListMultimap<DependencyNode, DependencyRelation> headRelations = ArrayListMultimap.create(); ArrayListMultimap<DependencyNode, DependencyRelation> childRelations = ArrayListMultimap.create(); for (SemanticGraphEdge stanfordEdge : dependencies.edgeIterable()) { DependencyRelation relation = new DependencyRelation(jCas); DependencyNode head = stanfordToUima.get(stanfordEdge.getGovernor()); DependencyNode child = stanfordToUima.get(stanfordEdge.getDependent()); String relationType = stanfordEdge.getRelation().toString(); if (head == null || child == null || relationType == null) { throw new RuntimeException( String.format( "null elements not allowed in relation:\nrelation=%s\nchild=%s\nhead=%s\n", relation, child, head)); } relation.setHead(head); relation.setChild(child); relation.setRelation(relationType); relation.addToIndexes(); headRelations.put(child, relation); childRelations.put(head, relation); } // set the relations for each node annotation for (DependencyNode node : stanfordToUima.values()) { List<DependencyRelation> heads = headRelations.get(node); node.setHeadRelations(new FSArray(jCas, heads == null ? 0 : heads.size())); if (heads != null) { FSCollectionFactory.fillArrayFS(node.getHeadRelations(), heads); } List<DependencyRelation> children = childRelations.get(node); node.setChildRelations(new FSArray(jCas, children == null ? 0 : children.size())); if (children != null) { FSCollectionFactory.fillArrayFS(node.getChildRelations(), children); } node.addToIndexes(); } } // map from spans to named entity mentions Map<Span, NamedEntityMention> spanMentionMap = new HashMap<Span, NamedEntityMention>(); for (NamedEntityMention mention : JCasUtil.select(jCas, NamedEntityMention.class)) { spanMentionMap.put(new Span(mention.getBegin(), mention.getEnd()), mention); } // add mentions for all entities identified by the coreference system List<NamedEntity> entities = new ArrayList<NamedEntity>(); List<List<Token>> sentenceTokens = new ArrayList<List<Token>>(); for (Sentence sentence : JCasUtil.select(jCas, Sentence.class)) { sentenceTokens.add(JCasUtil.selectCovered(jCas, Token.class, sentence)); } Map<Integer, CorefChain> corefChains = document.get(CorefChainAnnotation.class); for (CorefChain chain : corefChains.values()) { List<NamedEntityMention> mentions = new ArrayList<NamedEntityMention>(); for (CorefMention corefMention : chain.getMentionsInTextualOrder()) { // figure out the character span of the token List<Token> tokens = sentenceTokens.get(corefMention.sentNum - 1); int begin = tokens.get(corefMention.startIndex - 1).getBegin(); int end = tokens.get(corefMention.endIndex - 2).getEnd(); // use an existing named entity mention when possible; otherwise create a new one NamedEntityMention mention = spanMentionMap.get(new Span(begin, end)); if (mention == null) { mention = new NamedEntityMention(jCas, begin, end); mention.addToIndexes(); } mentions.add(mention); } // create an entity for the mentions Collections.sort( mentions, new Comparator<NamedEntityMention>() { @Override public int compare(NamedEntityMention m1, NamedEntityMention m2) { return m1.getBegin() - m2.getBegin(); } }); // create mentions and add them to entity NamedEntity entity = new NamedEntity(jCas); entity.setMentions(new FSArray(jCas, mentions.size())); int index = 0; for (NamedEntityMention mention : mentions) { mention.setMentionedEntity(entity); entity.setMentions(index, mention); index += 1; } entities.add(entity); } // add singleton entities for any named entities not picked up by coreference system for (NamedEntityMention mention : JCasUtil.select(jCas, NamedEntityMention.class)) { if (mention.getMentionedEntity() == null) { NamedEntity entity = new NamedEntity(jCas); entity.setMentions(new FSArray(jCas, 1)); entity.setMentions(0, mention); mention.setMentionedEntity(entity); entity.getMentions(); entities.add(entity); } } // sort entities by document order Collections.sort( entities, new Comparator<NamedEntity>() { @Override public int compare(NamedEntity o1, NamedEntity o2) { return getFirstBegin(o1) - getFirstBegin(o2); } private int getFirstBegin(NamedEntity entity) { int min = Integer.MAX_VALUE; for (NamedEntityMention mention : JCasUtil.select(entity.getMentions(), NamedEntityMention.class)) { if (mention.getBegin() < min) { min = mention.getBegin(); } } return min; } }); // add entities to document for (NamedEntity entity : entities) { entity.addToIndexes(); } }
/** * The core implementation of the search. * * @param root The root word to search from. Traditionally, this is the root of the sentence. * @param candidateFragments The callback for the resulting sentence fragments. This is a * predicate of a triple of values. The return value of the predicate determines whether we * should continue searching. The triple is a triple of * <ol> * <li>The log probability of the sentence fragment, according to the featurizer and the * weights * <li>The features along the path to this fragment. The last element of this is the * features from the most recent step. * <li>The sentence fragment. Because it is relatively expensive to compute the resulting * tree, this is returned as a lazy {@link Supplier}. * </ol> * * @param classifier The classifier for whether an arc should be on the path to a clause split, a * clause split itself, or neither. * @param featurizer The featurizer to use. Make sure this matches the weights! * @param actionSpace The action space we are allowed to take. Each action defines a means of * splitting a clause on a dependency boundary. */ protected void search( // The root to search from IndexedWord root, // The output specs final Predicate<Triple<Double, List<Counter<String>>, Supplier<SentenceFragment>>> candidateFragments, // The learning specs final Classifier<ClauseSplitter.ClauseClassifierLabel, String> classifier, Map<String, ? extends List<String>> hardCodedSplits, final Function<Triple<State, Action, State>, Counter<String>> featurizer, final Collection<Action> actionSpace, final int maxTicks) { // (the fringe) PriorityQueue<Pair<State, List<Counter<String>>>> fringe = new FixedPrioritiesPriorityQueue<>(); // (avoid duplicate work) Set<IndexedWord> seenWords = new HashSet<>(); State firstState = new State(null, null, -9000, null, x -> {}, true); // First state is implicitly "done" fringe.add(Pair.makePair(firstState, new ArrayList<>(0)), -0.0); int ticks = 0; while (!fringe.isEmpty()) { if (++ticks > maxTicks) { // System.err.println("WARNING! Timed out on search with " + ticks + " ticks"); return; } // Useful variables double logProbSoFar = fringe.getPriority(); assert logProbSoFar <= 0.0; Pair<State, List<Counter<String>>> lastStatePair = fringe.removeFirst(); State lastState = lastStatePair.first; List<Counter<String>> featuresSoFar = lastStatePair.second; IndexedWord rootWord = lastState.edge == null ? root : lastState.edge.getDependent(); // Register thunk if (lastState.isDone) { if (!candidateFragments.test( Triple.makeTriple( logProbSoFar, featuresSoFar, () -> { SemanticGraph copy = new SemanticGraph(tree); lastState .thunk .andThen( x -> { // Add the extra edges back in, if they don't break the tree-ness of the // extraction for (IndexedWord newTreeRoot : x.getRoots()) { if (newTreeRoot != null) { // what a strange thing to have happen... for (SemanticGraphEdge extraEdge : extraEdgesByGovernor.get(newTreeRoot)) { assert Util.isTree(x); //noinspection unchecked addSubtree( x, newTreeRoot, extraEdge.getRelation().toString(), tree, extraEdge.getDependent(), tree.getIncomingEdgesSorted(newTreeRoot)); assert Util.isTree(x); } } } }) .accept(copy); return new SentenceFragment(copy, assumedTruth, false); }))) { break; } } // Find relevant auxilliary terms SemanticGraphEdge subjOrNull = null; SemanticGraphEdge objOrNull = null; for (SemanticGraphEdge auxEdge : tree.outgoingEdgeIterable(rootWord)) { String relString = auxEdge.getRelation().toString(); if (relString.contains("obj")) { objOrNull = auxEdge; } else if (relString.contains("subj")) { subjOrNull = auxEdge; } } // Iterate over children // For each outgoing edge... for (SemanticGraphEdge outgoingEdge : tree.outgoingEdgeIterable(rootWord)) { // Prohibit indirect speech verbs from splitting off clauses // (e.g., 'said', 'think') // This fires if the governor is an indirect speech verb, and the outgoing edge is a ccomp if (outgoingEdge.getRelation().toString().equals("ccomp") && ((outgoingEdge.getGovernor().lemma() != null && INDIRECT_SPEECH_LEMMAS.contains(outgoingEdge.getGovernor().lemma())) || INDIRECT_SPEECH_LEMMAS.contains(outgoingEdge.getGovernor().word()))) { continue; } // Get some variables String outgoingEdgeRelation = outgoingEdge.getRelation().toString(); List<String> forcedArcOrder = hardCodedSplits.get(outgoingEdgeRelation); if (forcedArcOrder == null && outgoingEdgeRelation.contains(":")) { forcedArcOrder = hardCodedSplits.get( outgoingEdgeRelation.substring(0, outgoingEdgeRelation.indexOf(":")) + ":*"); } boolean doneForcedArc = false; // For each action... for (Action action : (forcedArcOrder == null ? actionSpace : orderActions(actionSpace, forcedArcOrder))) { // Check the prerequisite if (!action.prerequisitesMet(tree, outgoingEdge)) { continue; } if (forcedArcOrder != null && doneForcedArc) { break; } // 1. Compute the child state Optional<State> candidate = action.applyTo(tree, lastState, outgoingEdge, subjOrNull, objOrNull); if (candidate.isPresent()) { double logProbability; ClauseClassifierLabel bestLabel; Counter<String> features = featurizer.apply(Triple.makeTriple(lastState, action, candidate.get())); if (forcedArcOrder != null && !doneForcedArc) { logProbability = 0.0; bestLabel = ClauseClassifierLabel.CLAUSE_SPLIT; doneForcedArc = true; } else if (features.containsKey("__undocumented_junit_no_classifier")) { logProbability = Double.NEGATIVE_INFINITY; bestLabel = ClauseClassifierLabel.CLAUSE_INTERM; } else { Counter<ClauseClassifierLabel> scores = classifier.scoresOf(new RVFDatum<>(features)); if (scores.size() > 0) { Counters.logNormalizeInPlace(scores); } String rel = outgoingEdge.getRelation().toString(); if ("nsubj".equals(rel) || "dobj".equals(rel)) { scores.remove( ClauseClassifierLabel.NOT_A_CLAUSE); // Always at least yield on nsubj and dobj } logProbability = Counters.max(scores, Double.NEGATIVE_INFINITY); bestLabel = Counters.argmax(scores, (x, y) -> 0, ClauseClassifierLabel.CLAUSE_SPLIT); } if (bestLabel != ClauseClassifierLabel.NOT_A_CLAUSE) { Pair<State, List<Counter<String>>> childState = Pair.makePair( candidate.get().withIsDone(bestLabel), new ArrayList<Counter<String>>(featuresSoFar) { { add(features); } }); // 2. Register the child state if (!seenWords.contains(childState.first.edge.getDependent())) { // System.err.println(" pushing " + action.signature() + " with " + // argmax.first.edge); fringe.add(childState, logProbability); } } } } } seenWords.add(rootWord); } // System.err.println("Search finished in " + ticks + " ticks and " + classifierEvals + " // classifier evaluations."); }
/** * A helper to add an entire subtree to a given dependency tree. * * @param toModify The tree to add the subtree to. * @param root The root of the tree where we should be adding the subtree. * @param rel The relation to add the subtree with. * @param originalTree The orignal tree (i.e., {@link ClauseSplitterSearchProblem#tree}). * @param subject The root of the clause to add. * @param ignoredEdges The edges to ignore adding when adding this subtree. */ private static void addSubtree( SemanticGraph toModify, IndexedWord root, String rel, SemanticGraph originalTree, IndexedWord subject, Collection<SemanticGraphEdge> ignoredEdges) { if (toModify.containsVertex(subject)) { return; // This subtree already exists. } Queue<IndexedWord> fringe = new LinkedList<>(); Collection<IndexedWord> wordsToAdd = new ArrayList<>(); Collection<SemanticGraphEdge> edgesToAdd = new ArrayList<>(); // Search for subtree to add for (SemanticGraphEdge edge : originalTree.outgoingEdgeIterable(subject)) { if (!ignoredEdges.contains(edge)) { if (toModify.containsVertex(edge.getDependent())) { // Case: we're adding a subtree that's not disjoint from toModify. This is bad news. return; } edgesToAdd.add(edge); fringe.add(edge.getDependent()); } } while (!fringe.isEmpty()) { IndexedWord node = fringe.poll(); wordsToAdd.add(node); for (SemanticGraphEdge edge : originalTree.outgoingEdgeIterable(node)) { if (!ignoredEdges.contains(edge)) { if (toModify.containsVertex(edge.getDependent())) { // Case: we're adding a subtree that's not disjoint from toModify. This is bad news. return; } edgesToAdd.add(edge); fringe.add(edge.getDependent()); } } } // Add subtree // (add subject) toModify.addVertex(subject); toModify.addEdge( root, subject, GrammaticalRelation.valueOf(Language.English, rel), Double.NEGATIVE_INFINITY, false); // (add nodes) wordsToAdd.forEach(toModify::addVertex); // (add edges) for (SemanticGraphEdge edge : edgesToAdd) { assert !toModify.incomingEdgeIterator(edge.getDependent()).hasNext(); toModify.addEdge( edge.getGovernor(), edge.getDependent(), edge.getRelation(), edge.getWeight(), edge.isExtra()); } }
@Override public Counter<String> apply(Triple<State, Action, State> triple) { // Variables State from = triple.first; Action action = triple.second; State to = triple.third; String signature = action.signature(); String edgeRelTaken = to.edge == null ? "root" : to.edge.getRelation().toString(); String edgeRelShort = to.edge == null ? "root" : to.edge.getRelation().getShortName(); if (edgeRelShort.contains("_")) { edgeRelShort = edgeRelShort.substring(0, edgeRelShort.indexOf("_")); } // -- Featurize -- // Variables to aggregate boolean parentHasSubj = false; boolean parentHasObj = false; boolean childHasSubj = false; boolean childHasObj = false; Counter<String> feats = new ClassicCounter<>(); // 1. edge taken feats.incrementCount(signature + "&edge:" + edgeRelTaken); feats.incrementCount(signature + "&edge_type:" + edgeRelShort); // 2. last edge taken if (from.edge == null) { assert to.edge == null || to.originalTree().getRoots().contains(to.edge.getGovernor()); feats.incrementCount(signature + "&at_root"); feats.incrementCount( signature + "&at_root&root_pos:" + to.originalTree().getFirstRoot().tag()); } else { feats.incrementCount(signature + "¬_root"); String lastRelShort = from.edge.getRelation().getShortName(); if (lastRelShort.contains("_")) { lastRelShort = lastRelShort.substring(0, lastRelShort.indexOf("_")); } feats.incrementCount(signature + "&last_edge:" + lastRelShort); } if (to.edge != null) { // 3. other edges at parent for (SemanticGraphEdge parentNeighbor : from.originalTree().outgoingEdgeIterable(to.edge.getGovernor())) { if (parentNeighbor != to.edge) { String parentNeighborRel = parentNeighbor.getRelation().toString(); if (parentNeighborRel.contains("subj")) { parentHasSubj = true; } if (parentNeighborRel.contains("obj")) { parentHasObj = true; } // (add feature) feats.incrementCount(signature + "&parent_neighbor:" + parentNeighborRel); feats.incrementCount( signature + "&edge_type:" + edgeRelShort + "&parent_neighbor:" + parentNeighborRel); } } // 4. Other edges at child int childNeighborCount = 0; for (SemanticGraphEdge childNeighbor : from.originalTree().outgoingEdgeIterable(to.edge.getDependent())) { String childNeighborRel = childNeighbor.getRelation().toString(); if (childNeighborRel.contains("subj")) { childHasSubj = true; } if (childNeighborRel.contains("obj")) { childHasObj = true; } childNeighborCount += 1; // (add feature) feats.incrementCount(signature + "&child_neighbor:" + childNeighborRel); feats.incrementCount( signature + "&edge_type:" + edgeRelShort + "&child_neighbor:" + childNeighborRel); } // 4.1 Number of other edges at child feats.incrementCount( signature + "&child_neighbor_count:" + (childNeighborCount < 3 ? childNeighborCount : ">2")); feats.incrementCount( signature + "&edge_type:" + edgeRelShort + "&child_neighbor_count:" + (childNeighborCount < 3 ? childNeighborCount : ">2")); // 5. Subject/Object stats feats.incrementCount(signature + "&parent_neighbor_subj:" + parentHasSubj); feats.incrementCount(signature + "&parent_neighbor_obj:" + parentHasObj); feats.incrementCount(signature + "&child_neighbor_subj:" + childHasSubj); feats.incrementCount(signature + "&child_neighbor_obj:" + childHasObj); // 6. POS tag info feats.incrementCount(signature + "&parent_pos:" + to.edge.getGovernor().tag()); feats.incrementCount(signature + "&child_pos:" + to.edge.getDependent().tag()); feats.incrementCount( signature + "&pos_signature:" + to.edge.getGovernor().tag() + "_" + to.edge.getDependent().tag()); feats.incrementCount( signature + "&edge_type:" + edgeRelShort + "&pos_signature:" + to.edge.getGovernor().tag() + "_" + to.edge.getDependent().tag()); } return feats; }
public static DependencyParse parse(String text) { if (pipeline == null) { loadModels(); } DependencyParse parse = new DependencyParse(); Annotation document = new Annotation(text); pipeline.annotate(document); List<CoreMap> sentences = document.get(SentencesAnnotation.class); for (CoreMap sentence : sentences) { SemanticGraph dependencies = sentence.get(CollapsedCCProcessedDependenciesAnnotation.class); IndexedWord root = dependencies.getFirstRoot(); parse.setHeadNode(root.index()); List<SemanticGraphEdge> edges = dependencies.edgeListSorted(); // System.out.println(edges); for (SemanticGraphEdge t : edges) { String dep = t.getDependent().originalText(); int depIndex = t.getDependent().index(); String depPOS = t.getDependent().tag(); int depStart = t.getDependent().beginPosition(); int depEnd = t.getDependent().endPosition(); String gov = t.getGovernor().originalText(); int govIndex = t.getGovernor().index(); String govPOS = t.getGovernor().tag(); int govStart = t.getGovernor().beginPosition(); int govEnd = t.getGovernor().endPosition(); parse.addNode(govIndex, gov, govPOS, govStart, govEnd); parse.addNode(depIndex, dep, depPOS, depStart, depEnd); parse.addEdge(depIndex, govIndex, t.getRelation().getShortName()); } } return parse; }
/** * Fix some bizarre peculiarities with certain trees. So far, these include: * * <ul> * <li>Sometimes there's a node from a word to itself. This seems wrong. * </ul> * * @param tree The tree to clean (in place!). * @return A list of extra edges, which are valid but were removed. */ public static List<SemanticGraphEdge> cleanTree(SemanticGraph tree) { // assert !isCyclic(tree); // Clean nodes List<IndexedWord> toDelete = new ArrayList<>(); for (IndexedWord vertex : tree.vertexSet()) { // Clean punctuation if (vertex.tag() == null) { continue; } char tag = vertex.backingLabel().tag().charAt(0); if (tag == '.' || tag == ',' || tag == '(' || tag == ')' || tag == ':') { if (!tree.outgoingEdgeIterator(vertex) .hasNext()) { // This should really never happen, but it does. toDelete.add(vertex); } } } toDelete.forEach(tree::removeVertex); // Clean edges Iterator<SemanticGraphEdge> iter = tree.edgeIterable().iterator(); while (iter.hasNext()) { SemanticGraphEdge edge = iter.next(); if (edge.getDependent().index() == edge.getGovernor().index()) { // Clean self-edges iter.remove(); } else if (edge.getRelation().toString().equals("punct")) { // Clean punctuation (again) if (!tree.outgoingEdgeIterator(edge.getDependent()) .hasNext()) { // This should really never happen, but it does. iter.remove(); } } } // Remove extra edges List<SemanticGraphEdge> extraEdges = new ArrayList<>(); for (SemanticGraphEdge edge : tree.edgeIterable()) { if (edge.isExtra()) { if (tree.incomingEdgeList(edge.getDependent()).size() > 1) { extraEdges.add(edge); } } } extraEdges.forEach(tree::removeEdge); // Add apposition edges (simple coref) for (SemanticGraphEdge extraEdge : new ArrayList<>(extraEdges)) { // note[gabor] prevent concurrent modification exception for (SemanticGraphEdge candidateAppos : tree.incomingEdgeIterable(extraEdge.getDependent())) { if (candidateAppos.getRelation().toString().equals("appos")) { extraEdges.add( new SemanticGraphEdge( extraEdge.getGovernor(), candidateAppos.getGovernor(), extraEdge.getRelation(), extraEdge.getWeight(), extraEdge.isExtra())); } } for (SemanticGraphEdge candidateAppos : tree.outgoingEdgeIterable(extraEdge.getDependent())) { if (candidateAppos.getRelation().toString().equals("appos")) { extraEdges.add( new SemanticGraphEdge( extraEdge.getGovernor(), candidateAppos.getDependent(), extraEdge.getRelation(), extraEdge.getWeight(), extraEdge.isExtra())); } } } // Brute force ensure tree // Remove incoming edges from roots List<SemanticGraphEdge> rootIncomingEdges = new ArrayList<>(); for (IndexedWord root : tree.getRoots()) { for (SemanticGraphEdge incomingEdge : tree.incomingEdgeIterable(root)) { rootIncomingEdges.add(incomingEdge); } } rootIncomingEdges.forEach(tree::removeEdge); // Loop until it becomes a tree. boolean changed = true; while (changed) { // I just want trees to be trees; is that so much to ask!? changed = false; List<IndexedWord> danglingNodes = new ArrayList<>(); List<SemanticGraphEdge> invalidEdges = new ArrayList<>(); for (IndexedWord vertex : tree.vertexSet()) { // Collect statistics Iterator<SemanticGraphEdge> incomingIter = tree.incomingEdgeIterator(vertex); boolean hasIncoming = incomingIter.hasNext(); boolean hasMultipleIncoming = false; if (hasIncoming) { incomingIter.next(); hasMultipleIncoming = incomingIter.hasNext(); } // Register actions if (!hasIncoming && !tree.getRoots().contains(vertex)) { danglingNodes.add(vertex); } else { if (hasMultipleIncoming) { for (SemanticGraphEdge edge : new IterableIterator<>(incomingIter)) { invalidEdges.add(edge); } } } } // Perform actions for (IndexedWord vertex : danglingNodes) { tree.removeVertex(vertex); changed = true; } for (SemanticGraphEdge edge : invalidEdges) { tree.removeEdge(edge); changed = true; } } // Return assert isTree(tree); return extraEdges; }