List<Tree> prune(List<Tree> treeList, Label label, int start, int end) { // get reference tree if (treeList.size() == 1) { return treeList; } Tree testTree = treeList.get(0).treeFactory().newTreeNode(label, treeList); int goal = Numberer.getGlobalNumberer("states").number(label.value()); Tree tempTree = parser.extractBestParse(goal, start, end); // parser.restoreUnaries(tempTree); Tree pcfgTree = debinarizer.transformTree(tempTree); Set<Constituent> pcfgConstituents = pcfgTree.constituents(new LabeledScoredConstituentFactory()); // delete child labels that are not in reference but do not cross reference List<Tree> prunedChildren = new ArrayList<Tree>(); int childStart = 0; for (int c = 0, numCh = testTree.numChildren(); c < numCh; c++) { Tree child = testTree.getChild(c); boolean isExtra = true; int childEnd = childStart + child.yield().size(); Constituent childConstituent = new LabeledScoredConstituent(childStart, childEnd, child.label(), 0); if (pcfgConstituents.contains(childConstituent)) { isExtra = false; } if (childConstituent.crosses(pcfgConstituents)) { isExtra = false; } if (child.isLeaf() || child.isPreTerminal()) { isExtra = false; } if (pcfgTree.yield().size() != testTree.yield().size()) { isExtra = false; } if (!label.value().startsWith("NP^NP")) { isExtra = false; } if (isExtra) { System.err.println( "Pruning: " + child.label() + " from " + (childStart + start) + " to " + (childEnd + start)); System.err.println("Was: " + testTree + " vs " + pcfgTree); prunedChildren.addAll(child.getChildrenAsList()); } else { prunedChildren.add(child); } childStart = childEnd; } return prunedChildren; }
/** * Reads a list of Entries from a mapping file and update the given entries. Line numbers start * from 1. * * @return the updated list of Entries */ private static List<Entry> readEntries( String annotatorName, List<Entry> entries, TrieMap<String, Entry> seenRegexes, String mappingFilename, BufferedReader mapping, Set<String> noDefaultOverwriteLabels, boolean ignoreCase, boolean verbose) throws IOException { int origEntriesSize = entries.size(); int isTokensRegex = 0; int lineCount = 0; for (String line; (line = mapping.readLine()) != null; ) { lineCount++; String[] split = line.split("\t"); if (split.length < 2 || split.length > 5) { throw new IllegalArgumentException( "Provided mapping file is in wrong format. This line is bad: " + line); } String regex = split[0].trim(); String tokensRegex = null; String[] regexes = null; if (regex.startsWith("( ") && regex.endsWith(" )")) { // Tokens regex (remove start and end parenthesis) tokensRegex = regex.substring(1, regex.length() - 1).trim(); } else { regexes = regex.split("\\s+"); } String[] key = (regexes != null) ? regexes : new String[] {tokensRegex}; if (ignoreCase) { String[] norm = new String[key.length]; for (int i = 0; i < key.length; i++) { norm[i] = key[i].toLowerCase(); } key = norm; } String type = split[1].trim(); Set<String> overwritableTypes = Generics.newHashSet(); double priority = 0.0; if (split.length >= 3) { overwritableTypes.addAll(Arrays.asList(split[2].trim().split("\\s*,\\s*"))); } if (split.length >= 4) { try { priority = Double.parseDouble(split[3].trim()); } catch (NumberFormatException e) { throw new IllegalArgumentException( "ERROR: Invalid priority in line " + lineCount + " in regexner file " + mappingFilename + ": \"" + line + "\"!", e); } } int annotateGroup = 0; // Get annotate group from input.... if (split.length >= 5) { // Which group to take (allow for context) String context = split[4].trim(); try { annotateGroup = Integer.parseInt(context); } catch (NumberFormatException e) { throw new IllegalArgumentException( "ERROR: Invalid group in line " + lineCount + " in regexner file " + mappingFilename + ": \"" + line + "\"!", e); } } // Print some warning about the type int commaPos = type.indexOf(','); if (commaPos > 0) { // Strip the "," and just take first type String newType = type.substring(0, commaPos).trim(); logger.warn( "TokensRegexNERAnnotator " + annotatorName + ": Entry has multiple types: " + line + ". Taking type to be " + newType); type = newType; } Entry entry = new Entry(tokensRegex, regexes, type, overwritableTypes, priority, annotateGroup); if (seenRegexes.containsKey(key)) { Entry oldEntry = seenRegexes.get(key); if (priority > oldEntry.priority) { logger.warn( "TokensRegexNERAnnotator " + annotatorName + ": Replace duplicate entry (higher priority): old=" + oldEntry + ", new=" + entry); } else { if (!oldEntry.type.equals(type)) { if (verbose) { logger.warn( "TokensRegexNERAnnotator " + annotatorName + ": Ignoring duplicate entry: " + split[0] + ", old type = " + oldEntry.type + ", new type = " + type); } // } else { // if (verbose) { // logger.warn("TokensRegexNERAnnotator " + annotatorName + // ": Duplicate entry [ignored]: " + split[0] + ", old type = " + // oldEntry.type + ", new type = " + type); // } } continue; } } // Print some warning if label belongs to noDefaultOverwriteLabels but there is no // overwritable types if (entry.overwritableTypes.isEmpty() && noDefaultOverwriteLabels.contains(entry.type)) { logger.warn( "TokensRegexNERAnnotator " + annotatorName + ": Entry doesn't have overwriteable types " + entry + ", but entry type is in noDefaultOverwriteLabels"); } entries.add(entry); seenRegexes.put(key, entry); if (entry.tokensRegex != null) isTokensRegex++; } logger.log( "TokensRegexNERAnnotator " + annotatorName + ": Read " + (entries.size() - origEntriesSize) + " unique entries out of " + lineCount + " from " + mappingFilename + ", " + isTokensRegex + " TokensRegex patterns."); return entries; }
/** * The core implementation of the search. * * @param root The root word to search from. Traditionally, this is the root of the sentence. * @param candidateFragments The callback for the resulting sentence fragments. This is a * predicate of a triple of values. The return value of the predicate determines whether we * should continue searching. The triple is a triple of * <ol> * <li>The log probability of the sentence fragment, according to the featurizer and the * weights * <li>The features along the path to this fragment. The last element of this is the * features from the most recent step. * <li>The sentence fragment. Because it is relatively expensive to compute the resulting * tree, this is returned as a lazy {@link Supplier}. * </ol> * * @param classifier The classifier for whether an arc should be on the path to a clause split, a * clause split itself, or neither. * @param featurizer The featurizer to use. Make sure this matches the weights! * @param actionSpace The action space we are allowed to take. Each action defines a means of * splitting a clause on a dependency boundary. */ protected void search( // The root to search from IndexedWord root, // The output specs final Predicate<Triple<Double, List<Counter<String>>, Supplier<SentenceFragment>>> candidateFragments, // The learning specs final Classifier<ClauseSplitter.ClauseClassifierLabel, String> classifier, Map<String, ? extends List<String>> hardCodedSplits, final Function<Triple<State, Action, State>, Counter<String>> featurizer, final Collection<Action> actionSpace, final int maxTicks) { // (the fringe) PriorityQueue<Pair<State, List<Counter<String>>>> fringe = new FixedPrioritiesPriorityQueue<>(); // (avoid duplicate work) Set<IndexedWord> seenWords = new HashSet<>(); State firstState = new State(null, null, -9000, null, x -> {}, true); // First state is implicitly "done" fringe.add(Pair.makePair(firstState, new ArrayList<>(0)), -0.0); int ticks = 0; while (!fringe.isEmpty()) { if (++ticks > maxTicks) { // System.err.println("WARNING! Timed out on search with " + ticks + " ticks"); return; } // Useful variables double logProbSoFar = fringe.getPriority(); assert logProbSoFar <= 0.0; Pair<State, List<Counter<String>>> lastStatePair = fringe.removeFirst(); State lastState = lastStatePair.first; List<Counter<String>> featuresSoFar = lastStatePair.second; IndexedWord rootWord = lastState.edge == null ? root : lastState.edge.getDependent(); // Register thunk if (lastState.isDone) { if (!candidateFragments.test( Triple.makeTriple( logProbSoFar, featuresSoFar, () -> { SemanticGraph copy = new SemanticGraph(tree); lastState .thunk .andThen( x -> { // Add the extra edges back in, if they don't break the tree-ness of the // extraction for (IndexedWord newTreeRoot : x.getRoots()) { if (newTreeRoot != null) { // what a strange thing to have happen... for (SemanticGraphEdge extraEdge : extraEdgesByGovernor.get(newTreeRoot)) { assert Util.isTree(x); //noinspection unchecked addSubtree( x, newTreeRoot, extraEdge.getRelation().toString(), tree, extraEdge.getDependent(), tree.getIncomingEdgesSorted(newTreeRoot)); assert Util.isTree(x); } } } }) .accept(copy); return new SentenceFragment(copy, assumedTruth, false); }))) { break; } } // Find relevant auxilliary terms SemanticGraphEdge subjOrNull = null; SemanticGraphEdge objOrNull = null; for (SemanticGraphEdge auxEdge : tree.outgoingEdgeIterable(rootWord)) { String relString = auxEdge.getRelation().toString(); if (relString.contains("obj")) { objOrNull = auxEdge; } else if (relString.contains("subj")) { subjOrNull = auxEdge; } } // Iterate over children // For each outgoing edge... for (SemanticGraphEdge outgoingEdge : tree.outgoingEdgeIterable(rootWord)) { // Prohibit indirect speech verbs from splitting off clauses // (e.g., 'said', 'think') // This fires if the governor is an indirect speech verb, and the outgoing edge is a ccomp if (outgoingEdge.getRelation().toString().equals("ccomp") && ((outgoingEdge.getGovernor().lemma() != null && INDIRECT_SPEECH_LEMMAS.contains(outgoingEdge.getGovernor().lemma())) || INDIRECT_SPEECH_LEMMAS.contains(outgoingEdge.getGovernor().word()))) { continue; } // Get some variables String outgoingEdgeRelation = outgoingEdge.getRelation().toString(); List<String> forcedArcOrder = hardCodedSplits.get(outgoingEdgeRelation); if (forcedArcOrder == null && outgoingEdgeRelation.contains(":")) { forcedArcOrder = hardCodedSplits.get( outgoingEdgeRelation.substring(0, outgoingEdgeRelation.indexOf(":")) + ":*"); } boolean doneForcedArc = false; // For each action... for (Action action : (forcedArcOrder == null ? actionSpace : orderActions(actionSpace, forcedArcOrder))) { // Check the prerequisite if (!action.prerequisitesMet(tree, outgoingEdge)) { continue; } if (forcedArcOrder != null && doneForcedArc) { break; } // 1. Compute the child state Optional<State> candidate = action.applyTo(tree, lastState, outgoingEdge, subjOrNull, objOrNull); if (candidate.isPresent()) { double logProbability; ClauseClassifierLabel bestLabel; Counter<String> features = featurizer.apply(Triple.makeTriple(lastState, action, candidate.get())); if (forcedArcOrder != null && !doneForcedArc) { logProbability = 0.0; bestLabel = ClauseClassifierLabel.CLAUSE_SPLIT; doneForcedArc = true; } else if (features.containsKey("__undocumented_junit_no_classifier")) { logProbability = Double.NEGATIVE_INFINITY; bestLabel = ClauseClassifierLabel.CLAUSE_INTERM; } else { Counter<ClauseClassifierLabel> scores = classifier.scoresOf(new RVFDatum<>(features)); if (scores.size() > 0) { Counters.logNormalizeInPlace(scores); } String rel = outgoingEdge.getRelation().toString(); if ("nsubj".equals(rel) || "dobj".equals(rel)) { scores.remove( ClauseClassifierLabel.NOT_A_CLAUSE); // Always at least yield on nsubj and dobj } logProbability = Counters.max(scores, Double.NEGATIVE_INFINITY); bestLabel = Counters.argmax(scores, (x, y) -> 0, ClauseClassifierLabel.CLAUSE_SPLIT); } if (bestLabel != ClauseClassifierLabel.NOT_A_CLAUSE) { Pair<State, List<Counter<String>>> childState = Pair.makePair( candidate.get().withIsDone(bestLabel), new ArrayList<Counter<String>>(featuresSoFar) { { add(features); } }); // 2. Register the child state if (!seenWords.contains(childState.first.edge.getDependent())) { // System.err.println(" pushing " + action.signature() + " with " + // argmax.first.edge); fringe.add(childState, logProbability); } } } } } seenWords.add(rootWord); } // System.err.println("Search finished in " + ticks + " ticks and " + classifierEvals + " // classifier evaluations."); }
private boolean checkOrigNerTags(Entry entry, List<CoreLabel> tokens, int start, int end) { int prevNerEndIndex = start - 1; int nextNerStartIndex = end; // Check if we found a pattern that overlaps with existing ner labels // tag1 tag1 x x tag2 tag2 // tag tag tag tag // Don't overwrite the old ner label if we overlap like this String startNer = tokens.get(start).ner(); String endNer = tokens.get(end - 1).ner(); if (startNer != null && !myLabels.contains(startNer)) { while (prevNerEndIndex >= 0) { // go backwards to find different entity type String ner = tokens.get(prevNerEndIndex).ner(); if (ner == null || !ner.equals(startNer)) { break; } prevNerEndIndex--; } } if (endNer != null && !myLabels.contains(endNer)) { while (nextNerStartIndex < tokens.size()) { // go backwards to find different entity type String ner = tokens.get(nextNerStartIndex).ner(); if (ner == null || !ner.equals(endNer)) { break; } nextNerStartIndex++; } } boolean overwriteOriginalNer = false; //noinspection StatementWithEmptyBody if (prevNerEndIndex != (start - 1) || nextNerStartIndex != end) { // Cutting across already recognized NEs don't disturb } else if (startNer == null) { // No old ner, okay to replace overwriteOriginalNer = true; } else { // Check if we have one consistent NER tag // if not, overwrite // if consistent, overwrite only if in our set of ner tags that we overwrite for (int i = start + 1; i < end; i++) { if (!startNer.equals(tokens.get(i).ner())) { overwriteOriginalNer = true; break; } } if (!overwriteOriginalNer) { // check if old ner type was one that was specified as explicitly overwritable by this entry if (entry.overwritableTypes.contains(startNer)) { overwriteOriginalNer = true; } else { // if this ner type doesn't belong to the labels for which we don't overwrite the default // labels (noDefaultOverwriteLabels) // we check mylabels to see if we can overwrite this entry if ( /*entry.overwritableTypes.isEmpty() || */ !noDefaultOverwriteLabels.contains( entry.type)) { overwriteOriginalNer = myLabels.contains(startNer); } } } } return overwriteOriginalNer; }