/** * Create a searcher manually, suppling a dependency tree, an optional classifier for when to * split clauses, and a featurizer for that classifier. You almost certainly want to use {@link * ClauseSplitter#load(String)} instead of this constructor. * * @param tree The dependency tree to search over. * @param assumedTruth The assumed truth of the tree (relevant for natural logic inference). If in * doubt, pass in true. * @param isClauseClassifier The classifier for whether a given dependency arc should be a new * clause. If this is not given, all arcs are treated as clause separators. * @param featurizer The featurizer for the classifier. If no featurizer is given, one should be * given in {@link ClauseSplitterSearchProblem#search(java.util.function.Predicate, * Classifier, Map, java.util.function.Function, int)}, or else the classifier will be * useless. * @see ClauseSplitter#load(String) */ protected ClauseSplitterSearchProblem( SemanticGraph tree, boolean assumedTruth, Optional<Classifier<ClauseSplitter.ClauseClassifierLabel, String>> isClauseClassifier, Optional< Function< Triple< ClauseSplitterSearchProblem.State, ClauseSplitterSearchProblem.Action, ClauseSplitterSearchProblem.State>, Counter<String>>> featurizer) { this.tree = new SemanticGraph(tree); this.assumedTruth = assumedTruth; this.isClauseClassifier = isClauseClassifier; this.featurizer = featurizer; // Index edges this.tree.edgeIterable().forEach(edgeToIndex::addToIndex); // Get length List<IndexedWord> sortedVertices = tree.vertexListSorted(); sentenceLength = sortedVertices.get(sortedVertices.size() - 1).index(); // Register extra edges for (IndexedWord vertex : sortedVertices) { extraEdgesByGovernor.put(vertex, new ArrayList<>()); extraEdgesByDependent.put(vertex, new ArrayList<>()); } List<SemanticGraphEdge> extraEdges = Util.cleanTree(this.tree); assert Util.isTree(this.tree); for (SemanticGraphEdge edge : extraEdges) { extraEdgesByGovernor.get(edge.getGovernor()).add(edge); extraEdgesByDependent.get(edge.getDependent()).add(edge); } }
@Override public Set<Requirement> requires() { if (annotators.isEmpty()) { return Collections.emptySet(); } return annotators.get(0).requires(); }
/** * The basic method for splitting off a clause of a tree. This modifies the tree in place. * * @param tree The tree to split a clause from. * @param toKeep The edge representing the clause to keep. */ static void splitToChildOfEdge(SemanticGraph tree, SemanticGraphEdge toKeep) { Queue<IndexedWord> fringe = new LinkedList<>(); List<IndexedWord> nodesToRemove = new ArrayList<>(); // Find nodes to remove // (from the root) for (IndexedWord root : tree.getRoots()) { nodesToRemove.add(root); for (SemanticGraphEdge out : tree.outgoingEdgeIterable(root)) { if (!out.equals(toKeep)) { fringe.add(out.getDependent()); } } } // (recursively) while (!fringe.isEmpty()) { IndexedWord node = fringe.poll(); nodesToRemove.add(node); for (SemanticGraphEdge out : tree.outgoingEdgeIterable(node)) { if (!out.equals(toKeep)) { fringe.add(out.getDependent()); } } } // Remove nodes nodesToRemove.forEach(tree::removeVertex); // Set new root tree.setRoot(toKeep.getDependent()); }
// TODO: roll check into tokens regex pattern? // That allows for better matching because unmatched sequences will be eliminated at match time private boolean checkPosTags(List<CoreLabel> tokens, int start, int end) { if (validPosPattern != null) { // Need to check POS tag too... switch (posMatchType) { case MATCH_ONE_TOKEN_PHRASE_ONLY: if (tokens.size() > 1) return true; // fall through case MATCH_AT_LEAST_ONE_TOKEN: for (int i = start; i < end; i++) { CoreLabel token = tokens.get(i); String pos = token.get(CoreAnnotations.PartOfSpeechAnnotation.class); if (pos != null && validPosPattern.matcher(pos).matches()) { return true; } } return false; case MATCH_ALL_TOKENS: // Checked else where return true; default: // Don't know this match type.... return true; } } return true; }
private List<Tree> helper(List<Tree> treeList, int start) { List<Tree> newTreeList = new ArrayList<Tree>(treeList.size()); for (Tree tree : treeList) { int end = start + tree.yield().size(); newTreeList.add(prune(tree, start)); start = end; } return newTreeList; }
private static List<TaggedWord> cleanTags(List twList, TreebankLanguagePack tlp) { int sz = twList.size(); List<TaggedWord> l = new ArrayList<TaggedWord>(sz); for (int i = 0; i < sz; i++) { TaggedWord tw = (TaggedWord) twList.get(i); TaggedWord tw2 = new TaggedWord(tw.word(), tlp.basicCategory(tw.tag())); l.add(tw2); } return l; }
public AnnotationPipeline(List<Annotator> annotators) { this.annotators = annotators; if (TIME) { int num = annotators.size(); accumulatedTime = new ArrayList<MutableLong>(num); for (int i = 0; i < num; i++) { accumulatedTime.add(new MutableLong()); } } }
List<Tree> prune(List<Tree> treeList, Label label, int start, int end) { // get reference tree if (treeList.size() == 1) { return treeList; } Tree testTree = treeList.get(0).treeFactory().newTreeNode(label, treeList); int goal = Numberer.getGlobalNumberer("states").number(label.value()); Tree tempTree = parser.extractBestParse(goal, start, end); // parser.restoreUnaries(tempTree); Tree pcfgTree = debinarizer.transformTree(tempTree); Set<Constituent> pcfgConstituents = pcfgTree.constituents(new LabeledScoredConstituentFactory()); // delete child labels that are not in reference but do not cross reference List<Tree> prunedChildren = new ArrayList<Tree>(); int childStart = 0; for (int c = 0, numCh = testTree.numChildren(); c < numCh; c++) { Tree child = testTree.getChild(c); boolean isExtra = true; int childEnd = childStart + child.yield().size(); Constituent childConstituent = new LabeledScoredConstituent(childStart, childEnd, child.label(), 0); if (pcfgConstituents.contains(childConstituent)) { isExtra = false; } if (childConstituent.crosses(pcfgConstituents)) { isExtra = false; } if (child.isLeaf() || child.isPreTerminal()) { isExtra = false; } if (pcfgTree.yield().size() != testTree.yield().size()) { isExtra = false; } if (!label.value().startsWith("NP^NP")) { isExtra = false; } if (isExtra) { System.err.println( "Pruning: " + child.label() + " from " + (childStart + start) + " to " + (childEnd + start)); System.err.println("Was: " + testTree + " vs " + pcfgTree); prunedChildren.addAll(child.getChildrenAsList()); } else { prunedChildren.add(child); } childStart = childEnd; } return prunedChildren; }
protected String historyToString(List history) { String str = (String) historyToString.get(history); if (str == null) { StringBuilder sb = new StringBuilder(); for (int i = 0; i < history.size(); i++) { sb.append('^'); sb.append(history.get(i)); } str = sb.toString(); historyToString.put(history, str); } return str; }
private List<CoreMap> toCoreMaps( CoreMap annotation, List<TimeExpression> timeExpressions, SUTime.TimeIndex timeIndex) { if (timeExpressions == null) return null; List<CoreMap> coreMaps = new ArrayList<CoreMap>(timeExpressions.size()); for (TimeExpression te : timeExpressions) { CoreMap cm = te.getAnnotation(); SUTime.Temporal temporal = te.getTemporal(); if (temporal != null) { String origText = annotation.get(CoreAnnotations.TextAnnotation.class); String text = cm.get(CoreAnnotations.TextAnnotation.class); if (origText != null) { // Make sure the text is from original (and not from concatenated tokens) ChunkAnnotationUtils.annotateChunkText(cm, annotation); text = cm.get(CoreAnnotations.TextAnnotation.class); } Map<String, String> timexAttributes; try { timexAttributes = temporal.getTimexAttributes(timeIndex); if (options.includeRange) { SUTime.Temporal rangeTemporal = temporal.getRange(); if (rangeTemporal != null) { timexAttributes.put("range", rangeTemporal.toString()); } } } catch (Exception e) { logger.log( Level.WARNING, "Failed to get attributes from " + text + ", timeIndex " + timeIndex, e); continue; } Timex timex; try { timex = Timex.fromMap(text, timexAttributes); } catch (Exception e) { logger.log( Level.WARNING, "Failed to process " + text + " with attributes " + timexAttributes, e); continue; } cm.set(TimexAnnotation.class, timex); if (timex != null) { coreMaps.add(cm); } else { logger.warning("No timex expression for: " + text); } } } return coreMaps; }
/** * Stips aux and mark edges when we are splitting into a clause. * * @param toModify The tree we are stripping the edges from. */ private void stripAuxMark(SemanticGraph toModify) { List<SemanticGraphEdge> toClean = new ArrayList<>(); for (SemanticGraphEdge edge : toModify.outgoingEdgeIterable(toModify.getFirstRoot())) { String rel = edge.getRelation().toString(); if (("aux".equals(rel) || "mark".equals(rel)) && !toModify.outgoingEdgeIterator(edge.getDependent()).hasNext()) { toClean.add(edge); } } for (SemanticGraphEdge edge : toClean) { toModify.removeEdge(edge); toModify.removeVertex(edge.getDependent()); } }
public Object formResult() { Set brs = new HashSet(); Set urs = new HashSet(); // scan each rule / history pair int ruleCount = 0; for (Iterator pairI = rulePairs.keySet().iterator(); pairI.hasNext(); ) { if (ruleCount % 100 == 0) { System.err.println("Rules multiplied: " + ruleCount); } ruleCount++; Pair rulePair = (Pair) pairI.next(); Rule baseRule = (Rule) rulePair.first; String baseLabel = (String) ruleToLabel.get(baseRule); List history = (List) rulePair.second; double totalProb = 0; for (int depth = 1; depth <= HISTORY_DEPTH() && depth <= history.size(); depth++) { List subHistory = history.subList(0, depth); double c_label = labelPairs.getCount(new Pair(baseLabel, subHistory)); double c_rule = rulePairs.getCount(new Pair(baseRule, subHistory)); // System.out.println("Multiplying out "+baseRule+" with history "+subHistory); // System.out.println("Count of "+baseLabel+" with "+subHistory+" is "+c_label); // System.out.println("Count of "+baseRule+" with "+subHistory+" is "+c_rule ); double prob = (1.0 / HISTORY_DEPTH()) * (c_rule) / (c_label); totalProb += prob; for (int childDepth = 0; childDepth <= Math.min(HISTORY_DEPTH() - 1, depth); childDepth++) { Rule rule = specifyRule(baseRule, subHistory, childDepth); rule.score = (float) Math.log(totalProb); // System.out.println("Created "+rule+" with score "+rule.score); if (rule instanceof UnaryRule) { urs.add(rule); } else { brs.add(rule); } } } } System.out.println("Total states: " + stateNumberer.total()); BinaryGrammar bg = new BinaryGrammar(stateNumberer.total()); UnaryGrammar ug = new UnaryGrammar(stateNumberer.total()); for (Iterator brI = brs.iterator(); brI.hasNext(); ) { BinaryRule br = (BinaryRule) brI.next(); bg.addRule(br); } for (Iterator urI = urs.iterator(); urI.hasNext(); ) { UnaryRule ur = (UnaryRule) urI.next(); ug.addRule(ur); } return new Pair(ug, bg); }
public static final String doCorefResolution(Annotation annotation) { Map<Integer, CorefChain> corefs = annotation.get(CorefChainAnnotation.class); List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class); List<String> resolved = new ArrayList<String>(); for (CoreMap sentence : sentences) { List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class); for (CoreLabel token : tokens) { Integer corefClustId = token.get(CorefCoreAnnotations.CorefClusterIdAnnotation.class); CorefChain chain = corefs.get(corefClustId); if (chain == null) resolved.add(token.word()); else { int sentINdx = chain.getRepresentativeMention().sentNum - 1; CoreMap corefSentence = sentences.get(sentINdx); List<CoreLabel> corefSentenceTokens = corefSentence.get(TokensAnnotation.class); CorefMention reprMent = chain.getRepresentativeMention(); if (token.index() < reprMent.startIndex || token.index() > reprMent.endIndex) { for (int i = reprMent.startIndex; i < reprMent.endIndex; i++) { CoreLabel matchedLabel = corefSentenceTokens.get(i - 1); resolved.add(matchedLabel.word()); } } else resolved.add(token.word()); } } } String resolvedStr = ""; System.out.println(); for (String str : resolved) { resolvedStr += str + " "; } System.out.println(resolvedStr); return resolvedStr; }
protected void tallyInternalNode(Tree lt, List parents) { // form base rule String label = lt.label().value(); Rule baseR = ltToRule(lt); ruleToLabel.put(baseR, label); // act on each history depth for (int depth = 0, maxDepth = Math.min(HISTORY_DEPTH(), parents.size()); depth <= maxDepth; depth++) { List history = new ArrayList(parents.subList(0, depth)); // tally each history level / rewrite pair rulePairs.incrementCount(new Pair(baseR, history), 1); labelPairs.incrementCount(new Pair(label, history), 1); } }
private List<TimeExpression> filterInvalidTimeExpressions(List<TimeExpression> timeExprs) { int nfiltered = 0; List<TimeExpression> filtered = new ArrayList<TimeExpression>(timeExprs.size()); // Approximate size for (TimeExpression timeExpr : timeExprs) { if (timexPatterns.checkTimeExpression(timeExpr)) { filtered.add(timeExpr); } else { nfiltered++; } } if (nfiltered > 0) { logger.finest("Filtered " + nfiltered); } return filtered; }
private void annotateMatched(List<CoreLabel> tokens) { List<SequenceMatchResult<CoreMap>> matched = multiPatternMatcher.findNonOverlapping(tokens); for (SequenceMatchResult<CoreMap> m : matched) { Entry entry = patternToEntry.get(m.pattern()); // Check if we will overwrite the existing annotation with this annotation int g = entry.annotateGroup; int start = m.start(g); int end = m.end(g); boolean overwriteOriginalNer = checkPosTags(tokens, start, end); if (overwriteOriginalNer) { overwriteOriginalNer = checkOrigNerTags(entry, tokens, start, end); } if (overwriteOriginalNer) { for (int i = start; i < end; i++) { tokens.get(i).set(CoreAnnotations.NamedEntityTagAnnotation.class, entry.type); } } else { if (verbose) { System.err.println( "Not annotating '" + m.group(g) + "': " + StringUtils.joinFields( m.groupNodes(g), CoreAnnotations.NamedEntityTagAnnotation.class) + " with " + entry.type + ", sentence is '" + StringUtils.joinWords(tokens, " ") + '\''); } } } }
public SUTime.Temporal apply(MatchResult in) { if (in instanceof SequenceMatchResult) { SequenceMatchResult<CoreMap> mr = (SequenceMatchResult<CoreMap>) (in); if (group >= 0) { List<? extends CoreMap> matched = mr.groupNodes(group); if (matched != null) { int i = (nodeIndex >= 0) ? 0 : (matched.size() + nodeIndex); TimeExpression te = getTimeExpression(matched, i); if (te != null) { return te.getTemporal(); } } } } return null; }
/** Re-order the action space based on the specified order of names. */ private Collection<Action> orderActions(Collection<Action> actionSpace, List<String> order) { List<Action> tmp = new ArrayList<>(actionSpace); List<Action> out = new ArrayList<>(); for (String key : order) { Iterator<Action> iter = tmp.iterator(); while (iter.hasNext()) { Action a = iter.next(); if (a.signature().equals(key)) { out.add(a); iter.remove(); } } } out.addAll(tmp); return out; }
/** * Creates a combined list of Entries using the provided mapping files. * * @param mappings List of mapping files * @return list of Entries */ private static List<Entry> readEntries( String annotatorName, Set<String> noDefaultOverwriteLabels, boolean ignoreCase, boolean verbose, String... mappings) { // Unlike RegexNERClassifier, we don't bother sorting the entries // We leave it to TokensRegex NER to sort out the priorities and matches // (typically after all the matches has been made since for some TokenRegex expression, // we don't know how many tokens are matched until after the matching is done) List<Entry> entries = new ArrayList<>(); TrieMap<String, Entry> seenRegexes = new TrieMap<>(); Arrays.sort(mappings); for (String mapping : mappings) { BufferedReader rd = null; try { rd = IOUtils.readerFromString(mapping); readEntries( annotatorName, entries, seenRegexes, mapping, rd, noDefaultOverwriteLabels, ignoreCase, verbose); } catch (IOException e) { throw new RuntimeIOException("Couldn't read TokensRegexNER from " + mapping, e); } finally { IOUtils.closeIgnoringExceptions(rd); } } if (mappings.length != 1) { logger.log( "TokensRegexNERAnnotator " + annotatorName + ": Read " + entries.size() + " unique entries from " + mappings.length + " files"); } return entries; }
/** * Get the top few clauses from this searcher, cutting off at the given minimum probability. * * @param thresholdProbability The threshold under which to stop returning clauses. This should be * between 0 and 1. * @return The resulting {@link edu.stanford.nlp.naturalli.SentenceFragment} objects, representing * the top clauses of the sentence. */ public List<SentenceFragment> topClauses(double thresholdProbability) { List<SentenceFragment> results = new ArrayList<>(); search( triple -> { assert triple.first <= 0.0; double prob = Math.exp(triple.first); assert prob <= 1.0; assert prob >= 0.0; assert !Double.isNaN(prob); if (prob >= thresholdProbability) { SentenceFragment fragment = triple.third.get(); fragment.score = prob; results.add(fragment); return true; } else { return false; } }); return results; }
protected Rule specifyRule(Rule rule, List history, int childDepth) { Rule r; String topHistoryStr = historyToString(history.subList(1, history.size())); String bottomHistoryStr = historyToString(history.subList(0, childDepth)); if (rule instanceof UnaryRule) { UnaryRule ur = new UnaryRule(); UnaryRule urule = (UnaryRule) rule; ur.parent = stateNumberer.number(stateNumberer.object(urule.parent) + topHistoryStr); if (isSynthetic(urule.child)) { ur.child = stateNumberer.number(stateNumberer.object(urule.child) + topHistoryStr); } else if (isTag(urule.child)) { ur.child = urule.child; } else { ur.child = stateNumberer.number(stateNumberer.object(urule.child) + bottomHistoryStr); } r = ur; } else { BinaryRule br = new BinaryRule(); BinaryRule brule = (BinaryRule) rule; br.parent = stateNumberer.number(stateNumberer.object(brule.parent) + topHistoryStr); if (isSynthetic(brule.leftChild)) { br.leftChild = stateNumberer.number(stateNumberer.object(brule.leftChild) + topHistoryStr); } else if (isTag(brule.leftChild)) { br.leftChild = brule.leftChild; } else { br.leftChild = stateNumberer.number(stateNumberer.object(brule.leftChild) + bottomHistoryStr); } if (isSynthetic(brule.rightChild)) { br.rightChild = stateNumberer.number(stateNumberer.object(brule.rightChild) + topHistoryStr); } else if (isTag(brule.rightChild)) { br.rightChild = brule.rightChild; } else { br.rightChild = stateNumberer.number(stateNumberer.object(brule.rightChild) + bottomHistoryStr); } r = br; } return r; }
public List<TimeExpression> extractTimeExpressions(CoreMap annotation, String docDateStr) { List<CoreMap> mergedNumbers = NumberNormalizer.findAndMergeNumbers(annotation); annotation.set(CoreAnnotations.NumerizedTokensAnnotation.class, mergedNumbers); // TODO: docDate may not have century.... SUTime.Time docDate = timexPatterns.parseDateTime(docDateStr); List<? extends MatchedExpression> matchedExpressions = expressionExtractor.extractExpressions(annotation); List<TimeExpression> timeExpressions = new ArrayList<TimeExpression>(matchedExpressions.size()); for (MatchedExpression expr : matchedExpressions) { if (expr instanceof TimeExpression) { timeExpressions.add((TimeExpression) expr); } else { timeExpressions.add(new TimeExpression(expr)); } } // Add back nested time expressions for ranges.... // For now only one level of nesting... if (options.includeNested) { List<TimeExpression> nestedTimeExpressions = new ArrayList<TimeExpression>(); for (TimeExpression te : timeExpressions) { if (te.isIncludeNested()) { List<? extends CoreMap> children = te.getAnnotation().get(TimeExpression.ChildrenAnnotation.class); if (children != null) { for (CoreMap child : children) { TimeExpression childTe = child.get(TimeExpression.Annotation.class); if (childTe != null) { nestedTimeExpressions.add(childTe); } } } } } timeExpressions.addAll(nestedTimeExpressions); } Collections.sort(timeExpressions, MatchedExpression.EXPR_TOKEN_OFFSETS_NESTED_FIRST_COMPARATOR); timeExpressions = filterInvalidTimeExpressions(timeExpressions); // Some resolving is done even if docDate null... if ( /*docDate != null && */ timeExpressions != null) { resolveTimeExpressions(annotation, timeExpressions, docDate); } // Annotate timex return timeExpressions; }
private MultiPatternMatcher<CoreMap> createPatternMatcher( Map<SequencePattern<CoreMap>, Entry> patternToEntry) { // Convert to tokensregex pattern int patternFlags = ignoreCase ? Pattern.CASE_INSENSITIVE : 0; int stringMatchFlags = ignoreCase ? NodePattern.CASE_INSENSITIVE : 0; Env env = TokenSequencePattern.getNewEnv(); env.setDefaultStringPatternFlags(patternFlags); env.setDefaultStringMatchFlags(stringMatchFlags); NodePattern<String> posTagPattern = (validPosPattern != null && PosMatchType.MATCH_ALL_TOKENS.equals(posMatchType)) ? new CoreMapNodePattern.StringAnnotationRegexPattern(validPosPattern) : null; List<TokenSequencePattern> patterns = new ArrayList<>(entries.size()); for (Entry entry : entries) { TokenSequencePattern pattern; if (entry.tokensRegex != null) { // TODO: posTagPatterns... pattern = TokenSequencePattern.compile(env, entry.tokensRegex); } else { List<SequencePattern.PatternExpr> nodePatterns = new ArrayList<>(); for (String p : entry.regex) { CoreMapNodePattern c = CoreMapNodePattern.valueOf(p, patternFlags); if (posTagPattern != null) { c.add(CoreAnnotations.PartOfSpeechAnnotation.class, posTagPattern); } nodePatterns.add(new SequencePattern.NodePatternExpr(c)); } pattern = TokenSequencePattern.compile(new SequencePattern.SequencePatternExpr(nodePatterns)); } if (entry.annotateGroup < 0 || entry.annotateGroup > pattern.getTotalGroups()) { throw new RuntimeException("Invalid match group for entry " + entry); } pattern.setPriority(entry.priority); patterns.add(pattern); patternToEntry.put(pattern, entry); } return TokenSequencePattern.getMultiPatternMatcher(patterns); }
/** * Return a String that gives detailed human-readable information about how much time was spent by * each annotator and by the entire annotation pipeline. This String includes newline characters * but does not end with one, and so it is suitable to be printed out with a {@code println()}. * * @return Human readable information on time spent in processing. */ public String timingInformation() { StringBuilder sb = new StringBuilder(); if (TIME) { sb.append("Annotation pipeline timing information:\n"); Iterator<MutableLong> it = accumulatedTime.iterator(); long total = 0; for (Annotator annotator : annotators) { MutableLong m = it.next(); sb.append(StringUtils.getShortClassName(annotator)).append(": "); sb.append(Timing.toSecondsString(m.longValue())).append(" sec.\n"); total += m.longValue(); } sb.append("TOTAL: ").append(Timing.toSecondsString(total)).append(" sec."); } return sb.toString(); }
/** * Run the pipeline on an input annotation. The annotation is modified in place. * * @param annotation The input annotation, usually a raw document */ @Override public void annotate(Annotation annotation) { Iterator<MutableLong> it = accumulatedTime.iterator(); Timing t = new Timing(); for (Annotator annotator : annotators) { if (TIME) { t.start(); } annotator.annotate(annotation); if (TIME) { long elapsed = t.stop(); MutableLong m = it.next(); m.incValue(elapsed); } } }
public void addAnnotator(Annotator annotator) { annotators.add(annotator); if (TIME) { accumulatedTime.add(new MutableLong()); } }
/** * Reads a list of Entries from a mapping file and update the given entries. Line numbers start * from 1. * * @return the updated list of Entries */ private static List<Entry> readEntries( String annotatorName, List<Entry> entries, TrieMap<String, Entry> seenRegexes, String mappingFilename, BufferedReader mapping, Set<String> noDefaultOverwriteLabels, boolean ignoreCase, boolean verbose) throws IOException { int origEntriesSize = entries.size(); int isTokensRegex = 0; int lineCount = 0; for (String line; (line = mapping.readLine()) != null; ) { lineCount++; String[] split = line.split("\t"); if (split.length < 2 || split.length > 5) { throw new IllegalArgumentException( "Provided mapping file is in wrong format. This line is bad: " + line); } String regex = split[0].trim(); String tokensRegex = null; String[] regexes = null; if (regex.startsWith("( ") && regex.endsWith(" )")) { // Tokens regex (remove start and end parenthesis) tokensRegex = regex.substring(1, regex.length() - 1).trim(); } else { regexes = regex.split("\\s+"); } String[] key = (regexes != null) ? regexes : new String[] {tokensRegex}; if (ignoreCase) { String[] norm = new String[key.length]; for (int i = 0; i < key.length; i++) { norm[i] = key[i].toLowerCase(); } key = norm; } String type = split[1].trim(); Set<String> overwritableTypes = Generics.newHashSet(); double priority = 0.0; if (split.length >= 3) { overwritableTypes.addAll(Arrays.asList(split[2].trim().split("\\s*,\\s*"))); } if (split.length >= 4) { try { priority = Double.parseDouble(split[3].trim()); } catch (NumberFormatException e) { throw new IllegalArgumentException( "ERROR: Invalid priority in line " + lineCount + " in regexner file " + mappingFilename + ": \"" + line + "\"!", e); } } int annotateGroup = 0; // Get annotate group from input.... if (split.length >= 5) { // Which group to take (allow for context) String context = split[4].trim(); try { annotateGroup = Integer.parseInt(context); } catch (NumberFormatException e) { throw new IllegalArgumentException( "ERROR: Invalid group in line " + lineCount + " in regexner file " + mappingFilename + ": \"" + line + "\"!", e); } } // Print some warning about the type int commaPos = type.indexOf(','); if (commaPos > 0) { // Strip the "," and just take first type String newType = type.substring(0, commaPos).trim(); logger.warn( "TokensRegexNERAnnotator " + annotatorName + ": Entry has multiple types: " + line + ". Taking type to be " + newType); type = newType; } Entry entry = new Entry(tokensRegex, regexes, type, overwritableTypes, priority, annotateGroup); if (seenRegexes.containsKey(key)) { Entry oldEntry = seenRegexes.get(key); if (priority > oldEntry.priority) { logger.warn( "TokensRegexNERAnnotator " + annotatorName + ": Replace duplicate entry (higher priority): old=" + oldEntry + ", new=" + entry); } else { if (!oldEntry.type.equals(type)) { if (verbose) { logger.warn( "TokensRegexNERAnnotator " + annotatorName + ": Ignoring duplicate entry: " + split[0] + ", old type = " + oldEntry.type + ", new type = " + type); } // } else { // if (verbose) { // logger.warn("TokensRegexNERAnnotator " + annotatorName + // ": Duplicate entry [ignored]: " + split[0] + ", old type = " + // oldEntry.type + ", new type = " + type); // } } continue; } } // Print some warning if label belongs to noDefaultOverwriteLabels but there is no // overwritable types if (entry.overwritableTypes.isEmpty() && noDefaultOverwriteLabels.contains(entry.type)) { logger.warn( "TokensRegexNERAnnotator " + annotatorName + ": Entry doesn't have overwriteable types " + entry + ", but entry type is in noDefaultOverwriteLabels"); } entries.add(entry); seenRegexes.put(key, entry); if (entry.tokensRegex != null) isTokensRegex++; } logger.log( "TokensRegexNERAnnotator " + annotatorName + ": Read " + (entries.size() - origEntriesSize) + " unique entries out of " + lineCount + " from " + mappingFilename + ", " + isTokensRegex + " TokensRegex patterns."); return entries; }
private static TimeExpression getTimeExpression(List<? extends CoreMap> list, int index) { return list.get(index).get(TimeExpression.Annotation.class); }
public static void main(String[] args) { Options op = new Options(new EnglishTreebankParserParams()); // op.tlpParams may be changed to something else later, so don't use it till // after options are parsed. System.out.println("Currently " + new Date()); System.out.print("Invoked with arguments:"); for (String arg : args) { System.out.print(" " + arg); } System.out.println(); String path = "/u/nlp/stuff/corpora/Treebank3/parsed/mrg/wsj"; int trainLow = 200, trainHigh = 2199, testLow = 2200, testHigh = 2219; String serializeFile = null; int i = 0; while (i < args.length && args[i].startsWith("-")) { if (args[i].equalsIgnoreCase("-path") && (i + 1 < args.length)) { path = args[i + 1]; i += 2; } else if (args[i].equalsIgnoreCase("-train") && (i + 2 < args.length)) { trainLow = Integer.parseInt(args[i + 1]); trainHigh = Integer.parseInt(args[i + 2]); i += 3; } else if (args[i].equalsIgnoreCase("-test") && (i + 2 < args.length)) { testLow = Integer.parseInt(args[i + 1]); testHigh = Integer.parseInt(args[i + 2]); i += 3; } else if (args[i].equalsIgnoreCase("-serialize") && (i + 1 < args.length)) { serializeFile = args[i + 1]; i += 2; } else if (args[i].equalsIgnoreCase("-tLPP") && (i + 1 < args.length)) { try { op.tlpParams = (TreebankLangParserParams) Class.forName(args[i + 1]).newInstance(); } catch (ClassNotFoundException e) { System.err.println("Class not found: " + args[i + 1]); } catch (InstantiationException e) { System.err.println("Couldn't instantiate: " + args[i + 1] + ": " + e.toString()); } catch (IllegalAccessException e) { System.err.println("illegal access" + e); } i += 2; } else if (args[i].equals("-encoding")) { // sets encoding for TreebankLangParserParams op.tlpParams.setInputEncoding(args[i + 1]); op.tlpParams.setOutputEncoding(args[i + 1]); i += 2; } else { i = op.setOptionOrWarn(args, i); } } // System.out.println(tlpParams.getClass()); TreebankLanguagePack tlp = op.tlpParams.treebankLanguagePack(); Train.sisterSplitters = new HashSet(Arrays.asList(op.tlpParams.sisterSplitters())); // BinarizerFactory.TreeAnnotator.setTreebankLang(tlpParams); PrintWriter pw = op.tlpParams.pw(); Test.display(); Train.display(); op.display(); op.tlpParams.display(); // setup tree transforms Treebank trainTreebank = op.tlpParams.memoryTreebank(); MemoryTreebank testTreebank = op.tlpParams.testMemoryTreebank(); // Treebank blippTreebank = ((EnglishTreebankParserParams) tlpParams).diskTreebank(); // String blippPath = "/afs/ir.stanford.edu/data/linguistic-data/BLLIP-WSJ/"; // blippTreebank.loadPath(blippPath, "", true); Timing.startTime(); System.err.print("Reading trees..."); testTreebank.loadPath(path, new NumberRangeFileFilter(testLow, testHigh, true)); if (Test.increasingLength) { Collections.sort(testTreebank, new TreeLengthComparator()); } trainTreebank.loadPath(path, new NumberRangeFileFilter(trainLow, trainHigh, true)); Timing.tick("done."); System.err.print("Binarizing trees..."); TreeAnnotatorAndBinarizer binarizer = null; if (!Train.leftToRight) { binarizer = new TreeAnnotatorAndBinarizer(op.tlpParams, op.forceCNF, !Train.outsideFactor(), true); } else { binarizer = new TreeAnnotatorAndBinarizer( op.tlpParams.headFinder(), new LeftHeadFinder(), op.tlpParams, op.forceCNF, !Train.outsideFactor(), true); } CollinsPuncTransformer collinsPuncTransformer = null; if (Train.collinsPunc) { collinsPuncTransformer = new CollinsPuncTransformer(tlp); } TreeTransformer debinarizer = new Debinarizer(op.forceCNF); List<Tree> binaryTrainTrees = new ArrayList<Tree>(); if (Train.selectiveSplit) { Train.splitters = ParentAnnotationStats.getSplitCategories( trainTreebank, Train.tagSelectiveSplit, 0, Train.selectiveSplitCutOff, Train.tagSelectiveSplitCutOff, op.tlpParams.treebankLanguagePack()); if (Train.deleteSplitters != null) { List<String> deleted = new ArrayList<String>(); for (String del : Train.deleteSplitters) { String baseDel = tlp.basicCategory(del); boolean checkBasic = del.equals(baseDel); for (Iterator<String> it = Train.splitters.iterator(); it.hasNext(); ) { String elem = it.next(); String baseElem = tlp.basicCategory(elem); boolean delStr = checkBasic && baseElem.equals(baseDel) || elem.equals(del); if (delStr) { it.remove(); deleted.add(elem); } } } System.err.println("Removed from vertical splitters: " + deleted); } } if (Train.selectivePostSplit) { TreeTransformer myTransformer = new TreeAnnotator(op.tlpParams.headFinder(), op.tlpParams); Treebank annotatedTB = trainTreebank.transform(myTransformer); Train.postSplitters = ParentAnnotationStats.getSplitCategories( annotatedTB, true, 0, Train.selectivePostSplitCutOff, Train.tagSelectivePostSplitCutOff, op.tlpParams.treebankLanguagePack()); } if (Train.hSelSplit) { binarizer.setDoSelectiveSplit(false); for (Tree tree : trainTreebank) { if (Train.collinsPunc) { tree = collinsPuncTransformer.transformTree(tree); } // tree.pennPrint(tlpParams.pw()); tree = binarizer.transformTree(tree); // binaryTrainTrees.add(tree); } binarizer.setDoSelectiveSplit(true); } for (Tree tree : trainTreebank) { if (Train.collinsPunc) { tree = collinsPuncTransformer.transformTree(tree); } tree = binarizer.transformTree(tree); binaryTrainTrees.add(tree); } if (Test.verbose) { binarizer.dumpStats(); } List<Tree> binaryTestTrees = new ArrayList<Tree>(); for (Tree tree : testTreebank) { if (Train.collinsPunc) { tree = collinsPuncTransformer.transformTree(tree); } tree = binarizer.transformTree(tree); binaryTestTrees.add(tree); } Timing.tick("done."); // binarization BinaryGrammar bg = null; UnaryGrammar ug = null; DependencyGrammar dg = null; // DependencyGrammar dgBLIPP = null; Lexicon lex = null; // extract grammars Extractor bgExtractor = new BinaryGrammarExtractor(); // Extractor bgExtractor = new SmoothedBinaryGrammarExtractor();//new BinaryGrammarExtractor(); // Extractor lexExtractor = new LexiconExtractor(); // Extractor dgExtractor = new DependencyMemGrammarExtractor(); Extractor dgExtractor = new MLEDependencyGrammarExtractor(op); if (op.doPCFG) { System.err.print("Extracting PCFG..."); Pair bgug = null; if (Train.cheatPCFG) { List allTrees = new ArrayList(binaryTrainTrees); allTrees.addAll(binaryTestTrees); bgug = (Pair) bgExtractor.extract(allTrees); } else { bgug = (Pair) bgExtractor.extract(binaryTrainTrees); } bg = (BinaryGrammar) bgug.second; bg.splitRules(); ug = (UnaryGrammar) bgug.first; ug.purgeRules(); Timing.tick("done."); } System.err.print("Extracting Lexicon..."); lex = op.tlpParams.lex(op.lexOptions); lex.train(binaryTrainTrees); Timing.tick("done."); if (op.doDep) { System.err.print("Extracting Dependencies..."); binaryTrainTrees.clear(); // dgBLIPP = (DependencyGrammar) dgExtractor.extract(new // ConcatenationIterator(trainTreebank.iterator(),blippTreebank.iterator()),new // TransformTreeDependency(tlpParams,true)); DependencyGrammar dg1 = (DependencyGrammar) dgExtractor.extract( trainTreebank.iterator(), new TransformTreeDependency(op.tlpParams, true)); // dgBLIPP=(DependencyGrammar)dgExtractor.extract(blippTreebank.iterator(),new // TransformTreeDependency(tlpParams)); // dg = (DependencyGrammar) dgExtractor.extract(new // ConcatenationIterator(trainTreebank.iterator(),blippTreebank.iterator()),new // TransformTreeDependency(tlpParams)); // dg=new DependencyGrammarCombination(dg1,dgBLIPP,2); // dg = (DependencyGrammar) dgExtractor.extract(binaryTrainTrees); //uses information whether // the words are known or not, discards unknown words Timing.tick("done."); // System.out.print("Extracting Unknown Word Model..."); // UnknownWordModel uwm = (UnknownWordModel)uwmExtractor.extract(binaryTrainTrees); // Timing.tick("done."); System.out.print("Tuning Dependency Model..."); dg.tune(binaryTestTrees); // System.out.println("TUNE DEPS: "+tuneDeps); Timing.tick("done."); } BinaryGrammar boundBG = bg; UnaryGrammar boundUG = ug; GrammarProjection gp = new NullGrammarProjection(bg, ug); // serialization if (serializeFile != null) { System.err.print("Serializing parser..."); LexicalizedParser.saveParserDataToSerialized( new ParserData(lex, bg, ug, dg, Numberer.getNumberers(), op), serializeFile); Timing.tick("done."); } // test: pcfg-parse and output ExhaustivePCFGParser parser = null; if (op.doPCFG) { parser = new ExhaustivePCFGParser(boundBG, boundUG, lex, op); } ExhaustiveDependencyParser dparser = ((op.doDep && !Test.useFastFactored) ? new ExhaustiveDependencyParser(dg, lex, op) : null); Scorer scorer = (op.doPCFG ? new TwinScorer(new ProjectionScorer(parser, gp), dparser) : null); // Scorer scorer = parser; BiLexPCFGParser bparser = null; if (op.doPCFG && op.doDep) { bparser = (Test.useN5) ? new BiLexPCFGParser.N5BiLexPCFGParser( scorer, parser, dparser, bg, ug, dg, lex, op, gp) : new BiLexPCFGParser(scorer, parser, dparser, bg, ug, dg, lex, op, gp); } LabeledConstituentEval pcfgPE = new LabeledConstituentEval("pcfg PE", true, tlp); LabeledConstituentEval comboPE = new LabeledConstituentEval("combo PE", true, tlp); AbstractEval pcfgCB = new LabeledConstituentEval.CBEval("pcfg CB", true, tlp); AbstractEval pcfgTE = new AbstractEval.TaggingEval("pcfg TE"); AbstractEval comboTE = new AbstractEval.TaggingEval("combo TE"); AbstractEval pcfgTEnoPunct = new AbstractEval.TaggingEval("pcfg nopunct TE"); AbstractEval comboTEnoPunct = new AbstractEval.TaggingEval("combo nopunct TE"); AbstractEval depTE = new AbstractEval.TaggingEval("depnd TE"); AbstractEval depDE = new AbstractEval.DependencyEval("depnd DE", true, tlp.punctuationWordAcceptFilter()); AbstractEval comboDE = new AbstractEval.DependencyEval("combo DE", true, tlp.punctuationWordAcceptFilter()); if (Test.evalb) { EvalB.initEVALBfiles(op.tlpParams); } // int[] countByLength = new int[Test.maxLength+1]; // use a reflection ruse, so one can run this without needing the tagger // edu.stanford.nlp.process.SentenceTagger tagger = (Test.preTag ? new // edu.stanford.nlp.process.SentenceTagger("/u/nlp/data/tagger.params/wsj0-21.holder") : null); SentenceProcessor tagger = null; if (Test.preTag) { try { Class[] argsClass = new Class[] {String.class}; Object[] arguments = new Object[] {"/u/nlp/data/pos-tagger/wsj3t0-18-bidirectional/train-wsj-0-18.holder"}; tagger = (SentenceProcessor) Class.forName("edu.stanford.nlp.tagger.maxent.MaxentTagger") .getConstructor(argsClass) .newInstance(arguments); } catch (Exception e) { System.err.println(e); System.err.println("Warning: No pretagging of sentences will be done."); } } for (int tNum = 0, ttSize = testTreebank.size(); tNum < ttSize; tNum++) { Tree tree = testTreebank.get(tNum); int testTreeLen = tree.yield().size(); if (testTreeLen > Test.maxLength) { continue; } Tree binaryTree = binaryTestTrees.get(tNum); // countByLength[testTreeLen]++; System.out.println("-------------------------------------"); System.out.println("Number: " + (tNum + 1)); System.out.println("Length: " + testTreeLen); // tree.pennPrint(pw); // System.out.println("XXXX The binary tree is"); // binaryTree.pennPrint(pw); // System.out.println("Here are the tags in the lexicon:"); // System.out.println(lex.showTags()); // System.out.println("Here's the tagnumberer:"); // System.out.println(Numberer.getGlobalNumberer("tags").toString()); long timeMil1 = System.currentTimeMillis(); Timing.tick("Starting parse."); if (op.doPCFG) { // System.err.println(Test.forceTags); if (Test.forceTags) { if (tagger != null) { // System.out.println("Using a tagger to set tags"); // System.out.println("Tagged sentence as: " + // tagger.processSentence(cutLast(wordify(binaryTree.yield()))).toString(false)); parser.parse(addLast(tagger.processSentence(cutLast(wordify(binaryTree.yield()))))); } else { // System.out.println("Forcing tags to match input."); parser.parse(cleanTags(binaryTree.taggedYield(), tlp)); } } else { // System.out.println("XXXX Parsing " + binaryTree.yield()); parser.parse(binaryTree.yield()); } // Timing.tick("Done with pcfg phase."); } if (op.doDep) { dparser.parse(binaryTree.yield()); // Timing.tick("Done with dependency phase."); } boolean bothPassed = false; if (op.doPCFG && op.doDep) { bothPassed = bparser.parse(binaryTree.yield()); // Timing.tick("Done with combination phase."); } long timeMil2 = System.currentTimeMillis(); long elapsed = timeMil2 - timeMil1; System.err.println("Time: " + ((int) (elapsed / 100)) / 10.00 + " sec."); // System.out.println("PCFG Best Parse:"); Tree tree2b = null; Tree tree2 = null; // System.out.println("Got full best parse..."); if (op.doPCFG) { tree2b = parser.getBestParse(); tree2 = debinarizer.transformTree(tree2b); } // System.out.println("Debinarized parse..."); // tree2.pennPrint(); // System.out.println("DepG Best Parse:"); Tree tree3 = null; Tree tree3db = null; if (op.doDep) { tree3 = dparser.getBestParse(); // was: but wrong Tree tree3db = debinarizer.transformTree(tree2); tree3db = debinarizer.transformTree(tree3); tree3.pennPrint(pw); } // tree.pennPrint(); // ((Tree)binaryTrainTrees.get(tNum)).pennPrint(); // System.out.println("Combo Best Parse:"); Tree tree4 = null; if (op.doPCFG && op.doDep) { try { tree4 = bparser.getBestParse(); if (tree4 == null) { tree4 = tree2b; } } catch (NullPointerException e) { System.err.println("Blocked, using PCFG parse!"); tree4 = tree2b; } } if (op.doPCFG && !bothPassed) { tree4 = tree2b; } // tree4.pennPrint(); if (op.doDep) { depDE.evaluate(tree3, binaryTree, pw); depTE.evaluate(tree3db, tree, pw); } TreeTransformer tc = op.tlpParams.collinizer(); TreeTransformer tcEvalb = op.tlpParams.collinizerEvalb(); Tree tree4b = null; if (op.doPCFG) { // System.out.println("XXXX Best PCFG was: "); // tree2.pennPrint(); // System.out.println("XXXX Transformed best PCFG is: "); // tc.transformTree(tree2).pennPrint(); // System.out.println("True Best Parse:"); // tree.pennPrint(); // tc.transformTree(tree).pennPrint(); pcfgPE.evaluate(tc.transformTree(tree2), tc.transformTree(tree), pw); pcfgCB.evaluate(tc.transformTree(tree2), tc.transformTree(tree), pw); if (op.doDep) { comboDE.evaluate((bothPassed ? tree4 : tree3), binaryTree, pw); tree4b = tree4; tree4 = debinarizer.transformTree(tree4); if (op.nodePrune) { NodePruner np = new NodePruner(parser, debinarizer); tree4 = np.prune(tree4); } // tree4.pennPrint(); comboPE.evaluate(tc.transformTree(tree4), tc.transformTree(tree), pw); } // pcfgTE.evaluate(tree2, tree); pcfgTE.evaluate(tcEvalb.transformTree(tree2), tcEvalb.transformTree(tree), pw); pcfgTEnoPunct.evaluate(tc.transformTree(tree2), tc.transformTree(tree), pw); if (op.doDep) { comboTE.evaluate(tcEvalb.transformTree(tree4), tcEvalb.transformTree(tree), pw); comboTEnoPunct.evaluate(tc.transformTree(tree4), tc.transformTree(tree), pw); } System.out.println("PCFG only: " + parser.scoreBinarizedTree(tree2b, 0)); // tc.transformTree(tree2).pennPrint(); tree2.pennPrint(pw); if (op.doDep) { System.out.println("Combo: " + parser.scoreBinarizedTree(tree4b, 0)); // tc.transformTree(tree4).pennPrint(pw); tree4.pennPrint(pw); } System.out.println("Correct:" + parser.scoreBinarizedTree(binaryTree, 0)); /* if (parser.scoreBinarizedTree(tree2b,true) < parser.scoreBinarizedTree(binaryTree,true)) { System.out.println("SCORE INVERSION"); parser.validateBinarizedTree(binaryTree,0); } */ tree.pennPrint(pw); } // end if doPCFG if (Test.evalb) { if (op.doPCFG && op.doDep) { EvalB.writeEVALBline(tcEvalb.transformTree(tree), tcEvalb.transformTree(tree4)); } else if (op.doPCFG) { EvalB.writeEVALBline(tcEvalb.transformTree(tree), tcEvalb.transformTree(tree2)); } else if (op.doDep) { EvalB.writeEVALBline(tcEvalb.transformTree(tree), tcEvalb.transformTree(tree3db)); } } } // end for each tree in test treebank if (Test.evalb) { EvalB.closeEVALBfiles(); } // Test.display(); if (op.doPCFG) { pcfgPE.display(false, pw); System.out.println("Grammar size: " + Numberer.getGlobalNumberer("states").total()); pcfgCB.display(false, pw); if (op.doDep) { comboPE.display(false, pw); } pcfgTE.display(false, pw); pcfgTEnoPunct.display(false, pw); if (op.doDep) { comboTE.display(false, pw); comboTEnoPunct.display(false, pw); } } if (op.doDep) { depTE.display(false, pw); depDE.display(false, pw); } if (op.doPCFG && op.doDep) { comboDE.display(false, pw); } // pcfgPE.printGoodBad(); }
@SuppressWarnings("unused") private static String getText(List<? extends CoreMap> list, int index) { return list.get(index).get(CoreAnnotations.TextAnnotation.class); }