private void commitVariableGroups(Matcher m) { committedVariables = true; // commit all my variable groups. for (Pair<Integer, String> varGroup : myNode.variableGroups) { String thisVarString = m.group(varGroup.first()); variableStrings.setVar(varGroup.second(), thisVarString); } }
private String findNextParagraphSpeaker( List<CoreMap> paragraph, int paragraphOffset, Dictionaries dict) { CoreMap lastSent = paragraph.get(paragraph.size() - 1); String speaker = ""; for (CoreLabel w : lastSent.get(CoreAnnotations.TokensAnnotation.class)) { if (w.get(CoreAnnotations.LemmaAnnotation.class).equals("report") || w.get(CoreAnnotations.LemmaAnnotation.class).equals("say")) { String word = w.get(CoreAnnotations.TextAnnotation.class); SemanticGraph dependency = lastSent.get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class); IndexedWord t = dependency.getNodeByWordPattern(word); for (Pair<GrammaticalRelation, IndexedWord> child : dependency.childPairs(t)) { if (child.first().getShortName().equals("nsubj")) { int subjectIndex = child.second().index(); // start from 1 IntTuple headPosition = new IntTuple(2); headPosition.set(0, paragraph.size() - 1 + paragraphOffset); headPosition.set(1, subjectIndex - 1); if (mentionheadPositions.containsKey(headPosition) && mentionheadPositions.get(headPosition).nerString.startsWith("PER")) { speaker = Integer.toString(mentionheadPositions.get(headPosition).mentionID); } } } } } return speaker; }
private void incrementMonth(ISODateInstance referenceDate, Pair<DateField, Integer> relation) { String origDateString = referenceDate.getStartDate(); String monthString = origDateString.substring(4, 6); if (monthString.contains("*")) { isoDate = origDateString; return; } // Month is not a variable Integer monthNum = Integer.parseInt(monthString); // Check if we're an edge case if (((monthNum + relation.second()) > 12) || ((monthNum + relation.second) < 1)) { boolean decreasing = ((monthNum + relation.second) < 1); int newMonthNum = (monthNum + relation.second()) % 12; if (newMonthNum < 0) { newMonthNum *= -1; } // Set the month appropriately isoDate = makeStringMonthChange(origDateString, newMonthNum); // Increment the year if possible String yearString = origDateString.substring(0, 4); if (!yearString.contains("*")) { // How much we increment depends on above mod int numYearsToIncrement = (int) Math.ceil(relation.second() / 12.0); if (decreasing) { isoDate = makeStringYearChange(isoDate, Integer.parseInt(yearString) - numYearsToIncrement); } else { isoDate = makeStringYearChange(isoDate, Integer.parseInt(yearString) + numYearsToIncrement); } } } else { isoDate = makeStringMonthChange(origDateString, (monthNum + relation.second())); } }
/** Check one mention is the speaker of the other mention */ public static boolean isSpeaker(Mention m, Mention ant, Dictionaries dict) { if (!dict.firstPersonPronouns.contains(ant.spanToString().toLowerCase()) || ant.number == Number.PLURAL || ant.sentNum != m.sentNum) return false; int countQuotationMark = 0; for (int i = Math.min(m.headIndex, ant.headIndex) + 1; i < Math.max(m.headIndex, ant.headIndex); i++) { String word = m.sentenceWords.get(i).get(CoreAnnotations.TextAnnotation.class); if (word.equals("``") || word.equals("''")) countQuotationMark++; } if (countQuotationMark != 1) return false; IndexedWord w = m.dependency.getNodeByWordPattern( m.sentenceWords.get(m.headIndex).get(CoreAnnotations.TextAnnotation.class)); if (w == null) return false; for (Pair<GrammaticalRelation, IndexedWord> parent : m.dependency.parentPairs(w)) { if (parent.first().getShortName().equals("nsubj") && dict.reportVerb.contains(parent.second().get(CoreAnnotations.LemmaAnnotation.class))) { return true; } } return false; }
private void decommitVariableGroups() { if (committedVariables) { for (Pair<Integer, String> varGroup : myNode.variableGroups) { variableStrings.unsetVar(varGroup.second()); } } committedVariables = false; }
/** * Compares this <code>Pair</code> to another object. If the object is a <code>Pair</code>, this * function will work providing the elements of the <code>Pair</code> are themselves comparable. * It will then return a value based on the pair of objects, where <code> * p > q iff p.first() > q.first() || * (p.first().equals(q.first()) && p.second() > q.second())</code>. If the other object is not * a <code>Pair</code>, it throws a <code>ClassCastException</code>. * * @param o the <code>Object</code> to be compared. * @return the value <code>0</code> if the argument is a <code>Pair</code> equal to this <code> * Pair</code>; a value less than <code>0</code> if the argument is a <code>Pair</code> * greater than this <code>Pair</code>; and a value greater than <code>0</code> if the * argument is a <code>Pair</code> less than this <code>Pair</code>. * @throws ClassCastException if the argument is not a <code>Pair</code>. * @see java.lang.Comparable */ public int compareTo(Object o) { Pair another = (Pair) o; int comp = ((Comparable) first()).compareTo(another.first()); if (comp != 0) { return comp; } else { return ((Comparable) second()).compareTo(another.second()); } }
public String expandStringRegex(String regex) { // Replace all variables in regex String expanded = regex; for (String v : stringRegexVariables.keySet()) { Pair<Pattern, String> p = stringRegexVariables.get(v); expanded = p.first().matcher(expanded).replaceAll(p.second()); } return expanded; }
/** * Read a string representation of a Pair from a DataStream. This might not work correctly unless * the pair of objects are of type <code>String</code>. */ public static Pair<String, String> readStringPair(DataInputStream in) { Pair<String, String> p = new Pair<String, String>(); try { p.first = in.readUTF(); p.second = in.readUTF(); } catch (Exception e) { e.printStackTrace(); } return p; }
protected void updateKeepBids(Set<Integer> bids) { // TODO: Is there a point when we don't need to keep these bids anymore? for (int i = 0; i < reachableChildBids.length; i++) { Set<Pair<Integer, Integer>> v = reachableChildBids[i]; if (v != null) { for (Pair<Integer, Integer> p : v) { bids.add(p.first()); } } } }
/** * Construct a new ISODate based on its relation to a referenceDate. relativeDate should be * something like "today" or "tomorrow" or "last year" and the resulting ISODate will be the same * as the referenceDate, a day later, or a year earlier, respectively. */ public ISODateInstance(ISODateInstance referenceDate, String relativeDate) { Pair<DateField, Integer> relation = relativeDateMap.get(relativeDate.toLowerCase()); if (relation != null) { switch (relation.first()) { case DAY: incrementDay(referenceDate, relation); break; case MONTH: incrementMonth(referenceDate, relation); break; case YEAR: incrementYear(referenceDate, relation); break; } } }
/** * Uses regexp matching to match month, day, and year fields TODO: Find a way to mark what;s * already been handled in the string */ public boolean extractFields(String inputDate) { if (tokens.size() < 2) { tokenizeDate(inputDate); } if (DEBUG) { System.err.println("Extracting date: " + inputDate); } // first we see if it's a hyphen and two parseable dates - if not, we treat it as one date Pair<String, String> dateEndpoints = getRangeDates(inputDate); if (dateEndpoints != null) { ISODateInstance date1 = new ISODateInstance(dateEndpoints.first()); if (dateEndpoints.first().contains(" ") && !dateEndpoints.second().contains(" ")) { // consider whether it's a leading modifier; e.g., "June 8-10" will be split into June 8, // and 10 when really we'd like June 8 and June 10 String date = dateEndpoints.first().substring(0, dateEndpoints.first().indexOf(' ')) + ' ' + dateEndpoints.second(); ISODateInstance date2 = new ISODateInstance(date); if (!date1.isUnparseable() && !date2.isUnparseable()) { isoDate = (new ISODateInstance(date1, date2)).getDateString(); return true; } } ISODateInstance date2 = new ISODateInstance(dateEndpoints.second()); if (!date1.isUnparseable() && !date2.isUnparseable()) { isoDate = (new ISODateInstance(date1, date2)).getDateString(); return true; } } if (extractYYYYMMDD(inputDate)) { return true; } if (extractMMDDYY(inputDate)) { return true; } boolean passed = false; passed = extractYear(inputDate) || passed; passed = extractMonth(inputDate) || passed; passed = extractDay(inputDate) || passed; // slightly hacky, but check for some common modifiers that get grouped into the date passed = addExtraRanges(inputDate) || passed; if (!passed) { // couldn't parse // try one more trick unparseable = true; boolean weekday = extractWeekday(inputDate); if (!weekday) { isoDate = inputDate; } } return passed; }
private static void extractSubtrees(List<String> codeStrings, String treeFile) { List<Pair<Integer, Integer>> codes = new ArrayList<Pair<Integer, Integer>>(); for (String s : codeStrings) { Matcher m = codePattern.matcher(s); if (m.matches()) codes.add( new Pair<Integer, Integer>(Integer.parseInt(m.group(1)), Integer.parseInt(m.group(2)))); else throw new RuntimeException("Error: illegal node code " + s); } TreeReaderFactory trf = new TRegexTreeReaderFactory(); MemoryTreebank treebank = new MemoryTreebank(trf); treebank.loadPath(treeFile, null, true); for (Pair<Integer, Integer> code : codes) { Tree t = treebank.get(code.first() - 1); t.getNodeNumber(code.second()).pennPrint(); } }
public static Tree processPatternsOnTree(List<Pair<TregexPattern, TsurgeonPattern>> ops, Tree t) { matchedOnTree = false; for (Pair<TregexPattern, TsurgeonPattern> op : ops) { try { if (DEBUG) { System.err.println("Running pattern " + op.first()); } TregexMatcher m = op.first().matcher(t); while (m.find()) { matchedOnTree = true; t = op.second().evaluate(t, m); if (t == null) { return null; } m = op.first().matcher(t); } } catch (NullPointerException npe) { throw new RuntimeException( "Tsurgeon.processPatternsOnTree failed to match label for pattern: " + op.first() + ", " + op.second(), npe); } } return t; }
private boolean findSpeaker( int utterNum, int sentNum, List<CoreMap> sentences, int startIndex, int endIndex, Dictionaries dict) { List<CoreLabel> sent = sentences.get(sentNum).get(CoreAnnotations.TokensAnnotation.class); for (int i = startIndex; i < endIndex; i++) { if (sent.get(i).get(CoreAnnotations.UtteranceAnnotation.class) != 0) continue; String lemma = sent.get(i).get(CoreAnnotations.LemmaAnnotation.class); String word = sent.get(i).get(CoreAnnotations.TextAnnotation.class); if (dict.reportVerb.contains(lemma)) { // find subject SemanticGraph dependency = sentences .get(sentNum) .get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class); IndexedWord w = dependency.getNodeByWordPattern(word); if (w != null) { for (Pair<GrammaticalRelation, IndexedWord> child : dependency.childPairs(w)) { if (child.first().getShortName().equals("nsubj")) { String subjectString = child.second().word(); int subjectIndex = child.second().index(); // start from 1 IntTuple headPosition = new IntTuple(2); headPosition.set(0, sentNum); headPosition.set(1, subjectIndex - 1); String speaker; if (mentionheadPositions.containsKey(headPosition)) { speaker = Integer.toString(mentionheadPositions.get(headPosition).mentionID); } else { speaker = subjectString; } speakers.put(utterNum, speaker); return true; } } } else { SieveCoreferenceSystem.logger.warning("Cannot find node in dependency for word " + word); } } } return false; }
private void incrementYear(ISODateInstance referenceDate, Pair<DateField, Integer> relation) { String origDateString = referenceDate.getStartDate(); String yearString = origDateString.substring(0, 4); if (yearString.contains("*")) { isoDate = origDateString; return; } isoDate = makeStringYearChange(origDateString, Integer.parseInt(yearString) + relation.second()); }
/** * Returns true if there is a feasible combination of child branch ids that causes all child * expressions to be satisfied with respect to the specified child expression (assuming * satisfiction with the specified branch and node index) For other child expressions to have a * compatible satisfiable branch, that branch must also terminate with the same node index as * this one. * * @param index - Index of the child expression * @param bid - Branch id that causes the indexed child to be satisfied * @param pos - Node index that causes the indexed child to be satisfied * @return whether there is a feasible combination that causes all children to be satisfied with * respect to specfied child. */ private boolean isAllChildMatched(int index, int bid, int pos) { for (int i = 0; i < reachableChildBids.length; i++) { Set<Pair<Integer, Integer>> v = reachableChildBids[i]; if (v == null || v.isEmpty()) return false; if (i != index) { boolean ok = false; for (Pair<Integer, Integer> p : v) { if (p.second() == pos) { ok = true; break; } } if (!ok) { return false; } } } return true; }
@Override public Pair<DeepTree, DeepTree> process(Tree tree) { // For each tree, move in the direction of the gold tree, and // move away from the direction of the best scoring hypothesis IdentityHashMap<Tree, SimpleMatrix> goldVectors = new IdentityHashMap<>(); double scoreGold = score(tree, goldVectors); DeepTree bestTree = getHighestScoringTree(tree, TRAIN_LAMBDA); DeepTree goldTree = new DeepTree(tree, goldVectors, scoreGold); return Pair.makePair(goldTree, bestTree); }
// Update incompatibles for two clusters that are about to be merged public void mergeIncompatibles(CorefCluster to, CorefCluster from) { List<Pair<Pair<Integer, Integer>, Pair<Integer, Integer>>> replacements = new ArrayList<Pair<Pair<Integer, Integer>, Pair<Integer, Integer>>>(); for (Pair<Integer, Integer> p : incompatibleClusters) { Integer other = null; if (p.first == from.clusterID) { other = p.second; } else if (p.second == from.clusterID) { other = p.first; } if (other != null && other != to.clusterID) { int cid1 = Math.min(other, to.clusterID); int cid2 = Math.max(other, to.clusterID); replacements.add(Pair.makePair(p, Pair.makePair(cid1, cid2))); } } for (Pair<Pair<Integer, Integer>, Pair<Integer, Integer>> r : replacements) { incompatibleClusters.remove(r.first.first(), r.first.second()); incompatibleClusters.add(r.second.first(), r.second.second()); } }
private void findSpeakersInArticle(Dictionaries dict) { List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class); Pair<Integer, Integer> beginQuotation = new Pair<Integer, Integer>(); Pair<Integer, Integer> endQuotation = new Pair<Integer, Integer>(); boolean insideQuotation = false; int utterNum = -1; for (int i = 0; i < sentences.size(); i++) { List<CoreLabel> sent = sentences.get(i).get(CoreAnnotations.TokensAnnotation.class); for (int j = 0; j < sent.size(); j++) { int utterIndex = sent.get(j).get(CoreAnnotations.UtteranceAnnotation.class); if (utterIndex != 0 && !insideQuotation) { utterNum = utterIndex; insideQuotation = true; beginQuotation.setFirst(i); beginQuotation.setSecond(j); } else if (utterIndex == 0 && insideQuotation) { insideQuotation = false; endQuotation.setFirst(i); endQuotation.setSecond(j); findQuotationSpeaker(utterNum, sentences, beginQuotation, endQuotation, dict); } } } }
/** * Returns array of child branch ids that causes all child expressions to be satisfied with * respect to the specified child expression (assuming satisfiction with the specified branch * and node index) For other child expressions to have a compatible satisfiable branch, that * branch must also terminate with the same node index as this one. * * @param index - Index of the child expression * @param bid - Branch id that causes the indexed child to be satisfied * @param pos - Node index that causes the indexed child to be satisfied * @return array of child branch ids if there is a valid combination null otherwise */ private int[] getAllChildMatchedBids(int index, int bid, int pos) { int[] matchedBids = new int[reachableChildBids.length]; for (int i = 0; i < reachableChildBids.length; i++) { Set<Pair<Integer, Integer>> v = reachableChildBids[i]; if (v == null || v.isEmpty()) return null; if (i != index) { boolean ok = false; for (Pair<Integer, Integer> p : v) { if (p.second() == pos) { ok = true; matchedBids[i] = p.first(); break; } } if (!ok) { return null; } } else { matchedBids[i] = bid; } } return matchedBids; }
protected <T> boolean match( int bid, SequenceMatcher.MatchedStates<T> matchedStates, boolean consume) { // Try to match previous node/nodes exactly if (consume) { // First element is group that is matched, second is number of nodes matched so far Pair<SequenceMatcher.MatchedGroup, Integer> backRefState = (Pair<SequenceMatcher.MatchedGroup, Integer>) matchedStates.getBranchStates().getMatchStateInfo(bid, this); if (backRefState == null) { // Haven't tried to match this node before, try now // Get element and return if it matched or not SequenceMatcher.MatchedGroup matchedGroup = matchedStates.getBranchStates().getMatchedGroup(bid, captureGroupId); if (matchedGroup != null) { // See if the first node matches if (matchedGroup.matchEnd > matchedGroup.matchBegin) { boolean matched = match(bid, matchedStates, matchedGroup, 0); return matched; } else { // TODO: Check handling of previous nodes that are zero elements? return super.match(bid, matchedStates, consume); } } return false; } else { SequenceMatcher.MatchedGroup matchedGroup = backRefState.first(); int matchedNodes = backRefState.second(); boolean matched = match(bid, matchedStates, matchedGroup, matchedNodes); return matched; } } else { // Not consuming, just add this state back to list of states to be processed matchedStates.addState(bid, this); return false; } }
/** * Parse a CoNLL formatted tree into a SemanticGraph object (along with a list of tokens). * * @param conll The CoNLL formatted tree. * @return A pair of a SemanticGraph and a token list, corresponding to the parse of the sentence * and to tokens in the sentence. */ protected Pair<SemanticGraph, List<CoreLabel>> mkTree(String conll) { List<CoreLabel> sentence = new ArrayList<>(); SemanticGraph tree = new SemanticGraph(); for (String line : conll.split("\n")) { if (line.trim().equals("")) { continue; } String[] fields = line.trim().split("\\s+"); int index = Integer.parseInt(fields[0]); String word = fields[1]; CoreLabel label = IETestUtils.mkWord(word, index); sentence.add(label); if (fields[2].equals("0")) { tree.addRoot(new IndexedWord(label)); } else { tree.addVertex(new IndexedWord(label)); } if (fields.length > 4) { label.setTag(fields[4]); } if (fields.length > 5) { label.setNER(fields[5]); } if (fields.length > 6) { label.setLemma(fields[6]); } } int i = 0; for (String line : conll.split("\n")) { if (line.trim().equals("")) { continue; } String[] fields = line.trim().split("\\s+"); int parent = Integer.parseInt(fields[2]); String reln = fields[3]; if (parent > 0) { tree.addEdge( new IndexedWord(sentence.get(parent - 1)), new IndexedWord(sentence.get(i)), new GrammaticalRelation(Language.UniversalEnglish, reln, null, null), 1.0, false); } i += 1; } return Pair.makePair(tree, sentence); }
private void findQuotationSpeaker( int utterNum, List<CoreMap> sentences, Pair<Integer, Integer> beginQuotation, Pair<Integer, Integer> endQuotation, Dictionaries dict) { if (findSpeaker(utterNum, beginQuotation.first(), sentences, 0, beginQuotation.second(), dict)) return; if (findSpeaker( utterNum, endQuotation.first(), sentences, endQuotation.second(), sentences.get(endQuotation.first()).get(CoreAnnotations.TokensAnnotation.class).size(), dict)) return; if (beginQuotation.second() <= 1 && beginQuotation.first() > 0) { if (findSpeaker( utterNum, beginQuotation.first() - 1, sentences, 0, sentences .get(beginQuotation.first() - 1) .get(CoreAnnotations.TokensAnnotation.class) .size(), dict)) return; } if (endQuotation.second() == sentences.get(endQuotation.first()).size() - 1 && sentences.size() > endQuotation.first() + 1) { if (findSpeaker( utterNum, endQuotation.first() + 1, sentences, 0, sentences .get(endQuotation.first() + 1) .get(CoreAnnotations.TokensAnnotation.class) .size(), dict)) return; } }
/** * The core implementation of the search. * * @param root The root word to search from. Traditionally, this is the root of the sentence. * @param candidateFragments The callback for the resulting sentence fragments. This is a * predicate of a triple of values. The return value of the predicate determines whether we * should continue searching. The triple is a triple of * <ol> * <li>The log probability of the sentence fragment, according to the featurizer and the * weights * <li>The features along the path to this fragment. The last element of this is the * features from the most recent step. * <li>The sentence fragment. Because it is relatively expensive to compute the resulting * tree, this is returned as a lazy {@link Supplier}. * </ol> * * @param classifier The classifier for whether an arc should be on the path to a clause split, a * clause split itself, or neither. * @param featurizer The featurizer to use. Make sure this matches the weights! * @param actionSpace The action space we are allowed to take. Each action defines a means of * splitting a clause on a dependency boundary. */ protected void search( // The root to search from IndexedWord root, // The output specs final Predicate<Triple<Double, List<Counter<String>>, Supplier<SentenceFragment>>> candidateFragments, // The learning specs final Classifier<ClauseSplitter.ClauseClassifierLabel, String> classifier, Map<String, ? extends List<String>> hardCodedSplits, final Function<Triple<State, Action, State>, Counter<String>> featurizer, final Collection<Action> actionSpace, final int maxTicks) { // (the fringe) PriorityQueue<Pair<State, List<Counter<String>>>> fringe = new FixedPrioritiesPriorityQueue<>(); // (avoid duplicate work) Set<IndexedWord> seenWords = new HashSet<>(); State firstState = new State(null, null, -9000, null, x -> {}, true); // First state is implicitly "done" fringe.add(Pair.makePair(firstState, new ArrayList<>(0)), -0.0); int ticks = 0; while (!fringe.isEmpty()) { if (++ticks > maxTicks) { // System.err.println("WARNING! Timed out on search with " + ticks + " ticks"); return; } // Useful variables double logProbSoFar = fringe.getPriority(); assert logProbSoFar <= 0.0; Pair<State, List<Counter<String>>> lastStatePair = fringe.removeFirst(); State lastState = lastStatePair.first; List<Counter<String>> featuresSoFar = lastStatePair.second; IndexedWord rootWord = lastState.edge == null ? root : lastState.edge.getDependent(); // Register thunk if (lastState.isDone) { if (!candidateFragments.test( Triple.makeTriple( logProbSoFar, featuresSoFar, () -> { SemanticGraph copy = new SemanticGraph(tree); lastState .thunk .andThen( x -> { // Add the extra edges back in, if they don't break the tree-ness of the // extraction for (IndexedWord newTreeRoot : x.getRoots()) { if (newTreeRoot != null) { // what a strange thing to have happen... for (SemanticGraphEdge extraEdge : extraEdgesByGovernor.get(newTreeRoot)) { assert Util.isTree(x); //noinspection unchecked addSubtree( x, newTreeRoot, extraEdge.getRelation().toString(), tree, extraEdge.getDependent(), tree.getIncomingEdgesSorted(newTreeRoot)); assert Util.isTree(x); } } } }) .accept(copy); return new SentenceFragment(copy, assumedTruth, false); }))) { break; } } // Find relevant auxilliary terms SemanticGraphEdge subjOrNull = null; SemanticGraphEdge objOrNull = null; for (SemanticGraphEdge auxEdge : tree.outgoingEdgeIterable(rootWord)) { String relString = auxEdge.getRelation().toString(); if (relString.contains("obj")) { objOrNull = auxEdge; } else if (relString.contains("subj")) { subjOrNull = auxEdge; } } // Iterate over children // For each outgoing edge... for (SemanticGraphEdge outgoingEdge : tree.outgoingEdgeIterable(rootWord)) { // Prohibit indirect speech verbs from splitting off clauses // (e.g., 'said', 'think') // This fires if the governor is an indirect speech verb, and the outgoing edge is a ccomp if (outgoingEdge.getRelation().toString().equals("ccomp") && ((outgoingEdge.getGovernor().lemma() != null && INDIRECT_SPEECH_LEMMAS.contains(outgoingEdge.getGovernor().lemma())) || INDIRECT_SPEECH_LEMMAS.contains(outgoingEdge.getGovernor().word()))) { continue; } // Get some variables String outgoingEdgeRelation = outgoingEdge.getRelation().toString(); List<String> forcedArcOrder = hardCodedSplits.get(outgoingEdgeRelation); if (forcedArcOrder == null && outgoingEdgeRelation.contains(":")) { forcedArcOrder = hardCodedSplits.get( outgoingEdgeRelation.substring(0, outgoingEdgeRelation.indexOf(":")) + ":*"); } boolean doneForcedArc = false; // For each action... for (Action action : (forcedArcOrder == null ? actionSpace : orderActions(actionSpace, forcedArcOrder))) { // Check the prerequisite if (!action.prerequisitesMet(tree, outgoingEdge)) { continue; } if (forcedArcOrder != null && doneForcedArc) { break; } // 1. Compute the child state Optional<State> candidate = action.applyTo(tree, lastState, outgoingEdge, subjOrNull, objOrNull); if (candidate.isPresent()) { double logProbability; ClauseClassifierLabel bestLabel; Counter<String> features = featurizer.apply(Triple.makeTriple(lastState, action, candidate.get())); if (forcedArcOrder != null && !doneForcedArc) { logProbability = 0.0; bestLabel = ClauseClassifierLabel.CLAUSE_SPLIT; doneForcedArc = true; } else if (features.containsKey("__undocumented_junit_no_classifier")) { logProbability = Double.NEGATIVE_INFINITY; bestLabel = ClauseClassifierLabel.CLAUSE_INTERM; } else { Counter<ClauseClassifierLabel> scores = classifier.scoresOf(new RVFDatum<>(features)); if (scores.size() > 0) { Counters.logNormalizeInPlace(scores); } String rel = outgoingEdge.getRelation().toString(); if ("nsubj".equals(rel) || "dobj".equals(rel)) { scores.remove( ClauseClassifierLabel.NOT_A_CLAUSE); // Always at least yield on nsubj and dobj } logProbability = Counters.max(scores, Double.NEGATIVE_INFINITY); bestLabel = Counters.argmax(scores, (x, y) -> 0, ClauseClassifierLabel.CLAUSE_SPLIT); } if (bestLabel != ClauseClassifierLabel.NOT_A_CLAUSE) { Pair<State, List<Counter<String>>> childState = Pair.makePair( candidate.get().withIsDone(bestLabel), new ArrayList<Counter<String>>(featuresSoFar) { { add(features); } }); // 2. Register the child state if (!seenWords.contains(childState.first.edge.getDependent())) { // System.err.println(" pushing " + action.signature() + " with " + // argmax.first.edge); fringe.add(childState, logProbability); } } } } } seenWords.add(rootWord); } // System.err.println("Search finished in " + ticks + " ticks and " + classifierEvals + " // classifier evaluations."); }
/** * Usage: java edu.stanford.nlp.trees.tregex.tsurgeon.Tsurgeon [-s] -treeFile file-with-trees [-po * matching-pattern operation] operation-file-1 operation-file-2 ... operation-file-n * * <h4>Arguments:</h4> * * Each argument should be the name of a transformation file that contains a list of pattern and * transformation operation list pairs. That is, it is a sequence of pairs of a {@link * TregexPattern} pattern on one or more lines, then a blank line (empty or whitespace), then a * list of transformation operations one per line (as specified by <b>Legal operation syntax</b> * below) to apply when the pattern is matched, and then another blank line (empty or whitespace). * Note the need for blank lines: The code crashes if they are not present as separators (although * the blank line at the end of the file can be omitted). The script file can include comment * lines, either whole comment lines or trailing comments introduced by %, which extend to the end * of line. A needed percent mark can be escaped by a preceding backslash. * * <p>For example, if you want to excise an SBARQ node whenever it is the parent of an SQ node, * and relabel the SQ node to S, your transformation file would look like this: * * <blockquote> * * <code> * SBARQ=n1 < SQ=n2<br> * <br> * excise n1 n1<br> * relabel n2 S * </code> * * </blockquote> * * <p> * * <h4>Options:</h4> * * <ul> * <li><code>-treeFile <filename></code> specify the name of the file that has the trees * you want to transform. * <li><code>-po <matchPattern> <operation></code> Apply a single operation to * every tree using the specified match pattern and the specified operation. Use this option * when you want to quickly try the effect of one pattern/surgery combination, and are too * lazy to write a transformation file. * <li><code>-s</code> Print each output tree on one line (default is pretty-printing). * <li><code>-m</code> For every tree that had a matching pattern, print "before" (prepended as * "Operated on:") and "after" (prepended as "Result:"). Unoperated trees just pass through * the transducer as usual. * <li><code>-encoding X</code> Uses character set X for input and output of trees. * <li><code>-macros <filename></code> A file of macros to use on the tregex pattern. * Macros should be one per line, with original and replacement separated by tabs. * <li><code>-hf <headfinder-class-name></code> use the specified {@link HeadFinder} class * to determine headship relations. * <li><code>-hfArg <string></code> pass a string argument in to the {@link HeadFinder} * class's constructor. <code>-hfArg</code> can be used multiple times to pass in multiple * arguments. * <li><code>-trf <TreeReaderFactory-class-name></code> use the specified {@link * TreeReaderFactory} class to read trees from files. * </ul> * * <h4>Legal operation syntax:</h4> * * <ul> * <li><code>delete <name></code> deletes the node and everything below it. * <li><code>prune <name></code> Like delete, but if, after the pruning, the parent has * no children anymore, the parent is pruned too. Pruning continues to affect all ancestors * until one is found with remaining children. This may result in a null tree. * <li><code>excise <name1> <name2></code> The name1 node should either dominate * or be the same as the name2 node. This excises out everything from name1 to name2. All * the children of name2 go into the parent of name1, where name1 was. * <li><code>relabel <name> <new-label></code> Relabels the node to have the new * label. <br> * There are three possible forms: <br> * <code>relabel nodeX VP</code> - for changing a node label to an alphanumeric string <br> * <code>relabel nodeX /''/</code> - for relabeling a node to something that isn't a valid * identifier without quoting <br> * <code>relabel nodeX /^VB(.*)$/verb\\/$1/</code> - for regular expression based * relabeling. In this case, all matches of the regular expression against the node label * are replaced with the replacement String. This has the semantics of Java/Perl's * replaceAll: you may use capturing groups and put them in replacements with $n. For * example, if the pattern is /foo/bar/ and the node matched is "foo", the replaceAll * semantics result in "barbar". If the pattern is /^foo(.*)$/bar$1/ and node matched is * "foofoo", relabel will result in "barfoo". <br> * When using the regex replacement method, you can also use the sequences ={node} and * %{var} in the replacement string to use captured nodes or variable strings in the * replacement string. For example, if the Tregex pattern was "duck=bar" and the relabel is * /foo/={bar}/, "foofoo" will be replaced with "duckduck". <br> * To concatenate two nodes named in the tregex pattern, for example, you can use the * pattern /^.*$/={foo}={bar}/. Note that the ^.*$ is necessary to make sure the regex * pattern only matches and replaces once on the entire node name. <br> * To get an "=" or a "%" in the replacement, using \ escaping. Also, as in the example you * can escape a slash in the middle of the second and third forms with \\/ and \\\\. <br> * <li><code>insert <name> <position></code> or <code> * insert <tree> <position></code> inserts the named node or tree into the * position specified. * <li><code>move <name> <position></code> moves the named node into the * specified position. * <p>Right now the only ways to specify position are: * <p><code>$+ <name></code> the left sister of the named node<br> * <code>$- <name></code> the right sister of the named node<br> * <code>>i <name></code> the i_th daughter of the named node<br> * <code>>-i <name></code> the i_th daughter, counting from the right, of the * named node. * <li><code>replace <name1> <name2></code> deletes name1 and inserts a copy of * name2 in its place. * <li><code>replace <name> <tree> <tree2>...</code> deletes name and * inserts the new tree(s) in its place. If more than one replacement tree is given, each of * the new subtrees will be added in order where the old tree was. Multiple subtrees at the * root is an illegal operation and will throw an exception. * <li>{@code createSubtree <new-label> <name1> [<name2>]} Create a subtree out of all the nodes * from {@code <name1>} through {@code <name2>} and puts the new subtree where that span * used to be. To limit the operation to just one node, elide {@code <name2>}. * <li><code>adjoin <auxiliary_tree> <name></code> Adjoins the specified auxiliary * tree into the named node. The daughters of the target node will become the daughters of * the foot of the auxiliary tree. * <li><code>adjoinH <auxiliary_tree> <name></code> Similar to adjoin, but * preserves the target node and makes it the root of <tree>. (It is still accessible * as <code>name</code>. The root of the auxiliary tree is ignored.) * <li><code>adjoinF <auxiliary_tree> <name></code> Similar to adjoin, but * preserves the target node and makes it the foot of <tree>. (It is still accessible * as <code>name</code>, and retains its status as parent of its children. The root of the * auxiliary tree is ignored.) * <li> * <dt><code>coindex <name1> <name2> ... <nameM> </code> Puts a (Penn * Treebank style) coindexation suffix of the form "-N" on each of nodes name_1 through * name_m. The value of N will be automatically generated in reference to the existing * coindexations in the tree, so that there is never an accidental clash of indices across * things that are not meant to be coindexed. * </ul> * * <p>In the context of <code>adjoin</code>, <code>adjoinH</code>, and <code>adjoinF</code>, an * auxiliary tree is a tree in Penn Treebank format with <code>@</code> on exactly one of the * leaves denoting the foot of the tree. The operations which use the foot use the labeled node. * For example: <br> * Tsurgeon: <code>adjoin (FOO (BAR@)) foo</code> <br> * Tregex: <code>B=foo</code> <br> * Input: <code>(A (B 1 2))</code> Output: <code>(A (FOO (BAR 1 2)))</code> * * <p>Tsurgeon applies the same operation to the same tree for as long as the given tregex * operation matches. This means that infinite loops are very easy to cause. One common situation * where this comes up is with an insert operation will repeats infinitely many times unless you * add an expression to the tregex that matches against the inserted pattern. For example, this * pattern will infinite loop: * * <blockquote> * * <code> * TregexPattern tregex = TregexPattern.compile("S=node << NP"); <br> * TsurgeonPattern tsurgeon = Tsurgeon.parseOperation("insert (NP foo) >-1 node"); * </code> * * </blockquote> * * This pattern, though, will terminate: * * <blockquote> * * <code> * TregexPattern tregex = TregexPattern.compile("S=node << NP !<< foo"); <br> * TsurgeonPattern tsurgeon = Tsurgeon.parseOperation("insert (NP foo) >-1 node"); * </code> * * </blockquote> * * <p>Tsurgeon has (very) limited support for conditional statements. If a pattern is prefaced * with <code>if exists <name></code>, the rest of the pattern will only execute if the * named node was found in the corresponding TregexMatcher. * * @param args a list of names of files each of which contains a single tregex matching pattern * plus a list, one per line, of transformation operations to apply to the matched pattern. * @throws Exception If an I/O or pattern syntax error */ public static void main(String[] args) throws Exception { String headFinderClassName = null; String headFinderOption = "-hf"; String[] headFinderArgs = null; String headFinderArgOption = "-hfArg"; String encoding = "UTF-8"; String encodingOption = "-encoding"; if (args.length == 0) { System.err.println( "Usage: java edu.stanford.nlp.trees.tregex.tsurgeon.Tsurgeon [-s] -treeFile <file-with-trees> [-po <matching-pattern> <operation>] <operation-file-1> <operation-file-2> ... <operation-file-n>"); System.exit(0); } String treePrintFormats; String singleLineOption = "-s"; String verboseOption = "-v"; String matchedOption = "-m"; // if set, then print original form of trees that are matched & thus operated on String patternOperationOption = "-po"; String treeFileOption = "-treeFile"; String trfOption = "-trf"; String macroOption = "-macros"; String macroFilename = ""; Map<String, Integer> flagMap = Generics.newHashMap(); flagMap.put(patternOperationOption, 2); flagMap.put(treeFileOption, 1); flagMap.put(trfOption, 1); flagMap.put(singleLineOption, 0); flagMap.put(encodingOption, 1); flagMap.put(headFinderOption, 1); flagMap.put(macroOption, 1); Map<String, String[]> argsMap = StringUtils.argsToMap(args, flagMap); args = argsMap.get(null); if (argsMap.containsKey(headFinderOption)) headFinderClassName = argsMap.get(headFinderOption)[0]; if (argsMap.containsKey(headFinderArgOption)) headFinderArgs = argsMap.get(headFinderArgOption); if (argsMap.containsKey(verboseOption)) verbose = true; if (argsMap.containsKey(singleLineOption)) treePrintFormats = "oneline,"; else treePrintFormats = "penn,"; if (argsMap.containsKey(encodingOption)) encoding = argsMap.get(encodingOption)[0]; if (argsMap.containsKey(macroOption)) macroFilename = argsMap.get(macroOption)[0]; TreePrint tp = new TreePrint(treePrintFormats, new PennTreebankLanguagePack()); PrintWriter pwOut = new PrintWriter(new OutputStreamWriter(System.out, encoding), true); TreeReaderFactory trf; if (argsMap.containsKey(trfOption)) { String trfClass = argsMap.get(trfOption)[0]; trf = ReflectionLoading.loadByReflection(trfClass); } else { trf = new TregexPattern.TRegexTreeReaderFactory(); } Treebank trees = new DiskTreebank(trf, encoding); if (argsMap.containsKey(treeFileOption)) { trees.loadPath(argsMap.get(treeFileOption)[0]); } List<Pair<TregexPattern, TsurgeonPattern>> ops = new ArrayList<Pair<TregexPattern, TsurgeonPattern>>(); TregexPatternCompiler compiler; if (headFinderClassName == null) { compiler = new TregexPatternCompiler(); } else { HeadFinder hf; if (headFinderArgs == null) { hf = ReflectionLoading.loadByReflection(headFinderClassName); } else { hf = ReflectionLoading.loadByReflection(headFinderClassName, (Object[]) headFinderArgs); } compiler = new TregexPatternCompiler(hf); } Macros.addAllMacros(compiler, macroFilename, encoding); if (argsMap.containsKey(patternOperationOption)) { TregexPattern matchPattern = compiler.compile(argsMap.get(patternOperationOption)[0]); TsurgeonPattern p = parseOperation(argsMap.get(patternOperationOption)[1]); ops.add(new Pair<TregexPattern, TsurgeonPattern>(matchPattern, p)); } else { for (String arg : args) { List<Pair<TregexPattern, TsurgeonPattern>> pairs = getOperationsFromFile(arg, encoding, compiler); for (Pair<TregexPattern, TsurgeonPattern> pair : pairs) { if (verbose) { System.err.println(pair.second()); } ops.add(pair); } } } for (Tree t : trees) { Tree original = t.deepCopy(); Tree result = processPatternsOnTree(ops, t); if (argsMap.containsKey(matchedOption) && matchedOnTree) { pwOut.println("Operated on: "); displayTree(original, tp, pwOut); pwOut.println("Result: "); } displayTree(result, tp, pwOut); } }
public List<Pair<String, Double>> selectWeightedKeysWithSampling( ActiveLearningSelectionCriterion criterion, int numSamples, int seed) { List<Pair<String, Double>> result = new ArrayList<>(); forceTrack("Sampling Keys"); log("" + numSamples + " to collect"); // Get uncertainty forceTrack("Computing Uncertainties"); Counter<String> weightCounter = uncertainty(criterion); assert weightCounter.equals(uncertainty(criterion)); endTrack("Computing Uncertainties"); // Compute some statistics startTrack("Uncertainty Histogram"); // log(new Histogram(weightCounter, 50).toString()); // removed to make the release easier // (Histogram isn't in CoreNLP) endTrack("Uncertainty Histogram"); double totalCount = weightCounter.totalCount(); Random random = new Random(seed); // Flatten counter List<String> keys = new LinkedList<>(); List<Double> weights = new LinkedList<>(); List<String> zeroUncertaintyKeys = new LinkedList<>(); for (Pair<String, Double> elem : Counters.toSortedListWithCounts( weightCounter, (o1, o2) -> { int value = o1.compareTo(o2); if (value == 0) { return o1.first.compareTo(o2.first); } else { return value; } })) { if (elem.second != 0.0 || weightCounter.totalCount() == 0.0 || weightCounter.size() <= numSamples) { // ignore 0 probability weights keys.add(elem.first); weights.add(elem.second); } else { zeroUncertaintyKeys.add(elem.first); } } // Error check if (Utils.assertionsEnabled()) { for (Double elem : weights) { if (!(elem >= 0 && !Double.isInfinite(elem) && !Double.isNaN(elem))) { throw new IllegalArgumentException("Invalid weight: " + elem); } } } // Sample SAMPLE_ITER: for (int i = 1; i <= numSamples; ++i) { // For each sample if (i % 1000 == 0) { // Debug log log("sampled " + (i / 1000) + "k keys"); // Recompute total count to mitigate floating point errors totalCount = 0.0; for (double val : weights) { totalCount += val; } } if (weights.size() == 0) { continue; } assert totalCount >= 0.0; assert weights.size() == keys.size(); double target = random.nextDouble() * totalCount; Iterator<String> keyIter = keys.iterator(); Iterator<Double> weightIter = weights.iterator(); double runningTotal = 0.0; while (keyIter.hasNext()) { // For each candidate String key = keyIter.next(); double weight = weightIter.next(); runningTotal += weight; if (target <= runningTotal) { // Select that sample result.add(Pair.makePair(key, weight)); keyIter.remove(); weightIter.remove(); totalCount -= weight; continue SAMPLE_ITER; // continue sampling } } // We should get here only if the keys list is empty warn( "No more uncertain samples left to draw from! (target=" + target + " totalCount=" + totalCount + " size=" + keys.size()); assert keys.size() == 0; if (zeroUncertaintyKeys.size() > 0) { result.add(Pair.makePair(zeroUncertaintyKeys.remove(0), 0.0)); } else { break; } } endTrack("Sampling Keys"); return result; }
private void incrementDay(ISODateInstance referenceDate, Pair<DateField, Integer> relation) { String origDateString = referenceDate.getStartDate(); String dayString = origDateString.substring(origDateString.length() - 2, origDateString.length()); if (dayString.contains("*")) { isoDate = origDateString; return; } // Date is not a variable Integer dayNum = Integer.parseInt(dayString); String monthString = origDateString.substring(origDateString.length() - 4, origDateString.length() - 2); int numDaysInMonth = 30; // default - assume this if month is a variable int monthNum = -1; // ie, we don't know the month yet - this remains -1 if the month is a variable if (!monthString.contains("*")) { // Set appropriate numDaysInMonth and monthNum monthNum = Integer.parseInt(monthString); numDaysInMonth = daysPerMonth.get(monthNum); } // Now, find out if we're an edge case (potential to increment month) if (dayNum + relation.second() <= numDaysInMonth && dayNum + relation.second() >= 1) { // Not an edge case - just increment the day, create a new string, and return dayNum += relation.second(); isoDate = makeStringDayChange(origDateString, dayNum); return; } // Since we're an edge case, the month can't be a variable - if it is a variable, just set this // to the reference string if (monthNum == -1) { isoDate = origDateString; return; } // At this point, neither our day nor our month is a variable isoDate = origDateString; boolean decreasing = (dayNum + relation.second() < 1); // Need to increment the month, set the date appropriately - we need the new month num to set // the day appropriately, so do month first int newMonthNum; // Now, check if we're an edge case for month if ((monthNum + 1 > 12 && !decreasing) || (monthNum - 1 < 1 && decreasing)) { // First, change the month if (decreasing) { newMonthNum = 12; } else { newMonthNum = 1; } // If we can, increment the year // TODO: fix this to work more nicely with variables and thus handle more cases String yearString = origDateString.substring(0, 4); if (!yearString.contains("*")) { if (decreasing) { isoDate = makeStringYearChange(isoDate, Integer.parseInt(yearString) - 1); } else { isoDate = makeStringYearChange(isoDate, Integer.parseInt(yearString) + 1); } } } else { // We're not an edge case for month - just increment if (decreasing) { newMonthNum = monthNum - 1; } else { newMonthNum = monthNum + 1; } } // do the increment isoDate = makeStringMonthChange(isoDate, newMonthNum); int newDateNum; if (decreasing) { newDateNum = -relation.second() + daysPerMonth.get(newMonthNum) - dayNum; } else { newDateNum = relation.second() - dayNum + daysPerMonth.get(monthNum); } // Now, change the day in our original string to be appropriate isoDate = makeStringDayChange(isoDate, newDateNum); }
/** @param args */ public static void main(String[] args) { if (args.length != 3) { System.err.printf( "Usage: java %s language filename features%n", TreebankFactoredLexiconStats.class.getName()); System.exit(-1); } Language language = Language.valueOf(args[0]); TreebankLangParserParams tlpp = language.params; if (language.equals(Language.Arabic)) { String[] options = {"-arabicFactored"}; tlpp.setOptionFlag(options, 0); } else { String[] options = {"-frenchFactored"}; tlpp.setOptionFlag(options, 0); } Treebank tb = tlpp.diskTreebank(); tb.loadPath(args[1]); MorphoFeatureSpecification morphoSpec = language.equals(Language.Arabic) ? new ArabicMorphoFeatureSpecification() : new FrenchMorphoFeatureSpecification(); String[] features = args[2].trim().split(","); for (String feature : features) { morphoSpec.activate(MorphoFeatureType.valueOf(feature)); } // Counters Counter<String> wordTagCounter = new ClassicCounter<>(30000); Counter<String> morphTagCounter = new ClassicCounter<>(500); // Counter<String> signatureTagCounter = new ClassicCounter<String>(); Counter<String> morphCounter = new ClassicCounter<>(500); Counter<String> wordCounter = new ClassicCounter<>(30000); Counter<String> tagCounter = new ClassicCounter<>(300); Counter<String> lemmaCounter = new ClassicCounter<>(25000); Counter<String> lemmaTagCounter = new ClassicCounter<>(25000); Counter<String> richTagCounter = new ClassicCounter<>(1000); Counter<String> reducedTagCounter = new ClassicCounter<>(500); Counter<String> reducedTagLemmaCounter = new ClassicCounter<>(500); Map<String, Set<String>> wordLemmaMap = Generics.newHashMap(); TwoDimensionalIntCounter<String, String> lemmaReducedTagCounter = new TwoDimensionalIntCounter<>(30000); TwoDimensionalIntCounter<String, String> reducedTagTagCounter = new TwoDimensionalIntCounter<>(500); TwoDimensionalIntCounter<String, String> tagReducedTagCounter = new TwoDimensionalIntCounter<>(300); int numTrees = 0; for (Tree tree : tb) { for (Tree subTree : tree) { if (!subTree.isLeaf()) { tlpp.transformTree(subTree, tree); } } List<Label> pretermList = tree.preTerminalYield(); List<Label> yield = tree.yield(); assert yield.size() == pretermList.size(); int yieldLen = yield.size(); for (int i = 0; i < yieldLen; ++i) { String tag = pretermList.get(i).value(); String word = yield.get(i).value(); String morph = ((CoreLabel) yield.get(i)).originalText(); // Note: if there is no lemma, then we use the surface form. Pair<String, String> lemmaTag = MorphoFeatureSpecification.splitMorphString(word, morph); String lemma = lemmaTag.first(); String richTag = lemmaTag.second(); // WSGDEBUG if (tag.contains("MW")) lemma += "-MWE"; lemmaCounter.incrementCount(lemma); lemmaTagCounter.incrementCount(lemma + tag); richTagCounter.incrementCount(richTag); String reducedTag = morphoSpec.strToFeatures(richTag).toString(); reducedTagCounter.incrementCount(reducedTag); reducedTagLemmaCounter.incrementCount(reducedTag + lemma); wordTagCounter.incrementCount(word + tag); morphTagCounter.incrementCount(morph + tag); morphCounter.incrementCount(morph); wordCounter.incrementCount(word); tagCounter.incrementCount(tag); reducedTag = reducedTag.equals("") ? "NONE" : reducedTag; if (wordLemmaMap.containsKey(word)) { wordLemmaMap.get(word).add(lemma); } else { Set<String> lemmas = Generics.newHashSet(1); wordLemmaMap.put(word, lemmas); } lemmaReducedTagCounter.incrementCount(lemma, reducedTag); reducedTagTagCounter.incrementCount(lemma + reducedTag, tag); tagReducedTagCounter.incrementCount(tag, reducedTag); } ++numTrees; } // Barf... System.out.println("Language: " + language.toString()); System.out.printf("#trees:\t%d%n", numTrees); System.out.printf("#tokens:\t%d%n", (int) wordCounter.totalCount()); System.out.printf("#words:\t%d%n", wordCounter.keySet().size()); System.out.printf("#tags:\t%d%n", tagCounter.keySet().size()); System.out.printf("#wordTagPairs:\t%d%n", wordTagCounter.keySet().size()); System.out.printf("#lemmas:\t%d%n", lemmaCounter.keySet().size()); System.out.printf("#lemmaTagPairs:\t%d%n", lemmaTagCounter.keySet().size()); System.out.printf("#feattags:\t%d%n", reducedTagCounter.keySet().size()); System.out.printf("#feattag+lemmas:\t%d%n", reducedTagLemmaCounter.keySet().size()); System.out.printf("#richtags:\t%d%n", richTagCounter.keySet().size()); System.out.printf("#richtag+lemma:\t%d%n", morphCounter.keySet().size()); System.out.printf("#richtag+lemmaTagPairs:\t%d%n", morphTagCounter.keySet().size()); // Extra System.out.println("=================="); StringBuilder sbNoLemma = new StringBuilder(); StringBuilder sbMultLemmas = new StringBuilder(); for (Map.Entry<String, Set<String>> wordLemmas : wordLemmaMap.entrySet()) { String word = wordLemmas.getKey(); Set<String> lemmas = wordLemmas.getValue(); if (lemmas.size() == 0) { sbNoLemma.append("NO LEMMAS FOR WORD: " + word + "\n"); continue; } if (lemmas.size() > 1) { sbMultLemmas.append("MULTIPLE LEMMAS: " + word + " " + setToString(lemmas) + "\n"); continue; } String lemma = lemmas.iterator().next(); Set<String> reducedTags = lemmaReducedTagCounter.getCounter(lemma).keySet(); if (reducedTags.size() > 1) { System.out.printf("%s --> %s%n", word, lemma); for (String reducedTag : reducedTags) { int count = lemmaReducedTagCounter.getCount(lemma, reducedTag); String posTags = setToString(reducedTagTagCounter.getCounter(lemma + reducedTag).keySet()); System.out.printf("\t%s\t%d\t%s%n", reducedTag, count, posTags); } System.out.println(); } } System.out.println("=================="); System.out.println(sbNoLemma.toString()); System.out.println(sbMultLemmas.toString()); System.out.println("=================="); List<String> tags = new ArrayList<>(tagReducedTagCounter.firstKeySet()); Collections.sort(tags); for (String tag : tags) { System.out.println(tag); Set<String> reducedTags = tagReducedTagCounter.getCounter(tag).keySet(); for (String reducedTag : reducedTags) { int count = tagReducedTagCounter.getCount(tag, reducedTag); // reducedTag = reducedTag.equals("") ? "NONE" : reducedTag; System.out.printf("\t%s\t%d%n", reducedTag, count); } System.out.println(); } System.out.println("=================="); }
// when finished = false; break; is called, it means I successfully matched. @SuppressWarnings("null") private void goToNextNodeMatch() { decommitVariableGroups(); // make sure variable groups are free. decommitNamedNodes(); decommitNamedRelations(); finished = true; Matcher m = null; while (nodeMatchCandidateIterator.hasNext()) { if (myNode.reln.getName() != null) { String foundReln = namesToRelations.get(myNode.reln.getName()); nextMatchReln = ((GraphRelation.SearchNodeIterator) nodeMatchCandidateIterator).getReln(); if ((foundReln != null) && (!nextMatchReln.equals(foundReln))) { nextMatch = nodeMatchCandidateIterator.next(); continue; } } nextMatch = nodeMatchCandidateIterator.next(); // System.err.println("going to next match: " + nextMatch.word() + " " + // myNode.descString + " " + myNode.isLink); if (myNode.descString.equals("{}") && myNode.isLink) { IndexedWord otherNode = namesToNodes.get(myNode.name); if (otherNode != null) { if (otherNode.equals(nextMatch)) { if (!myNode.negDesc) { finished = false; break; } } else { if (myNode.negDesc) { finished = false; break; } } } else { boolean found = myNode.nodeAttrMatch(nextMatch, hyp ? sg : sg_aligned, ignoreCase); if (found) { for (Pair<Integer, String> varGroup : myNode.variableGroups) { // if variables have been captured from a regex, they // must match any previous matchings String thisVariable = varGroup.second(); String thisVarString = variableStrings.getString(thisVariable); if (thisVarString != null && !thisVarString.equals(m.group(varGroup.first()))) { // failed to match a variable found = false; break; } } // nodeAttrMatch already checks negDesc, so no need to // check for that here finished = false; break; } } } else { // try to match the description pattern. boolean found = myNode.nodeAttrMatch(nextMatch, hyp ? sg : sg_aligned, ignoreCase); if (found) { for (Pair<Integer, String> varGroup : myNode.variableGroups) { // if variables have been captured from a regex, they // must match any previous matchings String thisVariable = varGroup.second(); String thisVarString = variableStrings.getString(thisVariable); if (thisVarString != null && !thisVarString.equals(m.group(varGroup.first()))) { // failed to match a variable found = false; break; } } // nodeAttrMatch already checks negDesc, so no need to // check for that here finished = false; break; } } } // end while if (!finished) { // I successfully matched. resetChild(); if (myNode.name != null) { // note: have to fill in the map as we go for backreferencing if (!namesToNodes.containsKey(myNode.name)) { // System.err.println("making namedFirst"); namedFirst = true; } // System.err.println("adding named node: " + myNode.name + "=" + // nextMatch.word()); namesToNodes.put(myNode.name, nextMatch); } if (myNode.reln.getName() != null) { if (!namesToRelations.containsKey(myNode.reln.getName())) relnNamedFirst = true; namesToRelations.put(myNode.reln.getName(), nextMatchReln); } commitVariableGroups(m); // commit my variable groups. } // finished is false exiting this if and only if nextChild exists // and has a label or backreference that matches // (also it will just have been reset) }