public static final String doCorefResolution(Annotation annotation) { Map<Integer, CorefChain> corefs = annotation.get(CorefChainAnnotation.class); List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class); List<String> resolved = new ArrayList<String>(); for (CoreMap sentence : sentences) { List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class); for (CoreLabel token : tokens) { Integer corefClustId = token.get(CorefCoreAnnotations.CorefClusterIdAnnotation.class); CorefChain chain = corefs.get(corefClustId); if (chain == null) resolved.add(token.word()); else { int sentINdx = chain.getRepresentativeMention().sentNum - 1; CoreMap corefSentence = sentences.get(sentINdx); List<CoreLabel> corefSentenceTokens = corefSentence.get(TokensAnnotation.class); CorefMention reprMent = chain.getRepresentativeMention(); if (token.index() < reprMent.startIndex || token.index() > reprMent.endIndex) { for (int i = reprMent.startIndex; i < reprMent.endIndex; i++) { CoreLabel matchedLabel = corefSentenceTokens.get(i - 1); resolved.add(matchedLabel.word()); } } else resolved.add(token.word()); } } } String resolvedStr = ""; System.out.println(); for (String str : resolved) { resolvedStr += str + " "; } System.out.println(resolvedStr); return resolvedStr; }
/** * The basic method for splitting off a clause of a tree. This modifies the tree in place. This * method addtionally follows ref edges. * * @param tree The tree to split a clause from. * @param toKeep The edge representing the clause to keep. */ @SuppressWarnings("unchecked") private void simpleClause(SemanticGraph tree, SemanticGraphEdge toKeep) { splitToChildOfEdge(tree, toKeep); // Follow 'ref' edges Map<IndexedWord, IndexedWord> refReplaceMap = new HashMap<>(); // (find replacements) for (IndexedWord vertex : tree.vertexSet()) { for (SemanticGraphEdge edge : extraEdgesByDependent.get(vertex)) { if ("ref".equals(edge.getRelation().toString()) && // it's a ref edge... !tree.containsVertex( edge.getGovernor())) { // ...that doesn't already exist in the tree. refReplaceMap.put(vertex, edge.getGovernor()); } } } // (do replacements) for (Map.Entry<IndexedWord, IndexedWord> entry : refReplaceMap.entrySet()) { Iterator<SemanticGraphEdge> iter = tree.incomingEdgeIterator(entry.getKey()); if (!iter.hasNext()) { continue; } SemanticGraphEdge incomingEdge = iter.next(); IndexedWord governor = incomingEdge.getGovernor(); tree.removeVertex(entry.getKey()); addSubtree( tree, governor, incomingEdge.getRelation().toString(), this.tree, entry.getValue(), this.tree.incomingEdgeList(tree.getFirstRoot())); } }
public Object intern(Object o) { Object i = oToO.get(o); if (i == null) { i = o; oToO.put(o, o); } return i; }
protected String historyToString(List history) { String str = (String) historyToString.get(history); if (str == null) { StringBuilder sb = new StringBuilder(); for (int i = 0; i < history.size(); i++) { sb.append('^'); sb.append(history.get(i)); } str = sb.toString(); historyToString.put(history, str); } return str; }
/** * traceTo() values that are contained in the tree are assigned to new objects. traceTo() values * that are not contained in the tree are given the old value. */ public static void fixEmptyTreeLeafs(Tree t, Map<Tree, Tree> newToOld, Map<Tree, Tree> oldToNew) { Tree[] kids = t.children(); for (int i = 0, n = kids.length; i < n; i++) { fixEmptyTreeLeafs(kids[i], newToOld, oldToNew); } if (t instanceof EmptyTreeLeaf) { EmptyTreeLeaf oldT = (EmptyTreeLeaf) newToOld.get(t); ((EmptyTreeLeaf) t).setEmptyType(oldT.emptyType()); Tree oldTraceTo = oldT.traceTo(); Tree newTraceTo = oldToNew.get(oldTraceTo); if (newTraceTo != null) ((EmptyTreeLeaf) t).setTraceTo(newTraceTo); else ((EmptyTreeLeaf) t).setTraceTo(oldTraceTo); } }
public XBarGrammarProjection(BinaryGrammar bg, UnaryGrammar ug) { Map<BinaryRule, BinaryRule> binaryRules = new HashMap<BinaryRule, BinaryRule>(); Map<UnaryRule, UnaryRule> unaryRules = new HashMap<UnaryRule, UnaryRule>(); sourceUG = ug; sourceBG = bg; sourceNumberer = Numberer.getGlobalNumberer(bg.stateSpace()); targetNumberer = Numberer.getGlobalNumberer(bg.stateSpace() + "-xbar"); projection = new int[sourceNumberer.total()]; scanStates(sourceNumberer, targetNumberer); targetBG = new BinaryGrammar(targetNumberer.total(), bg.stateSpace() + "-xbar"); targetUG = new UnaryGrammar(targetNumberer.total()); for (Iterator<BinaryRule> brI = bg.iterator(); brI.hasNext(); ) { BinaryRule rule = projectBinaryRule(brI.next()); Rule old = binaryRules.get(rule); if (old == null || rule.score > old.score) { binaryRules.put(rule, rule); } } for (BinaryRule br : binaryRules.keySet()) { targetBG.addRule(br); // System.out.println("BR: "+targetNumberer.object(br.parent)+" -> // "+targetNumberer.object(br.leftChild)+" "+targetNumberer.object(br.rightChild)+" %% // "+br.score); } targetBG.splitRules(); for (int parent = 0; parent < sourceNumberer.total(); parent++) { for (Iterator<UnaryRule> urI = ug.ruleIteratorByParent(parent); urI.hasNext(); ) { UnaryRule sourceRule = urI.next(); UnaryRule rule = projectUnaryRule(sourceRule); Rule old = unaryRules.get(rule); if (old == null || rule.score > old.score) { unaryRules.put(rule, rule); } /* if (((UnaryRule)rule).child == targetNumberer.number("PRP") && ((String)sourceNumberer.object(rule.parent)).charAt(0) == 'N') { System.out.println("Source UR: "+sourceRule+" %% "+sourceRule.score); System.out.println("Score of "+rule+"is now: "+((UnaryRule)unaryRules.get(rule)).score); } */ } } for (UnaryRule ur : unaryRules.keySet()) { targetUG.addRule(ur); // System.out.println("UR: "+targetNumberer.object(ur.parent)+" -> // "+targetNumberer.object(ur.child)+" %% "+ur.score); } targetUG.purgeRules(); System.out.println( "Projected " + sourceNumberer.total() + " states to " + targetNumberer.total() + " states."); }
public static Tree copyHelper(Tree t, Map<Tree, Tree> newToOld, Map<Tree, Tree> oldToNew) { Tree[] kids = t.children(); Tree[] newKids = new Tree[kids.length]; for (int i = 0, n = kids.length; i < n; i++) { newKids[i] = copyHelper(kids[i], newToOld, oldToNew); } TreeFactory tf = t.treeFactory(); if (kids.length == 0) { Tree newLeaf = tf.newLeaf(t.label()); newToOld.put(newLeaf, t); oldToNew.put(newLeaf, t); return newLeaf; } Tree newNode = tf.newTreeNode(t.label(), Arrays.asList(newKids)); newToOld.put(newNode, t); oldToNew.put(t, newNode); return newNode; }
public Object formResult() { Set brs = new HashSet(); Set urs = new HashSet(); // scan each rule / history pair int ruleCount = 0; for (Iterator pairI = rulePairs.keySet().iterator(); pairI.hasNext(); ) { if (ruleCount % 100 == 0) { System.err.println("Rules multiplied: " + ruleCount); } ruleCount++; Pair rulePair = (Pair) pairI.next(); Rule baseRule = (Rule) rulePair.first; String baseLabel = (String) ruleToLabel.get(baseRule); List history = (List) rulePair.second; double totalProb = 0; for (int depth = 1; depth <= HISTORY_DEPTH() && depth <= history.size(); depth++) { List subHistory = history.subList(0, depth); double c_label = labelPairs.getCount(new Pair(baseLabel, subHistory)); double c_rule = rulePairs.getCount(new Pair(baseRule, subHistory)); // System.out.println("Multiplying out "+baseRule+" with history "+subHistory); // System.out.println("Count of "+baseLabel+" with "+subHistory+" is "+c_label); // System.out.println("Count of "+baseRule+" with "+subHistory+" is "+c_rule ); double prob = (1.0 / HISTORY_DEPTH()) * (c_rule) / (c_label); totalProb += prob; for (int childDepth = 0; childDepth <= Math.min(HISTORY_DEPTH() - 1, depth); childDepth++) { Rule rule = specifyRule(baseRule, subHistory, childDepth); rule.score = (float) Math.log(totalProb); // System.out.println("Created "+rule+" with score "+rule.score); if (rule instanceof UnaryRule) { urs.add(rule); } else { brs.add(rule); } } } } System.out.println("Total states: " + stateNumberer.total()); BinaryGrammar bg = new BinaryGrammar(stateNumberer.total()); UnaryGrammar ug = new UnaryGrammar(stateNumberer.total()); for (Iterator brI = brs.iterator(); brI.hasNext(); ) { BinaryRule br = (BinaryRule) brI.next(); bg.addRule(br); } for (Iterator urI = urs.iterator(); urI.hasNext(); ) { UnaryRule ur = (UnaryRule) urI.next(); ug.addRule(ur); } return new Pair(ug, bg); }
protected void tallyInternalNode(Tree lt, List parents) { // form base rule String label = lt.label().value(); Rule baseR = ltToRule(lt); ruleToLabel.put(baseR, label); // act on each history depth for (int depth = 0, maxDepth = Math.min(HISTORY_DEPTH(), parents.size()); depth <= maxDepth; depth++) { List history = new ArrayList(parents.subList(0, depth)); // tally each history level / rewrite pair rulePairs.incrementCount(new Pair(baseR, history), 1); labelPairs.incrementCount(new Pair(label, history), 1); } }
@Override public void finishTraining() { lex.finishTraining(); int numTags = tagIndex.size(); POSes = new HashSet<String>(tagIndex.objectsList()); initialPOSDist = Distribution.laplaceSmoothedDistribution(initial, numTags, 0.5); markovPOSDists = new HashMap<String, Distribution>(); Set entries = ruleCounter.lowestLevelCounterEntrySet(); for (Iterator iter = entries.iterator(); iter.hasNext(); ) { Map.Entry entry = (Map.Entry) iter.next(); // Map.Entry<List<String>, Counter> entry = (Map.Entry<List<String>, Counter>) // iter.next(); Distribution d = Distribution.laplaceSmoothedDistribution((ClassicCounter) entry.getValue(), numTags, 0.5); markovPOSDists.put(((List<String>) entry.getKey()).get(0), d); } }
/** * Prints out all matches of a semgrex pattern on a file of dependencies. <br> * Usage:<br> * java edu.stanford.nlp.semgraph.semgrex.SemgrexPattern [args] <br> * See the help() function for a list of possible arguments to provide. */ public static void main(String[] args) throws IOException { Map<String, Integer> flagMap = Generics.newHashMap(); flagMap.put(PATTERN, 1); flagMap.put(TREE_FILE, 1); flagMap.put(MODE, 1); flagMap.put(EXTRAS, 1); flagMap.put(CONLLU_FILE, 1); flagMap.put(OUTPUT_FORMAT_OPTION, 1); Map<String, String[]> argsMap = StringUtils.argsToMap(args, flagMap); args = argsMap.get(null); // TODO: allow patterns to be extracted from a file if (!(argsMap.containsKey(PATTERN)) || argsMap.get(PATTERN).length == 0) { help(); System.exit(2); } SemgrexPattern semgrex = SemgrexPattern.compile(argsMap.get(PATTERN)[0]); String modeString = DEFAULT_MODE; if (argsMap.containsKey(MODE) && argsMap.get(MODE).length > 0) { modeString = argsMap.get(MODE)[0].toUpperCase(); } SemanticGraphFactory.Mode mode = SemanticGraphFactory.Mode.valueOf(modeString); String outputFormatString = DEFAULT_OUTPUT_FORMAT; if (argsMap.containsKey(OUTPUT_FORMAT_OPTION) && argsMap.get(OUTPUT_FORMAT_OPTION).length > 0) { outputFormatString = argsMap.get(OUTPUT_FORMAT_OPTION)[0].toUpperCase(); } OutputFormat outputFormat = OutputFormat.valueOf(outputFormatString); boolean useExtras = true; if (argsMap.containsKey(EXTRAS) && argsMap.get(EXTRAS).length > 0) { useExtras = Boolean.valueOf(argsMap.get(EXTRAS)[0]); } List<SemanticGraph> graphs = Generics.newArrayList(); // TODO: allow other sources of graphs, such as dependency files if (argsMap.containsKey(TREE_FILE) && argsMap.get(TREE_FILE).length > 0) { for (String treeFile : argsMap.get(TREE_FILE)) { System.err.println("Loading file " + treeFile); MemoryTreebank treebank = new MemoryTreebank(new TreeNormalizer()); treebank.loadPath(treeFile); for (Tree tree : treebank) { // TODO: allow other languages... this defaults to English SemanticGraph graph = SemanticGraphFactory.makeFromTree( tree, mode, useExtras ? GrammaticalStructure.Extras.MAXIMAL : GrammaticalStructure.Extras.NONE, true); graphs.add(graph); } } } if (argsMap.containsKey(CONLLU_FILE) && argsMap.get(CONLLU_FILE).length > 0) { CoNLLUDocumentReader reader = new CoNLLUDocumentReader(); for (String conlluFile : argsMap.get(CONLLU_FILE)) { System.err.println("Loading file " + conlluFile); Iterator<SemanticGraph> it = reader.getIterator(IOUtils.readerFromString(conlluFile)); while (it.hasNext()) { SemanticGraph graph = it.next(); graphs.add(graph); } } } for (SemanticGraph graph : graphs) { SemgrexMatcher matcher = semgrex.matcher(graph); if (!(matcher.find())) { continue; } if (outputFormat == OutputFormat.LIST) { System.err.println("Matched graph:"); System.err.println(graph.toString(SemanticGraph.OutputFormat.LIST)); boolean found = true; while (found) { System.err.println( "Matches at: " + matcher.getMatch().value() + "-" + matcher.getMatch().index()); List<String> nodeNames = Generics.newArrayList(); nodeNames.addAll(matcher.getNodeNames()); Collections.sort(nodeNames); for (String name : nodeNames) { System.err.println( " " + name + ": " + matcher.getNode(name).value() + "-" + matcher.getNode(name).index()); } System.err.println(); found = matcher.find(); } } else if (outputFormat == OutputFormat.OFFSET) { if (graph.vertexListSorted().isEmpty()) { continue; } System.out.printf( "+%d %s%n", graph.vertexListSorted().get(0).get(CoreAnnotations.LineNumberAnnotation.class), argsMap.get(CONLLU_FILE)[0]); } } }
/** * Do max language model markov segmentation. Note that this algorithm inherently tags words as it * goes, but that we throw away the tags in the final result so that the segmented words are * untagged. (Note: for a couple of years till Aug 2007, a tagged result was returned, but this * messed up the parser, because it could use no tagging but the given tagging, which often wasn't * very good. Or in particular it was a subcategorized tagging which never worked with the current * forceTags option which assumes that gold taggings are inherently basic taggings.) * * @param s A String to segment * @return The list of segmented words. */ private ArrayList<HasWord> segmentWordsWithMarkov(String s) { int length = s.length(); // Set<String> POSes = (Set<String>) POSDistribution.keySet(); // 1.5 int numTags = POSes.size(); // score of span with initial word of this tag double[][][] scores = new double[length][length + 1][numTags]; // best (length of) first word for this span with this tag int[][][] splitBacktrace = new int[length][length + 1][numTags]; // best tag for second word over this span, if first is this tag int[][][] POSbacktrace = new int[length][length + 1][numTags]; for (int i = 0; i < length; i++) { for (int j = 0; j < length + 1; j++) { Arrays.fill(scores[i][j], Double.NEGATIVE_INFINITY); } } // first fill in word probabilities for (int diff = 1; diff <= 10; diff++) { for (int start = 0; start + diff <= length; start++) { int end = start + diff; StringBuilder wordBuf = new StringBuilder(); for (int pos = start; pos < end; pos++) { wordBuf.append(s.charAt(pos)); } String word = wordBuf.toString(); for (String tag : POSes) { IntTaggedWord itw = new IntTaggedWord(word, tag, wordIndex, tagIndex); double score = lex.score(itw, 0, word, null); if (start == 0) { score += Math.log(initialPOSDist.probabilityOf(tag)); } scores[start][end][itw.tag()] = score; splitBacktrace[start][end][itw.tag()] = end; } } } // now fill in word combination probabilities for (int diff = 2; diff <= length; diff++) { for (int start = 0; start + diff <= length; start++) { int end = start + diff; for (int split = start + 1; split < end && split - start <= 10; split++) { for (String tag : POSes) { int tagNum = tagIndex.indexOf(tag, true); if (splitBacktrace[start][split][tagNum] != split) { continue; } Distribution<String> rTagDist = markovPOSDists.get(tag); if (rTagDist == null) { continue; // this happens with "*" POS } for (String rTag : POSes) { int rTagNum = tagIndex.indexOf(rTag, true); double newScore = scores[start][split][tagNum] + scores[split][end][rTagNum] + Math.log(rTagDist.probabilityOf(rTag)); if (newScore > scores[start][end][tagNum]) { scores[start][end][tagNum] = newScore; splitBacktrace[start][end][tagNum] = split; POSbacktrace[start][end][tagNum] = rTagNum; } } } } } } int nextPOS = ArrayMath.argmax(scores[0][length]); ArrayList<HasWord> words = new ArrayList<HasWord>(); int start = 0; while (start < length) { int split = splitBacktrace[start][length][nextPOS]; StringBuilder wordBuf = new StringBuilder(); for (int i = start; i < split; i++) { wordBuf.append(s.charAt(i)); } String word = wordBuf.toString(); // String tag = tagIndex.get(nextPOS); // words.add(new TaggedWord(word, tag)); words.add(new Word(word)); if (split < length) { nextPOS = POSbacktrace[start][length][nextPOS]; } start = split; } return words; }