/** * Removes every {@code Relation} from the {@code RELATIONVIEW} in the list of text annotations. */ private static List<TextAnnotation> removeRelationsFromPredicateArgumentView( List<TextAnnotation> uncleansedAnnotations) { List<String> relationExtractionViews = new ArrayList<>(); relationExtractionViews.add(ViewNames.SENTENCE); relationExtractionViews.add(ViewNames.TOKENS); relationExtractionViews.add("RELATIONVIEW"); List<TextAnnotation> textAnnotations = removeViews(uncleansedAnnotations, relationExtractionViews); for (TextAnnotation textAnnotation : textAnnotations) { Set<String> viewNames = textAnnotation.getAvailableViews(); for (String viewName : viewNames) { View view = textAnnotation.getView(viewName); if (view instanceof PredicateArgumentView) { PredicateArgumentView predicateArgumentView = (PredicateArgumentView) view; predicateArgumentView.removeAllRelations(); for (Constituent c : predicateArgumentView.getConstituents()) { predicateArgumentView.removeConstituent(c); int start = c.getStartSpan(); int end = c.getEndSpan(); view.addConstituent(new Constituent("", "RELATIONVIEW", textAnnotation, start, end)); } } } } return textAnnotations; }
@Override public List<Constituent> transform(Constituent c) { Constituent c1 = c.cloneForNewView(c.getViewName()); new Relation("", c, c1, 0.0); return Collections.singletonList(c1); }
private static Constituent getHead(Constituent c, TreeView dependency) { Constituent end = null; List<Constituent> constituentsCovering = dependency.getConstituentsCovering(c); for (Constituent d : constituentsCovering) { List<Relation> in = d.getIncomingRelations(); if (in.size() == 0) { end = d; break; } else { Constituent parent = in.get(0).getSource(); int parentToken = parent.getStartSpan(); if (c.getStartSpan() <= parentToken && parentToken < c.getEndSpan()) continue; if (end == null) { end = d; } else if (end.getStartSpan() < d.getStartSpan()) { end = d; } } } Constituent c1; if (end == null) c1 = constituentsCovering.get(0).cloneForNewView(""); else c1 = end.cloneForNewView(""); return addPointerToSource(c, c1); }
@Override public List<Constituent> transform(Constituent input) { TextAnnotation ta = input.getTextAnnotation(); int tokenPosition = input.getStartSpan(); TreeView dependency = (TreeView) ta.getView(ViewNames.DEPENDENCY); Constituent verbNode = dependency.getConstituentsCoveringToken(tokenPosition).get(0); boolean done = false; while (!done) { String pos = WordHelpers.getPOS(ta, verbNode.getStartSpan()); if (POSUtils.isPOSVerb(pos)) { done = true; } else { List<Relation> incoming = verbNode.getIncomingRelations(); if (incoming == null || incoming.size() == 0) { return new ArrayList<>(); } else verbNode = incoming.get(0).getSource(); } } return Collections.singletonList(addPointerToSource(input, verbNode)); }
@Override public List<Constituent> transform(Constituent input) { int tokenId = input.getStartSpan(); if (tokenId > 0) { Constituent c = new Constituent("", "", input.getTextAnnotation(), tokenId - 1, tokenId); return Collections.singletonList(addPointerToSource(input, c)); } else return new ArrayList<>(); }
public Constituent getDependencyConstituentCoveringTokenId(Problem prob, int tokenId) { for (int i = 0; i <= 2; ++i) { for (Constituent cons : prob.dependency) { if (tokenId + i >= cons.getStartSpan() && tokenId + i < cons.getEndSpan()) { return cons; } } } return null; }
@Override public List<Constituent> transform(Constituent input) { int tokenId = input.getEndSpan(); TextAnnotation ta = input.getTextAnnotation(); Sentence sentence = ta.getSentence(input.getSentenceId()); if (tokenId < sentence.size()) { Constituent c = new Constituent("", "", ta, tokenId, tokenId + 1); return Collections.singletonList(addPointerToSource(input, c)); } else return new ArrayList<>(); }
@Override public List<Constituent> transform(Constituent input) { return Collections.singletonList( new Constituent( "", "", input.getTextAnnotation(), input.getStartSpan(), input.getStartSpan() + 1)); }
/** * Add an array of objects reporting View's Constituents' surface form and character offsets. May * make deserialization to TextAnnotation problematic, as the relevant methods deduce token * character offsets directly from list of token strings and raw text. * * @param fieldName name to give to this field * @param view view whose character offsets will be serialized * @param json Json object to which resulting array will be added */ private static void writeTokenOffsets(String fieldName, View view, JsonObject json) { JsonArray offsetArray = new JsonArray(); for (Constituent c : view.getConstituents()) { JsonObject cJ = new JsonObject(); writeString(FORM, c.getSurfaceForm(), cJ); writeInt(STARTCHAROFFSET, c.getStartCharOffset(), cJ); writeInt(ENDCHAROFFSET, c.getEndCharOffset(), cJ); offsetArray.add(cJ); } json.add(fieldName, offsetArray); }
@Override /** * This feature extractor assumes that the TOKEN View has been generated in the Constituents * TextAnnotation. It generate a feature for a window [-2, +2] of Forms (original text) for each * constituent. */ public Set<Feature> getFeatures(Constituent c) throws EdisonException { TextAnnotation ta = c.getTextAnnotation(); TOKENS = ta.getView(ViewNames.TOKENS); // We can assume that the constituent in this case is a Word(Token) int startspan = c.getStartSpan(); int endspan = c.getEndSpan(); // k is 3 since we need up to 3-grams int k = 3; int window = 2; // All our constituents are words(tokens) String[] forms = getWindowK(TOKENS, startspan, endspan, window); String id, value; String classifier = "WordConjunctionOneTwoThreeGramWindowTwo"; Set<Feature> result = new LinkedHashSet<>(); for (int j = 0; j < k; j++) { // k = 3, j goes from 0 to 2 for (int i = 0; i < forms.length; i++) { // forms.length = 5, So i goes from 0 to 4, for each String in the forms array. StringBuilder f = new StringBuilder(); // Starts with context = 0 and then increments context as long as it is below // the current value of j and is not out of index of the forms array. // This is basically creating a discrete feature for each combination of one, two // and three word combinations within [-2,2] window or words. for (int context = 0; context <= j && i + context < forms.length; context++) { // add a '_' between words to conjoin them together if (context != 0) { f.append("_"); } f.append(forms[i + context]); } // 2 is the center object in the array so i should go from -2 to +2 (with 0 being // the center) // j is the size of the n-gram so it goes 1 to 3 id = classifier + ":" + ((i - window) + "_" + (j + 1)); value = "(" + (f.toString()) + ")"; result.add(new DiscreteFeature(id + value)); } } return result; }
@Override public List<Constituent> transform(Constituent input) { List<Constituent> list = new ArrayList<>(); TextAnnotation ta = input.getTextAnnotation(); for (int i = input.getStartSpan(); i < input.getEndSpan(); i++) { list.add(new Constituent("", "", ta, i, i + 1)); } return list; }
public static int getChunkIndex(Problem prob, int tokenId) { for (int i = 0; i <= 2; ++i) { for (int j = 0; j < prob.chunks.size(); ++j) { Constituent cons = prob.chunks.get(j); if (tokenId + i >= cons.getStartSpan() && tokenId + i < cons.getEndSpan()) { return j; } } } return -1; }
private static List<Constituent> getGovernor(Constituent input, TreeView dependency) { List<Constituent> constituentsCovering = dependency.getConstituentsCovering(input); if (constituentsCovering.size() == 0) return new ArrayList<>(); Constituent c = constituentsCovering.get(0); List<Relation> incomingRelations = c.getIncomingRelations(); if (incomingRelations == null || incomingRelations.size() == 0) return new ArrayList<>(); else return Collections.singletonList( addPointerToSource(input, incomingRelations.get(0).getSource())); }
@Override public List<Constituent> transform(Constituent input) { TextAnnotation ta = input.getTextAnnotation(); return getGovernor(input, getDependencyView(ta, BERKELEY_DEPENDENCIES)); }
@Override public List<Constituent> transform(Constituent input) { TreeView dependency = (TreeView) input.getTextAnnotation().getView(ViewNames.DEPENDENCY); return getObject(input, dependency, "obj"); }
@Override public List<Constituent> transform(Constituent input) { TreeView dependency = (TreeView) input.getTextAnnotation().getView(ViewNames.DEPENDENCY_STANFORD); return getGovernor(input, dependency); }
@Override public List<Constituent> transform(Constituent input) { TextAnnotation ta = input.getTextAnnotation(); TreeView dependency = getDependencyView(ta, STANFORD_DEPENDENCIES); return getGovernor(input, dependency); }
@Override public List<Constituent> transform(Constituent input) { List<Constituent> c = new ArrayList<>(); for (Relation r : input.getOutgoingRelations()) { c.add(addPointerToSource(input, r.getTarget())); } return c; }
protected String getNERString() { List<Constituent> constituents = new ArrayList<>(view.getConstituents()); Collections.sort(constituents, TextAnnotationUtilities.constituentStartComparator); StringBuilder sb = new StringBuilder(); String text = textAnnotation.getText(); int where = 0; for (Constituent c : constituents) { int start = c.getStartCharOffset(); String startstring = text.substring(where, start); sb.append(startstring) .append("[") .append(c.getLabel()) .append(" ") .append(c.getTokenizedSurfaceForm()) .append(" ] "); where = c.getEndCharOffset(); } return sb.toString(); }
/** Removes the label from the NER_GOLD_EXTENT_SPAN. */ private static List<TextAnnotation> removeLabelsForNER(List<TextAnnotation> cleansedAnnotations) { List<String> nerViews = new ArrayList<>(); nerViews.add(ViewNames.SENTENCE); nerViews.add(ViewNames.TOKENS); nerViews.add("NER_GOLD_EXTENT_SPAN"); List<TextAnnotation> textAnnotations = removeViews(cleansedAnnotations, nerViews); for (TextAnnotation textAnnotation : textAnnotations) { View view = textAnnotation.getView("NER_GOLD_EXTENT_SPAN"); List<Constituent> constituents = view.getConstituents(); for (Constituent c : constituents) { view.removeConstituent(c); int start = c.getStartSpan(); int end = c.getEndSpan(); view.addConstituent( new Constituent("", "NER_GOLD_EXTENT_SPAN", textAnnotation, start, end)); } textAnnotation.addView(view.getViewName(), view); } return textAnnotations; }
public void evaluate(ClassificationTester tester, View gold, View prediction) { super.cleanAttributes(gold, prediction); Set<IntPair> goldSpans = new HashSet<>(); for (Constituent cons : gold.getConstituents()) { goldSpans.add(cons.getSpan()); } Set<IntPair> predictedSpans = new HashSet<>(); for (Constituent cons : prediction.getConstituents()) { predictedSpans.add(cons.getSpan()); } Set<IntPair> spanIntersection = new HashSet<>(goldSpans); spanIntersection.retainAll(predictedSpans); tester.recordCount( "" /* label doesn't matter */, goldSpans.size(), predictedSpans.size(), spanIntersection.size()); }
private static List<Constituent> getModifiers(Constituent input, TreeView dependency) { List<Constituent> constituentsCovering = dependency.getConstituentsCovering(input); if (constituentsCovering.size() == 0) return new ArrayList<>(); Constituent c = constituentsCovering.get(0); List<Relation> outgoingRelations = c.getOutgoingRelations(); if (outgoingRelations == null || outgoingRelations.size() == 0) return new ArrayList<>(); else { for (Relation r : outgoingRelations) { if (r.getRelationName().contains("mod")) return Collections.singletonList(addPointerToSource(input, r.getTarget())); } return Collections.singletonList( addPointerToSource(input, outgoingRelations.get(0).getTarget())); } }
public Constituent getDependentVerb(Problem prob, QuantSpan qs) { Constituent result = getDependencyConstituentCoveringTokenId( prob, prob.ta.getTokenIdFromCharacterOffset(qs.start)); if (result == null) { System.out.println( "Text : " + prob.question + " Token : " + prob.ta.getTokenIdFromCharacterOffset(qs.start)); Tools.printCons(prob.dependency); } while (result != null) { if (result.getIncomingRelations().size() == 0) break; // System.out.println(result.getIncomingRelations().get(0).getSource()+" --> "+result); result = result.getIncomingRelations().get(0).getSource(); if (prob.posTags.get(result.getStartSpan()).getLabel().startsWith("VB")) { return result; } } return result; }
@Override public List<Constituent> transform(Constituent c) { TextAnnotation ta = c.getTextAnnotation(); TreeView tree = (TreeView) ta.getView(ViewNames.PARSE_CHARNIAK); try { Constituent phrase = tree.getParsePhrase(c); int head = CollinsHeadFinder.getInstance().getHeadWordPosition(phrase); Constituent c1 = new Constituent("", "", ta, head, head + 1); return Collections.singletonList(addPointerToSource(c, c1)); } catch (Exception e) { throw new RuntimeException(e); } }
private static void writeConstituent(Constituent c, JsonObject cJ) { writeString("label", c.getLabel(), cJ); if (c.getConstituentScore() != 0) writeDouble("score", c.getConstituentScore(), cJ); writeInt("start", c.getStartSpan(), cJ); writeInt("end", c.getEndSpan(), cJ); writeAttributes(c, cJ); Map<String, Double> labelsToScores = c.getLabelsToScores(); if (null != labelsToScores) writeLabelsToScores(labelsToScores, cJ); }
@Override public List<Constituent> transform(Constituent c) { TextAnnotation ta = c.getTextAnnotation(); int tokenPosition = c.getStartSpan(); TreeView dependency = (TreeView) ta.getView(ViewNames.DEPENDENCY); Constituent prepositionDepConstituent = dependency.getConstituentsCoveringToken(tokenPosition).get(0); List<Relation> incomingRelations = prepositionDepConstituent.getIncomingRelations(); List<Constituent> list = new ArrayList<>(); if (incomingRelations != null && incomingRelations.size() > 0) { Constituent parent = incomingRelations.get(0).getSource(); for (Relation out : parent.getOutgoingRelations()) { if (out == incomingRelations.get(0)) continue; String label = out.getRelationName(); if (label.contains("prep")) { Constituent ppNode = out.getTarget(); list.add(addPointerToSource(c, ppNode)); // get the first child of the pp and add this List<Relation> ppOut = ppNode.getOutgoingRelations(); if (ppOut != null && ppOut.size() != 0) { Constituent child = ppOut.get(0).getTarget(); list.add(addPointerToSource(c, child)); } } } } return list; }
public Constituent getRateUnit(Problem prob) { for (Constituent cons : connectedNPs) { if (cons.getSurfaceForm().toLowerCase().contains("each")) { return cons; } } for (Constituent cons : connectedNPs) { if (cons.getSurfaceForm().toLowerCase().contains("every")) { return cons; } } if (quantPhrase.getSurfaceForm().contains("each") || quantPhrase.getSurfaceForm().contains("every")) { return quantPhrase; } int chunkId = getChunkIndex(prob, quantPhrase.getStartSpan()); if (chunkId + 2 < prob.chunks.size() && prob.chunks.get(chunkId + 1).getSurfaceForm().equals("per")) { return prob.chunks.get(chunkId + 2); } return null; }
public List<Constituent> getConnectedNPs(Problem prob) { List<Constituent> npList = new ArrayList<>(); List<Constituent> npListQuantRemoved = new ArrayList<>(); boolean onlyQuantityInSentence = true; int sentId = prob.ta.getSentenceFromToken(quantPhrase.getStartSpan()).getSentenceId(); for (QuantSpan qs : prob.quantities) { int tokenId = prob.ta.getTokenIdFromCharacterOffset(qs.start); if (prob.ta.getSentenceFromToken(tokenId).getSentenceId() == sentId && !(quantPhrase.getStartSpan() <= tokenId && quantPhrase.getEndSpan() > tokenId)) { onlyQuantityInSentence = false; break; } } // Find NPs from children of verb if (verbPhrase != null) { List<Relation> relations = verbPhrase.getOutgoingRelations(); for (Relation relation : relations) { if (!relation.getRelationName().equals("nsubj")) continue; Constituent dst = relation.getTarget(); for (Constituent cons : prob.chunks) { if (cons.getStartSpan() <= dst.getStartSpan() && cons.getEndSpan() > dst.getStartSpan() && cons.getLabel().equals("NP") && !npList.contains(cons)) { npList.add(cons); subject = cons; break; } } } } // Find NPs from PP NP connection int quantPhraseId = getChunkIndex(prob, quantPhrase.getStartSpan()); if (quantPhraseId + 2 < prob.chunks.size() && !prob.chunks.get(quantPhraseId + 1).getSurfaceForm().trim().equals("of") && prob.chunks.get(quantPhraseId + 1).getLabel().equals("PP") && prob.chunks.get(quantPhraseId + 2).getLabel().equals("NP") && !npList.contains(prob.chunks.get(quantPhraseId + 2))) { npList.add(prob.chunks.get(quantPhraseId + 2)); } if (quantPhraseId - 2 >= 0 && prob.chunks.get(quantPhraseId - 1).getLabel().equals("PP") && prob.chunks.get(quantPhraseId - 2).getLabel().equals("NP") && !npList.contains(prob.chunks.get(quantPhraseId - 2))) { npList.add(prob.chunks.get(quantPhraseId - 2)); } // Get preceding NP if (quantPhraseId - 1 >= 0 && prob.chunks.get(quantPhraseId - 1).getLabel().equals("NP") && !prob.posTags .get(prob.chunks.get(quantPhraseId - 1).getEndSpan()) .getLabel() .equals("CC") && !npList.contains(prob.chunks.get(quantPhraseId - 1))) { npList.add(prob.chunks.get(quantPhraseId - 1)); } // Get succeeding NP if (quantPhraseId + 1 < prob.chunks.size() && prob.chunks.get(quantPhraseId + 1).getLabel().equals("NP") && !prob.posTags.get(prob.chunks.get(quantPhraseId).getEndSpan()).getLabel().equals("CC") && !npList.contains(prob.chunks.get(quantPhraseId + 1))) { npList.add(prob.chunks.get(quantPhraseId + 1)); } // If only quantity in sentence, all NPs are connected if (onlyQuantityInSentence) { for (int i = 0; i < prob.chunks.size(); ++i) { Constituent cons = prob.chunks.get(i); if (cons.getSentenceId() == sentId && (i > quantPhraseId + 2 || i < quantPhraseId - 2) && !npList.contains(cons) && cons.getLabel().equals("NP")) { npList.add(cons); } } } // Remove quantity phrases from npList for (Constituent cons : npList) { boolean allow = true; for (QuantSpan qs : prob.quantities) { int index = prob.ta.getTokenIdFromCharacterOffset(qs.start); if (index >= cons.getStartSpan() && index < cons.getEndSpan()) { allow = false; break; } } if (allow) { npListQuantRemoved.add(cons); } } return npListQuantRemoved; }
@Override public Set<Feature> getFeatures(Constituent c) throws EdisonException { TextAnnotation ta = c.getTextAnnotation(); Set<Feature> features = new LinkedHashSet<>(); TreeView parse = (TreeView) ta.getView(dependencyViewName); // get equivalent of c in the parse view Constituent c2 = parse.getConstituentsCoveringToken(c.getStartSpan()).get(0); List<Relation> incomingRelations = c2.getIncomingRelations(); if (incomingRelations.size() > 0) { Constituent c1 = parse .getConstituentsCoveringToken(incomingRelations.get(0).getSource().getStartSpan()) .get(0); Pair<List<Constituent>, List<Constituent>> paths = PathFeatureHelper.getPathsToCommonAncestor(c1, c2, 400); List<String> path = new ArrayList<>(); List<String> pos = new ArrayList<>(); for (int i = 0; i < paths.getFirst().size() - 1; i++) { Constituent cc = paths.getFirst().get(i); path.add( cc.getIncomingRelations().get(0).getRelationName() + PathFeatureHelper.PATH_UP_STRING); pos.add( WordHelpers.getPOS(ta, cc.getStartSpan()) + ":" + cc.getIncomingRelations().get(0).getRelationName() + PathFeatureHelper.PATH_UP_STRING); } Constituent top = paths.getFirst().get(paths.getFirst().size() - 1); pos.add(WordHelpers.getPOS(ta, top.getStartSpan()) + ":*"); path.add("*"); if (paths.getSecond().size() > 1) { for (int i = paths.getSecond().size() - 2; i >= 0; i--) { Constituent cc = paths.getSecond().get(i); pos.add( WordHelpers.getPOS(ta, cc.getStartSpan()) + ":" + PathFeatureHelper.PATH_DOWN_STRING); path.add(PathFeatureHelper.PATH_DOWN_STRING); } } features.addAll(getNgrams(path, "")); features.addAll(getNgrams(pos, "pos")); } return features; }
public Pair<String, Constituent> getUnit(Problem prob, int quantIndex) { String unit = ""; int tokenId = prob.ta.getTokenIdFromCharacterOffset(prob.quantities.get(quantIndex).start); int quantPhraseId = getChunkIndex(prob, tokenId); Constituent quantPhrase = prob.chunks.get(quantPhraseId); // Detect cases like 4 red and 6 blue balls int numQuantInChunk = 0; for (QuantSpan qs : prob.quantities) { int index = prob.ta.getTokenIdFromCharacterOffset(qs.start); if (index >= quantPhrase.getStartSpan() && index < quantPhrase.getEndSpan()) { numQuantInChunk++; } } int start = quantPhrase.getStartSpan(); int end = quantPhrase.getEndSpan(); boolean addEndNoun = false; if (numQuantInChunk > 1) { for (int i = quantPhrase.getStartSpan(); i < quantPhrase.getEndSpan(); ++i) { if (prob.posTags.get(i).getLabel().equals("CC")) { if (tokenId < i) { end = i; addEndNoun = true; } else { start = i + 1; } break; } } } for (int i = start; i < end; ++i) { if (i != tokenId) { if (prob.ta.getToken(i).equals("$")) { unit += "dollar "; } else { unit += prob.lemmas.get(i) + " "; } } } // Connecting disconnected units, as in, 5 red and 6 green apples if (addEndNoun && quantPhrase.getEndSpan() <= prob.ta.size() && prob.posTags.get(quantPhrase.getEndSpan() - 1).getLabel().startsWith("N")) { unit += prob.lemmas.get(quantPhrase.getEndSpan() - 1) + " "; } // Unit from neighboring phrases if (quantPhraseId + 2 < prob.chunks.size() && prob.chunks.get(quantPhraseId + 1).getSurfaceForm().trim().equals("of") && prob.chunks.get(quantPhraseId + 2).getLabel().equals("NP")) { Constituent cons = prob.chunks.get(quantPhraseId + 2); for (int j = cons.getStartSpan(); j < cons.getEndSpan(); ++j) { unit += prob.lemmas.get(j) + " "; } quantPhraseId += 2; } return new Pair<String, Constituent>(unit, quantPhrase); }