/** * Static method to remove views from `TextAnnotation` objects. Each task variant has a number of * views necessary for the solver to solve it. All other views should be removed. */ private static List<TextAnnotation> removeViews( List<TextAnnotation> textAnnotations, List<String> viewsToKeep) { if (viewsToKeep == null) { viewsToKeep = new ArrayList<String>(); } viewsToKeep = new ArrayList<>(viewsToKeep); List<String> viewsToRemove = new ArrayList<>(); List<TextAnnotation> cleansed = new ArrayList<>(); for (TextAnnotation textAnnotation : textAnnotations) { for (String viewName : textAnnotation.getAvailableViews()) { if (!viewsToKeep.contains(viewName)) { viewsToRemove.add(viewName); } } TextAnnotation cleansedAnnotation; try { cleansedAnnotation = (TextAnnotation) textAnnotation.clone(); } catch (CloneNotSupportedException ce) { cleansed.add(null); continue; } for (String viewName : viewsToRemove) { cleansedAnnotation.removeView(viewName); } cleansed.add(cleansedAnnotation); } return cleansed; }
/** * Removes every {@code Relation} from the {@code RELATIONVIEW} in the list of text annotations. */ private static List<TextAnnotation> removeRelationsFromPredicateArgumentView( List<TextAnnotation> uncleansedAnnotations) { List<String> relationExtractionViews = new ArrayList<>(); relationExtractionViews.add(ViewNames.SENTENCE); relationExtractionViews.add(ViewNames.TOKENS); relationExtractionViews.add("RELATIONVIEW"); List<TextAnnotation> textAnnotations = removeViews(uncleansedAnnotations, relationExtractionViews); for (TextAnnotation textAnnotation : textAnnotations) { Set<String> viewNames = textAnnotation.getAvailableViews(); for (String viewName : viewNames) { View view = textAnnotation.getView(viewName); if (view instanceof PredicateArgumentView) { PredicateArgumentView predicateArgumentView = (PredicateArgumentView) view; predicateArgumentView.removeAllRelations(); for (Constituent c : predicateArgumentView.getConstituents()) { predicateArgumentView.removeConstituent(c); int start = c.getStartSpan(); int end = c.getEndSpan(); view.addConstituent(new Constituent("", "RELATIONVIEW", textAnnotation, start, end)); } } } } return textAnnotations; }
@Override public List<Constituent> transform(Constituent input) { TextAnnotation ta = input.getTextAnnotation(); int tokenPosition = input.getStartSpan(); TreeView dependency = (TreeView) ta.getView(ViewNames.DEPENDENCY); Constituent verbNode = dependency.getConstituentsCoveringToken(tokenPosition).get(0); boolean done = false; while (!done) { String pos = WordHelpers.getPOS(ta, verbNode.getStartSpan()); if (POSUtils.isPOSVerb(pos)) { done = true; } else { List<Relation> incoming = verbNode.getIncomingRelations(); if (incoming == null || incoming.size() == 0) { return new ArrayList<>(); } else verbNode = incoming.get(0).getSource(); } } return Collections.singletonList(addPointerToSource(input, verbNode)); }
@Override public List<Constituent> transform(Constituent input) { int tokenId = input.getEndSpan(); TextAnnotation ta = input.getTextAnnotation(); Sentence sentence = ta.getSentence(input.getSentenceId()); if (tokenId < sentence.size()) { Constituent c = new Constituent("", "", ta, tokenId, tokenId + 1); return Collections.singletonList(addPointerToSource(input, c)); } else return new ArrayList<>(); }
@Override /** * This feature extractor assumes that the TOKEN View has been generated in the Constituents * TextAnnotation. It generate a feature for a window [-2, +2] of Forms (original text) for each * constituent. */ public Set<Feature> getFeatures(Constituent c) throws EdisonException { TextAnnotation ta = c.getTextAnnotation(); TOKENS = ta.getView(ViewNames.TOKENS); // We can assume that the constituent in this case is a Word(Token) int startspan = c.getStartSpan(); int endspan = c.getEndSpan(); // k is 3 since we need up to 3-grams int k = 3; int window = 2; // All our constituents are words(tokens) String[] forms = getWindowK(TOKENS, startspan, endspan, window); String id, value; String classifier = "WordConjunctionOneTwoThreeGramWindowTwo"; Set<Feature> result = new LinkedHashSet<>(); for (int j = 0; j < k; j++) { // k = 3, j goes from 0 to 2 for (int i = 0; i < forms.length; i++) { // forms.length = 5, So i goes from 0 to 4, for each String in the forms array. StringBuilder f = new StringBuilder(); // Starts with context = 0 and then increments context as long as it is below // the current value of j and is not out of index of the forms array. // This is basically creating a discrete feature for each combination of one, two // and three word combinations within [-2,2] window or words. for (int context = 0; context <= j && i + context < forms.length; context++) { // add a '_' between words to conjoin them together if (context != 0) { f.append("_"); } f.append(forms[i + context]); } // 2 is the center object in the array so i should go from -2 to +2 (with 0 being // the center) // j is the size of the n-gram so it goes 1 to 3 id = classifier + ":" + ((i - window) + "_" + (j + 1)); value = "(" + (f.toString()) + ")"; result.add(new DiscreteFeature(id + value)); } } return result; }
@Override public Set<Feature> getFeatures(Constituent c) throws EdisonException { TextAnnotation ta = c.getTextAnnotation(); Set<Feature> features = new LinkedHashSet<>(); TreeView parse = (TreeView) ta.getView(dependencyViewName); // get equivalent of c in the parse view Constituent c2 = parse.getConstituentsCoveringToken(c.getStartSpan()).get(0); List<Relation> incomingRelations = c2.getIncomingRelations(); if (incomingRelations.size() > 0) { Constituent c1 = parse .getConstituentsCoveringToken(incomingRelations.get(0).getSource().getStartSpan()) .get(0); Pair<List<Constituent>, List<Constituent>> paths = PathFeatureHelper.getPathsToCommonAncestor(c1, c2, 400); List<String> path = new ArrayList<>(); List<String> pos = new ArrayList<>(); for (int i = 0; i < paths.getFirst().size() - 1; i++) { Constituent cc = paths.getFirst().get(i); path.add( cc.getIncomingRelations().get(0).getRelationName() + PathFeatureHelper.PATH_UP_STRING); pos.add( WordHelpers.getPOS(ta, cc.getStartSpan()) + ":" + cc.getIncomingRelations().get(0).getRelationName() + PathFeatureHelper.PATH_UP_STRING); } Constituent top = paths.getFirst().get(paths.getFirst().size() - 1); pos.add(WordHelpers.getPOS(ta, top.getStartSpan()) + ":*"); path.add("*"); if (paths.getSecond().size() > 1) { for (int i = paths.getSecond().size() - 2; i >= 0; i--) { Constituent cc = paths.getSecond().get(i); pos.add( WordHelpers.getPOS(ta, cc.getStartSpan()) + ":" + PathFeatureHelper.PATH_DOWN_STRING); path.add(PathFeatureHelper.PATH_DOWN_STRING); } } features.addAll(getNgrams(path, "")); features.addAll(getNgrams(pos, "pos")); } return features; }
@Override public List<Constituent> transform(Constituent c) { TextAnnotation ta = c.getTextAnnotation(); TreeView tree = (TreeView) ta.getView(ViewNames.PARSE_CHARNIAK); try { Constituent phrase = tree.getParsePhrase(c); int head = CollinsHeadFinder.getInstance().getHeadWordPosition(phrase); Constituent c1 = new Constituent("", "", ta, head, head + 1); return Collections.singletonList(addPointerToSource(c, c1)); } catch (Exception e) { throw new RuntimeException(e); } }
private static TreeView getDependencyView(TextAnnotation ta, Annotator viewGenerator) { if (!ta.hasView(viewGenerator.getViewName())) { synchronized (FeatureInputTransformer.class) { if (!ta.hasView(viewGenerator.getViewName())) { try { ta.addView(viewGenerator); } catch (AnnotatorException e) { throw new RuntimeException(e); } } } } return (TreeView) ta.getView(viewGenerator.getViewName()); }
JsonObject writeTextAnnotation(TextAnnotation ta, boolean doWriteTokenOffsets) { // get rid of the views that are empty Set<String> viewNames = new HashSet<>(ta.getAvailableViews()); for (String vu : viewNames) { if (ta.getView(vu) == null) { logger.warn("View " + vu + " is null"); ta.removeView(vu); } } JsonObject json = new JsonObject(); writeString("corpusId", ta.getCorpusId(), json); writeString("id", ta.getId(), json); writeString("text", ta.getText(), json); writeStringArray("tokens", ta.getTokens(), json); if (doWriteTokenOffsets) writeTokenOffsets(TOKENOFFSETS, ta.getView(ViewNames.TOKENS), json); writeSentences(ta, json); JsonArray views = new JsonArray(); for (String viewName : Sorters.sortSet(ta.getAvailableViews())) { if (viewName.equals(ViewNames.SENTENCE)) continue; JsonObject view = new JsonObject(); writeString("viewName", viewName, view); views.add(view); JsonArray viewData = new JsonArray(); List<View> topKViews = ta.getTopKViews(viewName); for (View topKView : topKViews) { JsonObject kView = new JsonObject(); writeView(topKView, kView); viewData.add(kView); } view.add("viewData", viewData); } json.add("views", views); writeAttributes(ta, json); return json; }
/** * A test for {@link ParsePath} * * @author Daniel Khashabi */ public class TestParsePath { private static TextAnnotation tas = DummyTextAnnotationGenerator.generateAnnotatedTextAnnotation(false, 1); private List<Constituent> cons = tas.getView(ViewNames.PARSE_GOLD).getConstituents(); public static ParsePath parsePath = new ParsePath(ViewNames.PARSE_GOLD); private static Logger logger = LoggerFactory.getLogger(TestParsePath.class); // protected void setUp() throws Exception { // super.setUp(); // } Set<String> correctResponses = new HashSet<>( Arrays.asList( new String[] { "The construction of the John Smith library finished on time .->[]", "The construction of the John Smith library finished on time .->[S, , l=1.0]", "The construction of the John Smith library->[SvNP, , l=2.0]", "The construction->[NPvNP, , l=2.0]", "The->[NPvDT, , l=2.0]", "The->[DT, , l=1.0]", "construction->[NPvNN, , l=2.0]", "construction->[NN, , l=1.0]", "of the John Smith library->[NPvPP, , l=2.0]", "of->[PPvIN, , l=2.0]", "of->[IN, , l=1.0]", "the John Smith library->[PPvNP, , l=2.0]", "the->[NPvDT, , l=2.0]", "the->[DT, , l=1.0]", "John->[NPvNNP, , l=2.0]", "John->[NNP, , l=1.0]", "Smith->[NPvNNP, , l=2.0]", "Smith->[NNP, , l=1.0]", "library->[NPvNN, , l=2.0]", "library->[NN, , l=1.0]", "finished on time->[SvVP, , l=2.0]", "finished->[VPvVBD, , l=2.0]", "finished->[VBD, , l=1.0]", "on time->[VPvPP, , l=2.0]", "on->[PPvIN, , l=2.0]", "on->[IN, , l=1.0]", "time->[PPvNP, , l=2.0]", "time->[NP, , l=1.0]", "time->[NP, , l=1.0]", ".->[Sv., , l=2.0]", ".->[., , l=1.0]" })); @Test public final void testParsePath() throws Exception { logger.info(String.valueOf(cons.size())); logger.info(tas.getView(ViewNames.PARSE_GOLD).toString()); for (int i = 0; i < cons.size(); i++) { String prediction = cons.get(i).toString() + "->" + parsePath.getFeatures(cons.get(i)).toString(); assertTrue(correctResponses.contains(prediction)); } } }
private static void writeSentences(TextAnnotation ta, JsonObject json) { JsonObject object = new JsonObject(); SpanLabelView sentenceView = (SpanLabelView) ta.getView(ViewNames.SENTENCE); writeString("generator", sentenceView.getViewGenerator(), object); writeDouble("score", sentenceView.getScore(), object); int numSentences = sentenceView.getNumberOfConstituents(); int[] sentenceEndPositions = new int[numSentences]; int id = 0; for (Sentence sentence : ta.sentences()) { sentenceEndPositions[id++] = sentence.getEndSpan(); } writeIntArray("sentenceEndPositions", sentenceEndPositions, object); json.add("sentences", object); }
@Test public final void testParsePath() throws Exception { logger.info(String.valueOf(cons.size())); logger.info(tas.getView(ViewNames.PARSE_GOLD).toString()); for (int i = 0; i < cons.size(); i++) { String prediction = cons.get(i).toString() + "->" + parsePath.getFeatures(cons.get(i)).toString(); assertTrue(correctResponses.contains(prediction)); } }
/** Removes all coreference relations from {@code COREF} View. */ private static List<TextAnnotation> removeCoreferenceRelations( List<TextAnnotation> uncleansedAnnotations) { List<String> coreferenceViews = new ArrayList<>(); coreferenceViews.add(ViewNames.SENTENCE); coreferenceViews.add(ViewNames.TOKENS); coreferenceViews.add(ViewNames.COREF); List<TextAnnotation> textAnnotations = removeViews(uncleansedAnnotations, coreferenceViews); for (TextAnnotation textAnnotation : textAnnotations) { Set<String> viewNames = textAnnotation.getAvailableViews(); for (String viewName : viewNames) { View view = textAnnotation.getView(viewName); if (view instanceof CoreferenceView) { CoreferenceView coreferenceView = (CoreferenceView) view; coreferenceView.removeAllRelations(); textAnnotation.addView(viewName, coreferenceView); } } } return textAnnotations; }
/** Removes the label from the NER_GOLD_EXTENT_SPAN. */ private static List<TextAnnotation> removeLabelsForNER(List<TextAnnotation> cleansedAnnotations) { List<String> nerViews = new ArrayList<>(); nerViews.add(ViewNames.SENTENCE); nerViews.add(ViewNames.TOKENS); nerViews.add("NER_GOLD_EXTENT_SPAN"); List<TextAnnotation> textAnnotations = removeViews(cleansedAnnotations, nerViews); for (TextAnnotation textAnnotation : textAnnotations) { View view = textAnnotation.getView("NER_GOLD_EXTENT_SPAN"); List<Constituent> constituents = view.getConstituents(); for (Constituent c : constituents) { view.removeConstituent(c); int start = c.getStartSpan(); int end = c.getEndSpan(); view.addConstituent( new Constituent("", "NER_GOLD_EXTENT_SPAN", textAnnotation, start, end)); } textAnnotation.addView(view.getViewName(), view); } return textAnnotations; }
@Override public List<Constituent> transform(Constituent c) { TextAnnotation ta = c.getTextAnnotation(); int tokenPosition = c.getStartSpan(); TreeView dependency = (TreeView) ta.getView(ViewNames.DEPENDENCY); Constituent prepositionDepConstituent = dependency.getConstituentsCoveringToken(tokenPosition).get(0); List<Relation> incomingRelations = prepositionDepConstituent.getIncomingRelations(); List<Constituent> list = new ArrayList<>(); if (incomingRelations != null && incomingRelations.size() > 0) { Constituent parent = incomingRelations.get(0).getSource(); for (Relation out : parent.getOutgoingRelations()) { if (out == incomingRelations.get(0)) continue; String label = out.getRelationName(); if (label.contains("prep")) { Constituent ppNode = out.getTarget(); list.add(addPointerToSource(c, ppNode)); // get the first child of the pp and add this List<Relation> ppOut = ppNode.getOutgoingRelations(); if (ppOut != null && ppOut.size() != 0) { Constituent child = ppOut.get(0).getTarget(); list.add(addPointerToSource(c, child)); } } } } return list; }
TextAnnotation readTextAnnotation(String string) throws Exception { JsonObject json = (JsonObject) new JsonParser().parse(string); String corpusId = readString("corpusId", json); String id = readString("id", json); String text = readString("text", json); String[] tokens = readStringArray("tokens", json); Pair<Pair<String, Double>, int[]> sentences = readSentences(json); IntPair[] offsets = TokenUtils.getTokenOffsets(text, tokens); TextAnnotation ta = new TextAnnotation(corpusId, id, text, offsets, tokens, sentences.getSecond()); JsonArray views = json.getAsJsonArray("views"); for (int i = 0; i < views.size(); i++) { JsonObject view = (JsonObject) views.get(i); String viewName = readString("viewName", view); JsonArray viewData = view.getAsJsonArray("viewData"); List<View> topKViews = new ArrayList<>(); for (int k = 0; k < viewData.size(); k++) { JsonObject kView = (JsonObject) viewData.get(k); topKViews.add(readView(kView, ta)); } ta.addTopKView(viewName, topKViews); } readAttributes(ta, json); return ta; }
protected String getNERString() { List<Constituent> constituents = new ArrayList<>(view.getConstituents()); Collections.sort(constituents, TextAnnotationUtilities.constituentStartComparator); StringBuilder sb = new StringBuilder(); String text = textAnnotation.getText(); int where = 0; for (Constituent c : constituents) { int start = c.getStartCharOffset(); String startstring = text.substring(where, start); sb.append(startstring) .append("[") .append(c.getLabel()) .append(" ") .append(c.getTokenizedSurfaceForm()) .append(" ] "); where = c.getEndCharOffset(); } return sb.toString(); }
@Override public void labelData() { textAnnotation = tab.createTextAnnotation(data); nerAnnotator.addView(textAnnotation); view = textAnnotation.getView(nerAnnotator.getViewName()); }
@Override public List<Constituent> transform(Constituent c) { TextAnnotation ta = c.getTextAnnotation(); TreeView dependency = (TreeView) ta.getView(ViewNames.DEPENDENCY_STANFORD); return Collections.singletonList(getHead(c, dependency)); }
@Override public List<Constituent> transform(Constituent c) { TextAnnotation ta = c.getTextAnnotation(); TreeView dependency = (TreeView) ta.getView(ViewNames.DEPENDENCY_STANFORD); return getModifiers(c, dependency); }