public static void fillInParseAnnotations( boolean verbose, boolean buildGraphs, CoreMap sentence, Tree tree) { // make sure all tree nodes are CoreLabels // TODO: why isn't this always true? something fishy is going on ParserAnnotatorUtils.convertToCoreLabels(tree); // index nodes, i.e., add start and end token positions to all nodes // this is needed by other annotators down stream, e.g., the NFLAnnotator tree.indexSpans(0); sentence.set(TreeAnnotation.class, tree); if (verbose) { System.err.println("Tree is:"); tree.pennPrint(System.err); } if (buildGraphs) { // generate the dependency graph SemanticGraph deps = generateCollapsedDependencies(tree); SemanticGraph uncollapsedDeps = generateUncollapsedDependencies(tree); SemanticGraph ccDeps = generateCCProcessedDependencies(tree); if (verbose) { System.err.println("SDs:"); System.err.println(deps.toString("plain")); } sentence.set(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class, deps); sentence.set(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class, uncollapsedDeps); sentence.set( SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation.class, ccDeps); } setMissingTags(sentence, tree); }
public static void addFigerAnnotationToDocument(Annotation d) throws SQLException { List<CoreMap> sentences = d.get(CoreAnnotations.SentencesAnnotation.class); Set<String> entityIds = new HashSet<String>(); for (CoreMap sen : sentences) { List<Triple<Pair<Integer, Integer>, String, Float>> nelAnnotation = sen.get(NamedEntityLinkingAnnotation.class); for (Triple<Pair<Integer, Integer>, String, Float> t : nelAnnotation) { String id = t.second; if (!id.equals("null")) { entityIds.add(id); } } } Map<String, Set<String>> idTypeMap = bigQuery(entityIds); // add type onto sentences for (CoreMap sen : sentences) { List<Triple<Pair<Integer, Integer>, String, Float>> nelAnnotation = sen.get(NamedEntityLinkingAnnotation.class); List<Triple<Set<String>, Integer, Integer>> figerData = new ArrayList<>(); for (Triple<Pair<Integer, Integer>, String, Float> t : nelAnnotation) { Integer start = t.first.first; Integer end = t.first.second; Set<String> types = null; if (!t.second.equals("null")) { types = idTypeMap.get(GuidMidConversion.convertBackward(t.second)); } Triple<Set<String>, Integer, Integer> figerTrip = new Triple<>(types, start, end); figerData.add(figerTrip); } sen.set(FigerAnnotation.class, figerData); } }
private static void addLemma( Morphology morpha, Class<? extends CoreAnnotation<String>> ann, CoreMap map, String word, String tag) { if (tag.length() > 0) { String phrasalVerb = phrasalVerb(morpha, word, tag); if (phrasalVerb == null) { map.set(ann, morpha.lemma(word, tag)); } else { map.set(ann, phrasalVerb); } } else { map.set(ann, morpha.stem(word)); } }
private List<CoreMap> toCoreMaps( CoreMap annotation, List<TimeExpression> timeExpressions, SUTime.TimeIndex timeIndex) { if (timeExpressions == null) return null; List<CoreMap> coreMaps = new ArrayList<CoreMap>(timeExpressions.size()); for (TimeExpression te : timeExpressions) { CoreMap cm = te.getAnnotation(); SUTime.Temporal temporal = te.getTemporal(); if (temporal != null) { String origText = annotation.get(CoreAnnotations.TextAnnotation.class); String text = cm.get(CoreAnnotations.TextAnnotation.class); if (origText != null) { // Make sure the text is from original (and not from concatenated tokens) ChunkAnnotationUtils.annotateChunkText(cm, annotation); text = cm.get(CoreAnnotations.TextAnnotation.class); } Map<String, String> timexAttributes; try { timexAttributes = temporal.getTimexAttributes(timeIndex); if (options.includeRange) { SUTime.Temporal rangeTemporal = temporal.getRange(); if (rangeTemporal != null) { timexAttributes.put("range", rangeTemporal.toString()); } } } catch (Exception e) { logger.log( Level.WARNING, "Failed to get attributes from " + text + ", timeIndex " + timeIndex, e); continue; } Timex timex; try { timex = Timex.fromMap(text, timexAttributes); } catch (Exception e) { logger.log( Level.WARNING, "Failed to process " + text + " with attributes " + timexAttributes, e); continue; } cm.set(TimexAnnotation.class, timex); if (timex != null) { coreMaps.add(cm); } else { logger.warning("No timex expression for: " + text); } } } return coreMaps; }
public List<TimeExpression> extractTimeExpressions(CoreMap annotation, String docDateStr) { List<CoreMap> mergedNumbers = NumberNormalizer.findAndMergeNumbers(annotation); annotation.set(CoreAnnotations.NumerizedTokensAnnotation.class, mergedNumbers); // TODO: docDate may not have century.... SUTime.Time docDate = timexPatterns.parseDateTime(docDateStr); List<? extends MatchedExpression> matchedExpressions = expressionExtractor.extractExpressions(annotation); List<TimeExpression> timeExpressions = new ArrayList<TimeExpression>(matchedExpressions.size()); for (MatchedExpression expr : matchedExpressions) { if (expr instanceof TimeExpression) { timeExpressions.add((TimeExpression) expr); } else { timeExpressions.add(new TimeExpression(expr)); } } // Add back nested time expressions for ranges.... // For now only one level of nesting... if (options.includeNested) { List<TimeExpression> nestedTimeExpressions = new ArrayList<TimeExpression>(); for (TimeExpression te : timeExpressions) { if (te.isIncludeNested()) { List<? extends CoreMap> children = te.getAnnotation().get(TimeExpression.ChildrenAnnotation.class); if (children != null) { for (CoreMap child : children) { TimeExpression childTe = child.get(TimeExpression.Annotation.class); if (childTe != null) { nestedTimeExpressions.add(childTe); } } } } } timeExpressions.addAll(nestedTimeExpressions); } Collections.sort(timeExpressions, MatchedExpression.EXPR_TOKEN_OFFSETS_NESTED_FIRST_COMPARATOR); timeExpressions = filterInvalidTimeExpressions(timeExpressions); // Some resolving is done even if docDate null... if ( /*docDate != null && */ timeExpressions != null) { resolveTimeExpressions(annotation, timeExpressions, docDate); } // Annotate timex return timeExpressions; }
/** Reads an annotation from the given filename using the requested input. */ public static List<Annotation> getAnnotations( StanfordCoreNLP tokenizer, Input inputFormat, String filename, boolean filterUnknown) { switch (inputFormat) { case TEXT: { String text = IOUtils.slurpFileNoExceptions(filename); Annotation annotation = new Annotation(text); tokenizer.annotate(annotation); List<Annotation> annotations = Generics.newArrayList(); for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) { Annotation nextAnnotation = new Annotation(sentence.get(CoreAnnotations.TextAnnotation.class)); nextAnnotation.set( CoreAnnotations.SentencesAnnotation.class, Collections.singletonList(sentence)); annotations.add(nextAnnotation); } return annotations; } case TREES: { List<Tree> trees; if (filterUnknown) { trees = SentimentUtils.readTreesWithGoldLabels(filename); trees = SentimentUtils.filterUnknownRoots(trees); } else { trees = Generics.newArrayList(); MemoryTreebank treebank = new MemoryTreebank("utf-8"); treebank.loadPath(filename, null); for (Tree tree : treebank) { trees.add(tree); } } List<Annotation> annotations = Generics.newArrayList(); for (Tree tree : trees) { CoreMap sentence = new Annotation(Sentence.listToString(tree.yield())); sentence.set(TreeCoreAnnotations.TreeAnnotation.class, tree); List<CoreMap> sentences = Collections.singletonList(sentence); Annotation annotation = new Annotation(""); annotation.set(CoreAnnotations.SentencesAnnotation.class, sentences); annotations.add(annotation); } return annotations; } default: throw new IllegalArgumentException("Unknown format " + inputFormat); } }
/** * Converts NamedEntityTagAnnotation tags into {@link EntityMention}s. This finds the longest * sequence of NamedEntityTagAnnotation tags of the matching type. * * @param sentence A sentence annotated with NamedEntityTagAnnotation */ public void makeAnnotationFromAllNERTags(CoreMap sentence) { List<CoreLabel> words = sentence.get(CoreAnnotations.TokensAnnotation.class); List<EntityMention> mentions = sentence.get(MachineReadingAnnotations.EntityMentionsAnnotation.class); assert words != null; if (mentions == null) { this.logger.info("mentions are null"); mentions = new ArrayList<>(); } for (int start = 0; start < words.size(); start++) { int end; // find the first token after start that isn't of nerType String lastneTag = null; String ne = null; for (end = start; end < words.size(); end++) { ne = words.get(end).get(NamedEntityTagAnnotation.class); if (ne.equals(SeqClassifierFlags.DEFAULT_BACKGROUND_SYMBOL) || (lastneTag != null && !ne.equals(lastneTag))) { break; } lastneTag = ne; } if (end > start) { // found a match! String entityType = this.getEntityTypeForTag(lastneTag); EntityMention m = EntityMentionFactory.constructEntityMention( EntityMention.makeUniqueId(), sentence, new Span(start, end), new Span(start, end), entityType, null, null); // TODO: changed entityType in the above sentence to nerTag - Sonal logger.info("Created " + entityType + " entity mention: " + m); start = end - 1; mentions.add(m); } } sentence.set(MachineReadingAnnotations.EntityMentionsAnnotation.class, mentions); }
/** * Converts NamedEntityTagAnnotation tags into {@link EntityMention}s. This finds the longest * sequence of NamedEntityTagAnnotation tags of the matching type. * * @param sentence A sentence, ideally annotated with NamedEntityTagAnnotation * @param nerTag The name of the NER tag to copy, e.g. "DATE". * @param entityType The type of the {@link EntityMention} objects created */ public static void makeAnnotationFromGivenNERTag( CoreMap sentence, String nerTag, String entityType) { List<CoreLabel> words = sentence.get(CoreAnnotations.TokensAnnotation.class); List<EntityMention> mentions = sentence.get(MachineReadingAnnotations.EntityMentionsAnnotation.class); assert words != null; assert mentions != null; for (int start = 0; start < words.size(); start++) { int end; // find the first token after start that isn't of nerType for (end = start; end < words.size(); end++) { String ne = words.get(end).get(NamedEntityTagAnnotation.class); if (!ne.equals(nerTag)) { break; } } if (end > start) { // found a match! EntityMention m = EntityMentionFactory.constructEntityMention( EntityMention.makeUniqueId(), sentence, new Span(start, end), new Span(start, end), entityType, null, null); logger.info("Created " + entityType + " entity mention: " + m); start = end - 1; mentions.add(m); } } sentence.set(MachineReadingAnnotations.EntityMentionsAnnotation.class, mentions); }
@Override public Document nextDoc() throws Exception { List<List<CoreLabel>> allWords = new ArrayList<List<CoreLabel>>(); List<Tree> allTrees = new ArrayList<Tree>(); CoNLL2011DocumentReader.Document conllDoc = reader.getNextDocument(); if (conllDoc == null) { return null; } Annotation anno = conllDoc.getAnnotation(); List<CoreMap> sentences = anno.get(CoreAnnotations.SentencesAnnotation.class); for (CoreMap sentence : sentences) { if (!Constants.USE_GOLD_PARSES && !replicateCoNLL) { // Remove tree from annotation and replace with parse using stanford parser sentence.remove(TreeCoreAnnotations.TreeAnnotation.class); } else { Tree tree = sentence.get(TreeCoreAnnotations.TreeAnnotation.class); // generate the dependency graph try { SemanticGraph deps = SemanticGraphFactory.makeFromTree( tree, SemanticGraphFactory.Mode.COLLAPSED, includeExtras, lemmatize, threadSafe); SemanticGraph basicDeps = SemanticGraphFactory.makeFromTree( tree, SemanticGraphFactory.Mode.BASIC, includeExtras, lemmatize, threadSafe); sentence.set(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class, basicDeps); sentence.set(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class, deps); } catch (Exception e) { logger.log( Level.WARNING, "Exception caught during extraction of Stanford dependencies. Will ignore and continue...", e); } } } String preSpeaker = null; int utterance = -1; for (CoreLabel token : anno.get(CoreAnnotations.TokensAnnotation.class)) { if (!token.containsKey(CoreAnnotations.SpeakerAnnotation.class)) { token.set(CoreAnnotations.SpeakerAnnotation.class, ""); } String curSpeaker = token.get(CoreAnnotations.SpeakerAnnotation.class); if (!curSpeaker.equals(preSpeaker)) { utterance++; preSpeaker = curSpeaker; } token.set(CoreAnnotations.UtteranceAnnotation.class, utterance); } // Run pipeline stanfordProcessor.annotate(anno); for (CoreMap sentence : anno.get(CoreAnnotations.SentencesAnnotation.class)) { allWords.add(sentence.get(CoreAnnotations.TokensAnnotation.class)); allTrees.add(sentence.get(TreeCoreAnnotations.TreeAnnotation.class)); } // Initialize gold mentions List<List<Mention>> allGoldMentions = extractGoldMentions(conllDoc); List<List<Mention>> allPredictedMentions; if (Constants.USE_GOLD_MENTIONS) { // allPredictedMentions = allGoldMentions; // Make copy of gold mentions since mentions may be later merged, mentionID's changed and // stuff allPredictedMentions = makeCopy(allGoldMentions); } else if (Constants.USE_GOLD_MENTION_BOUNDARIES) { allPredictedMentions = ((RuleBasedCorefMentionFinder) mentionFinder) .filterPredictedMentions(allGoldMentions, anno, dictionaries); } else { allPredictedMentions = mentionFinder.extractPredictedMentions(anno, maxID, dictionaries); } try { recallErrors(allGoldMentions, allPredictedMentions, anno); } catch (IOException e) { throw new RuntimeException(e); } Document doc = arrange(anno, allWords, allTrees, allPredictedMentions, allGoldMentions, true); doc.conllDoc = conllDoc; return doc; }
@Override public Document nextDoc() throws Exception { List<List<CoreLabel>> allWords = new ArrayList<List<CoreLabel>>(); List<Tree> allTrees = new ArrayList<Tree>(); List<List<Mention>> allGoldMentions = new ArrayList<List<Mention>>(); List<List<Mention>> allPredictedMentions; List<CoreMap> allSentences = new ArrayList<CoreMap>(); Annotation docAnno = new Annotation(""); Pattern docPattern = Pattern.compile("<DOC>(.*?)</DOC>", Pattern.DOTALL + Pattern.CASE_INSENSITIVE); Pattern sentencePattern = Pattern.compile( "(<s>|<hl>|<dd>|<DATELINE>)(.*?)(</s>|</hl>|</dd>|</DATELINE>)", Pattern.DOTALL + Pattern.CASE_INSENSITIVE); Matcher docMatcher = docPattern.matcher(fileContents); if (!docMatcher.find(currentOffset)) return null; currentOffset = docMatcher.end(); String doc = docMatcher.group(1); Matcher sentenceMatcher = sentencePattern.matcher(doc); String ner = null; // Maintain current document ID. Pattern docIDPattern = Pattern.compile("<DOCNO>(.*?)</DOCNO>", Pattern.DOTALL + Pattern.CASE_INSENSITIVE); Matcher docIDMatcher = docIDPattern.matcher(doc); if (docIDMatcher.find()) currentDocumentID = docIDMatcher.group(1); else currentDocumentID = "documentAfter " + currentDocumentID; while (sentenceMatcher.find()) { String sentenceString = sentenceMatcher.group(2); List<CoreLabel> words = tokenizerFactory.getTokenizer(new StringReader(sentenceString)).tokenize(); // FIXING TOKENIZATION PROBLEMS for (int i = 0; i < words.size(); i++) { CoreLabel w = words.get(i); if (i > 0 && w.word().equals("$")) { if (!words.get(i - 1).word().endsWith("PRP") && !words.get(i - 1).word().endsWith("WP")) continue; words.get(i - 1).set(CoreAnnotations.TextAnnotation.class, words.get(i - 1).word() + "$"); words.remove(i); i--; } else if (w.word().equals("\\/")) { if (words.get(i - 1).word().equals("</COREF>")) continue; w.set( CoreAnnotations.TextAnnotation.class, words.get(i - 1).word() + "\\/" + words.get(i + 1).word()); words.remove(i + 1); words.remove(i - 1); } } // END FIXING TOKENIZATION PROBLEMS List<CoreLabel> sentence = new ArrayList<CoreLabel>(); // MUC accepts embedded coref mentions, so we need to keep a stack for the mentions currently // open Stack<Mention> stack = new Stack<Mention>(); List<Mention> mentions = new ArrayList<Mention>(); allWords.add(sentence); allGoldMentions.add(mentions); for (CoreLabel word : words) { String w = word.get(CoreAnnotations.TextAnnotation.class); // found regular token: WORD/POS if (!w.startsWith("<") && w.contains("\\/") && w.lastIndexOf("\\/") != w.length() - 2) { int i = w.lastIndexOf("\\/"); String w1 = w.substring(0, i); // we do NOT set POS info here. We take the POS tags from the parser! word.set(CoreAnnotations.TextAnnotation.class, w1); word.remove(CoreAnnotations.OriginalTextAnnotation.class); if (Constants.USE_GOLD_NE) { if (ner != null) { word.set(CoreAnnotations.NamedEntityTagAnnotation.class, ner); } else { word.set(CoreAnnotations.NamedEntityTagAnnotation.class, "O"); } } sentence.add(word); } // found the start SGML tag for a NE, e.g., "<ORGANIZATION>" else if (w.startsWith("<") && !w.startsWith("<COREF") && !w.startsWith("</")) { Pattern nerPattern = Pattern.compile("<(.*?)>"); Matcher m = nerPattern.matcher(w); m.find(); ner = m.group(1); } // found the end SGML tag for a NE, e.g., "</ORGANIZATION>" else if (w.startsWith("</") && !w.startsWith("</COREF")) { Pattern nerPattern = Pattern.compile("</(.*?)>"); Matcher m = nerPattern.matcher(w); m.find(); String ner1 = m.group(1); if (ner != null && !ner.equals(ner1)) throw new RuntimeException("Unmatched NE labels in MUC file: " + ner + " v. " + ner1); ner = null; } // found the start SGML tag for a coref mention else if (w.startsWith("<COREF")) { Mention mention = new Mention(); // position of this mention in the sentence mention.startIndex = sentence.size(); // extract GOLD info about this coref chain. needed for eval Pattern idPattern = Pattern.compile("ID=\"(.*?)\""); Pattern refPattern = Pattern.compile("REF=\"(.*?)\""); Matcher m = idPattern.matcher(w); m.find(); mention.mentionID = Integer.parseInt(m.group(1)); m = refPattern.matcher(w); if (m.find()) { mention.originalRef = Integer.parseInt(m.group(1)); } // open mention. keep track of all open mentions using the stack stack.push(mention); } // found the end SGML tag for a coref mention else if (w.equals("</COREF>")) { Mention mention = stack.pop(); mention.endIndex = sentence.size(); // this is a closed mention. add it to the final list of mentions // System.err.printf("Found MENTION: ID=%d, REF=%d\n", mention.mentionID, // mention.originalRef); mentions.add(mention); } else { word.remove(CoreAnnotations.OriginalTextAnnotation.class); if (Constants.USE_GOLD_NE) { if (ner != null) { word.set(CoreAnnotations.NamedEntityTagAnnotation.class, ner); } else { word.set(CoreAnnotations.NamedEntityTagAnnotation.class, "O"); } } sentence.add(word); } } StringBuilder textContent = new StringBuilder(); for (int i = 0; i < sentence.size(); i++) { CoreLabel w = sentence.get(i); w.set(CoreAnnotations.IndexAnnotation.class, i + 1); w.set(CoreAnnotations.UtteranceAnnotation.class, 0); if (i > 0) textContent.append(" "); textContent.append(w.getString(CoreAnnotations.TextAnnotation.class)); } CoreMap sentCoreMap = new Annotation(textContent.toString()); allSentences.add(sentCoreMap); sentCoreMap.set(CoreAnnotations.TokensAnnotation.class, sentence); } // assign goldCorefClusterID Map<Integer, Mention> idMention = Generics.newHashMap(); // temporary use for (List<Mention> goldMentions : allGoldMentions) { for (Mention m : goldMentions) { idMention.put(m.mentionID, m); } } for (List<Mention> goldMentions : allGoldMentions) { for (Mention m : goldMentions) { if (m.goldCorefClusterID == -1) { if (m.originalRef == -1) m.goldCorefClusterID = m.mentionID; else { int ref = m.originalRef; while (true) { Mention m2 = idMention.get(ref); if (m2.goldCorefClusterID != -1) { m.goldCorefClusterID = m2.goldCorefClusterID; break; } else if (m2.originalRef == -1) { m2.goldCorefClusterID = m2.mentionID; m.goldCorefClusterID = m2.goldCorefClusterID; break; } else { ref = m2.originalRef; } } } } } } docAnno.set(CoreAnnotations.SentencesAnnotation.class, allSentences); stanfordProcessor.annotate(docAnno); if (allSentences.size() != allWords.size()) throw new IllegalStateException("allSentences != allWords"); for (int i = 0; i < allSentences.size(); i++) { List<CoreLabel> annotatedSent = allSentences.get(i).get(CoreAnnotations.TokensAnnotation.class); List<CoreLabel> unannotatedSent = allWords.get(i); List<Mention> mentionInSent = allGoldMentions.get(i); for (Mention m : mentionInSent) { m.dependency = allSentences .get(i) .get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class); } if (annotatedSent.size() != unannotatedSent.size()) { throw new IllegalStateException("annotatedSent != unannotatedSent"); } for (int j = 0, sz = annotatedSent.size(); j < sz; j++) { CoreLabel annotatedWord = annotatedSent.get(j); CoreLabel unannotatedWord = unannotatedSent.get(j); if (!annotatedWord .get(CoreAnnotations.TextAnnotation.class) .equals(unannotatedWord.get(CoreAnnotations.TextAnnotation.class))) { throw new IllegalStateException("annotatedWord != unannotatedWord"); } } allWords.set(i, annotatedSent); allTrees.add(allSentences.get(i).get(TreeCoreAnnotations.TreeAnnotation.class)); } // extract predicted mentions if (Constants.USE_GOLD_MENTIONS) allPredictedMentions = allGoldMentions; else allPredictedMentions = mentionFinder.extractPredictedMentions(docAnno, maxID, dictionaries); // add the relevant fields to mentions and order them for coref return arrange(docAnno, allWords, allTrees, allPredictedMentions, allGoldMentions, true); }
public void annotate(CoreMap document) throws IOException { // write input file in GUTime format Element inputXML = toInputXML(document); File inputFile = File.createTempFile("gutime", ".input"); // Document doc = new Document(inputXML); PrintWriter inputWriter = new PrintWriter(inputFile); inputWriter.println(inputXML.toXML()); // new XMLOutputter().output(inputXML, inputWriter); inputWriter.close(); boolean useFirstDate = (!document.has(CoreAnnotations.CalendarAnnotation.class) && !document.has(CoreAnnotations.DocDateAnnotation.class)); ArrayList<String> args = new ArrayList<String>(); args.add("perl"); args.add("-I" + this.gutimePath.getPath()); args.add(new File(this.gutimePath, "TimeTag.pl").getPath()); if (useFirstDate) args.add("-FDNW"); args.add(inputFile.getPath()); // run GUTime on the input file ProcessBuilder process = new ProcessBuilder(args); StringWriter outputWriter = new StringWriter(); SystemUtils.run(process, outputWriter, null); String output = outputWriter.getBuffer().toString(); Pattern docClose = Pattern.compile("</DOC>.*", Pattern.DOTALL); output = docClose.matcher(output).replaceAll("</DOC>"); // parse the GUTime output Element outputXML; try { Document newNodeDocument = new Builder().build(output, ""); outputXML = newNodeDocument.getRootElement(); } catch (ParsingException ex) { throw new RuntimeException( String.format( "error:\n%s\ninput:\n%s\noutput:\n%s", ex, IOUtils.slurpFile(inputFile), output)); } /* try { outputXML = new SAXBuilder().build(new StringReader(output)).getRootElement(); } catch (JDOMException e) { throw new RuntimeException(String.format("error:\n%s\ninput:\n%s\noutput:\n%s", e, IOUtils.slurpFile(inputFile), output)); } */ inputFile.delete(); // get Timex annotations List<CoreMap> timexAnns = toTimexCoreMaps(outputXML, document); document.set(TimexAnnotations.class, timexAnns); if (outputResults) { System.out.println(timexAnns); } // align Timex annotations to sentences int timexIndex = 0; for (CoreMap sentence : document.get(CoreAnnotations.SentencesAnnotation.class)) { int sentBegin = beginOffset(sentence); int sentEnd = endOffset(sentence); // skip times before the sentence while (timexIndex < timexAnns.size() && beginOffset(timexAnns.get(timexIndex)) < sentBegin) { ++timexIndex; } // determine times within the sentence int sublistBegin = timexIndex; int sublistEnd = timexIndex; while (timexIndex < timexAnns.size() && sentBegin <= beginOffset(timexAnns.get(timexIndex)) && endOffset(timexAnns.get(timexIndex)) <= sentEnd) { ++sublistEnd; ++timexIndex; } // set the sentence timexes sentence.set(TimexAnnotations.class, timexAnns.subList(sublistBegin, sublistEnd)); } }
private static List<CoreMap> toTimexCoreMaps(Element docElem, CoreMap originalDocument) { // --Collect Token Offsets HashMap<Integer, Integer> beginMap = new HashMap<Integer, Integer>(); HashMap<Integer, Integer> endMap = new HashMap<Integer, Integer>(); boolean haveTokenOffsets = true; for (CoreMap sent : originalDocument.get(CoreAnnotations.SentencesAnnotation.class)) { for (CoreLabel token : sent.get(CoreAnnotations.TokensAnnotation.class)) { Integer tokBegin = token.get(CoreAnnotations.TokenBeginAnnotation.class); Integer tokEnd = token.get(CoreAnnotations.TokenEndAnnotation.class); if (tokBegin == null || tokEnd == null) { haveTokenOffsets = false; } int charBegin = token.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class); int charEnd = token.get(CoreAnnotations.CharacterOffsetEndAnnotation.class); beginMap.put(charBegin, tokBegin); endMap.put(charEnd, tokEnd); } } // --Set Timexes List<CoreMap> timexMaps = new ArrayList<CoreMap>(); int offset = 0; Element textElem = docElem.getFirstChildElement("text"); for (int i = 0; i < textElem.getChildCount(); i++) { Node content = textElem.getChild(i); if (content instanceof Text) { Text text = (Text) content; offset += text.getValue().length(); } else if (content instanceof Element) { Element child = (Element) content; if (child.getLocalName().equals("TIMEX3")) { Timex timex = new Timex(child); if (child.getChildCount() != 1) { throw new RuntimeException("TIMEX3 should only contain text " + child); } String timexText = child.getValue(); CoreMap timexMap = new ArrayCoreMap(); // (timex) timexMap.set(TimexAnnotation.class, timex); // (text) timexMap.set(CoreAnnotations.TextAnnotation.class, timexText); // (characters) int charBegin = offset; timexMap.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, charBegin); offset += timexText.length(); int charEnd = offset; timexMap.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, charEnd); // (tokens) if (haveTokenOffsets) { Integer tokBegin = beginMap.get(charBegin); int searchStep = 1; // if no exact match, search around the character offset while (tokBegin == null) { tokBegin = beginMap.get(charBegin - searchStep); if (tokBegin == null) { tokBegin = beginMap.get(charBegin + searchStep); } searchStep += 1; } searchStep = 1; Integer tokEnd = endMap.get(charEnd); while (tokEnd == null) { tokEnd = endMap.get(charEnd - searchStep); if (tokEnd == null) { tokEnd = endMap.get(charEnd + searchStep); } searchStep += 1; } timexMap.set(CoreAnnotations.TokenBeginAnnotation.class, tokBegin); timexMap.set(CoreAnnotations.TokenEndAnnotation.class, tokEnd); } // (add) timexMaps.add(timexMap); } else { throw new RuntimeException("unexpected element " + child); } } else { throw new RuntimeException("unexpected content " + content); } } return timexMaps; }
/** * Label entities in an ExtractionSentence. Assumes the classifier has already been trained. * * @param sentence ExtractionSentence that we want to extract entities from * @return an ExtractionSentence with text content, tree and entities set. Relations will not be * set. */ private CoreMap extractEntities(CoreMap sentence, int sentCount) { // don't add answer annotations List<CoreLabel> testSentence = AnnotationUtils.sentenceEntityMentionsToCoreLabels( sentence, false, annotationsToSkip, null, useSubTypes, useBIO); // now label the sentence List<CoreLabel> annotatedSentence = this.classifier.classify(testSentence); if (logger.isLoggable(Level.FINEST)) { logger.finest("CLASSFIER OUTPUT: " + annotatedSentence); } List<EntityMention> extractedEntities = new ArrayList<>(); int i = 0; // variables which keep track of partially seen entities (i.e. we've seen // some but not all the words in them so far) String lastType = null; int startIndex = -1; // // note that labels may be in the BIO or just the IO format. we must handle both transparently // for (CoreLabel label : annotatedSentence) { String type = label.get(AnswerAnnotation.class); if (type.equals(SeqClassifierFlags.DEFAULT_BACKGROUND_SYMBOL)) { type = null; } // this is an entity end boundary followed by O if (type == null && lastType != null) { makeEntityMention(sentence, startIndex, i, lastType, extractedEntities, sentCount); logger.info("Found entity: " + extractedEntities.get(extractedEntities.size() - 1)); startIndex = -1; } // entity start preceded by an O else if (lastType == null && type != null) { startIndex = i; } // entity end followed by another entity of different type else if (lastType != null && type != null && (type.startsWith("B-") || (lastType.startsWith("I-") && type.startsWith("I-") && !lastType.equals(type)) || (notBIO(lastType) && notBIO(type) && !lastType.equals(type)))) { makeEntityMention(sentence, startIndex, i, lastType, extractedEntities, sentCount); logger.info("Found entity: " + extractedEntities.get(extractedEntities.size() - 1)); startIndex = i; } lastType = type; i++; } // replace the original annotation with the predicted entities sentence.set(MachineReadingAnnotations.EntityMentionsAnnotation.class, extractedEntities); logger.finest("EXTRACTED ENTITIES: "); for (EntityMention e : extractedEntities) { if (logger.isLoggable(Level.FINEST)) { logger.finest("\t" + e); } } postprocessSentence(sentence, sentCount); return sentence; }