public static void saveCoNLLFiles( String dir, Annotation dataset, boolean useSubTypes, boolean alreadyBIO) throws IOException { List<CoreMap> sentences = dataset.get(CoreAnnotations.SentencesAnnotation.class); String docid = null; PrintStream os = null; for (CoreMap sentence : sentences) { String myDocid = sentence.get(CoreAnnotations.DocIDAnnotation.class); if (docid == null || !myDocid.equals(docid)) { if (os != null) { os.close(); } docid = myDocid; os = new PrintStream(new FileOutputStream(dir + File.separator + docid + ".conll")); } List<CoreLabel> labeledSentence = AnnotationUtils.sentenceEntityMentionsToCoreLabels( sentence, true, null, null, useSubTypes, alreadyBIO); assert (labeledSentence != null); String prev = null; for (CoreLabel word : labeledSentence) { String w = word.word().replaceAll("[ \t\n]+", "_"); String t = word.get(CoreAnnotations.PartOfSpeechAnnotation.class); String l = word.get(CoreAnnotations.AnswerAnnotation.class); String nl = l; if (!alreadyBIO && !l.equals("O")) { if (prev != null && l.equals(prev)) nl = "I-" + l; else nl = "B-" + l; } String line = w + ' ' + t + ' ' + nl; String[] toks = line.split("[ \t\n]+"); if (toks.length != 3) { throw new RuntimeException("INVALID LINE: \"" + line + '"'); } os.printf("%s %s %s\n", w, t, nl); prev = l; } os.println(); } if (os != null) { os.close(); } }
/** * Label entities in an ExtractionSentence. Assumes the classifier has already been trained. * * @param sentence ExtractionSentence that we want to extract entities from * @return an ExtractionSentence with text content, tree and entities set. Relations will not be * set. */ private CoreMap extractEntities(CoreMap sentence, int sentCount) { // don't add answer annotations List<CoreLabel> testSentence = AnnotationUtils.sentenceEntityMentionsToCoreLabels( sentence, false, annotationsToSkip, null, useSubTypes, useBIO); // now label the sentence List<CoreLabel> annotatedSentence = this.classifier.classify(testSentence); if (logger.isLoggable(Level.FINEST)) { logger.finest("CLASSFIER OUTPUT: " + annotatedSentence); } List<EntityMention> extractedEntities = new ArrayList<>(); int i = 0; // variables which keep track of partially seen entities (i.e. we've seen // some but not all the words in them so far) String lastType = null; int startIndex = -1; // // note that labels may be in the BIO or just the IO format. we must handle both transparently // for (CoreLabel label : annotatedSentence) { String type = label.get(AnswerAnnotation.class); if (type.equals(SeqClassifierFlags.DEFAULT_BACKGROUND_SYMBOL)) { type = null; } // this is an entity end boundary followed by O if (type == null && lastType != null) { makeEntityMention(sentence, startIndex, i, lastType, extractedEntities, sentCount); logger.info("Found entity: " + extractedEntities.get(extractedEntities.size() - 1)); startIndex = -1; } // entity start preceded by an O else if (lastType == null && type != null) { startIndex = i; } // entity end followed by another entity of different type else if (lastType != null && type != null && (type.startsWith("B-") || (lastType.startsWith("I-") && type.startsWith("I-") && !lastType.equals(type)) || (notBIO(lastType) && notBIO(type) && !lastType.equals(type)))) { makeEntityMention(sentence, startIndex, i, lastType, extractedEntities, sentCount); logger.info("Found entity: " + extractedEntities.get(extractedEntities.size() - 1)); startIndex = i; } lastType = type; i++; } // replace the original annotation with the predicted entities sentence.set(MachineReadingAnnotations.EntityMentionsAnnotation.class, extractedEntities); logger.finest("EXTRACTED ENTITIES: "); for (EntityMention e : extractedEntities) { if (logger.isLoggable(Level.FINEST)) { logger.finest("\t" + e); } } postprocessSentence(sentence, sentCount); return sentence; }