/** set UtteranceAnnotation for quotations: default UtteranceAnnotation = 0 is given */ private void markQuotations(List<CoreMap> results, boolean normalQuotationType) { boolean insideQuotation = false; for (CoreMap m : results) { for (CoreLabel l : m.get(CoreAnnotations.TokensAnnotation.class)) { String w = l.get(CoreAnnotations.TextAnnotation.class); boolean noSpeakerInfo = !l.containsKey(CoreAnnotations.SpeakerAnnotation.class) || l.get(CoreAnnotations.SpeakerAnnotation.class).equals("") || l.get(CoreAnnotations.SpeakerAnnotation.class).startsWith("PER"); if (w.equals("``") || (!insideQuotation && normalQuotationType && w.equals("\""))) { insideQuotation = true; maxUtter++; continue; } else if (w.equals("''") || (insideQuotation && normalQuotationType && w.equals("\""))) { insideQuotation = false; } if (insideQuotation) { l.set(CoreAnnotations.UtteranceAnnotation.class, maxUtter); } if (noSpeakerInfo) { l.set( CoreAnnotations.SpeakerAnnotation.class, "PER" + l.get(CoreAnnotations.UtteranceAnnotation.class)); } } } if (maxUtter == 0 && !normalQuotationType) markQuotations(results, true); }
@Override public List<CoreLabel> apply(String doc) { if (num > 0 && num % 1000 == 0) { System.err.print("[" + num + "]"); } num++; List<CoreLabel> words = new ArrayList<>(); String[] lines = doc.split("\n"); for (String line : lines) { ++lineCount; if (line.trim().length() == 0) { continue; } String[] info = whitePattern.split(line); // todo: We could speed things up here by having one time only having converted map into an // array of CoreLabel keys (Class<? extends CoreAnnotation<?>>) and then instantiating them. // Need new constructor. CoreLabel wi; try { wi = new CoreLabel(map, info); // Since the map normally only specified answer, we copy it to GoldAnswer unless they've // put something else there! if (!wi.containsKey(CoreAnnotations.GoldAnswerAnnotation.class) && wi.containsKey(CoreAnnotations.AnswerAnnotation.class)) { wi.set( CoreAnnotations.GoldAnswerAnnotation.class, wi.get(CoreAnnotations.AnswerAnnotation.class)); } } catch (RuntimeException e) { System.err.println("Error on line " + lineCount + ": " + line); throw e; } words.add(wi); } return words; }
/** Set paragraph index */ private void setParagraphAnnotation() { int paragraphIndex = 0; int previousOffset = -10; for (CoreMap sent : annotation.get(CoreAnnotations.SentencesAnnotation.class)) { for (CoreLabel w : sent.get(CoreAnnotations.TokensAnnotation.class)) { if (w.containsKey(CoreAnnotations.CharacterOffsetBeginAnnotation.class)) { if (w.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class) > previousOffset + 2) paragraphIndex++; w.set(CoreAnnotations.ParagraphAnnotation.class, paragraphIndex); previousOffset = w.get(CoreAnnotations.CharacterOffsetEndAnnotation.class); } else { w.set(CoreAnnotations.ParagraphAnnotation.class, -1); } } } for (List<Mention> l : predictedOrderedMentionsBySentence) { for (Mention m : l) { m.paragraph = m.headWord.get(CoreAnnotations.ParagraphAnnotation.class); } } numParagraph = paragraphIndex; }
@Override public Document nextDoc() throws Exception { List<List<CoreLabel>> allWords = new ArrayList<List<CoreLabel>>(); List<Tree> allTrees = new ArrayList<Tree>(); CoNLL2011DocumentReader.Document conllDoc = reader.getNextDocument(); if (conllDoc == null) { return null; } Annotation anno = conllDoc.getAnnotation(); List<CoreMap> sentences = anno.get(CoreAnnotations.SentencesAnnotation.class); for (CoreMap sentence : sentences) { if (!Constants.USE_GOLD_PARSES && !replicateCoNLL) { // Remove tree from annotation and replace with parse using stanford parser sentence.remove(TreeCoreAnnotations.TreeAnnotation.class); } else { Tree tree = sentence.get(TreeCoreAnnotations.TreeAnnotation.class); // generate the dependency graph try { SemanticGraph deps = SemanticGraphFactory.makeFromTree( tree, SemanticGraphFactory.Mode.COLLAPSED, includeExtras, lemmatize, threadSafe); SemanticGraph basicDeps = SemanticGraphFactory.makeFromTree( tree, SemanticGraphFactory.Mode.BASIC, includeExtras, lemmatize, threadSafe); sentence.set(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class, basicDeps); sentence.set(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class, deps); } catch (Exception e) { logger.log( Level.WARNING, "Exception caught during extraction of Stanford dependencies. Will ignore and continue...", e); } } } String preSpeaker = null; int utterance = -1; for (CoreLabel token : anno.get(CoreAnnotations.TokensAnnotation.class)) { if (!token.containsKey(CoreAnnotations.SpeakerAnnotation.class)) { token.set(CoreAnnotations.SpeakerAnnotation.class, ""); } String curSpeaker = token.get(CoreAnnotations.SpeakerAnnotation.class); if (!curSpeaker.equals(preSpeaker)) { utterance++; preSpeaker = curSpeaker; } token.set(CoreAnnotations.UtteranceAnnotation.class, utterance); } // Run pipeline stanfordProcessor.annotate(anno); for (CoreMap sentence : anno.get(CoreAnnotations.SentencesAnnotation.class)) { allWords.add(sentence.get(CoreAnnotations.TokensAnnotation.class)); allTrees.add(sentence.get(TreeCoreAnnotations.TreeAnnotation.class)); } // Initialize gold mentions List<List<Mention>> allGoldMentions = extractGoldMentions(conllDoc); List<List<Mention>> allPredictedMentions; if (Constants.USE_GOLD_MENTIONS) { // allPredictedMentions = allGoldMentions; // Make copy of gold mentions since mentions may be later merged, mentionID's changed and // stuff allPredictedMentions = makeCopy(allGoldMentions); } else if (Constants.USE_GOLD_MENTION_BOUNDARIES) { allPredictedMentions = ((RuleBasedCorefMentionFinder) mentionFinder) .filterPredictedMentions(allGoldMentions, anno, dictionaries); } else { allPredictedMentions = mentionFinder.extractPredictedMentions(anno, maxID, dictionaries); } try { recallErrors(allGoldMentions, allPredictedMentions, anno); } catch (IOException e) { throw new RuntimeException(e); } Document doc = arrange(anno, allWords, allTrees, allPredictedMentions, allGoldMentions, true); doc.conllDoc = conllDoc; return doc; }