/** * Make Document for coref (for method coref(Document doc, StringBuilder[] outputs)). Mention * detection and document preprocessing is done here. * * @throws Exception */ public Document makeDocument(InputDoc input) throws Exception { if (input == null) return null; Annotation anno = input.annotation; // add missing annotation if (needMissingAnnotations) { addMissingAnnotation(anno); } if (Boolean.parseBoolean(props.getProperty("hcoref.useMarkedDiscourse", "false"))) { anno.set(CoreAnnotations.UseMarkedDiscourseAnnotation.class, true); } // remove nested NP with same headword except newswire document for chinese if (input.conllDoc != null && CorefProperties.getLanguage(props) == Locale.CHINESE) { CorefProperties.setRemoveNested(props, !input.conllDoc.documentID.contains("nw")); } // mention detection: MD gives following information about mentions: mention start/end index, // span, headword // rest information will be set in preprocess step List<List<Mention>> mentions = md.findMentions(anno, dict, props); Document doc = new Document(input, mentions); // find headword for gold mentions if (input.goldMentions != null) findGoldMentionHeads(doc); // document preprocessing: initialization (assign ID), mention processing (gender, number, type, // etc), speaker extraction, etc Preprocessor.preprocess(doc, dict, singletonPredictor, headFinder); return doc; }
public CorefDocMaker(Properties props, Dictionaries dictionaries) throws ClassNotFoundException, IOException { this.props = props; this.dict = dictionaries; reader = getDocumentReader(props); headFinder = getHeadFinder(props); md = getMentionFinder(props, dictionaries, headFinder); // corenlp = new StanfordCoreNLP(props, false); corenlp = loadStanfordProcessor(props); treeLemmatizer = new TreeLemmatizer(); singletonPredictor = (CorefProperties.useSingletonPredictor(props)) ? getSingletonPredictorFromSerializedFile( CorefProperties.getPathSingletonPredictor(props)) : null; }
private static HeadFinder getHeadFinder(Properties props) { Locale lang = CorefProperties.getLanguage(props); if (lang == Locale.ENGLISH) return new SemanticHeadFinder(); else if (lang == Locale.CHINESE) return new ChineseSemanticHeadFinder(); else { throw new RuntimeException("Invalid language setting: cannot load HeadFinder"); } }
/** Load Stanford Processor: skip unnecessary annotator */ protected StanfordCoreNLP loadStanfordProcessor(Properties props) { Properties pipelineProps = new Properties(props); StringBuilder annoSb = new StringBuilder(""); if (!CorefProperties.useGoldPOS(props)) { annoSb.append("pos, lemma"); } else { annoSb.append("lemma"); } if (CorefProperties.USE_TRUECASE) { annoSb.append(", truecase"); } if (!CorefProperties.useGoldNE(props) || CorefProperties.getLanguage(props) == Locale.CHINESE) { annoSb.append(", ner"); } if (!CorefProperties.useGoldParse(props)) { if (CorefProperties.useConstituencyTree(props)) annoSb.append(", parse"); else annoSb.append(", depparse"); } String annoStr = annoSb.toString(); Redwood.log("MentionExtractor ignores specified annotators, using annotators=" + annoStr); pipelineProps.put("annotators", annoStr); return new StanfordCoreNLP(pipelineProps, false); }
private static DocReader getDocumentReader(Properties props) { switch (CorefProperties.getInputType(props)) { case CONLL: String corpusPath = CorefProperties.getPathInput(props); CoNLLDocumentReader.Options options = new CoNLLDocumentReader.Options(); options.annotateTokenCoref = false; if (CorefProperties.useCoNLLAuto(props)) options.setFilter(".*_auto_conll$"); options.lang = CorefProperties.getLanguage(props); return new CoNLLDocumentReader(corpusPath, options); case ACE: // TODO return null; case MUC: // TODO return null; case RAW: default: // default is raw text // TODO return null; } }
private static CorefMentionFinder getMentionFinder( Properties props, Dictionaries dictionaries, HeadFinder headFinder) throws ClassNotFoundException, IOException { switch (CorefProperties.getMDType(props)) { case RULE: return new RuleBasedCorefMentionFinder(headFinder, props); case HYBRID: return new HybridCorefMentionFinder(headFinder, props); case DEPENDENCY: default: // default is dependency return new DependencyCorefMentionFinder(props); } }
private void addMissingAnnotation(Annotation anno) { boolean useConstituency = CorefProperties.useConstituencyTree(props); final boolean LEMMATIZE = true; List<CoreMap> sentences = anno.get(CoreAnnotations.SentencesAnnotation.class); for (CoreMap sentence : sentences) { boolean hasTree = sentence.containsKey(TreeCoreAnnotations.TreeAnnotation.class); Tree tree = sentence.get(TreeCoreAnnotations.TreeAnnotation.class); if (!useConstituency) { // TODO: temp for dev: make sure we don't use constituency tree sentence.remove(TreeCoreAnnotations.TreeAnnotation.class); } if (LEMMATIZE && hasTree && useConstituency) treeLemmatizer.transformTree(tree); // TODO don't need? } corenlp.annotate(anno); }