/** * Make Document for coref (for method coref(Document doc, StringBuilder[] outputs)). Mention * detection and document preprocessing is done here. * * @throws Exception */ public Document makeDocument(InputDoc input) throws Exception { if (input == null) return null; Annotation anno = input.annotation; // add missing annotation if (needMissingAnnotations) { addMissingAnnotation(anno); } if (Boolean.parseBoolean(props.getProperty("hcoref.useMarkedDiscourse", "false"))) { anno.set(CoreAnnotations.UseMarkedDiscourseAnnotation.class, true); } // remove nested NP with same headword except newswire document for chinese if (input.conllDoc != null && CorefProperties.getLanguage(props) == Locale.CHINESE) { CorefProperties.setRemoveNested(props, !input.conllDoc.documentID.contains("nw")); } // mention detection: MD gives following information about mentions: mention start/end index, // span, headword // rest information will be set in preprocess step List<List<Mention>> mentions = md.findMentions(anno, dict, props); Document doc = new Document(input, mentions); // find headword for gold mentions if (input.goldMentions != null) findGoldMentionHeads(doc); // document preprocessing: initialization (assign ID), mention processing (gender, number, type, // etc), speaker extraction, etc Preprocessor.preprocess(doc, dict, singletonPredictor, headFinder); return doc; }
private static HeadFinder getHeadFinder(Properties props) { Locale lang = CorefProperties.getLanguage(props); if (lang == Locale.ENGLISH) return new SemanticHeadFinder(); else if (lang == Locale.CHINESE) return new ChineseSemanticHeadFinder(); else { throw new RuntimeException("Invalid language setting: cannot load HeadFinder"); } }
/** Load Stanford Processor: skip unnecessary annotator */ protected StanfordCoreNLP loadStanfordProcessor(Properties props) { Properties pipelineProps = new Properties(props); StringBuilder annoSb = new StringBuilder(""); if (!CorefProperties.useGoldPOS(props)) { annoSb.append("pos, lemma"); } else { annoSb.append("lemma"); } if (CorefProperties.USE_TRUECASE) { annoSb.append(", truecase"); } if (!CorefProperties.useGoldNE(props) || CorefProperties.getLanguage(props) == Locale.CHINESE) { annoSb.append(", ner"); } if (!CorefProperties.useGoldParse(props)) { if (CorefProperties.useConstituencyTree(props)) annoSb.append(", parse"); else annoSb.append(", depparse"); } String annoStr = annoSb.toString(); Redwood.log("MentionExtractor ignores specified annotators, using annotators=" + annoStr); pipelineProps.put("annotators", annoStr); return new StanfordCoreNLP(pipelineProps, false); }
private static DocReader getDocumentReader(Properties props) { switch (CorefProperties.getInputType(props)) { case CONLL: String corpusPath = CorefProperties.getPathInput(props); CoNLLDocumentReader.Options options = new CoNLLDocumentReader.Options(); options.annotateTokenCoref = false; if (CorefProperties.useCoNLLAuto(props)) options.setFilter(".*_auto_conll$"); options.lang = CorefProperties.getLanguage(props); return new CoNLLDocumentReader(corpusPath, options); case ACE: // TODO return null; case MUC: // TODO return null; case RAW: default: // default is raw text // TODO return null; } }