Exemplo n.º 1
0
  /**
   * Make Document for coref (for method coref(Document doc, StringBuilder[] outputs)). Mention
   * detection and document preprocessing is done here.
   *
   * @throws Exception
   */
  public Document makeDocument(InputDoc input) throws Exception {
    if (input == null) return null;
    Annotation anno = input.annotation;

    // add missing annotation
    if (needMissingAnnotations) {
      addMissingAnnotation(anno);
    }

    if (Boolean.parseBoolean(props.getProperty("hcoref.useMarkedDiscourse", "false"))) {
      anno.set(CoreAnnotations.UseMarkedDiscourseAnnotation.class, true);
    }

    // remove nested NP with same headword except newswire document for chinese

    if (input.conllDoc != null && CorefProperties.getLanguage(props) == Locale.CHINESE) {
      CorefProperties.setRemoveNested(props, !input.conllDoc.documentID.contains("nw"));
    }

    // mention detection: MD gives following information about mentions: mention start/end index,
    // span, headword
    // rest information will be set in preprocess step
    List<List<Mention>> mentions = md.findMentions(anno, dict, props);
    Document doc = new Document(input, mentions);

    // find headword for gold mentions
    if (input.goldMentions != null) findGoldMentionHeads(doc);

    // document preprocessing: initialization (assign ID), mention processing (gender, number, type,
    // etc), speaker extraction, etc
    Preprocessor.preprocess(doc, dict, singletonPredictor, headFinder);

    return doc;
  }
Exemplo n.º 2
0
 private static HeadFinder getHeadFinder(Properties props) {
   Locale lang = CorefProperties.getLanguage(props);
   if (lang == Locale.ENGLISH) return new SemanticHeadFinder();
   else if (lang == Locale.CHINESE) return new ChineseSemanticHeadFinder();
   else {
     throw new RuntimeException("Invalid language setting: cannot load HeadFinder");
   }
 }
Exemplo n.º 3
0
  /** Load Stanford Processor: skip unnecessary annotator */
  protected StanfordCoreNLP loadStanfordProcessor(Properties props) {

    Properties pipelineProps = new Properties(props);
    StringBuilder annoSb = new StringBuilder("");
    if (!CorefProperties.useGoldPOS(props)) {
      annoSb.append("pos, lemma");
    } else {
      annoSb.append("lemma");
    }
    if (CorefProperties.USE_TRUECASE) {
      annoSb.append(", truecase");
    }
    if (!CorefProperties.useGoldNE(props) || CorefProperties.getLanguage(props) == Locale.CHINESE) {
      annoSb.append(", ner");
    }
    if (!CorefProperties.useGoldParse(props)) {
      if (CorefProperties.useConstituencyTree(props)) annoSb.append(", parse");
      else annoSb.append(", depparse");
    }
    String annoStr = annoSb.toString();
    Redwood.log("MentionExtractor ignores specified annotators, using annotators=" + annoStr);
    pipelineProps.put("annotators", annoStr);
    return new StanfordCoreNLP(pipelineProps, false);
  }
Exemplo n.º 4
0
  private static DocReader getDocumentReader(Properties props) {
    switch (CorefProperties.getInputType(props)) {
      case CONLL:
        String corpusPath = CorefProperties.getPathInput(props);
        CoNLLDocumentReader.Options options = new CoNLLDocumentReader.Options();
        options.annotateTokenCoref = false;
        if (CorefProperties.useCoNLLAuto(props)) options.setFilter(".*_auto_conll$");
        options.lang = CorefProperties.getLanguage(props);
        return new CoNLLDocumentReader(corpusPath, options);

      case ACE:
        // TODO
        return null;

      case MUC:
        // TODO
        return null;

      case RAW:
      default: // default is raw text
        // TODO
        return null;
    }
  }