Exemplo n.º 1
0
  /**
   * Make Document for coref (for method coref(Document doc, StringBuilder[] outputs)). Mention
   * detection and document preprocessing is done here.
   *
   * @throws Exception
   */
  public Document makeDocument(InputDoc input) throws Exception {
    if (input == null) return null;
    Annotation anno = input.annotation;

    // add missing annotation
    if (needMissingAnnotations) {
      addMissingAnnotation(anno);
    }

    if (Boolean.parseBoolean(props.getProperty("hcoref.useMarkedDiscourse", "false"))) {
      anno.set(CoreAnnotations.UseMarkedDiscourseAnnotation.class, true);
    }

    // remove nested NP with same headword except newswire document for chinese

    if (input.conllDoc != null && CorefProperties.getLanguage(props) == Locale.CHINESE) {
      CorefProperties.setRemoveNested(props, !input.conllDoc.documentID.contains("nw"));
    }

    // mention detection: MD gives following information about mentions: mention start/end index,
    // span, headword
    // rest information will be set in preprocess step
    List<List<Mention>> mentions = md.findMentions(anno, dict, props);
    Document doc = new Document(input, mentions);

    // find headword for gold mentions
    if (input.goldMentions != null) findGoldMentionHeads(doc);

    // document preprocessing: initialization (assign ID), mention processing (gender, number, type,
    // etc), speaker extraction, etc
    Preprocessor.preprocess(doc, dict, singletonPredictor, headFinder);

    return doc;
  }