Exemplo n.º 1
0
  /**
   * Read parse trees from a Reader.
   *
   * @param filename
   * @param in The <code>Reader</code>
   * @param simplifiedTagset If `true`, convert part-of-speech labels to a simplified version of the
   *     EAGLES tagset, where the tags do not include extensive morphological analysis
   * @param aggressiveNormalization Perform aggressive "normalization" on the trees read from the
   *     provided corpus documents: split multi-word tokens into their constituent words (and infer
   *     parts of speech of the constituent words).
   * @param retainNER Retain NER information in preterminals (for later use in
   *     `MultiWordPreprocessor) and add NER-specific parents to single-word NE tokens
   * @param detailedAnnotations Retain detailed tree node annotations. These annotations on parse
   *     tree constituents may be useful for e.g. training a parser.
   */
  public SpanishXMLTreeReader(
      String filename,
      Reader in,
      boolean simplifiedTagset,
      boolean aggressiveNormalization,
      boolean retainNER,
      boolean detailedAnnotations) {
    TreebankLanguagePack tlp = new SpanishTreebankLanguagePack();

    this.simplifiedTagset = simplifiedTagset;
    this.detailedAnnotations = detailedAnnotations;

    stream = new ReaderInputStream(in, tlp.getEncoding());
    treeFactory = new LabeledScoredTreeFactory();
    treeNormalizer =
        new SpanishTreeNormalizer(simplifiedTagset, aggressiveNormalization, retainNER);

    DocumentBuilder parser = XMLUtils.getXmlParser();
    try {
      final Document xml = parser.parse(stream);
      final Element root = xml.getDocumentElement();
      sentences = root.getElementsByTagName(NODE_SENT);
      sentIdx = 0;

    } catch (SAXException e) {
      System.err.println("Parse exception while reading " + filename);
      e.printStackTrace();
    } catch (IOException e) {
      e.printStackTrace();
    }
  }
 private static <IN extends CoreMap> void printAnswersTokenizedXML(List<IN> doc, PrintWriter out) {
   int num = 0;
   for (IN wi : doc) {
     out.print("<wi num=\"");
     // tag.append(wi.get("position"));
     out.print(num++);
     out.print("\" entity=\"");
     out.print(StringUtils.getNotNullString(wi.get(AnswerAnnotation.class)));
     out.print("\">");
     out.print(XMLUtils.escapeXML(StringUtils.getNotNullString(wi.get(TextAnnotation.class))));
     out.println("</wi>");
   }
 }