/** * Read parse trees from a Reader. * * @param filename * @param in The <code>Reader</code> * @param simplifiedTagset If `true`, convert part-of-speech labels to a simplified version of the * EAGLES tagset, where the tags do not include extensive morphological analysis * @param aggressiveNormalization Perform aggressive "normalization" on the trees read from the * provided corpus documents: split multi-word tokens into their constituent words (and infer * parts of speech of the constituent words). * @param retainNER Retain NER information in preterminals (for later use in * `MultiWordPreprocessor) and add NER-specific parents to single-word NE tokens * @param detailedAnnotations Retain detailed tree node annotations. These annotations on parse * tree constituents may be useful for e.g. training a parser. */ public SpanishXMLTreeReader( String filename, Reader in, boolean simplifiedTagset, boolean aggressiveNormalization, boolean retainNER, boolean detailedAnnotations) { TreebankLanguagePack tlp = new SpanishTreebankLanguagePack(); this.simplifiedTagset = simplifiedTagset; this.detailedAnnotations = detailedAnnotations; stream = new ReaderInputStream(in, tlp.getEncoding()); treeFactory = new LabeledScoredTreeFactory(); treeNormalizer = new SpanishTreeNormalizer(simplifiedTagset, aggressiveNormalization, retainNER); DocumentBuilder parser = XMLUtils.getXmlParser(); try { final Document xml = parser.parse(stream); final Element root = xml.getDocumentElement(); sentences = root.getElementsByTagName(NODE_SENT); sentIdx = 0; } catch (SAXException e) { System.err.println("Parse exception while reading " + filename); e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } }