public void append(MaryData md) { if (md == null) throw new NullPointerException("Received null marydata"); if (!md.getType().equals(this.getType())) throw new IllegalArgumentException( "Cannot append mary data of type `" + md.getType().name() + "' to mary data of type `" + this.getType().name() + "'"); if (getType().isXMLType()) { NodeList kids = md.getDocument().getDocumentElement().getChildNodes(); logger.debug("Appending " + kids.getLength() + " nodes to MaryXML structure"); Element docEl = this.getDocument().getDocumentElement(); for (int i = 0; i < kids.getLength(); i++) { docEl.appendChild(this.getDocument().importNode(kids.item(i), true)); } } else if (getType().isTextType()) { // Attention: XML type is a text type! if (this.plainText == null) { this.plainText = md.getPlainText(); } else { this.plainText = this.plainText + "\n\n" + md.getPlainText(); } } else if (getType().equals(MaryDataType.get("AUDIO"))) { appendAudio(md.getAudio()); } else { throw new UnsupportedOperationException( "Cannot append two mary data items of type `" + getType() + "'"); } }
public MaryData process(MaryData d) throws Exception { Document doc = d.getDocument(); NodeIterator it = MaryDomUtils.createNodeIterator(doc, doc, MaryXML.TOKEN); Element t = null; while ((t = (Element) it.nextNode()) != null) { String text; // Do not touch tokens for which a transcription is already // given (exception: transcription contains a '*' character: if (t.hasAttribute("ph") && !t.getAttribute("ph").contains("*")) { continue; } if (t.hasAttribute("sounds_like")) text = t.getAttribute("sounds_like"); else text = MaryDomUtils.tokenText(t); String pos = null; // use part-of-speech if available if (t.hasAttribute("pos")) { pos = t.getAttribute("pos"); } if (text != null && !text.equals("") && (pos == null || !pos.startsWith("$") /*punctuation*/)) { // If text consists of several parts (e.g., because that was // inserted into the sounds_like attribute), each part // is transcribed separately. StringBuilder ph = new StringBuilder(); String g2pMethod = null; StringTokenizer st = new StringTokenizer(text, " -"); while (st.hasMoreTokens()) { String graph = st.nextToken(); StringBuilder helper = new StringBuilder(); String phon = phonemise(graph, pos, helper); if (ph.length() == 0) { // first part // The g2pMethod of the combined beast is // the g2pMethod of the first constituant. g2pMethod = helper.toString(); ph.append(phon); } else { // following parts ph.append(" - "); // Reduce primary to secondary stress: ph.append(phon.replace('\'', ',')); } } if (ph != null && ph.length() > 0) { setPh(t, ph.toString()); t.setAttribute("g2p_method", g2pMethod); } } } MaryData result = new MaryData(outputType(), d.getLocale()); result.setDocument(doc); return result; }
public MaryData process(MaryData d) throws Exception { // prevUnitIndex; // numberOfConsecutiveUnits; // basenameDuration; // phoneTier; // PraatIntervalTier unitTier; // PraatIntervalTier sourceTier; // sourceInterval; Document doc = d.getDocument(); // initialize various variables: Double duration = 0.0; String phone = null; // initialize some class variables: PraatIntervalTier phoneTier = new PraatIntervalTier("phones"); Double basenameDuration = 0.0; int prevUnitIndex = Integer.MIN_VALUE; int numberOfConsecutiveUnits = 0; // counter to track consecutive units PraatInterval sourceInterval = new PraatInterval(basenameDuration); // until we have a robust way of checking the voice type, just initialize unit and source tiers // anyway: PraatIntervalTier unitTier = new PraatIntervalTier("units"); PraatIntervalTier sourceTier = new PraatIntervalTier("sources"); // prepare to iterate only over the PHONE, SENTENCE, and BOUNDARY nodes in the MaryXML: NodeIterator ni = DomUtils.createNodeIterator(doc, PHONE, BOUNDARY); Element element; // now iterate over these nodes: while ((element = (Element) ni.nextNode()) != null) { switch (element.getTagName()) { // <s>, <ph>, or <boundary> as specified above case PHONE: phone = element.getAttribute("p"); duration = Integer.parseInt(element.getAttribute("d")) / 1000.0; // duration is always in ms break; case BOUNDARY: phone = "_"; // TODO: perhaps we should access TargetFeatureComputer.getPauseSymbol() instead if (element.hasAttribute("duration")) { duration = Double.parseDouble(element.getAttribute("duration")) / 1000.0; // duration is always in ms } else { duration = 0.0; // HMM voices can have duration-less <boundary/> tags } break; default: logger.error( "NodeIterator should not find an element of type " + element.getTagName() + " here!"); break; } PraatInterval phoneInterval = new PraatInterval(duration, phone); // TODO: crude way of checking for unit selection voice; also, name of attribute could change! if (element.hasAttribute("units")) { // unitselectionProcessing(element, unitTier, prevUnitIndex, numberOfConsecutiveUnits, // basenameDuration, // sourceInterval, sourceTier); String units = element.getAttribute("units"); String[] unitStrings = units.split("; "); // boundaries have only one unit string boolean differentSource = false; String basename = null; String unitRange = null; for (String unitString : unitStrings) { // TODO verify that unit string matches "UNITNAME BASENAME UNITINDEX UNITDURATION" String[] unitFields = unitString.split(" "); String unitName = unitFields[0]; basename = unitFields[1]; int unitIndex = Integer.parseInt(unitFields[2]); Double unitDuration = Double.parseDouble(unitFields[3]); // units are straightforward, just like phones: unitTier.appendInterval(new PraatInterval(unitDuration, unitString)); // unit source processing is a little more elaborate: /* * Note: the following assumes that consecutive selected units are ALWAYS from the same basename! That could * change if basename boundaries are no longer marked by null units in the timeline. */ differentSource = unitIndex != prevUnitIndex + 1; // is source unit from a different part of the timeline?; if (differentSource) { // reset primary variables: numberOfConsecutiveUnits = 0; basenameDuration = 0.0; } // increment/increase primary variables: numberOfConsecutiveUnits++; basenameDuration += unitDuration; // construct unit index range string: unitRange = Integer.toString(unitIndex - numberOfConsecutiveUnits + 1); if (numberOfConsecutiveUnits > 1) { unitRange = unitRange + "-" + unitIndex; } // append source intervals to source tier: if (differentSource) { sourceInterval = new PraatInterval(basenameDuration, basename + ": " + unitRange); sourceTier.appendInterval(sourceInterval); } else { sourceInterval.setDuration(basenameDuration); sourceInterval.setText(basename + ": " + unitRange); } prevUnitIndex = unitIndex; } // HACK: arbitrary threshold to detect end points in ms (in the case of diphone voice or // boundary segment) } else if (duration > 10) { // TODO: there is still a bug somewhere regarding boundary durations with mbrola... phoneInterval.setDuration(duration / 1000.0); } phoneTier.appendInterval(phoneInterval); } PraatTextGrid textGrid = new PraatTextGrid(); phoneTier.updateBoundaries(); // force full specification of timings textGrid.appendTier(phoneTier); // fragile way of checking whether this is a unit selection voice: if (unitTier.getNumberOfIntervals() > 0) { // complete and append unit and source tiers: unitTier.updateBoundaries(); textGrid.appendTier(unitTier); sourceTier.updateBoundaries(); textGrid.appendTier(sourceTier); } // return raw TextGrid as result: MaryData result = new MaryData(getOutputType(), d.getLocale()); result.setPlainText(textGrid.toString()); return result; }
public void computeFeaturesFor(String basename) throws IOException, Exception { String text; Locale localVoice; localVoice = MaryUtils.string2locale(locale); // First, test if there is a corresponding .rawmaryxml file in textdir: File rawmaryxmlFile = new File(db.getProp(db.MARYXMLDIR) + basename + db.getProp(db.MARYXMLEXT)); if (rawmaryxmlFile.exists()) { text = FileUtils.getFileAsString(rawmaryxmlFile, "UTF-8"); } else { text = getMaryXMLHeaderWithInitialBoundary(locale) + FileUtils.getFileAsString( new File(db.getProp(db.TEXTDIR) + basename + db.getProp(db.TEXTEXT)), "UTF-8") + "</maryxml>"; } File pfeatFile = new File(unitfeatureDir, basename + featsExt); OutputStream os = new BufferedOutputStream(new FileOutputStream(pfeatFile)); MaryClient maryClient = getMaryClient(); /*Vector voices = maryClient.getVoices(localVoice); MaryClient.Voice defaultVoice = (MaryClient.Voice) voices.firstElement(); String voiceName = defaultVoice.name();*/ // maryClient.process(text, maryInputType, maryOutputType, null, null, os); maryClient.process(text, maryInputType, maryOutputType, locale, null, "slt-arctic", os); // maryClient.process(text, maryInputType, maryOutputType, null, "slt-arctic", os, timeout); // maryClient.getOutputDataTypes().size() // MaryData result = new MaryData(os); os.flush(); os.close(); // System.out.println(" TO STRING: "+new FileReader(pfeatFile).toString()); // BufferedReader bfr = new BufferedReader(new FileReader(pfeatFile)); String line; MaryData d = new MaryData(MaryDataType.get("PHONEMISED_EN"), Locale.US); // d.readFrom(new ByteArrayInputStream(os.toByteArray())); d.readFrom(new FileReader(pfeatFile)); // MaryData d = new MaryData(pfeatFile); Document doc = d.getDocument(); // Document acoustparams = d.getDocument(); // NodeIterator it = ((DocumentTraversal)acoustparams).createNodeIterator(acoustparams, // NodeFilter.SHOW_ELEMENT,new NameNodeFilter(new String[]{MaryXML.TOKEN, // MaryXML.BOUNDARY}),false); NodeIterator it = ((DocumentTraversal) doc) .createNodeIterator( doc, NodeFilter.SHOW_ELEMENT, new NameNodeFilter(MaryXML.TOKEN), false); Element t = null; while ((t = (Element) it.nextNode()) != null) { if (t.hasAttribute("g2p_method")) { String g2p = t.getAttribute("g2p_method"); String nodeText = t.getTextContent().trim(); if (g2p.equals("rules")) { // && nodeText.equals("!")){ System.out.print(basename + " ----> " + nodeText); if (bnl.contains(basename)) bnl.remove(basename); System.out.println(" SO removing basename: " + basename); } // System.out.println("G2P:"+t.getAttribute("g2p_method")); // System.out.println("Text:"+t.getTextContent()); } } /*while((line =bfr.readLine()) != null){ //boolean b = m.matches(); if(Pattern.matches("rules", line)) System.out.println(basename + " LINE ---> " + line); }*/ // System.out.println(" TO STRING: "+line); }