/** * HTS maximum likelihood parameter generation * * @param um : utterance model sequence after processing Mary context features * @param htsData : HMM pdfs model set. * @throws Exception Exception */ public void htsMaximumLikelihoodParameterGeneration(HTSUttModel um, final HMMData htsData) throws Exception { CartTreeSet ms = htsData.getCartTreeSet(); /* Initialisation of PStream objects */ /* Initialise Parameter generation using UttModel um and Modelset ms */ /* initialise PStream objects for all the parameters that are going to be generated: */ /* mceppst, strpst, magpst, lf0pst */ /* Here i should pass the window files to initialise the dynamic windows dw */ /* for the moment the dw are all the same and hard-coded */ if (htsData.getPdfMgcStream() != null) mcepPst = new HTSPStream( ms.getMcepVsize(), um.getTotalFrame(), HMMData.FeatureType.MGC, htsData.getMaxMgcGvIter()); /* for lf0 count just the number of lf0frames that are voiced or non-zero */ if (htsData.getPdfLf0Stream() != null) lf0Pst = new HTSPStream( ms.getLf0Stream(), um.getLf0Frame(), HMMData.FeatureType.LF0, htsData.getMaxLf0GvIter()); /* The following are optional in case of generating mixed excitation */ if (htsData.getPdfStrStream() != null) strPst = new HTSPStream( ms.getStrVsize(), um.getTotalFrame(), HMMData.FeatureType.STR, htsData.getMaxStrGvIter()); if (htsData.getPdfMagStream() != null) magPst = new HTSPStream( ms.getMagVsize(), um.getTotalFrame(), HMMData.FeatureType.MAG, htsData.getMaxMagGvIter()); int lf0Frame = 0; // counts voiced frames int uttFrame = 0; // counts all frames voiced = new boolean[um.getTotalFrame()]; // local variables for faster access int msNumStates = ms.getNumStates(); int totalFrames = um.getTotalFrame(); for (int i = 0; i < um.getNumUttModel(); i++) { HTSModel m = um.getUttModel(i); int numVoicedInModel = 0; for (int state = 0; state < msNumStates; state++) { int dur = m.getDur(state); Arrays.fill(voiced, uttFrame, uttFrame += dur, m.getVoiced(state)); if (m.getVoiced(state)) lf0Frame += dur; } } /* mcepframe and lf0frame are used in the original code to initialise the T field */ /* in each pst, but here the pst are already initialised .... */ logger.debug("utteranceFrame=" + uttFrame + " lf0frame=" + lf0Frame); // Step 1: initialize fields in the parameter streams uttFrame = 0; lf0Frame = 0; /* copy pdfs */ for (int i = 0; i < um.getNumUttModel(); i++) { HTSModel m = um.getUttModel(i); boolean gvSwitch = m.getGvSwitch(); for (int state = 0; state < msNumStates; state++) { for (int frame = 0; frame < m.getDur(state); frame++) { /* copy pdfs for mcep */ if (mcepPst != null) { mcepPst.setMseq(uttFrame, m.getMean(FeatureType.MGC, state)); mcepPst.setVseq(uttFrame, m.getVariance(FeatureType.MGC, state)); if (!gvSwitch) mcepPst.setGvSwitch(uttFrame, false); } /* copy pdf for str */ if (strPst != null) { strPst.setMseq(uttFrame, m.getMean(FeatureType.STR, state)); strPst.setVseq(uttFrame, m.getVariance(FeatureType.STR, state)); if (!gvSwitch) strPst.setGvSwitch(uttFrame, false); } /* copy pdf for mag */ if (magPst != null) { magPst.setMseq(uttFrame, m.getMean(FeatureType.MAG, state)); magPst.setVseq(uttFrame, m.getVariance(FeatureType.MAG, state)); if (!gvSwitch) magPst.setGvSwitch(uttFrame, false); } /* copy pdfs for lf0 */ if (lf0Pst != null && !htsData.getUseAcousticModels()) { for (int k = 0; k < ms.getLf0Stream(); k++) { boolean nobound = true; /* check if current frame is voiced/unvoiced boundary or not */ for (int n = lf0Pst.getDWLeftBoundary(k); n <= lf0Pst.getDWRightBoundary(k); n++) if ((uttFrame + n) <= 0 || totalFrames <= (uttFrame + n)) nobound = false; else nobound = (nobound && voiced[uttFrame + n]); /* copy pdfs */ if (voiced[uttFrame]) { lf0Pst.setMseq(lf0Frame, k, m.getLf0Mean(state, k)); if (nobound || k == 0) lf0Pst.setIvseq(lf0Frame, k, finv(m.getLf0Variance(state, k))); else /* the variances for dynamic features are set to inf on v/uv boundary */ lf0Pst.setIvseq(lf0Frame, k, 0.0); } } } if (voiced[uttFrame]) { if (!gvSwitch) lf0Pst.setGvSwitch(lf0Frame, false); lf0Frame++; } uttFrame++; } /* for each frame in this state */ } /* for each state in this model */ } /* for each model in this utterance */ GVModelSet gvms = htsData.getGVModelSet(); // Step 2: set dynamic features to infinity on the borders for MGC/STR/MAG if (mcepPst != null) mcepPst.fixDynFeatOnBoundaries(); if (strPst != null) strPst.fixDynFeatOnBoundaries(); if (magPst != null) magPst.fixDynFeatOnBoundaries(); // Step 3: optimize individual parameter streams /* parameter generation for mcep */ if (mcepPst != null) { logger.info("Parameter generation for MGC: "); if (htsData.getUseGV() && (htsData.getPdfMgcGVStream() != null)) mcepPst.setGvMeanVar(gvms.getGVmeanMgc(), gvms.getGVcovInvMgc()); mcepPst.mlpg(htsData, htsData.getUseGV()); } // parameter generation for lf0 */ if (htsData.getUseAcousticModels()) loadMaryXmlF0(um, htsData); else if (lf0Pst != null) { logger.info("Parameter generation for LF0: "); if (htsData.getUseGV() && (htsData.getPdfLf0GVStream() != null)) lf0Pst.setGvMeanVar(gvms.getGVmeanLf0(), gvms.getGVcovInvLf0()); lf0Pst.mlpg(htsData, htsData.getUseGV()); // here we need set realisedF0 setRealisedF0(lf0Pst, um, msNumStates); } /* parameter generation for str */ boolean useGV = false; if (strPst != null) { logger.debug("Parameter generation for STR "); if (htsData.getUseGV() && (htsData.getPdfStrGVStream() != null)) { useGV = true; strPst.setGvMeanVar(gvms.getGVmeanStr(), gvms.getGVcovInvStr()); } strPst.mlpg(htsData, useGV); } /* parameter generation for mag */ useGV = false; if (magPst != null) { logger.info("Parameter generation for MAG "); if (htsData.getUseGV() && (htsData.getPdfMagGVStream() != null)) { useGV = true; magPst.setGvMeanVar(gvms.getGVmeanMag(), gvms.getGVcovInvMag()); } magPst.mlpg(htsData, useGV); } } /* method htsMaximumLikelihoodParameterGeneration */