protected byte[] performEspeak(CommunicateAction communicateAction, Locale lang) throws IOException { byte[] wavBytes; try (final ByteArrayInputStream objectIn = new ByteArrayInputStream( communicateAction.getObject().getBytes(StandardCharsets.UTF_8)); final ByteArrayOutputStream wavStream = new ByteArrayOutputStream(); final ByteArrayOutputStream err = new ByteArrayOutputStream()) { final CommandLine cmdLine = new CommandLine("espeak"); cmdLine.addArgument("-b"); cmdLine.addArgument("1"); // UTF-8 cmdLine.addArgument("-m"); // SSML markup cmdLine.addArgument("-s"); cmdLine.addArgument("130"); if (INDONESIAN.getLanguage().equals(lang.getLanguage())) { cmdLine.addArgument("-v"); cmdLine.addArgument(SpeechProsody.MBROLA_ID1_VOICE); cmdLine.addArgument("-a"); cmdLine.addArgument(String.valueOf(INDONESIAN_AMPLITUDE)); } else if ("ar".equals(lang.getLanguage())) { cmdLine.addArgument("-v"); cmdLine.addArgument(SpeechProsody.MBROLA_AR1_VOICE); } // cmdLine.addArgument("-w"); // cmdLine.addArgument(wavFile.toString()); cmdLine.addArgument("--stdin"); cmdLine.addArgument("--stdout"); // cmdLine.addArgument(communicateAction.getObject()); executor.setStreamHandler(new PumpStreamHandler(wavStream, err, objectIn)); final int executed; try { executed = executor.execute(cmdLine); wavBytes = wavStream.toByteArray(); } finally { log.info("{}: {}", cmdLine, err.toString()); } } return wavBytes; }
protected Status processCommunicateAction( final Exchange exchange, final CommunicateAction communicateAction) throws IOException { final EmotionKind emotionKind = Optional.ofNullable(communicateAction.getEmotionKind()).orElse(EmotionKind.NEUTRAL); final Locale lang = Optional.ofNullable(communicateAction.getInLanguage()).orElse(Locale.US); log.info("Got speech lang-legacy={}: {}", lang.getLanguage(), communicateAction); final String avatarId = Optional.ofNullable(communicateAction.getAvatarId()).orElse("nao1"); // final File wavFile = File.createTempFile("lumen-speech-synthesis_", ".wav"); // final File oggFile = File.createTempFile("lumen-speech-synthesis_", // ".ogg"); try { byte[] wavBytes = null; if (INDONESIAN.getLanguage().equals(lang.getLanguage())) { // Expressive speech (for now, Indonesian only) try { PhonemeDoc phonemeDoc; if (EmotionKind.NEUTRAL == emotionKind) { phonemeDoc = speechProsody.performNeutral(communicateAction.getObject()); } else { try { final EmotionProsody emotionProsody = emotionProsodies .getEmotion(emotionKind) .orElseThrow( () -> new SpeechSynthesisException( "Emotion " + emotionKind + " not supported")); phonemeDoc = speechProsody.perform(communicateAction.getObject(), emotionProsody); } catch (Exception e) { log.error( "Cannot speak with emotion " + emotionKind + ", falling back to NEUTRAL: " + communicateAction.getObject(), e); phonemeDoc = speechProsody.performNeutral(communicateAction.getObject()); } } try (final ByteArrayInputStream objectIn = new ByteArrayInputStream(phonemeDoc.toString().getBytes(StandardCharsets.UTF_8)); final ByteArrayOutputStream wavStream = new ByteArrayOutputStream(); final ByteArrayOutputStream err = new ByteArrayOutputStream()) { final CommandLine cmdLine = new CommandLine("mbrola"); cmdLine.addArgument("-v"); cmdLine.addArgument(String.valueOf(INDONESIAN_AMPLITUDE / 100f)); cmdLine.addArgument(new File(mbrolaShareFolder, "id1/id1").toString()); cmdLine.addArgument("-"); cmdLine.addArgument("-.wav"); executor.setStreamHandler(new PumpStreamHandler(wavStream, err, objectIn)); final int executed; try { executed = executor.execute(cmdLine); wavBytes = wavStream.toByteArray(); } finally { log.info("{}: {}", cmdLine, err.toString()); } } } catch (Exception e) { log.error( "Cannot speak Indonesian using prosody engine, falling back to direct espeak: " + communicateAction.getObject(), e); } } if (wavBytes == null) { // Neutral speech using direct espeak try { wavBytes = performEspeak(communicateAction, lang); } catch (Exception e) { if (!Locale.US.getLanguage().equals(lang.getLanguage())) { // Indonesian sometimes fails especially "k-k", e.g. "baik koq". // retry using English as last resort, as long as it says something! log.error( "Cannot speak using " + lang.toLanguageTag() + ", falling back to English (US): " + communicateAction.getObject(), e); wavBytes = performEspeak(communicateAction, Locale.US); } else { throw e; } } } log.info("espeak/mbrola generated {} bytes WAV", wavBytes.length); try (final ByteArrayInputStream wavIn = new ByteArrayInputStream(wavBytes); final ByteArrayOutputStream bos = new ByteArrayOutputStream(); final ByteArrayOutputStream err = new ByteArrayOutputStream()) { // flac.exe doesn't support mp3, and that's a problem for now (note: mp3 patent is expiring) final CommandLine cmdLine = new CommandLine(ffmpegExecutable); cmdLine.addArgument("-i"); cmdLine.addArgument("-"); // cmdLine.addArgument(wavFile.toString()); cmdLine.addArgument("-ar"); cmdLine.addArgument(String.valueOf(SAMPLE_RATE)); cmdLine.addArgument("-ac"); cmdLine.addArgument("1"); cmdLine.addArgument("-f"); cmdLine.addArgument("ogg"); // without this you'll get FLAC instead, which browsers do not support cmdLine.addArgument("-acodec"); cmdLine.addArgument("libvorbis"); // cmdLine.addArgument("-y"); // happens, weird! // cmdLine.addArgument(oggFile.toString()); cmdLine.addArgument("-"); executor.setStreamHandler(new PumpStreamHandler(bos, err, wavIn)); final int executed; try { executed = executor.execute(cmdLine); } finally { log.info("{}: {}", cmdLine, err.toString()); } // Preconditions.checkState(oggFile.exists(), "Cannot convert // %s bytes WAV to OGG", // wavBytes.length); // Send // final byte[] audioContent = // FileUtils.readFileToByteArray(oggFile); final byte[] audioContent = bos.toByteArray(); final String audioContentType = "audio/ogg"; final AudioObject audioObject = new AudioObject(); audioObject.setTranscript(communicateAction.getObject()); audioObject.setInLanguage(lang); audioObject.setMediaLayer(MediaLayer.SPEECH); audioObject.setContentType(audioContentType + "; rate=" + SAMPLE_RATE); audioObject.setContentUrl( "data:" + audioContentType + ";base64," + Base64.encodeBase64String(audioContent)); audioObject.setContentSize((long) audioContent.length); // // audioObject.setName(FilenameUtils.getName(oggFile.getName())); audioObject.setName("lumen-speech-" + new DateTime() + ".ogg"); audioObject.setDateCreated(new DateTime()); audioObject.setDatePublished(audioObject.getDateCreated()); audioObject.setDateModified(audioObject.getDateCreated()); audioObject.setUploadDate(audioObject.getDateCreated()); final String audioOutUri = "rabbitmq://dummy/amq.topic?connectionFactory=#amqpConnFactory&exchangeType=topic&autoDelete=false&skipQueueDeclare=true&routingKey=" + AvatarChannel.AUDIO_OUT.key(avatarId); log.info("Sending {} to {} ...", audioObject, audioOutUri); producer.sendBodyAndHeader( audioOutUri, toJson.getMapper().writeValueAsBytes(audioObject), RabbitMQConstants.EXPIRATION, String.valueOf(MESSAGE_EXPIRATION.getMillis())); } } finally { // oggFile.delete(); // wavFile.delete(); } // reply log.trace("Exchange {} is {}", exchange.getIn().getMessageId(), exchange.getPattern()); final Status status = new Status(); exchange.getOut().setBody(status); return status; // final String replyTo = exchange.getIn().getHeader("rabbitmq.REPLY_TO", // String.class); // if (replyTo != null) { // log.debug("Sending reply to {} ...", replyTo); // exchange.getOut().setHeader("rabbitmq.ROUTING_KEY", replyTo); // exchange.getOut().setHeader("rabbitmq.EXCHANGE_NAME", ""); // exchange.getOut().setHeader("recipients", // // "rabbitmq://dummy/dummy?connectionFactory=#amqpConnFactory&autoDelete=false,log:OUT." + // LumenChannel.SPEECH_SYNTHESIS); // } else { // exchange.getOut().setHeader("recipients"); // } }