plume-asr/plume/utils/tts.py

from logging import getLogger
from plume.utils import lazy_module


from pathlib import Path

import typer

# from google.cloud import texttospeech
texttospeech = lazy_module('google.cloud.texttospeech')

LOGGER = getLogger("googletts")

app = typer.Typer()


class GoogleTTS(object):
    def __init__(self):
        self.client = texttospeech.TextToSpeechClient()

    def text_to_speech(self, text: str, params: dict) -> bytes:
        tts_input = texttospeech.types.SynthesisInput(text=text)
        voice = texttospeech.types.VoiceSelectionParams(
            language_code=params["language"], name=params["name"]
        )
        audio_config = texttospeech.types.AudioConfig(
            audio_encoding=texttospeech.enums.AudioEncoding.LINEAR16,
            sample_rate_hertz=params["sample_rate"],
        )
        if 'speaking_rate' in params:
            audio_config.speaking_rate = params['speaking_rate']
        if 'pitch' in params:
            audio_config.pitch = params['pitch']
        response = self.client.synthesize_speech(tts_input, voice, audio_config)
        audio_content = response.audio_content
        return audio_content

    def ssml_to_speech(self, text: str, params: dict) -> bytes:
        tts_input = texttospeech.types.SynthesisInput(ssml=text)
        voice = texttospeech.types.VoiceSelectionParams(
            language_code=params["language"], name=params["name"]
        )
        audio_config = texttospeech.types.AudioConfig(
            audio_encoding=texttospeech.enums.AudioEncoding.LINEAR16,
            sample_rate_hertz=params["sample_rate"],
        )
        response = self.client.synthesize_speech(tts_input, voice, audio_config)
        audio_content = response.audio_content
        return audio_content

    @classmethod
    def voice_list(cls):
        """Lists the available voices."""

        client = cls().client

        # Performs the list voices request
        voices = client.list_voices()
        results = []
        for voice in voices.voices:
            supported_eng_langs = [
                lang for lang in voice.language_codes if lang[:2] == "en"
            ]
            if len(supported_eng_langs) > 0:
                lang = ",".join(supported_eng_langs)
            else:
                continue

            ssml_gender = texttospeech.enums.SsmlVoiceGender(voice.ssml_gender)
            results.append(
                {
                    "name": voice.name,
                    "language": lang,
                    "gender": ssml_gender.name,
                    "engine": "wavenet" if "Wav" in voice.name else "standard",
                    "sample_rate": voice.natural_sample_rate_hertz,
                }
            )
        return results

    @classmethod
    def voice_by_name(cls, name):
        """Lists the available voices."""

        # client = cls().client

        # Performs the list voices request
        results = cls.voice_list()
        for voice in results:
            if voice['name'] == name:
                return voice
        raise ValueError(f'{name} not a valid voice')


@app.command()
def generate_audio_file(text, dest_path: Path = "./tts_audio.wav", voice="en-US-Wavenet-D"):
    tts = GoogleTTS()
    selected_voice = [v for v in tts.voice_list() if v["name"] == voice][0]
    wav_data = tts.text_to_speech(text, selected_voice)
    with dest_path.open("wb") as wf:
        wf.write(wav_data)


def main():
    app()


if __name__ == "__main__":
    main()