110 lines
3.4 KiB
Python
110 lines
3.4 KiB
Python
from logging import getLogger
|
|
from plume.utils import lazy_module
|
|
|
|
|
|
from pathlib import Path
|
|
|
|
import typer
|
|
|
|
# from google.cloud import texttospeech
|
|
texttospeech = lazy_module('google.cloud.texttospeech')
|
|
|
|
LOGGER = getLogger("googletts")
|
|
|
|
app = typer.Typer()
|
|
|
|
|
|
class GoogleTTS(object):
|
|
def __init__(self):
|
|
self.client = texttospeech.TextToSpeechClient()
|
|
|
|
def text_to_speech(self, text: str, params: dict) -> bytes:
|
|
tts_input = texttospeech.types.SynthesisInput(text=text)
|
|
voice = texttospeech.types.VoiceSelectionParams(
|
|
language_code=params["language"], name=params["name"]
|
|
)
|
|
audio_config = texttospeech.types.AudioConfig(
|
|
audio_encoding=texttospeech.enums.AudioEncoding.LINEAR16,
|
|
sample_rate_hertz=params["sample_rate"],
|
|
)
|
|
if 'speaking_rate' in params:
|
|
audio_config.speaking_rate = params['speaking_rate']
|
|
if 'pitch' in params:
|
|
audio_config.pitch = params['pitch']
|
|
response = self.client.synthesize_speech(tts_input, voice, audio_config)
|
|
audio_content = response.audio_content
|
|
return audio_content
|
|
|
|
def ssml_to_speech(self, text: str, params: dict) -> bytes:
|
|
tts_input = texttospeech.types.SynthesisInput(ssml=text)
|
|
voice = texttospeech.types.VoiceSelectionParams(
|
|
language_code=params["language"], name=params["name"]
|
|
)
|
|
audio_config = texttospeech.types.AudioConfig(
|
|
audio_encoding=texttospeech.enums.AudioEncoding.LINEAR16,
|
|
sample_rate_hertz=params["sample_rate"],
|
|
)
|
|
response = self.client.synthesize_speech(tts_input, voice, audio_config)
|
|
audio_content = response.audio_content
|
|
return audio_content
|
|
|
|
@classmethod
|
|
def voice_list(cls):
|
|
"""Lists the available voices."""
|
|
|
|
client = cls().client
|
|
|
|
# Performs the list voices request
|
|
voices = client.list_voices()
|
|
results = []
|
|
for voice in voices.voices:
|
|
supported_eng_langs = [
|
|
lang for lang in voice.language_codes if lang[:2] == "en"
|
|
]
|
|
if len(supported_eng_langs) > 0:
|
|
lang = ",".join(supported_eng_langs)
|
|
else:
|
|
continue
|
|
|
|
ssml_gender = texttospeech.enums.SsmlVoiceGender(voice.ssml_gender)
|
|
results.append(
|
|
{
|
|
"name": voice.name,
|
|
"language": lang,
|
|
"gender": ssml_gender.name,
|
|
"engine": "wavenet" if "Wav" in voice.name else "standard",
|
|
"sample_rate": voice.natural_sample_rate_hertz,
|
|
}
|
|
)
|
|
return results
|
|
|
|
@classmethod
|
|
def voice_by_name(cls, name):
|
|
"""Lists the available voices."""
|
|
|
|
# client = cls().client
|
|
|
|
# Performs the list voices request
|
|
results = cls.voice_list()
|
|
for voice in results:
|
|
if voice['name'] == name:
|
|
return voice
|
|
raise ValueError(f'{name} not a valid voice')
|
|
|
|
|
|
@app.command()
|
|
def generate_audio_file(text, dest_path: Path = "./tts_audio.wav", voice="en-US-Wavenet-D"):
|
|
tts = GoogleTTS()
|
|
selected_voice = [v for v in tts.voice_list() if v["name"] == voice][0]
|
|
wav_data = tts.text_to_speech(text, selected_voice)
|
|
with dest_path.open("wb") as wf:
|
|
wf.write(wav_data)
|
|
|
|
|
|
def main():
|
|
app()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|