2021-02-23 14:13:33 +00:00
|
|
|
from pathlib import Path
|
|
|
|
|
# from IPython import display
|
|
|
|
|
import requests
|
|
|
|
|
import io
|
2021-03-23 07:57:35 +00:00
|
|
|
import shutil
|
2021-02-23 14:13:33 +00:00
|
|
|
|
2021-03-23 07:57:35 +00:00
|
|
|
import typer
|
2021-02-23 14:13:33 +00:00
|
|
|
from plume.utils import lazy_module
|
|
|
|
|
|
2021-03-23 07:57:35 +00:00
|
|
|
from .tts import GoogleTTS
|
|
|
|
|
|
2021-02-23 14:13:33 +00:00
|
|
|
display = lazy_module('IPython.display')
|
|
|
|
|
pydub = lazy_module('pydub')
|
|
|
|
|
|
|
|
|
|
app = typer.Typer()
|
|
|
|
|
|
|
|
|
|
# Start gentle with following command
|
|
|
|
|
# docker run --rm -d --name gentle_service -p 8765:8765/tcp lowerquality/gentle
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def gentle_aligner(service_uri, wav_data, utter_text):
|
|
|
|
|
# service_uri= "http://52.41.161.36:8765/transcriptions"
|
|
|
|
|
wav_f = io.BytesIO(wav_data)
|
|
|
|
|
wav_seg = pydub.AudioSegment.from_file(wav_f)
|
|
|
|
|
|
|
|
|
|
mp3_f = io.BytesIO()
|
|
|
|
|
wav_seg.export(mp3_f, format="mp3")
|
|
|
|
|
mp3_f.seek(0)
|
|
|
|
|
params = (("async", "false"),)
|
|
|
|
|
files = {
|
|
|
|
|
"audio": ("audio.mp3", mp3_f),
|
|
|
|
|
"transcript": ("words.txt", io.BytesIO(utter_text.encode("utf-8"))),
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
response = requests.post(service_uri, params=params, files=files)
|
|
|
|
|
print(f"Time duration of audio {wav_seg.duration_seconds}")
|
|
|
|
|
print(f"Time taken to align: {response.elapsed}s")
|
|
|
|
|
return wav_seg, response.json()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def gentle_align_iter(service_uri, wav_data, utter_text):
|
|
|
|
|
wav_seg, response = gentle_aligner(service_uri, wav_data, utter_text)
|
|
|
|
|
for span in response:
|
|
|
|
|
word_seg = wav_seg[int(span["start"] * 1000) : int(span["end"] * 1000)]
|
|
|
|
|
word = span["word"]
|
|
|
|
|
yield (word, word_seg)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def tts_jupyter():
|
|
|
|
|
google_voices = GoogleTTS.voice_list()
|
|
|
|
|
gtts = GoogleTTS()
|
|
|
|
|
# google_voices[4]
|
|
|
|
|
us_voice = [v for v in google_voices if v["language"] == "en-US"][0]
|
|
|
|
|
utter_text = (
|
|
|
|
|
"I would like to align the audio segments based on word level timestamps"
|
|
|
|
|
)
|
|
|
|
|
wav_data = gtts.text_to_speech(text=utter_text, params=us_voice)
|
|
|
|
|
for word, seg in gentle_align_iter(wav_data, utter_text):
|
|
|
|
|
print(word)
|
|
|
|
|
display.display(seg)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@app.command()
|
|
|
|
|
def gentle_preview(
|
|
|
|
|
audio_path: Path,
|
|
|
|
|
transcript_path: Path,
|
|
|
|
|
service_uri="http://101.53.142.218:8765/transcriptions",
|
2021-03-23 07:57:35 +00:00
|
|
|
gent_preview_dir="./gentle_preview",
|
2021-02-23 14:13:33 +00:00
|
|
|
):
|
|
|
|
|
from . import ExtendedPath
|
|
|
|
|
|
2021-03-23 07:57:35 +00:00
|
|
|
pkg_gentle_dir = Path(__file__).parent / 'gentle_preview'
|
|
|
|
|
|
|
|
|
|
shutil.copytree(str(pkg_gentle_dir), str(gent_preview_dir))
|
|
|
|
|
# ab = audio_path.read_bytes()
|
|
|
|
|
# tt = transcript_path.read_text()
|
|
|
|
|
# audio, alignment = gentle_aligner(service_uri, ab, tt)
|
|
|
|
|
# audio.export(gent_preview_dir / Path("a.wav"), format="wav")
|
|
|
|
|
# alignment["status"] = "OK"
|
|
|
|
|
# ExtendedPath(gent_preview_dir / Path("status.json")).write_json(alignment)
|
2021-02-23 14:13:33 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
|
app()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
main()
|