{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import matplotlib\n", "#%matplotlib inline\n", "import matplotlib.pylab as plt\n", "\n", "import IPython.display as ipd\n", "\n", "import sys\n", "sys.path.append('waveglow/')\n", "import numpy as np\n", "import torch\n", "\n", "from hparams import create_hparams\n", "from model import Tacotron2\n", "from layers import TacotronSTFT, STFT\n", "from audio_processing import griffin_lim\n", "from train import load_model\n", "from text import text_to_sequence\n", "from denoiser import Denoiser" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import os" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "hparams = create_hparams()\n", "hparams.sampling_rate = 22050\n", "checkpoint_path = \"checkpoint_15000\"\n", "model = load_model(hparams)\n", "model.load_state_dict(torch.load(checkpoint_path, map_location = 'cpu')['state_dict']) #added map_location = 'cpu'\n", "_ = model.eval() #it was originally model.cuda().eval().half()\n", "waveglow_path = 'waveglow_256channels.pt'\n", "waveglow = torch.load(waveglow_path, map_location = 'cpu')['model'] #added map_location = 'cpu'\n", "waveglow.eval() #originally waveglow.cuda().eval().half()\n", "for k in waveglow.convinv:\n", " k.float()\n", "#denoiser = Denoiser(waveglow)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import soundfile as sf" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import time" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def convert(array):\n", " sf.write('sample.wav', array, 22050)\n", " os.system('ffmpeg -i {0} -filter:a \"atempo=0.80\" {1}'.format('sample.wav', 'sample0.wav'))\n", " #os.system('ffmpeg -i {0} -ar 8000 {1}'.format('sample0.wav', 'sample1.wav'))\n", " data, rate = sf.read('sample0.wav')\n", " os.remove('sample.wav')\n", " os.remove('sample0.wav')\n", " #os.remove('sample1.wav')\n", " return data" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def speech(t):\n", " start = time.time()\n", " text = t\n", " sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :]\n", " print(sequence)\n", " sequence = torch.autograd.Variable(\n", " torch.from_numpy(sequence)).long() #originally torch.from_numpy(sequence)).cuda().long()\n", " mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)\n", " with torch.no_grad():\n", " audio = waveglow.infer(mel_outputs_postnet, sigma=0.666)\n", " #audio_denoised = denoiser(audio, strength=0.01)[:, 0]\n", " data = convert(audio[0].data.cpu().numpy())\n", " #os.system('ffmpeg -i {0} -filter:a \"atempo=0.85\" {1}'.format('harvard_inference/audio/'+str(i)+'.wav', 'harvard_inference/audio_0.85/'+str(i)+'.wav'))\n", " aud = ipd.Audio(data, rate=22050)\n", " end = time.time()\n", " print(end-start)\n", " return aud" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "speech('I understand your frustration and disappointment. I am sorry that its happening and I would like to help prevent it in the future. What style of diapers did you buy? For instance, was it the snugglers, pull ups or baby dry.')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "from final import display,speech,play_device\n", "import pyaudio" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "15.046638011932373\n" ] } ], "source": [ "data = speech('Thank you for calling Huggies. How may I help you today .')" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "display(data)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "def play_device(data):\n", " audio_interface = pyaudio.PyAudio()\n", " _audio_stream = audio_interface.open(format=pyaudio.paInt16,channels=1, rate=16000,output=True)\n", " _audio_stream.write(data.tostring())\n", "# _audio_stream.close()\n", "play_device(data)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 2 }