{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import matplotlib\n",
"#%matplotlib inline\n",
"import matplotlib.pylab as plt\n",
"\n",
"import IPython.display as ipd\n",
"\n",
"import sys\n",
"sys.path.append('waveglow/')\n",
"import numpy as np\n",
"import torch\n",
"\n",
"from hparams import create_hparams\n",
"from model import Tacotron2\n",
"from layers import TacotronSTFT, STFT\n",
"from audio_processing import griffin_lim\n",
"from train import load_model\n",
"from text import text_to_sequence\n",
"from denoiser import Denoiser"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"hparams = create_hparams()\n",
"hparams.sampling_rate = 22050\n",
"checkpoint_path = \"checkpoint_15000\"\n",
"model = load_model(hparams)\n",
"model.load_state_dict(torch.load(checkpoint_path, map_location = 'cpu')['state_dict']) #added map_location = 'cpu'\n",
"_ = model.eval() #it was originally model.cuda().eval().half()\n",
"waveglow_path = 'waveglow_256channels.pt'\n",
"waveglow = torch.load(waveglow_path, map_location = 'cpu')['model'] #added map_location = 'cpu'\n",
"waveglow.eval() #originally waveglow.cuda().eval().half()\n",
"for k in waveglow.convinv:\n",
" k.float()\n",
"#denoiser = Denoiser(waveglow)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import soundfile as sf"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import time"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def convert(array):\n",
" sf.write('sample.wav', array, 22050)\n",
" os.system('ffmpeg -i {0} -filter:a \"atempo=0.80\" {1}'.format('sample.wav', 'sample0.wav'))\n",
" #os.system('ffmpeg -i {0} -ar 8000 {1}'.format('sample0.wav', 'sample1.wav'))\n",
" data, rate = sf.read('sample0.wav')\n",
" os.remove('sample.wav')\n",
" os.remove('sample0.wav')\n",
" #os.remove('sample1.wav')\n",
" return data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def speech(t):\n",
" start = time.time()\n",
" text = t\n",
" sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :]\n",
" print(sequence)\n",
" sequence = torch.autograd.Variable(\n",
" torch.from_numpy(sequence)).long() #originally torch.from_numpy(sequence)).cuda().long()\n",
" mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)\n",
" with torch.no_grad():\n",
" audio = waveglow.infer(mel_outputs_postnet, sigma=0.666)\n",
" #audio_denoised = denoiser(audio, strength=0.01)[:, 0]\n",
" data = convert(audio[0].data.cpu().numpy())\n",
" #os.system('ffmpeg -i {0} -filter:a \"atempo=0.85\" {1}'.format('harvard_inference/audio/'+str(i)+'.wav', 'harvard_inference/audio_0.85/'+str(i)+'.wav'))\n",
" aud = ipd.Audio(data, rate=22050)\n",
" end = time.time()\n",
" print(end-start)\n",
" return aud"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"speech('I understand your frustration and disappointment. I am sorry that its happening and I would like to help prevent it in the future. What style of diapers did you buy? For instance, was it the snugglers, pull ups or baby dry.')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"from final import display,speech,play_device\n",
"import pyaudio"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"15.046638011932373\n"
]
}
],
"source": [
"data = speech('Thank you for calling Huggies. How may I help you today .')"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"display(data)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"def play_device(data):\n",
" audio_interface = pyaudio.PyAudio()\n",
" _audio_stream = audio_interface.open(format=pyaudio.paInt16,channels=1, rate=16000,output=True)\n",
" _audio_stream.write(data.tostring())\n",
"# _audio_stream.close()\n",
"play_device(data)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}