mirror of https://github.com/malarinv/tacotron2
149 lines
4.1 KiB
Python
149 lines
4.1 KiB
Python
# -*- coding: utf-8 -*-
|
|
import torch
|
|
import numpy as np
|
|
from scipy.signal import get_window
|
|
import librosa.util as librosa_util
|
|
from librosa import resample
|
|
from librosa.effects import time_stretch
|
|
|
|
|
|
def window_sumsquare(
|
|
window,
|
|
n_frames,
|
|
hop_length=200,
|
|
win_length=800,
|
|
n_fft=800,
|
|
dtype=np.float32,
|
|
norm=None,
|
|
):
|
|
"""
|
|
# from librosa 0.6
|
|
Compute the sum-square envelope of a window function at a given hop length.
|
|
|
|
This is used to estimate modulation effects induced by windowing
|
|
observations in short-time fourier transforms.
|
|
|
|
Parameters
|
|
----------
|
|
window : string, tuple, number, callable, or list-like
|
|
Window specification, as in `get_window`
|
|
|
|
n_frames : int > 0
|
|
The number of analysis frames
|
|
|
|
hop_length : int > 0
|
|
The number of samples to advance between frames
|
|
|
|
win_length : [optional]
|
|
The length of the window function. By default, this matches `n_fft`.
|
|
|
|
n_fft : int > 0
|
|
The length of each analysis frame.
|
|
|
|
dtype : np.dtype
|
|
The data type of the output
|
|
|
|
Returns
|
|
-------
|
|
wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))`
|
|
The sum-squared envelope of the window function
|
|
"""
|
|
if win_length is None:
|
|
win_length = n_fft
|
|
|
|
n = n_fft + hop_length * (n_frames - 1)
|
|
x = np.zeros(n, dtype=dtype)
|
|
|
|
# Compute the squared window at the desired length
|
|
win_sq = get_window(window, win_length, fftbins=True)
|
|
win_sq = librosa_util.normalize(win_sq, norm=norm) ** 2
|
|
win_sq = librosa_util.pad_center(win_sq, n_fft)
|
|
|
|
# Fill the envelope
|
|
for i in range(n_frames):
|
|
sample = i * hop_length
|
|
x[sample : min(n, sample + n_fft)] += win_sq[: max(0, min(n_fft, n - sample))]
|
|
return x
|
|
|
|
|
|
def griffin_lim(magnitudes, stft_fn, n_iters=30):
|
|
"""
|
|
PARAMS
|
|
------
|
|
magnitudes: spectrogram magnitudes
|
|
stft_fn: STFT class with transform (STFT) and inverse (ISTFT) methods
|
|
"""
|
|
|
|
angles = np.angle(np.exp(2j * np.pi * np.random.rand(*magnitudes.size())))
|
|
angles = angles.astype(np.float32)
|
|
angles = torch.autograd.Variable(torch.from_numpy(angles))
|
|
signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
|
|
|
|
for i in range(n_iters):
|
|
_, angles = stft_fn.transform(signal)
|
|
signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
|
|
return signal
|
|
|
|
|
|
def dynamic_range_compression(x, C=1, clip_val=1e-5):
|
|
"""
|
|
PARAMS
|
|
------
|
|
C: compression factor
|
|
"""
|
|
return torch.log(torch.clamp(x, min=clip_val) * C)
|
|
|
|
|
|
def dynamic_range_decompression(x, C=1):
|
|
"""
|
|
PARAMS
|
|
------
|
|
C: compression factor used to compress
|
|
"""
|
|
return torch.exp(x) / C
|
|
|
|
|
|
# adapted from
|
|
# https://github.com/mgeier/python-audio/blob/master/audio-files/utility.py
|
|
def float2pcm(sig, dtype="int16"):
|
|
"""Convert floating point signal with a range from -1 to 1 to PCM.
|
|
Any signal values outside the interval [-1.0, 1.0) are clipped.
|
|
No dithering is used.
|
|
Note that there are different possibilities for scaling floating
|
|
point numbers to PCM numbers, this function implements just one of
|
|
them. For an overview of alternatives see
|
|
http://blog.bjornroche.com/2009/12/int-float-int-its-jungle-out-there.html
|
|
Parameters
|
|
----------
|
|
sig : array_like
|
|
Input array, must have floating point type.
|
|
dtype : data type, optional
|
|
Desired (integer) data type.
|
|
Returns
|
|
-------
|
|
numpy.ndarray
|
|
Integer data, scaled and clipped to the range of the given
|
|
*dtype*.
|
|
See Also
|
|
--------
|
|
pcm2float, dtype
|
|
"""
|
|
sig = np.asarray(sig)
|
|
if sig.dtype.kind != "f":
|
|
raise TypeError("'sig' must be a float array")
|
|
dtype = np.dtype(dtype)
|
|
if dtype.kind not in "iu":
|
|
raise TypeError("'dtype' must be an integer type")
|
|
|
|
i = np.iinfo(dtype)
|
|
abs_max = 2 ** (i.bits - 1)
|
|
offset = i.min + abs_max
|
|
return (sig * abs_max + offset).clip(i.min, i.max).astype(dtype)
|
|
|
|
|
|
def postprocess_audio(audio, tempo=0.8, src_rate=22050, dst_rate=16000):
|
|
slow_data = time_stretch(audio, tempo)
|
|
float_data = resample(slow_data, 22050, dst_rate)
|
|
data = float2pcm(float_data)
|
|
return data.tobytes()
|