tacotron2/taco2/audio_processing.py

149 lines
4.1 KiB
Python

# -*- coding: utf-8 -*-
import torch
import numpy as np
from scipy.signal import get_window
import librosa.util as librosa_util
from librosa import resample
from librosa.effects import time_stretch
def window_sumsquare(
window,
n_frames,
hop_length=200,
win_length=800,
n_fft=800,
dtype=np.float32,
norm=None,
):
"""
# from librosa 0.6
Compute the sum-square envelope of a window function at a given hop length.
This is used to estimate modulation effects induced by windowing
observations in short-time fourier transforms.
Parameters
----------
window : string, tuple, number, callable, or list-like
Window specification, as in `get_window`
n_frames : int > 0
The number of analysis frames
hop_length : int > 0
The number of samples to advance between frames
win_length : [optional]
The length of the window function. By default, this matches `n_fft`.
n_fft : int > 0
The length of each analysis frame.
dtype : np.dtype
The data type of the output
Returns
-------
wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))`
The sum-squared envelope of the window function
"""
if win_length is None:
win_length = n_fft
n = n_fft + hop_length * (n_frames - 1)
x = np.zeros(n, dtype=dtype)
# Compute the squared window at the desired length
win_sq = get_window(window, win_length, fftbins=True)
win_sq = librosa_util.normalize(win_sq, norm=norm) ** 2
win_sq = librosa_util.pad_center(win_sq, n_fft)
# Fill the envelope
for i in range(n_frames):
sample = i * hop_length
x[sample : min(n, sample + n_fft)] += win_sq[: max(0, min(n_fft, n - sample))]
return x
def griffin_lim(magnitudes, stft_fn, n_iters=30):
"""
PARAMS
------
magnitudes: spectrogram magnitudes
stft_fn: STFT class with transform (STFT) and inverse (ISTFT) methods
"""
angles = np.angle(np.exp(2j * np.pi * np.random.rand(*magnitudes.size())))
angles = angles.astype(np.float32)
angles = torch.autograd.Variable(torch.from_numpy(angles))
signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
for i in range(n_iters):
_, angles = stft_fn.transform(signal)
signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
return signal
def dynamic_range_compression(x, C=1, clip_val=1e-5):
"""
PARAMS
------
C: compression factor
"""
return torch.log(torch.clamp(x, min=clip_val) * C)
def dynamic_range_decompression(x, C=1):
"""
PARAMS
------
C: compression factor used to compress
"""
return torch.exp(x) / C
# adapted from
# https://github.com/mgeier/python-audio/blob/master/audio-files/utility.py
def float2pcm(sig, dtype="int16"):
"""Convert floating point signal with a range from -1 to 1 to PCM.
Any signal values outside the interval [-1.0, 1.0) are clipped.
No dithering is used.
Note that there are different possibilities for scaling floating
point numbers to PCM numbers, this function implements just one of
them. For an overview of alternatives see
http://blog.bjornroche.com/2009/12/int-float-int-its-jungle-out-there.html
Parameters
----------
sig : array_like
Input array, must have floating point type.
dtype : data type, optional
Desired (integer) data type.
Returns
-------
numpy.ndarray
Integer data, scaled and clipped to the range of the given
*dtype*.
See Also
--------
pcm2float, dtype
"""
sig = np.asarray(sig)
if sig.dtype.kind != "f":
raise TypeError("'sig' must be a float array")
dtype = np.dtype(dtype)
if dtype.kind not in "iu":
raise TypeError("'dtype' must be an integer type")
i = np.iinfo(dtype)
abs_max = 2 ** (i.bits - 1)
offset = i.min + abs_max
return (sig * abs_max + offset).clip(i.min, i.max).astype(dtype)
def postprocess_audio(audio, tempo=0.8, src_rate=22050, dst_rate=16000):
slow_data = time_stretch(audio, tempo)
float_data = resample(slow_data, 22050, dst_rate)
data = float2pcm(float_data)
return data.tobytes()