# -*- coding: utf-8 -*- import torch import numpy as np from scipy.signal import get_window import librosa.util as librosa_util from librosa import resample from librosa.effects import time_stretch def window_sumsquare( window, n_frames, hop_length=200, win_length=800, n_fft=800, dtype=np.float32, norm=None, ): """ # from librosa 0.6 Compute the sum-square envelope of a window function at a given hop length. This is used to estimate modulation effects induced by windowing observations in short-time fourier transforms. Parameters ---------- window : string, tuple, number, callable, or list-like Window specification, as in `get_window` n_frames : int > 0 The number of analysis frames hop_length : int > 0 The number of samples to advance between frames win_length : [optional] The length of the window function. By default, this matches `n_fft`. n_fft : int > 0 The length of each analysis frame. dtype : np.dtype The data type of the output Returns ------- wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))` The sum-squared envelope of the window function """ if win_length is None: win_length = n_fft n = n_fft + hop_length * (n_frames - 1) x = np.zeros(n, dtype=dtype) # Compute the squared window at the desired length win_sq = get_window(window, win_length, fftbins=True) win_sq = librosa_util.normalize(win_sq, norm=norm) ** 2 win_sq = librosa_util.pad_center(win_sq, n_fft) # Fill the envelope for i in range(n_frames): sample = i * hop_length x[sample : min(n, sample + n_fft)] += win_sq[: max(0, min(n_fft, n - sample))] return x def griffin_lim(magnitudes, stft_fn, n_iters=30): """ PARAMS ------ magnitudes: spectrogram magnitudes stft_fn: STFT class with transform (STFT) and inverse (ISTFT) methods """ angles = np.angle(np.exp(2j * np.pi * np.random.rand(*magnitudes.size()))) angles = angles.astype(np.float32) angles = torch.autograd.Variable(torch.from_numpy(angles)) signal = stft_fn.inverse(magnitudes, angles).squeeze(1) for i in range(n_iters): _, angles = stft_fn.transform(signal) signal = stft_fn.inverse(magnitudes, angles).squeeze(1) return signal def dynamic_range_compression(x, C=1, clip_val=1e-5): """ PARAMS ------ C: compression factor """ return torch.log(torch.clamp(x, min=clip_val) * C) def dynamic_range_decompression(x, C=1): """ PARAMS ------ C: compression factor used to compress """ return torch.exp(x) / C # adapted from # https://github.com/mgeier/python-audio/blob/master/audio-files/utility.py def float2pcm(sig, dtype="int16"): """Convert floating point signal with a range from -1 to 1 to PCM. Any signal values outside the interval [-1.0, 1.0) are clipped. No dithering is used. Note that there are different possibilities for scaling floating point numbers to PCM numbers, this function implements just one of them. For an overview of alternatives see http://blog.bjornroche.com/2009/12/int-float-int-its-jungle-out-there.html Parameters ---------- sig : array_like Input array, must have floating point type. dtype : data type, optional Desired (integer) data type. Returns ------- numpy.ndarray Integer data, scaled and clipped to the range of the given *dtype*. See Also -------- pcm2float, dtype """ sig = np.asarray(sig) if sig.dtype.kind != "f": raise TypeError("'sig' must be a float array") dtype = np.dtype(dtype) if dtype.kind not in "iu": raise TypeError("'dtype' must be an integer type") i = np.iinfo(dtype) abs_max = 2 ** (i.bits - 1) offset = i.min + abs_max return (sig * abs_max + offset).clip(i.min, i.max).astype(dtype) def postprocess_audio(audio, tempo=0.8, src_rate=22050, dst_rate=16000): slow_data = time_stretch(audio, tempo) float_data = resample(slow_data, 22050, dst_rate) data = float2pcm(float_data) return data.tobytes()