From e7fc607578be727ada09c508a5697be1b22679ad Mon Sep 17 00:00:00 2001 From: Malar Kannan Date: Wed, 22 Nov 2017 14:45:08 +0530 Subject: [PATCH] trying mfcc instead of spectrogram --- speech_data.py | 4 +++- speech_pitch.py | 46 ++++++++++++++++++++++++++++++++++++++++++++++ speech_spectrum.py | 36 ++++++++++++++++++++---------------- 3 files changed, 69 insertions(+), 17 deletions(-) create mode 100644 speech_pitch.py diff --git a/speech_data.py b/speech_data.py index 48d2ad8..7bb9d7b 100644 --- a/speech_data.py +++ b/speech_data.py @@ -6,6 +6,7 @@ import tensorflow as tf from tensorflow.python.ops import data_flow_ops import numpy as np from speech_spectrum import generate_aiff_spectrogram +from speech_pitch import compute_mfcc from sklearn.model_selection import train_test_split import itertools import os @@ -63,7 +64,8 @@ def create_spectrogram_tfrecords(audio_group='audio',sample_count=0,train_test_r for (w, word_group) in word_group_prog: word_group_prog.set_postfix(word=w,sample_name=sample_name) g = word_group.reset_index() - g['spectrogram'] = apply_by_multiprocessing(g['file_path'],generate_aiff_spectrogram) + # g['spectrogram'] = apply_by_multiprocessing(g['file_path'],generate_aiff_spectrogram) + g['spectrogram'] = apply_by_multiprocessing(g['file_path'],compute_mfcc) sample_right = g.loc[g['variant'] == 'low'] sample_wrong = g.loc[g['variant'] == 'medium'] same, diff = siamese_pairs(sample_right, sample_wrong) diff --git a/speech_pitch.py b/speech_pitch.py new file mode 100644 index 0000000..ef76ee9 --- /dev/null +++ b/speech_pitch.py @@ -0,0 +1,46 @@ +import parselmouth as pm +from pysndfile import sndio as snd + +def pitch_array(sample_file='outputs/audio/sunflowers-Victoria-180-normal-870.aiff'): + samples, samplerate, _ = snd.read(sample_file) + sample_sound = pm.Sound(values=samples,sampling_frequency=samplerate) + sample_pitch = sample_sound.to_pitch() + return sample_pitch.to_matrix().as_array() + +def intensity_array(sample_file='outputs/audio/sunflowers-Victoria-180-normal-870.aiff'): + sample_file='outputs/audio/sunflowers-Victoria-180-normal-870.aiff' + samples, samplerate, _ = snd.read(sample_file) + sample_sound = pm.Sound(values=samples,sampling_frequency=samplerate) + sample_intensity = sample_sound.to_mfcc() + sample_intensity.as_array().shape + return sample_pitch.to_matrix().as_array() + +def compute_mfcc(sample_file='outputs/audio/sunflowers-Victoria-180-normal-870.aiff'): + # sample_file='outputs/audio/sunflowers-Victoria-180-normal-870.aiff' + samples, samplerate, _ = snd.read(sample_file) + sample_sound = pm.Sound(values=samples,sampling_frequency=samplerate) + sample_mfcc = sample_sound.to_mfcc() + # sample_mfcc.to_array().shape + return sample_mfcc.to_array() + +# sunflowers_vic_180_norm = pitch_array('outputs/audio/sunflowers-Victoria-180-normal-870.aiff') +# sunflowers_fred_180_norm = pitch_array('outputs/audio/sunflowers-Fred-180-normal-6515.aiff') +# sunflowers_vic_180_norm_mfcc = compute_mfcc('outputs/audio/sunflowers-Victoria-180-normal-870.aiff') +fred_180_norm_mfcc = compute_mfcc('outputs/audio/sunflowers-Fred-180-normal-6515.aiff') +alex_mfcc = compute_mfcc('outputs/audio/sunflowers-Alex-180-normal-4763.aiff') +# # sunflowers_vic_180_norm.shape +# # sunflowers_fred_180_norm.shape +# alex_mfcc.shape +# sunflowers_vic_180_norm_mfcc.shape +# sunflowers_fred_180_norm_mfcc.shape +from speech_spectrum import generate_aiff_spectrogram +vic_spec = generate_aiff_spectrogram('outputs/audio/sunflowers-Victoria-180-normal-870.aiff') +alex_spec = generate_aiff_spectrogram('outputs/audio/sunflowers-Alex-180-normal-4763.aiff') +alex150spec = generate_aiff_spectrogram('outputs/audio/sunflowers-Alex-150-normal-589.aiff') +vic_spec.shape +alex_spec.shape +alex150spec.shape +alex_mfcc.shape +fred_180_norm_mfcc.shape +# pm.SoundFileFormat +# pm.Pitch.get_number_of_frames() diff --git a/speech_spectrum.py b/speech_spectrum.py index 794586f..42c2154 100644 --- a/speech_spectrum.py +++ b/speech_spectrum.py @@ -1,5 +1,4 @@ #!/usr/bin/env python - """ This work is licensed under a Creative Commons Attribution 3.0 Unported License. Frank Zalkow, 2012-2013 @@ -16,15 +15,14 @@ from numpy.lib import stride_tricks STFT_WINDOWS_MSEC = 20 STFT_WINDOW_OVERLAP = 1.0 / 3 + def stft(sig, frameSize, overlapFac=0.5, window=np.hanning): win = window(frameSize) hopSize = int(frameSize - np.floor(overlapFac * frameSize)) # zeros at beginning (thus center of 1st window should be for sample nr. 0) # sig = (sig*255).astype(np.uint8) - # import pdb;pdb.set_trace() count = int(np.floor(frameSize / 2.0)) - # import pdb;pdb.set_trace() samples = np.append(np.zeros(count), sig) # cols for windowing cols = int(np.ceil((len(samples) - frameSize) / float(hopSize)) + 1) @@ -36,7 +34,6 @@ def stft(sig, frameSize, overlapFac=0.5, window=np.hanning): shape=(cols, frameSize), strides=(samples.strides[0] * hopSize, samples.strides[0])).copy() frames *= win - return np.fft.rfft(frames) @@ -49,7 +46,6 @@ def logscale_spec(spec, sr=44100, factor=20.): scale = np.linspace(0, 1, freqbins)**factor scale *= (freqbins - 1) / max(scale) scale = np.unique(np.round(scale)).astype(np.uint32) - # import pdb;pdb.set_trace() # create spectrogram with new freq bins newspec = np.complex128(np.zeros([timebins, len(scale)])) for i in range(0, len(scale)): @@ -57,7 +53,6 @@ def logscale_spec(spec, sr=44100, factor=20.): newspec[:, i] = np.sum(spec[:, scale[i]:], axis=1) else: newspec[:, i] = np.sum(spec[:, scale[i]:scale[i + 1]], axis=1) - # list center freq of bins allfreqs = np.abs(np.fft.fftfreq(freqbins * 2, 1. / sr)[:freqbins + 1]) freqs = [] @@ -66,7 +61,6 @@ def logscale_spec(spec, sr=44100, factor=20.): freqs += [np.mean(allfreqs[scale[i]:])] else: freqs += [np.mean(allfreqs[scale[i]:scale[i + 1]])] - return newspec, freqs @@ -76,11 +70,13 @@ def logscale_spec(spec, sr=44100, factor=20.): def generate_spec_frec(samples, samplerate): # samplerate, samples = wav.read(audiopath) # s = stft(samples, binsize) - s = stft(samples, samplerate * STFT_WINDOWS_MSEC // 1000, STFT_WINDOW_OVERLAP) - + s = stft(samples, samplerate * STFT_WINDOWS_MSEC // 1000, + STFT_WINDOW_OVERLAP) sshow, freq = logscale_spec(s, factor=1.0, sr=samplerate) - ims = 20. * np.log10(np.abs(sshow) / 10e-6) - ims[ims<0] = 0 + # add epison so that log10 doesn't break + sshow_abs = np.abs(sshow + np.finfo(sshow.dtype).eps) + ims = 20. * np.log10(sshow_abs / 10e-6) + ims[ims < 0] = 0 #np.finfo(sshow.dtype).eps return ims, freq @@ -90,7 +86,11 @@ def generate_aiff_spectrogram(audiopath): return ims -def plot_stft(samples, samplerate, binsize=2**10, plotpath=None, colormap="jet"): +def plot_stft(samples, + samplerate, + binsize=2**10, + plotpath=None, + colormap="jet"): (ims, freq) = generate_spec_frec(samples, samplerate) timebins, freqbins = np.shape(ims) plt.figure(figsize=(15, 7.5)) @@ -128,8 +128,10 @@ def plot_aiff_stft(audiopath, binsize=2**10, plotpath=None, colormap="jet"): def play_sunflower(): - sample_r = snd.get_info('./outputs/audio/sunflowers-Alex-150-normal-589.aiff')[0] - snd_data_f64 = snd.read('./outputs/audio/sunflowers-Alex-150-normal-589.aiff')[0] + sample_r = snd.get_info( + './outputs/audio/sunflowers-Alex-150-normal-589.aiff')[0] + snd_data_f64 = snd.read( + './outputs/audio/sunflowers-Alex-150-normal-589.aiff')[0] snd_data_f32 = snd_data_f64.astype(np.float32) print(snd_data_f32.shape) snd_data = snd_data_f32.tobytes() @@ -144,8 +146,10 @@ def play_sunflower(): if __name__ == '__main__': # play_sunflower() - plot_aiff_stft('./outputs/story_words/Agnes/150/chicken-Agnes-150-low-1077.aiff') - plot_aiff_stft('./outputs/story_words/Agnes/150/chicken-Agnes-150-medium-1762.aiff') + plot_aiff_stft( + './outputs/story_words/Agnes/150/chicken-Agnes-150-low-1077.aiff') + plot_aiff_stft( + './outputs/story_words/Agnes/150/chicken-Agnes-150-medium-1762.aiff') # spec = generate_aiff_spectrogram('./outputs/story_words/Agnes/150/chicken-Agnes-150-low-1077.aiff') # print(spec.shape) # plot_aiff_stft('./outputs/sunflowers-Alex-180-normal-4763.aiff')