From 6355db4af7ea22a0a32ac1a7f9e68a61cd046533 Mon Sep 17 00:00:00 2001 From: Malar Kannan Date: Wed, 22 Nov 2017 15:04:02 +0530 Subject: [PATCH 1/4] adding missing model-dir for training constants copying --- speech_data.py | 2 +- speech_model.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/speech_data.py b/speech_data.py index cc91b7e..cc171ce 100644 --- a/speech_data.py +++ b/speech_data.py @@ -254,7 +254,7 @@ if __name__ == '__main__': # create_spectrogram_tfrecords('story_all',sample_count=25) # fix_csv('story_words_test') #fix_csv('story_phrases') - create_spectrogram_tfrecords('story_phrases',sample_count=100,train_test_ratio=0.1) + create_spectrogram_tfrecords('story_phrases',sample_count=500,train_test_ratio=0.1) # create_spectrogram_tfrecords('audio',sample_count=50) # read_siamese_tfrecords_generator('audio') # padd_zeros_siamese_tfrecords('audio') diff --git a/speech_model.py b/speech_model.py index 42a6fb3..ce2a7c0 100644 --- a/speech_model.py +++ b/speech_model.py @@ -75,7 +75,7 @@ def train_siamese(audio_group = 'audio'): log_dir = './logs/'+audio_group create_dir(log_dir) tr_gen_fn,te_pairs,te_y,copy_read_consts = read_siamese_tfrecords_generator(audio_group,batch_size=batch_size,test_size=batch_size) - n_step,n_features,n_records = copy_read_consts() + n_step,n_features,n_records = copy_read_consts(model_dir) tr_gen = tr_gen_fn() input_dim = (n_step, n_features) From 54f38ca7753e739f74b43776aaa7f597eae98932 Mon Sep 17 00:00:00 2001 From: Malar Kannan Date: Wed, 22 Nov 2017 15:46:42 +0530 Subject: [PATCH 2/4] removed a layer using lstm --- requirements-linux.txt | 1 + speech_model.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/requirements-linux.txt b/requirements-linux.txt index 899b69f..6d3af11 100644 --- a/requirements-linux.txt +++ b/requirements-linux.txt @@ -41,6 +41,7 @@ partd==0.3.8 pexpect==4.2.1 pickleshare==0.7.4 pkg-resources==0.0.0 +praat-parselmouth==0.2.0 progressbar2==3.34.3 prompt-toolkit==1.0.15 protobuf==3.4.0 diff --git a/speech_model.py b/speech_model.py index ce2a7c0..6dc8bb2 100644 --- a/speech_model.py +++ b/speech_model.py @@ -18,9 +18,9 @@ def create_base_rnn_network(input_dim): inp = Input(shape=input_dim) # ls0 = LSTM(512, return_sequences=True)(inp) ls1 = Bidirectional(LSTM(128, return_sequences=True))(inp) - ls2 = LSTM(128, return_sequences=True)(ls1) + #ls2 = LSTM(128, return_sequences=True)(ls1) # ls3 = LSTM(32, return_sequences=True)(ls2) - ls4 = LSTM(64)(ls2) + ls4 = LSTM(64)(ls1) # d1 = Dense(128, activation='relu')(ls4) #d2 = Dense(64, activation='relu')(ls2) return Model(inp, ls4) From 235300691e7de9ec4c2526e7f59adea99523d4d2 Mon Sep 17 00:00:00 2001 From: Malar Kannan Date: Fri, 24 Nov 2017 14:26:36 +0530 Subject: [PATCH 3/4] find spec_n from tfrecords --- requirements-linux.txt | 6 ++++-- speech_data.py | 25 ++++++++++++++++--------- speech_model.py | 14 ++++++++------ speech_test.py | 2 +- 4 files changed, 29 insertions(+), 18 deletions(-) diff --git a/requirements-linux.txt b/requirements-linux.txt index 6d3af11..025282d 100644 --- a/requirements-linux.txt +++ b/requirements-linux.txt @@ -8,6 +8,7 @@ distributed==1.19.3 entrypoints==0.2.3 enum34==1.1.6 futures==3.1.1 +graphviz==0.8.1 h5py==2.7.1 HeapDict==1.0.0 html5lib==0.9999999 @@ -44,10 +45,11 @@ pkg-resources==0.0.0 praat-parselmouth==0.2.0 progressbar2==3.34.3 prompt-toolkit==1.0.15 -protobuf==3.4.0 +protobuf==3.5.0 psutil==5.4.0 ptyprocess==0.5.2 PyAudio==0.2.11 +pydot==1.2.3 Pygments==2.2.0 pyparsing==2.2.0 pysndfile==1.0.0 @@ -66,7 +68,7 @@ sortedcontainers==1.5.7 tables==3.4.2 tblib==1.3.2 tensorflow==1.3.0 -tensorflow-tensorboard==0.4.0rc1 +tensorflow-tensorboard==0.4.0rc3 terminado==0.6 testpath==0.3.1 toolz==0.8.2 diff --git a/speech_data.py b/speech_data.py index cc171ce..4df9aa2 100644 --- a/speech_data.py +++ b/speech_data.py @@ -20,7 +20,7 @@ from tqdm import tqdm def siamese_pairs(rightGroup, wrongGroup): group1 = [r for (i, r) in rightGroup.iterrows()] group2 = [r for (i, r) in wrongGroup.iterrows()] - rightWrongPairs = [(g1, g2) for g2 in group2 for g1 in group1]+[(g2, g1) for g2 in group2 for g1 in group1] + rightWrongPairs = [(g1, g2) for g2 in group2 for g1 in group1]#+[(g2, g1) for g2 in group2 for g1 in group1] rightRightPairs = [i for i in itertools.permutations(group1, 2)]#+[i for i in itertools.combinations(group2, 2)] def filter_criteria(s1,s2): same = s1['variant'] == s2['variant'] @@ -28,8 +28,8 @@ def siamese_pairs(rightGroup, wrongGroup): voice_diff = s1['voice'] != s2['voice'] if not same and phon_same: return False - if same and not voice_diff: - return False + # if same and not voice_diff: + # return False return True validRWPairs = [i for i in rightWrongPairs if filter_criteria(*i)] validRRPairs = [i for i in rightRightPairs if filter_criteria(*i)] @@ -64,8 +64,8 @@ def create_spectrogram_tfrecords(audio_group='audio',sample_count=0,train_test_r for (w, word_group) in word_group_prog: word_group_prog.set_postfix(word=w,sample_name=sample_name) g = word_group.reset_index() - # g['spectrogram'] = apply_by_multiprocessing(g['file_path'],generate_aiff_spectrogram) - g['spectrogram'] = apply_by_multiprocessing(g['file_path'],compute_mfcc) + g['spectrogram'] = apply_by_multiprocessing(g['file_path'],generate_aiff_spectrogram) + # g['spectrogram'] = apply_by_multiprocessing(g['file_path'],compute_mfcc) sample_right = g.loc[g['variant'] == 'low'] sample_wrong = g.loc[g['variant'] == 'medium'] same, diff = siamese_pairs(sample_right, sample_wrong) @@ -208,11 +208,17 @@ def audio_samples_word_count(audio_group='audio'): def record_generator_count(records_file): record_iterator = tf.python_io.tf_record_iterator(path=records_file) - count = 0 + count,spec_n = 0,0 for i in record_iterator: + example = tf.train.Example() + example.ParseFromString(i) + spec_n1 = example.features.feature['spec_n1'].int64_list.value[0] + spec_n2 = example.features.feature['spec_n2'].int64_list.value[0] + spec_n = max([spec_n,spec_n1,spec_n2]) + import pdb; pdb.set_trace() count+=1 record_iterator = tf.python_io.tf_record_iterator(path=records_file) - return record_iterator,count + return record_iterator,count,spec_n def fix_csv(audio_group='audio'): audio_csv_lines = open('./outputs/' + audio_group + '.csv','r').readlines() @@ -253,8 +259,9 @@ if __name__ == '__main__': # create_spectrogram_tfrecords('audio',sample_count=100) # create_spectrogram_tfrecords('story_all',sample_count=25) # fix_csv('story_words_test') - #fix_csv('story_phrases') - create_spectrogram_tfrecords('story_phrases',sample_count=500,train_test_ratio=0.1) + #fix_csv('audio') + # create_spectrogram_tfrecords('story_words_test',sample_count=100,train_test_ratio=0.1) + record_generator_count() # create_spectrogram_tfrecords('audio',sample_count=50) # read_siamese_tfrecords_generator('audio') # padd_zeros_siamese_tfrecords('audio') diff --git a/speech_model.py b/speech_model.py index 6dc8bb2..f083360 100644 --- a/speech_model.py +++ b/speech_model.py @@ -9,6 +9,7 @@ from keras.utils import to_categorical from keras.optimizers import RMSprop from keras.callbacks import TensorBoard, ModelCheckpoint from keras import backend as K +from keras.utils import plot_model from speech_tools import create_dir,step_count @@ -17,10 +18,10 @@ def create_base_rnn_network(input_dim): ''' inp = Input(shape=input_dim) # ls0 = LSTM(512, return_sequences=True)(inp) - ls1 = Bidirectional(LSTM(128, return_sequences=True))(inp) - #ls2 = LSTM(128, return_sequences=True)(ls1) + ls1 = LSTM(128, return_sequences=True)(inp) + ls2 = LSTM(64, return_sequences=True)(ls1) # ls3 = LSTM(32, return_sequences=True)(ls2) - ls4 = LSTM(64)(ls1) + ls4 = LSTM(32)(ls2) # d1 = Dense(128, activation='relu')(ls4) #d2 = Dense(64, activation='relu')(ls2) return Model(inp, ls4) @@ -55,7 +56,7 @@ def siamese_model(input_dim): processed_b = base_network(input_b) final_output = dense_classifier([processed_a,processed_b]) model = Model([input_a, input_b], final_output) - return model + return model,base_network def write_model_arch(mod,mod_file): model_f = open(mod_file,'w') @@ -79,8 +80,9 @@ def train_siamese(audio_group = 'audio'): tr_gen = tr_gen_fn() input_dim = (n_step, n_features) - model = siamese_model(input_dim) - + model,base_model = siamese_model(input_dim) + plot_model(model,show_shapes=True, to_file=model_dir+'/model.png') + plot_model(base_model,show_shapes=True, to_file=model_dir+'/base_model.png') tb_cb = TensorBoard( log_dir=log_dir, histogram_freq=1, diff --git a/speech_test.py b/speech_test.py index d2ab72b..ddf2d7b 100644 --- a/speech_test.py +++ b/speech_test.py @@ -178,7 +178,7 @@ def visualize_results(audio_group='audio'): if __name__ == '__main__': # evaluate_siamese('./outputs/story_words_test.train.tfrecords',audio_group='story_words.gpu',weights ='siamese_speech_model-58-epoch-0.00-acc.h5') # evaluate_siamese('./outputs/story_words.test.tfrecords',audio_group='story_words',weights ='siamese_speech_model-675-epoch-0.00-acc.h5') - evaluate_siamese('./outputs/story_words_test.train.tfrecords',audio_group='story_phrases',weights ='siamese_speech_model-231-epoch-0.00-acc.h5') + evaluate_siamese('./outputs/story_words_test.train.tfrecords',audio_group='story_words.gpu',weights ='siamese_speech_model-58-epoch-0.00-acc.h5') # play_results('story_words') #inspect_tfrecord('./outputs/story_phrases.test.tfrecords',audio_group='story_phrases') # visualize_results('story_words.gpu') From 2268ad8bb09e497421af0a35f283c02c79cdbee8 Mon Sep 17 00:00:00 2001 From: Malar Kannan Date: Fri, 24 Nov 2017 14:32:13 +0530 Subject: [PATCH 4/4] implemented pitch plotting --- speech_pitch.py | 132 ++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 111 insertions(+), 21 deletions(-) diff --git a/speech_pitch.py b/speech_pitch.py index ef76ee9..12cd6af 100644 --- a/speech_pitch.py +++ b/speech_pitch.py @@ -1,5 +1,9 @@ import parselmouth as pm from pysndfile import sndio as snd +import numpy as np +import matplotlib.pyplot as plt +import seaborn as sns +sns.set() # Use seaborn's default style to make graphs more pretty def pitch_array(sample_file='outputs/audio/sunflowers-Victoria-180-normal-870.aiff'): samples, samplerate, _ = snd.read(sample_file) @@ -23,24 +27,110 @@ def compute_mfcc(sample_file='outputs/audio/sunflowers-Victoria-180-normal-870.a # sample_mfcc.to_array().shape return sample_mfcc.to_array() -# sunflowers_vic_180_norm = pitch_array('outputs/audio/sunflowers-Victoria-180-normal-870.aiff') -# sunflowers_fred_180_norm = pitch_array('outputs/audio/sunflowers-Fred-180-normal-6515.aiff') -# sunflowers_vic_180_norm_mfcc = compute_mfcc('outputs/audio/sunflowers-Victoria-180-normal-870.aiff') -fred_180_norm_mfcc = compute_mfcc('outputs/audio/sunflowers-Fred-180-normal-6515.aiff') -alex_mfcc = compute_mfcc('outputs/audio/sunflowers-Alex-180-normal-4763.aiff') -# # sunflowers_vic_180_norm.shape -# # sunflowers_fred_180_norm.shape -# alex_mfcc.shape -# sunflowers_vic_180_norm_mfcc.shape -# sunflowers_fred_180_norm_mfcc.shape -from speech_spectrum import generate_aiff_spectrogram -vic_spec = generate_aiff_spectrogram('outputs/audio/sunflowers-Victoria-180-normal-870.aiff') -alex_spec = generate_aiff_spectrogram('outputs/audio/sunflowers-Alex-180-normal-4763.aiff') -alex150spec = generate_aiff_spectrogram('outputs/audio/sunflowers-Alex-150-normal-589.aiff') -vic_spec.shape -alex_spec.shape -alex150spec.shape -alex_mfcc.shape -fred_180_norm_mfcc.shape -# pm.SoundFileFormat -# pm.Pitch.get_number_of_frames() +def compute_formants(sample_file='outputs/audio/sunflowers-Victoria-180-normal-870.aiff'): + sample_file='outputs/audio/sunflowers-Victoria-180-normal-870.aiff' + samples, samplerate, _ = snd.read(sample_file) + sample_sound = pm.Sound(values=samples,sampling_frequency=samplerate) + sample_formant = sample_sound.to_formant_burg() + sample_formant.x_bins() + # sample_mfcc.to_array().shape + return sample_mfcc.to_array() + +def draw_spectrogram(spectrogram, dynamic_range=70): + X, Y = spectrogram.x_grid(), spectrogram.y_grid() + sg_db = 10 * np.log10(spectrogram.values.T) + plt.pcolormesh(X, Y, sg_db, vmin=sg_db.max() - dynamic_range, cmap='afmhot') + plt.ylim([spectrogram.ymin, spectrogram.ymax]) + plt.xlabel("time [s]") + plt.ylabel("frequency [Hz]") + +def draw_intensity(intensity): + plt.plot(intensity.xs(), intensity.values, linewidth=3, color='w') + plt.plot(intensity.xs(), intensity.values, linewidth=1) + plt.grid(False) + plt.ylim(0) + plt.ylabel("intensity [dB]") + +def draw_pitch(pitch): + # Extract selected pitch contour, and + # replace unvoiced samples by NaN to not plot + pitch_values = pitch.to_matrix().values + pitch_values[pitch_values==0] = np.nan + plt.plot(pitch.xs(), pitch_values, linewidth=3, color='w') + plt.plot(pitch.xs(), pitch_values, linewidth=1) + plt.grid(False) + plt.ylim(0, pitch.ceiling) + plt.ylabel("pitch [Hz]") + +def pm_snd(sample_file): + # sample_file = 'inputs/self-apple/apple-low1.aiff' + samples, samplerate, _ = snd.read(sample_file) + return pm.Sound(values=samples,sampling_frequency=samplerate) +def plot_sample_raw(sample_file='outputs/audio/sunflowers-Victoria-180-normal-870.aiff'): + # %matplotlib inline + # sample_file='outputs/audio/sunflowers-Victoria-180-normal-870.aiff + snd_d = pm_snd(sample_file) + plt.figure() + plt.plot(snd_d.xs(), snd_d.values) + plt.xlim([snd_d.xmin, snd_d.xmax]) + plt.xlabel("time [s]") + plt.ylabel("amplitude") + plt.show() + +def plot_sample_intensity(sample_file='outputs/audio/sunflowers-Victoria-180-normal-870.aiff'): + snd_d = pm_snd(sample_file) + intensity = snd_d.to_intensity() + spectrogram = snd_d.to_spectrogram() + plt.figure() + draw_spectrogram(spectrogram) + plt.twinx() + draw_intensity(intensity) + plt.xlim([snd_d.xmin, snd_d.xmax]) + plt.show() + +def plot_sample_pitch(sample_file='outputs/audio/sunflowers-Victoria-180-normal-870.aiff'): + snd_d = pm_snd(sample_file) + pitch = snd_d.to_pitch() + spectrogram = snd_d.to_spectrogram(window_length=0.03, maximum_frequency=8000) + plt.figure() + draw_spectrogram(spectrogram) + plt.twinx() + draw_pitch(pitch) + plt.xlim([snd_d.xmin, snd_d.xmax]) + plt.show() + + # snd_part = snd_d.extract_part(from_time=0.9, preserve_times=True) + # plt.figure() + # plt.plot(snd_part.xs(), snd_part.values, linewidth=0.5) + # plt.xlim([snd_part.xmin, snd_part.xmax]) + # plt.xlabel("time [s]") + # plt.ylabel("amplitude") + # plt.show() + + +if __name__ == '__main__': + # sunflowers_vic_180_norm = pitch_array('outputs/audio/sunflowers-Victoria-180-normal-870.aiff') + # sunflowers_fred_180_norm = pitch_array('outputs/audio/sunflowers-Fred-180-normal-6515.aiff') + # sunflowers_vic_180_norm_mfcc = compute_mfcc('outputs/audio/sunflowers-Victoria-180-normal-870.aiff') + # fred_180_norm_mfcc = compute_mfcc('outputs/audio/sunflowers-Fred-180-normal-6515.aiff') + # alex_mfcc = compute_mfcc('outputs/audio/sunflowers-Alex-180-normal-4763.aiff') + # # # sunflowers_vic_180_norm.shape + # # # sunflowers_fred_180_norm.shape + # # alex_mfcc.shape + # # sunflowers_vic_180_norm_mfcc.shape + # # sunflowers_fred_180_norm_mfcc.shape + # from speech_spectrum import generate_aiff_spectrogram + # vic_spec = generate_aiff_spectrogram('outputs/audio/sunflowers-Victoria-180-normal-870.aiff') + # alex_spec = generate_aiff_spectrogram('outputs/audio/sunflowers-Alex-180-normal-4763.aiff') + # alex150spec = generate_aiff_spectrogram('outputs/audio/sunflowers-Alex-150-normal-589.aiff') + # vic_spec.shape + # alex_spec.shape + # alex150spec.shape + # alex_mfcc.shape + # fred_180_norm_mfcc.shape + plot_sample_pitch('outputs/audio/sunflowers-Victoria-180-normal-870.aiff') + plot_sample_pitch('inputs/self-apple/apple-low1.aiff') + plot_sample_pitch('inputs/self-apple/apple-low2.aiff') + plot_sample_pitch('inputs/self-apple/apple-medium1.aiff') + # pm.SoundFileFormat + # pm.Pitch.get_number_of_frames()