From 15f29895d4680fc04cdc836dbf5ac173a89d2aa8 Mon Sep 17 00:00:00 2001 From: Malar Kannan Date: Tue, 7 Nov 2017 00:10:23 +0530 Subject: [PATCH 01/15] implemented tfrecord reader and model refactor wip --- speech_data.py | 65 ++++++++++++++++++++++++++++++----------------- speech_siamese.py | 12 +++++---- 2 files changed, 48 insertions(+), 29 deletions(-) diff --git a/speech_data.py b/speech_data.py index 8876480..b9b822f 100644 --- a/speech_data.py +++ b/speech_data.py @@ -31,7 +31,8 @@ def siamese_pairs(rightGroup, wrongGroup): random.shuffle(rightWrongPairs) random.shuffle(rightRightPairs) # return (random.sample(same,10), random.sample(diff,10)) - return rightRightPairs[:10],rightWrongPairs[:10] + # return rightRightPairs[:10],rightWrongPairs[:10] + return rightRightPairs,rightWrongPairs def append_zeros(spgr, max_samples): return np.lib.pad(spgr, [(0, max_samples - spgr.shape[0]), (0, 0)], @@ -42,7 +43,6 @@ def padd_zeros(spgr, max_samples): 'constant') def to_onehot(a,class_count=2): - # >>> a = np.array([1, 0, 3]) a_row_n = a.shape[0] b = np.zeros((a_row_n, class_count)) b[np.arange(a_row_n), a] = 1 @@ -101,6 +101,10 @@ def create_spectrogram_data(audio_group='audio'): audio_samples.to_pickle('outputs/{}-spectrogram.pkl'.format(audio_group)) def create_spectrogram_tfrecords(audio_group='audio'): + ''' + http://warmspringwinds.github.io/tensorflow/tf-slim/2016/12/21/tfrecords-guide/ + http://www.machinelearninguru.com/deep_learning/tensorflow/basics/tfrecord/tfrecord.html + ''' audio_samples = pd.read_csv( './outputs/' + audio_group + '.csv' , names=['word','phonemes', 'voice', 'language', 'rate', 'variant', 'file'] , quoting=csv.QUOTE_NONE) @@ -120,7 +124,6 @@ def create_spectrogram_tfrecords(audio_group='audio'): return tf.train.Feature(bytes_list=tf.train.BytesList(value=value)) writer = tf.python_io.TFRecordWriter('./outputs/' + audio_group + '.tfrecords') - # audio_samples = audio_samples[:100] for (w, word_group) in audio_samples.groupby(audio_samples['word']): g = word_group.reset_index() g['spectrogram'] = apply_by_multiprocessing(g['file_path'],generate_aiff_spectrogram) @@ -160,22 +163,28 @@ def create_spectrogram_tfrecords(audio_group='audio'): writer.write(example.SerializeToString()) writer.close() -def create_tagged_data(audio_samples): - same_data, diff_data = [], [] - for (w, g) in audio_samples.groupby(audio_samples['word']): - # sample_norm = g.loc[audio_samples['variant'] == 'low'] - # sample_phon = g.loc[audio_samples['variant'] == 'medium'] - sample_norm = g.loc[audio_samples['variant'] == 'normal'] - sample_phon = g.loc[audio_samples['variant'] == 'phoneme'] - same, diff = get_siamese_pairs(sample_norm, sample_phon) - same_data.extend([create_X(s) for s in same]) - diff_data.extend([create_X(d) for d in diff]) - print('creating all speech pairs') - Y_f = np.hstack([np.ones(len(same_data)), np.zeros(len(diff_data))]) - Y = to_onehot(Y_f.astype(np.int8)) - print('casting as array speech pairs') - X = np.asarray(same_data + diff_data) - return X,Y +def read_siamese_tfrecords(audio_group='audio'): + records_file = os.path.join('./outputs',audio_group+'.tfrecords') + record_iterator = tf.python_io.tf_record_iterator(path=records_file) + input_pairs = [] + output_class = [] + input_words = [] + for string_record in record_iterator: + example = tf.train.Example() + example.ParseFromString(string_record) + word = example.features.feature['word'].bytes_list.value[0] + input_words.append(word) + example.features.feature['spec2'].float_list.value[0] + spec_n1 = example.features.feature['spec_n1'].int64_list.value[0] + spec_n2 = example.features.feature['spec_n2'].int64_list.value[0] + spec_w1 = example.features.feature['spec_w1'].int64_list.value[0] + spec_w2 = example.features.feature['spec_w2'].int64_list.value[0] + spec1 = np.array(example.features.feature['spec1'].float_list.value).reshape(spec_n1,spec_w1) + spec2 = np.array(example.features.feature['spec2'].float_list.value).reshape(spec_n2,spec_w2) + input_pairs.append([spec1,spec2]) + output = example.features.feature['output'].int64_list.value + output_class.append(output) + return input_pairs,output_class def create_speech_pairs_data(audio_group='audio'): audio_samples = pd.read_pickle('outputs/{}-spectrogram.pkl'.format(audio_group)) @@ -195,10 +204,17 @@ def create_speech_pairs_data(audio_group='audio'): save_samples_for('train',tr_audio_samples) save_samples_for('test',te_audio_samples) -def speech_data(audio_group='audio'): - X = np.load('outputs/{}-X.npy'.format(audio_group)) / 255.0 - Y = np.load('outputs/{}-Y.npy'.format(audio_group)) - return (X,Y) +def audio_samples_word_count(audio_group='audio'): + audio_group = 'story_all' + audio_samples = pd.read_csv( './outputs/' + audio_group + '.csv' + , names=['word','phonemes', 'voice', 'language', 'rate', 'variant', 'file'] + , quoting=csv.QUOTE_NONE) + # audio_samples = audio_samples.loc[audio_samples['word'] == + # 'sunflowers'].reset_index(drop=True) + audio_samples['file_path'] = audio_samples.loc[:, 'file'].apply(lambda x: 'outputs/' + audio_group + '/' + x) + audio_samples['file_exists'] = apply_by_multiprocessing(audio_samples['file_path'], os.path.exists) + audio_samples = audio_samples[audio_samples['file_exists'] == True].reset_index() + return len(audio_samples.groupby(audio_samples['word'])) def speech_model_data(): tr_pairs = np.load('outputs/tr_pairs.npy') / 255.0 @@ -214,7 +230,8 @@ if __name__ == '__main__': # sunflower_pairs_data() # create_spectrogram_data() # create_spectrogram_data('story_words') - create_spectrogram_tfrecords('story_words') + # create_spectrogram_tfrecords('story_words') + create_spectrogram_tfrecords('story_all') # create_padded_spectrogram() # create_speech_pairs_data() # print(speech_model_data()) diff --git a/speech_siamese.py b/speech_siamese.py index e07c6cf..64a28fb 100644 --- a/speech_siamese.py +++ b/speech_siamese.py @@ -1,7 +1,8 @@ from __future__ import absolute_import from __future__ import print_function import numpy as np -from speech_data import speech_model_data +# from speech_data import speech_model_data +from speech_data import read_siamese_tfrecords from keras.models import Model,load_model from keras.layers import Input, Dense, Dropout, LSTM, Lambda, Concatenate from keras.losses import categorical_crossentropy @@ -80,10 +81,11 @@ def siamese_model(input_dim): def train_siamese(): # the data, shuffled and split between train and test sets - tr_pairs, te_pairs, tr_y_e, te_y_e = speech_model_data() - tr_y = to_categorical(tr_y_e, num_classes=2) - te_y = to_categorical(te_y_e, num_classes=2) - input_dim = (tr_pairs.shape[2], tr_pairs.shape[3]) + # tr_pairs, te_pairs, tr_y_e, te_y_e = speech_model_data() + pairs,y = read_siamese_tfrecords('story_words') + # tr_y = to_categorical(tr_y_e, num_classes=2) + # te_y = to_categorical(te_y_e, num_classes=2) + input_dim = (None, 1654) model = siamese_model(input_dim) From 55e2de2f04d4f47a79cfdc9cf0e03c410516364e Mon Sep 17 00:00:00 2001 From: Malar Kannan Date: Tue, 7 Nov 2017 11:56:09 +0530 Subject: [PATCH 02/15] using csv writer instead as comma in phrases are mis-aligning columns --- speech_data.py | 134 ++++++++++------------------------------------- tts_samplegen.py | 21 +++----- 2 files changed, 36 insertions(+), 119 deletions(-) diff --git a/speech_data.py b/speech_data.py index b9b822f..9dadb67 100644 --- a/speech_data.py +++ b/speech_data.py @@ -11,17 +11,11 @@ import os import random import csv import gc +import progressbar -def get_siamese_pairs(groupF1, groupF2): - group1 = [r for (i, r) in groupF1.iterrows()] - group2 = [r for (i, r) in groupF2.iterrows()] - diff = [(g1, g2) for g2 in group2 for g1 in group1] - same = [i for i in itertools.combinations(group1, 2) - ] + [i for i in itertools.combinations(group2, 2)] - random.shuffle(same) - random.shuffle(diff) - # return (random.sample(same,10), random.sample(diff,10)) - return same[:10],diff[:10] +def prog_bar(title): + widgets = [title, progressbar.Counter(), ' [', progressbar.Bar(), '] - ', progressbar.ETA()] + return progressbar.ProgressBar(widgets=widgets) def siamese_pairs(rightGroup, wrongGroup): group1 = [r for (i, r) in rightGroup.iterrows()] @@ -32,73 +26,8 @@ def siamese_pairs(rightGroup, wrongGroup): random.shuffle(rightRightPairs) # return (random.sample(same,10), random.sample(diff,10)) # return rightRightPairs[:10],rightWrongPairs[:10] - return rightRightPairs,rightWrongPairs - -def append_zeros(spgr, max_samples): - return np.lib.pad(spgr, [(0, max_samples - spgr.shape[0]), (0, 0)], - 'median') - -def padd_zeros(spgr, max_samples): - return np.lib.pad(spgr, [(0, max_samples - spgr.shape[0]), (0, 0)], - 'constant') - -def to_onehot(a,class_count=2): - a_row_n = a.shape[0] - b = np.zeros((a_row_n, class_count)) - b[np.arange(a_row_n), a] = 1 - return b - -def create_pair(l, r, max_samples): - l_sample = padd_zeros(l, max_samples) - r_sample = padd_zeros(r, max_samples) - return np.asarray([l_sample, r_sample]) - - -def create_test_pair(l, r, max_samples): - l_sample = append_zeros(l, max_samples) - r_sample = append_zeros(r, max_samples) - return np.asarray([[l_sample, r_sample]]) - - -def create_X(sp, max_samples): - return create_pair(sp[0]['spectrogram'], sp[1]['spectrogram'], max_samples) - - -# def get_word_pairs_data(word, max_samples): -# audio_samples = pd.read_csv( -# './outputs/audio.csv', -# names=['word', 'voice', 'rate', 'variant', 'file']) -# audio_samples = audio_samples.loc[audio_samples['word'] == -# word].reset_index(drop=True) -# audio_samples.loc[:, 'spectrogram'] = audio_samples.loc[:, 'file'].apply( -# lambda x: 'outputs/audio/' + x).apply(generate_aiff_spectrogram) -# max_samples = audio_samples['spectrogram'].apply( -# lambda x: x.shape[0]).max() -# same_data, diff_data = [], [] -# for (w, g) in audio_samples.groupby(audio_samples['word']): -# sample_norm = g.loc[audio_samples['variant'] == 'normal'] -# sample_phon = g.loc[audio_samples['variant'] == 'phoneme'] -# same, diff = get_siamese_pairs(sample_norm, sample_phon) -# same_data.extend([create_X(s, max_samples) for s in same]) -# diff_data.extend([create_X(d, max_samples) for d in diff]) -# Y = np.hstack([np.ones(len(same_data)), np.zeros(len(diff_data))]) -# X = np.asarray(same_data + diff_data) -# # tr_pairs, te_pairs, tr_y, te_y = train_test_split(X, Y, test_size=0.1) -# return (X, Y) - - -def create_spectrogram_data(audio_group='audio'): - audio_samples = pd.read_csv( './outputs/' + audio_group + '.csv' - , names=['word','phonemes', 'voice', 'language', 'rate', 'variant', 'file'] - , quoting=csv.QUOTE_NONE) - # audio_samples = audio_samples.loc[audio_samples['word'] == - # 'sunflowers'].reset_index(drop=True) - audio_samples['file_paths'] = audio_samples.loc[:, 'file'].apply(lambda x: 'outputs/' + audio_group + '/' + x) - audio_samples['file_exists'] = apply_by_multiprocessing(audio_samples['file_paths'], os.path.exists) - audio_samples = audio_samples[audio_samples['file_exists'] == True].reset_index() - audio_samples['spectrogram'] = apply_by_multiprocessing(audio_samples['file_paths'],generate_aiff_spectrogram)#.apply( - audio_samples['window_count'] = audio_samples.loc[:,'spectrogram'].apply(lambda x: x.shape[0]) - audio_samples.to_pickle('outputs/{}-spectrogram.pkl'.format(audio_group)) + return rightRightPairs[:32],rightWrongPairs[:32] + # return rightRightPairs,rightWrongPairs def create_spectrogram_tfrecords(audio_group='audio'): ''' @@ -113,7 +42,9 @@ def create_spectrogram_tfrecords(audio_group='audio'): audio_samples['file_path'] = audio_samples.loc[:, 'file'].apply(lambda x: 'outputs/' + audio_group + '/' + x) audio_samples['file_exists'] = apply_by_multiprocessing(audio_samples['file_path'], os.path.exists) audio_samples = audio_samples[audio_samples['file_exists'] == True].reset_index() - + audio_samples['rate_int'] = apply_by_multiprocessing(audio_samples['rate'], str.isdigit) + audio_samples = audio_samples[audio_samples['rate_int'] == True].reset_index().drop(['level_0'],axis=1) + audio_samples['rate'] = audio_samples['rate'].astype(int) def _float_feature(value): return tf.train.Feature(float_list=tf.train.FloatList(value=value)) @@ -124,7 +55,8 @@ def create_spectrogram_tfrecords(audio_group='audio'): return tf.train.Feature(bytes_list=tf.train.BytesList(value=value)) writer = tf.python_io.TFRecordWriter('./outputs/' + audio_group + '.tfrecords') - for (w, word_group) in audio_samples.groupby(audio_samples['word']): + prog = prog_bar('Generating siamese pairs : ') + for (w, word_group) in prog(audio_samples.groupby(audio_samples['word'])): g = word_group.reset_index() g['spectrogram'] = apply_by_multiprocessing(g['file_path'],generate_aiff_spectrogram) sample_right = g.loc[audio_samples['variant'] == 'low'] @@ -186,24 +118,6 @@ def read_siamese_tfrecords(audio_group='audio'): output_class.append(output) return input_pairs,output_class -def create_speech_pairs_data(audio_group='audio'): - audio_samples = pd.read_pickle('outputs/{}-spectrogram.pkl'.format(audio_group)) - # sample_size = audio_samples['spectrogram'][0].shape[1] - tr_audio_samples,te_audio_samples = train_test_split(audio_samples, test_size=0.1) - def save_samples_for(sample_name,samples): - print('generating {} siamese speech pairs'.format(sample_name)) - X,Y = create_tagged_data(samples) - print('shuffling array speech pairs') - rng_state = np.random.get_state() - np.random.shuffle(X) - np.random.set_state(rng_state) - np.random.shuffle(Y) - print('pickling X/Y') - np.save('outputs/{}-train-X.npy'.format(audio_group), X) - np.save('outputs/{}-train-Y.npy'.format(audio_group), Y) - save_samples_for('train',tr_audio_samples) - save_samples_for('test',te_audio_samples) - def audio_samples_word_count(audio_group='audio'): audio_group = 'story_all' audio_samples = pd.read_csv( './outputs/' + audio_group + '.csv' @@ -216,15 +130,23 @@ def audio_samples_word_count(audio_group='audio'): audio_samples = audio_samples[audio_samples['file_exists'] == True].reset_index() return len(audio_samples.groupby(audio_samples['word'])) -def speech_model_data(): - tr_pairs = np.load('outputs/tr_pairs.npy') / 255.0 - te_pairs = np.load('outputs/te_pairs.npy') / 255.0 - tr_pairs[tr_pairs < 0] = 0 - te_pairs[te_pairs < 0] = 0 - tr_y = np.load('outputs/tr_y.npy') - te_y = np.load('outputs/te_y.npy') - return tr_pairs, te_pairs, tr_y, te_y - +def fix_csv(audio_group='audio'): + audio_group = 'story_all' + audio_samples = pd.read_csv( './outputs/story_words.csv' + , names=['word','phonemes', 'voice', 'language', 'rate', 'variant', 'file'] + , quoting=csv.QUOTE_NONE) + voice_set = set(audio_samples['voice'].unique().tolist()) + audio_csv_lines = open('./outputs/' + audio_group + '.csv','r').readlines() + audio_csv_data = [i.strip().split(',') for i in audio_csv_lines] + to_be_fixed = [i for i in audio_csv_data if len(i) > 7] + def unite_words(entries): + entries = to_be_fixed[0] + word_entries = next(((entries[:i],entries[i:]) for (i,e) in enumerate(entries) if e in voice_set),'') + word_entries[1] + return + to_be_fixed[0] + entries = [unite_words for e in to_be_fixed] + [i for i in entries if len(i) % 2 != 0] if __name__ == '__main__': # sunflower_pairs_data() diff --git a/tts_samplegen.py b/tts_samplegen.py index 9c72674..f9ef32e 100644 --- a/tts_samplegen.py +++ b/tts_samplegen.py @@ -3,6 +3,7 @@ from AppKit import NSSpeechSynthesizer, NSSpeechInputModeProperty from AppKit import NSSpeechModePhoneme from Foundation import NSURL import json +import csv import random import os import re @@ -81,6 +82,11 @@ class SynthFile(object): return ','.join([str(c) for c in cols])+'\n' + def get_values(self): + cols = [self.word, self.phoneme, self.voice, + self.voice_lang, self.rate, self.variant, + self.filename] + return [str(c) for c in cols] class SynthVariant(object): """docstring for SynthVariant.""" @@ -191,22 +197,11 @@ def synth_generator(): print("It took {} to synthsize all variants.".format(time_str)) return synth_for_words - -def write_synths(synth_list, fname, csv=False): - f = open(fname, 'w') - if csv: - for s in synth_list: - f.write(s.get_csv()) - else: - json.dump([s.get_json() for s in synth_list], f) - f.close() - - def synth_logger(fname, csv=False): f = open(fname, 'w') - + s_csv_w = csv.writer(f, quoting=csv.QUOTE_MINIMAL) def csv_writer(s): - f.write(s.get_csv()) + s_csv_w.writerow(s.get_values()) synth_list = [] def json_writer(s): From 41b3f1a9fea25f4ecd52dbc2a961d33aa4c42a88 Mon Sep 17 00:00:00 2001 From: Malar Kannan Date: Tue, 7 Nov 2017 12:43:17 +0530 Subject: [PATCH 03/15] dropping invalid csv entries --- speech_data.py | 40 ++++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/speech_data.py b/speech_data.py index 9dadb67..83b5d6d 100644 --- a/speech_data.py +++ b/speech_data.py @@ -26,7 +26,7 @@ def siamese_pairs(rightGroup, wrongGroup): random.shuffle(rightRightPairs) # return (random.sample(same,10), random.sample(diff,10)) # return rightRightPairs[:10],rightWrongPairs[:10] - return rightRightPairs[:32],rightWrongPairs[:32] + return rightRightPairs[:16],rightWrongPairs[:16] # return rightRightPairs,rightWrongPairs def create_spectrogram_tfrecords(audio_group='audio'): @@ -42,9 +42,9 @@ def create_spectrogram_tfrecords(audio_group='audio'): audio_samples['file_path'] = audio_samples.loc[:, 'file'].apply(lambda x: 'outputs/' + audio_group + '/' + x) audio_samples['file_exists'] = apply_by_multiprocessing(audio_samples['file_path'], os.path.exists) audio_samples = audio_samples[audio_samples['file_exists'] == True].reset_index() - audio_samples['rate_int'] = apply_by_multiprocessing(audio_samples['rate'], str.isdigit) - audio_samples = audio_samples[audio_samples['rate_int'] == True].reset_index().drop(['level_0'],axis=1) - audio_samples['rate'] = audio_samples['rate'].astype(int) + # audio_samples['rate_int'] = apply_by_multiprocessing(audio_samples['rate'], str.isdigit) + # audio_samples = audio_samples[audio_samples['rate_int'] == True].reset_index().drop(['level_0'],axis=1) + # audio_samples['rate'] = audio_samples['rate'].astype(int) def _float_feature(value): return tf.train.Feature(float_list=tf.train.FloatList(value=value)) @@ -131,22 +131,26 @@ def audio_samples_word_count(audio_group='audio'): return len(audio_samples.groupby(audio_samples['word'])) def fix_csv(audio_group='audio'): - audio_group = 'story_all' - audio_samples = pd.read_csv( './outputs/story_words.csv' - , names=['word','phonemes', 'voice', 'language', 'rate', 'variant', 'file'] - , quoting=csv.QUOTE_NONE) - voice_set = set(audio_samples['voice'].unique().tolist()) audio_csv_lines = open('./outputs/' + audio_group + '.csv','r').readlines() audio_csv_data = [i.strip().split(',') for i in audio_csv_lines] - to_be_fixed = [i for i in audio_csv_data if len(i) > 7] - def unite_words(entries): - entries = to_be_fixed[0] - word_entries = next(((entries[:i],entries[i:]) for (i,e) in enumerate(entries) if e in voice_set),'') - word_entries[1] - return - to_be_fixed[0] - entries = [unite_words for e in to_be_fixed] - [i for i in entries if len(i) % 2 != 0] + # audio_samples = pd.read_csv( './outputs/story_words.csv' + # , names=['word','phonemes', 'voice', 'language', 'rate', 'variant', 'file'] + # , quoting=csv.QUOTE_NONE) + # voice_set = set(audio_samples['voice'].unique().tolist()) + # to_be_fixed = [i for i in audio_csv_data if len(i) > 7] + # def unite_words(entries): + # entries = to_be_fixed[0] + # word_entries = next(((entries[:i],entries[i:]) for (i,e) in enumerate(entries) if e in voice_set),'') + # word_entries[1] + # return + # to_be_fixed[0] + # entries = [unite_words for e in to_be_fixed] + # [i for i in entries if len(i) % 2 != 0] + proper_rows = [i for i in audio_csv_data if len(i) == 7] + with open('./outputs/' + audio_group + '-new.csv','w') as fixed_csv: + fixed_csv_w = csv.writer(fixed_csv, quoting=csv.QUOTE_MINIMAL) + fixed_csv_w.writerows(proper_rows) + if __name__ == '__main__': # sunflower_pairs_data() From b8a9f87031fc84af8e534858639feb5adab31a08 Mon Sep 17 00:00:00 2001 From: Malar Kannan Date: Tue, 7 Nov 2017 15:18:04 +0530 Subject: [PATCH 04/15] implemented padding and pipeline is complete --- speech_data.py | 45 ++++++++++++++++++++++++++++++++++++++------- speech_siamese.py | 8 ++++---- 2 files changed, 42 insertions(+), 11 deletions(-) diff --git a/speech_data.py b/speech_data.py index 83b5d6d..a47627c 100644 --- a/speech_data.py +++ b/speech_data.py @@ -95,17 +95,34 @@ def create_spectrogram_tfrecords(audio_group='audio'): writer.write(example.SerializeToString()) writer.close() +def padd_zeros(spgr, max_samples): + return np.lib.pad(spgr, [(0, max_samples - spgr.shape[0]), (0, 0)], + 'constant') + +def find_max_n(trf): + max_n = 0 + max_n_it = tf.python_io.tf_record_iterator(path=trf) + for string_record in max_n_it: + example = tf.train.Example() + example.ParseFromString(string_record) + spec_n1 = example.features.feature['spec_n1'].int64_list.value[0] + spec_n2 = example.features.feature['spec_n2'].int64_list.value[0] + max_n = max([max_n,spec_n1,spec_n2]) + return max_n + def read_siamese_tfrecords(audio_group='audio'): records_file = os.path.join('./outputs',audio_group+'.tfrecords') record_iterator = tf.python_io.tf_record_iterator(path=records_file) + # input1,input2 = [],[] input_pairs = [] output_class = [] - input_words = [] + max_n = find_max_n(records_file) + spec_w1 = 0 for string_record in record_iterator: example = tf.train.Example() example.ParseFromString(string_record) - word = example.features.feature['word'].bytes_list.value[0] - input_words.append(word) + # word = example.features.feature['word'].bytes_list.value[0] + # input_words.append(word) example.features.feature['spec2'].float_list.value[0] spec_n1 = example.features.feature['spec_n1'].int64_list.value[0] spec_n2 = example.features.feature['spec_n2'].int64_list.value[0] @@ -113,10 +130,23 @@ def read_siamese_tfrecords(audio_group='audio'): spec_w2 = example.features.feature['spec_w2'].int64_list.value[0] spec1 = np.array(example.features.feature['spec1'].float_list.value).reshape(spec_n1,spec_w1) spec2 = np.array(example.features.feature['spec2'].float_list.value).reshape(spec_n2,spec_w2) - input_pairs.append([spec1,spec2]) + p_spec1,p_spec2 = padd_zeros(spec1,max_n),padd_zeros(spec2,max_n) + # input1.append(spec1) + # input2.append(spec2) + input_pairs.append(np.asarray([p_spec1,p_spec2])) + # input_pairs.append([spec1,spec2]) output = example.features.feature['output'].int64_list.value - output_class.append(output) - return input_pairs,output_class + output_class.append(np.asarray(output)) + n_features = spec_w1 + # if len(input_pairs) > 50: + # break + input_data,output_data = np.asarray(input_pairs),np.asarray(output_class) + # import pdb; pdb.set_trace() + # tr_x1,te_x1,tr_x2,te_x2,tr_y,te_y = train_test_split(input1,input2,output_class) + tr_pairs,te_pairs,tr_y,te_y = train_test_split(input_data,output_data) + # return (tr_x1,te_x1,tr_x2,te_x2,tr_y,te_y) + n_step,n_features = int(max_n),int(spec_w1) + return (tr_pairs,te_pairs,tr_y,te_y,n_step,n_features) def audio_samples_word_count(audio_group='audio'): audio_group = 'story_all' @@ -157,7 +187,8 @@ if __name__ == '__main__': # create_spectrogram_data() # create_spectrogram_data('story_words') # create_spectrogram_tfrecords('story_words') - create_spectrogram_tfrecords('story_all') + # create_spectrogram_tfrecords('story_all') + read_siamese_tfrecords('story_all') # create_padded_spectrogram() # create_speech_pairs_data() # print(speech_model_data()) diff --git a/speech_siamese.py b/speech_siamese.py index 64a28fb..e9ad718 100644 --- a/speech_siamese.py +++ b/speech_siamese.py @@ -82,10 +82,10 @@ def siamese_model(input_dim): def train_siamese(): # the data, shuffled and split between train and test sets # tr_pairs, te_pairs, tr_y_e, te_y_e = speech_model_data() - pairs,y = read_siamese_tfrecords('story_words') + (tr_pairs,te_pairs,tr_y,te_y,n_step,n_features) = read_siamese_tfrecords('story_words') # tr_y = to_categorical(tr_y_e, num_classes=2) # te_y = to_categorical(te_y_e, num_classes=2) - input_dim = (None, 1654) + input_dim = (n_step, n_features) model = siamese_model(input_dim) @@ -114,11 +114,11 @@ def train_siamese(): rms = RMSprop(lr=0.001) model.compile(loss=categorical_crossentropy, optimizer=rms, metrics=[accuracy]) model.fit( - [tr_pairs[:, 0], tr_pairs[:, 1]], + [tr_x1, tr_x2], tr_y, batch_size=128, epochs=50, - validation_data=([te_pairs[:, 0], te_pairs[:, 1]], te_y), + validation_data=([tr_pairs[:, 0], tr_pairs[:, 1]], te_y), callbacks=[tb_cb, cp_cb]) model.save('./models/siamese_speech_model-final.h5') From 7cbfebbf1a9e91583af17573baa5218ca6af69fc Mon Sep 17 00:00:00 2001 From: Malar Kannan Date: Tue, 7 Nov 2017 17:27:01 +0530 Subject: [PATCH 05/15] 1. fixed missing wrong pairs 2.using different progress bakend --- speech_data.py | 35 +++++++++++++++++++++-------------- speech_siamese.py | 6 +++--- 2 files changed, 24 insertions(+), 17 deletions(-) diff --git a/speech_data.py b/speech_data.py index a47627c..2860961 100644 --- a/speech_data.py +++ b/speech_data.py @@ -11,11 +11,12 @@ import os import random import csv import gc -import progressbar +# import progressbar +from tqdm import tqdm -def prog_bar(title): - widgets = [title, progressbar.Counter(), ' [', progressbar.Bar(), '] - ', progressbar.ETA()] - return progressbar.ProgressBar(widgets=widgets) +# def prog_bar(title): +# widgets = [title, progressbar.Counter(), ' [', progressbar.Bar(), '] - ', progressbar.ETA()] +# return progressbar.ProgressBar(widgets=widgets) def siamese_pairs(rightGroup, wrongGroup): group1 = [r for (i, r) in rightGroup.iterrows()] @@ -26,7 +27,7 @@ def siamese_pairs(rightGroup, wrongGroup): random.shuffle(rightRightPairs) # return (random.sample(same,10), random.sample(diff,10)) # return rightRightPairs[:10],rightWrongPairs[:10] - return rightRightPairs[:16],rightWrongPairs[:16] + return rightRightPairs[:32],rightWrongPairs[:32] # return rightRightPairs,rightWrongPairs def create_spectrogram_tfrecords(audio_group='audio'): @@ -55,16 +56,21 @@ def create_spectrogram_tfrecords(audio_group='audio'): return tf.train.Feature(bytes_list=tf.train.BytesList(value=value)) writer = tf.python_io.TFRecordWriter('./outputs/' + audio_group + '.tfrecords') - prog = prog_bar('Generating siamese pairs : ') - for (w, word_group) in prog(audio_samples.groupby(audio_samples['word'])): + prog = tqdm(audio_samples.groupby(audio_samples['word']),desc='Computing spectrogram') + for (w, word_group) in prog: + prog.set_postfix(word=w) g = word_group.reset_index() g['spectrogram'] = apply_by_multiprocessing(g['file_path'],generate_aiff_spectrogram) - sample_right = g.loc[audio_samples['variant'] == 'low'] - sample_wrong = g.loc[audio_samples['variant'] == 'medium'] + sample_right = g.loc[g['variant'] == 'low'] + sample_wrong = g.loc[g['variant'] == 'medium'] same, diff = siamese_pairs(sample_right, sample_wrong) groups = [([0,1],same),([1,0],diff)] for (output,group) in groups: - for sample1,sample2 in group: + group_prog = tqdm(group,desc='Writing Spectrogram') + for sample1,sample2 in group_prog: + group_prog.set_postfix(output=output + ,var1=sample1['variant'] + ,var2=sample2['variant']) spectro1,spectro2 = sample1['spectrogram'],sample2['spectrogram'] spec_n1,spec_n2 = spectro1.shape[0],spectro2.shape[0] spec_w1,spec_w2 = spectro1.shape[1],spectro2.shape[1] @@ -93,6 +99,7 @@ def create_spectrogram_tfrecords(audio_group='audio'): } )) writer.write(example.SerializeToString()) + prog.close() writer.close() def padd_zeros(spgr, max_samples): @@ -141,7 +148,7 @@ def read_siamese_tfrecords(audio_group='audio'): # if len(input_pairs) > 50: # break input_data,output_data = np.asarray(input_pairs),np.asarray(output_class) - # import pdb; pdb.set_trace() + import pdb; pdb.set_trace() # tr_x1,te_x1,tr_x2,te_x2,tr_y,te_y = train_test_split(input1,input2,output_class) tr_pairs,te_pairs,tr_y,te_y = train_test_split(input_data,output_data) # return (tr_x1,te_x1,tr_x2,te_x2,tr_y,te_y) @@ -186,9 +193,9 @@ if __name__ == '__main__': # sunflower_pairs_data() # create_spectrogram_data() # create_spectrogram_data('story_words') - # create_spectrogram_tfrecords('story_words') - # create_spectrogram_tfrecords('story_all') - read_siamese_tfrecords('story_all') + create_spectrogram_tfrecords('story_words') + # create_spectrogram_tfrecords('story_words_test') + # read_siamese_tfrecords('story_all') # create_padded_spectrogram() # create_speech_pairs_data() # print(speech_model_data()) diff --git a/speech_siamese.py b/speech_siamese.py index e9ad718..353fa4b 100644 --- a/speech_siamese.py +++ b/speech_siamese.py @@ -114,11 +114,11 @@ def train_siamese(): rms = RMSprop(lr=0.001) model.compile(loss=categorical_crossentropy, optimizer=rms, metrics=[accuracy]) model.fit( - [tr_x1, tr_x2], + [tr_pairs[:, 0], tr_pairs[:, 1]], tr_y, batch_size=128, - epochs=50, - validation_data=([tr_pairs[:, 0], tr_pairs[:, 1]], te_y), + epochs=100, + validation_data=([te_pairs[:, 0], te_pairs[:, 1]], te_y), callbacks=[tb_cb, cp_cb]) model.save('./models/siamese_speech_model-final.h5') From b3a6aa2f6a8b483a0580782c0e19ece2d56b01bc Mon Sep 17 00:00:00 2001 From: Malar Kannan Date: Wed, 8 Nov 2017 11:08:19 +0530 Subject: [PATCH 06/15] clean-up --- speech_data.py | 40 ++-------------------------------------- speech_siamese.py | 2 +- 2 files changed, 3 insertions(+), 39 deletions(-) diff --git a/speech_data.py b/speech_data.py index 2860961..a9a87b4 100644 --- a/speech_data.py +++ b/speech_data.py @@ -11,12 +11,8 @@ import os import random import csv import gc -# import progressbar from tqdm import tqdm -# def prog_bar(title): -# widgets = [title, progressbar.Counter(), ' [', progressbar.Bar(), '] - ', progressbar.ETA()] -# return progressbar.ProgressBar(widgets=widgets) def siamese_pairs(rightGroup, wrongGroup): group1 = [r for (i, r) in rightGroup.iterrows()] @@ -25,10 +21,7 @@ def siamese_pairs(rightGroup, wrongGroup): rightRightPairs = [i for i in itertools.combinations(group1, 2)] random.shuffle(rightWrongPairs) random.shuffle(rightRightPairs) - # return (random.sample(same,10), random.sample(diff,10)) - # return rightRightPairs[:10],rightWrongPairs[:10] return rightRightPairs[:32],rightWrongPairs[:32] - # return rightRightPairs,rightWrongPairs def create_spectrogram_tfrecords(audio_group='audio'): ''' @@ -38,14 +31,10 @@ def create_spectrogram_tfrecords(audio_group='audio'): audio_samples = pd.read_csv( './outputs/' + audio_group + '.csv' , names=['word','phonemes', 'voice', 'language', 'rate', 'variant', 'file'] , quoting=csv.QUOTE_NONE) - # audio_samples = audio_samples.loc[audio_samples['word'] == - # 'sunflowers'].reset_index(drop=True) audio_samples['file_path'] = audio_samples.loc[:, 'file'].apply(lambda x: 'outputs/' + audio_group + '/' + x) audio_samples['file_exists'] = apply_by_multiprocessing(audio_samples['file_path'], os.path.exists) audio_samples = audio_samples[audio_samples['file_exists'] == True].reset_index() - # audio_samples['rate_int'] = apply_by_multiprocessing(audio_samples['rate'], str.isdigit) - # audio_samples = audio_samples[audio_samples['rate_int'] == True].reset_index().drop(['level_0'],axis=1) - # audio_samples['rate'] = audio_samples['rate'].astype(int) + def _float_feature(value): return tf.train.Feature(float_list=tf.train.FloatList(value=value)) @@ -99,6 +88,7 @@ def create_spectrogram_tfrecords(audio_group='audio'): } )) writer.write(example.SerializeToString()) + group_prog.close() prog.close() writer.close() @@ -120,7 +110,6 @@ def find_max_n(trf): def read_siamese_tfrecords(audio_group='audio'): records_file = os.path.join('./outputs',audio_group+'.tfrecords') record_iterator = tf.python_io.tf_record_iterator(path=records_file) - # input1,input2 = [],[] input_pairs = [] output_class = [] max_n = find_max_n(records_file) @@ -128,8 +117,6 @@ def read_siamese_tfrecords(audio_group='audio'): for string_record in record_iterator: example = tf.train.Example() example.ParseFromString(string_record) - # word = example.features.feature['word'].bytes_list.value[0] - # input_words.append(word) example.features.feature['spec2'].float_list.value[0] spec_n1 = example.features.feature['spec_n1'].int64_list.value[0] spec_n2 = example.features.feature['spec_n2'].int64_list.value[0] @@ -138,20 +125,12 @@ def read_siamese_tfrecords(audio_group='audio'): spec1 = np.array(example.features.feature['spec1'].float_list.value).reshape(spec_n1,spec_w1) spec2 = np.array(example.features.feature['spec2'].float_list.value).reshape(spec_n2,spec_w2) p_spec1,p_spec2 = padd_zeros(spec1,max_n),padd_zeros(spec2,max_n) - # input1.append(spec1) - # input2.append(spec2) input_pairs.append(np.asarray([p_spec1,p_spec2])) - # input_pairs.append([spec1,spec2]) output = example.features.feature['output'].int64_list.value output_class.append(np.asarray(output)) n_features = spec_w1 - # if len(input_pairs) > 50: - # break input_data,output_data = np.asarray(input_pairs),np.asarray(output_class) - import pdb; pdb.set_trace() - # tr_x1,te_x1,tr_x2,te_x2,tr_y,te_y = train_test_split(input1,input2,output_class) tr_pairs,te_pairs,tr_y,te_y = train_test_split(input_data,output_data) - # return (tr_x1,te_x1,tr_x2,te_x2,tr_y,te_y) n_step,n_features = int(max_n),int(spec_w1) return (tr_pairs,te_pairs,tr_y,te_y,n_step,n_features) @@ -160,8 +139,6 @@ def audio_samples_word_count(audio_group='audio'): audio_samples = pd.read_csv( './outputs/' + audio_group + '.csv' , names=['word','phonemes', 'voice', 'language', 'rate', 'variant', 'file'] , quoting=csv.QUOTE_NONE) - # audio_samples = audio_samples.loc[audio_samples['word'] == - # 'sunflowers'].reset_index(drop=True) audio_samples['file_path'] = audio_samples.loc[:, 'file'].apply(lambda x: 'outputs/' + audio_group + '/' + x) audio_samples['file_exists'] = apply_by_multiprocessing(audio_samples['file_path'], os.path.exists) audio_samples = audio_samples[audio_samples['file_exists'] == True].reset_index() @@ -170,19 +147,6 @@ def audio_samples_word_count(audio_group='audio'): def fix_csv(audio_group='audio'): audio_csv_lines = open('./outputs/' + audio_group + '.csv','r').readlines() audio_csv_data = [i.strip().split(',') for i in audio_csv_lines] - # audio_samples = pd.read_csv( './outputs/story_words.csv' - # , names=['word','phonemes', 'voice', 'language', 'rate', 'variant', 'file'] - # , quoting=csv.QUOTE_NONE) - # voice_set = set(audio_samples['voice'].unique().tolist()) - # to_be_fixed = [i for i in audio_csv_data if len(i) > 7] - # def unite_words(entries): - # entries = to_be_fixed[0] - # word_entries = next(((entries[:i],entries[i:]) for (i,e) in enumerate(entries) if e in voice_set),'') - # word_entries[1] - # return - # to_be_fixed[0] - # entries = [unite_words for e in to_be_fixed] - # [i for i in entries if len(i) % 2 != 0] proper_rows = [i for i in audio_csv_data if len(i) == 7] with open('./outputs/' + audio_group + '-new.csv','w') as fixed_csv: fixed_csv_w = csv.writer(fixed_csv, quoting=csv.QUOTE_MINIMAL) diff --git a/speech_siamese.py b/speech_siamese.py index 353fa4b..9ed9591 100644 --- a/speech_siamese.py +++ b/speech_siamese.py @@ -82,7 +82,7 @@ def siamese_model(input_dim): def train_siamese(): # the data, shuffled and split between train and test sets # tr_pairs, te_pairs, tr_y_e, te_y_e = speech_model_data() - (tr_pairs,te_pairs,tr_y,te_y,n_step,n_features) = read_siamese_tfrecords('story_words') + (tr_pairs,te_pairs,tr_y,te_y,n_step,n_features) = read_siamese_tfrecords('story_words_test') # tr_y = to_categorical(tr_y_e, num_classes=2) # te_y = to_categorical(te_y_e, num_classes=2) input_dim = (n_step, n_features) From 0a4d4fadebd82f754621c605974d0d59a611e1a1 Mon Sep 17 00:00:00 2001 From: Malar Kannan Date: Thu, 9 Nov 2017 15:00:17 +0530 Subject: [PATCH 07/15] implemented random sampling of data for oneshot loading --- speech_data.py | 217 ++++++++++++++++++++++++++++++++++++++++------ speech_siamese.py | 4 +- 2 files changed, 194 insertions(+), 27 deletions(-) diff --git a/speech_data.py b/speech_data.py index a9a87b4..26067c5 100644 --- a/speech_data.py +++ b/speech_data.py @@ -3,6 +3,7 @@ from pandas_parallel import apply_by_multiprocessing # import dask as dd # import dask.dataframe as ddf import tensorflow as tf +from tensorflow.python.ops import data_flow_ops import numpy as np from spectro_gen import generate_aiff_spectrogram from sklearn.model_selection import train_test_split @@ -11,6 +12,7 @@ import os import random import csv import gc +import pickle from tqdm import tqdm @@ -23,6 +25,16 @@ def siamese_pairs(rightGroup, wrongGroup): random.shuffle(rightRightPairs) return rightRightPairs[:32],rightWrongPairs[:32] + +def _float_feature(value): + return tf.train.Feature(float_list=tf.train.FloatList(value=value)) + +def _int64_feature(value): + return tf.train.Feature(int64_list=tf.train.Int64List(value=value)) + +def _bytes_feature(value): + return tf.train.Feature(bytes_list=tf.train.BytesList(value=value)) + def create_spectrogram_tfrecords(audio_group='audio'): ''' http://warmspringwinds.github.io/tensorflow/tf-slim/2016/12/21/tfrecords-guide/ @@ -35,14 +47,7 @@ def create_spectrogram_tfrecords(audio_group='audio'): audio_samples['file_exists'] = apply_by_multiprocessing(audio_samples['file_path'], os.path.exists) audio_samples = audio_samples[audio_samples['file_exists'] == True].reset_index() - def _float_feature(value): - return tf.train.Feature(float_list=tf.train.FloatList(value=value)) - - def _int64_feature(value): - return tf.train.Feature(int64_list=tf.train.Int64List(value=value)) - - def _bytes_feature(value): - return tf.train.Feature(bytes_list=tf.train.BytesList(value=value)) + n_records = n_spec = n_features = 0 writer = tf.python_io.TFRecordWriter('./outputs/' + audio_group + '.tfrecords') prog = tqdm(audio_samples.groupby(audio_samples['word']),desc='Computing spectrogram') @@ -64,6 +69,11 @@ def create_spectrogram_tfrecords(audio_group='audio'): spec_n1,spec_n2 = spectro1.shape[0],spectro2.shape[0] spec_w1,spec_w2 = spectro1.shape[1],spectro2.shape[1] spec1,spec2 = spectro1.reshape(-1),spectro2.reshape(-1) + + n_spec = max([n_spec,spec_n1,spec_n2]) + n_features = spec_w1 + n_records+=1 + example = tf.train.Example(features=tf.train.Features( feature={ 'word': _bytes_feature([w.encode('utf-8')]), @@ -91,13 +101,15 @@ def create_spectrogram_tfrecords(audio_group='audio'): group_prog.close() prog.close() writer.close() + const_file = os.path.join('./outputs',audio_group+'.constants') + pickle.dump((n_spec,n_features,n_records),open(const_file,'wb')) def padd_zeros(spgr, max_samples): return np.lib.pad(spgr, [(0, max_samples - spgr.shape[0]), (0, 0)], 'constant') def find_max_n(trf): - max_n = 0 + max_n,n_records = 0,0 max_n_it = tf.python_io.tf_record_iterator(path=trf) for string_record in max_n_it: example = tf.train.Example() @@ -105,19 +117,20 @@ def find_max_n(trf): spec_n1 = example.features.feature['spec_n1'].int64_list.value[0] spec_n2 = example.features.feature['spec_n2'].int64_list.value[0] max_n = max([max_n,spec_n1,spec_n2]) - return max_n + n_records+=1 + return (max_n,n_records) -def read_siamese_tfrecords(audio_group='audio'): +def padd_zeros_siamese_tfrecords(audio_group='audio'): records_file = os.path.join('./outputs',audio_group+'.tfrecords') record_iterator = tf.python_io.tf_record_iterator(path=records_file) - input_pairs = [] - output_class = [] - max_n = find_max_n(records_file) - spec_w1 = 0 - for string_record in record_iterator: + print('finding max_n...') + max_n,n_records = find_max_n(records_file) + p_spec1 = None + print('reading tfrecords...') + writer = tf.python_io.TFRecordWriter('./outputs/' + audio_group + '_padded.tfrecords') + for string_record in tqdm(record_iterator,desc='padding siamese record',total=n_records): example = tf.train.Example() example.ParseFromString(string_record) - example.features.feature['spec2'].float_list.value[0] spec_n1 = example.features.feature['spec_n1'].int64_list.value[0] spec_n2 = example.features.feature['spec_n2'].int64_list.value[0] spec_w1 = example.features.feature['spec_w1'].int64_list.value[0] @@ -125,14 +138,155 @@ def read_siamese_tfrecords(audio_group='audio'): spec1 = np.array(example.features.feature['spec1'].float_list.value).reshape(spec_n1,spec_w1) spec2 = np.array(example.features.feature['spec2'].float_list.value).reshape(spec_n2,spec_w2) p_spec1,p_spec2 = padd_zeros(spec1,max_n),padd_zeros(spec2,max_n) - input_pairs.append(np.asarray([p_spec1,p_spec2])) output = example.features.feature['output'].int64_list.value - output_class.append(np.asarray(output)) - n_features = spec_w1 - input_data,output_data = np.asarray(input_pairs),np.asarray(output_class) - tr_pairs,te_pairs,tr_y,te_y = train_test_split(input_data,output_data) - n_step,n_features = int(max_n),int(spec_w1) - return (tr_pairs,te_pairs,tr_y,te_y,n_step,n_features) + w_example = tf.train.Example(features=tf.train.Features( + feature={ + 'spec1':_float_feature(p_spec1.reshape(-1)), + 'spec2':_float_feature(p_spec2.reshape(-1)), + 'output':_int64_feature(output) + } + )) + writer.write(w_example.SerializeToString()) + const_file = os.path.join('./outputs',audio_group+'.constants') + pickle.dump((max_n,p_spec1.shape[1],n_records),open(const_file,'wb')) + writer.close() + +def pickle_constants(audio_group='audio'): + records_file = os.path.join('./outputs',audio_group+'_padded.tfrecords') + record_iterator = tf.python_io.tf_record_iterator(path=records_file) + print('finding max_n...') + max_n,n_records = find_max_n(records_file) + spec1 = 0 + print('finding spec_w1...') + record_iterator = tf.python_io.tf_record_iterator(path=records_file) + for string_record in record_iterator: + example = tf.train.Example() + example.ParseFromString(string_record) + spec1 = len(example.features.feature['spec1'].float_list.value)//max_n + print('found spec_w1...') + break + const_file = os.path.join('./outputs',audio_group+'.constants') + print(max_n,spec1,n_records) + pickle.dump((max_n,spec1,n_records),open(const_file,'wb')) + +def reservoir_sample(iterable, k): + it = iter(iterable) + if not (k > 0): + raise ValueError("sample size must be positive") + + sample = list(itertools.islice(it, k)) # fill the reservoir + random.shuffle(sample) # if number of items less then *k* then + # return all items in random order. + for i, item in enumerate(it, start=k+1): + j = random.randrange(i) # random [0..i) + if j < k: + sample[j] = item # replace item with gradually decreasing probability + return sample + +def read_siamese_tfrecords_oneshot(audio_group='audio'): + records_file = os.path.join('./outputs',audio_group+'_padded.tfrecords') + record_iterator = tf.python_io.tf_record_iterator(path=records_file) + input_pairs = [] + output_class = [] + const_file = os.path.join('./outputs',audio_group+'.constants') + (n_spec,n_features,n_records) = pickle.load(open(const_file,'rb')) + print('reading tfrecords...') + samples = min([30000,n_records]) + input_data = np.zeros((samples,2,n_spec,n_features)) + output_data = np.zeros((samples,2)) + random_samples = enumerate(reservoir_sample(record_iterator,samples)) + for (i,string_record) in tqdm(random_samples,total=samples): + # if i == samples: + # break + example = tf.train.Example() + example.ParseFromString(string_record) + spec1 = np.array(example.features.feature['spec1'].float_list.value).reshape(n_spec,n_features) + spec2 = np.array(example.features.feature['spec2'].float_list.value).reshape(n_spec,n_features) + input_data[i] = np.asarray([spec1,spec2]) + output = example.features.feature['output'].int64_list.value + output_data[i] = np.asarray(output) + print('converting to nparray...') + tr_pairs,te_pairs,tr_y,te_y = train_test_split(input_data,output_data,test_size=0.1) + result = (tr_pairs,te_pairs,tr_y,te_y,n_spec,n_features) + return result + +def read_siamese_tfrecords(audio_group='audio'): + audio_group='story_words_test' + + record_file = os.path.join('./outputs',audio_group+'_padded.tfrecords') + const_file = os.path.join('./outputs',audio_group+'.constants') + (n_spec,n_features) = pickle.load(open(const_file,'rb')) + + filename_queue = tf.train.string_input_producer([record_file]) + reader = tf.TFRecordReader() + _, serialized_example = reader.read(filename_queue) + features = tf.parse_single_example(serialized_example, + features={ + 'spec1': tf.FixedLenFeature([1,n_spec,n_features], tf.float32), + 'spec2': tf.FixedLenFeature([1,n_spec,n_features], tf.float32), + 'output':tf.FixedLenFeature([2], tf.int64) + }) + spec1 = features['spec1'] + spec1 = tf.cast(spec1, tf.float32) * (1. / 255) + spec2 = features['spec2'] + spec2 = tf.cast(spec2, tf.float32) * (1. / 255) + output = tf.cast(features['output'], tf.int32) + return spec1,spec2, output,n_spec,n_features + +def read_siamese_tfrecords_batch(audio_group='audio', batch_size=32): + audio_group='story_words_test' + record_file = os.path.join('./outputs',audio_group+'_padded.tfrecords') + """ Return tensor to read from TFRecord """ + print('Creating graph for loading {} ...'.format(record_file)) + const_file = os.path.join('./outputs',audio_group+'.constants') + (n_spec,n_features) = pickle.load(open(const_file,'rb')) + records_file = os.path.join('./outputs',audio_group+'.tfrecords') + record_iterator = tf.python_io.tf_record_iterator(path=records_file) + n_records = len([i for i in record_iterator]) + batch_shape=[batch_size, n_spec, n_features] + with tf.variable_scope("SiameseTFRecords"): + record_input = data_flow_ops.RecordInput(record_file, batch_size=batch_size) + records_op = record_input.get_yield_op() + records_op = tf.split(records_op, batch_shape[0], 0) + records_op = [tf.reshape(record, []) for record in records_op] + specs1, specs2 = [],[] + outputs = [] + for i, serialized_example in tqdm(enumerate(records_op)): + with tf.variable_scope("parse_siamese_pairs", reuse=True): + features = tf.parse_single_example( + serialized_example, + features={ + 'spec1': tf.FixedLenFeature([n_spec,n_features], tf.float32), + 'spec2': tf.FixedLenFeature([n_spec,n_features], tf.float32), + 'output':tf.FixedLenFeature([2], tf.int64) + }) + spec1 = features['spec1'] + spec1 = tf.cast(spec1, tf.float32) * (1. / 255) + spec2 = features['spec2'] + output = tf.cast(spec2, tf.float32) * (1. / 255) + output = tf.cast(features['output'], tf.float32) + specs1.append(spec1) + specs2.append(spec2) + outputs.append(output) + + specs1 = tf.parallel_stack(specs1, 0) + specs2 = tf.parallel_stack(specs2, 0) + outputs = tf.parallel_stack(outputs, 0) + specs1 = tf.cast(specs1, tf.float32) + specs2 = tf.cast(specs2, tf.float32) + + specs1 = tf.reshape(specs1, shape=batch_shape) + specs2 = tf.reshape(specs1, shape=batch_shape) + specs1_shape = specs1.get_shape() + specs2_shape = specs2.get_shape() + outputs_shape = outputs.get_shape() + copy_stage = data_flow_ops.StagingArea( + [tf.float32, tf.float32, tf.float32], + shapes=[specs1_shape, specs2_shape, outputs_shape]) + copy_stage_op = copy_stage.put( + [specs1, specs2, outputs]) + staged_specs1, staged_specs2, staged_outputs = copy_stage.get() + return specs1, spec2, outputs,n_spec,n_features,n_records def audio_samples_word_count(audio_group='audio'): audio_group = 'story_all' @@ -152,14 +306,27 @@ def fix_csv(audio_group='audio'): fixed_csv_w = csv.writer(fixed_csv, quoting=csv.QUOTE_MINIMAL) fixed_csv_w.writerows(proper_rows) +def convert_old_audio(): + audio_samples = pd.read_csv( './outputs/audio.csv.old' + , names=['word', 'voice', 'rate', 'variant', 'file']) + audio_samples['phonemes'] = 'unknown' + audio_samples['language'] = 'en-US' + audio_samples.loc[audio_samples['variant'] == 'normal','variant'] = 'low' + audio_samples.loc[audio_samples['variant'] == 'phoneme','variant'] = 'medium' + audio_samples = audio_samples[['word','phonemes', 'voice', 'language', 'rate', 'variant', 'file']] + audio_samples.to_csv('./outputs/audio_new.csv',index=False,header=False) if __name__ == '__main__': # sunflower_pairs_data() # create_spectrogram_data() # create_spectrogram_data('story_words') - create_spectrogram_tfrecords('story_words') + # create_spectrogram_tfrecords('story_words') # create_spectrogram_tfrecords('story_words_test') # read_siamese_tfrecords('story_all') + # read_siamese_tfrecords('story_words_test') + pickle_constants('story_words_test') + # create_spectrogram_tfrecords('audio') + # padd_zeros_siamese_tfrecords('audio') # create_padded_spectrogram() # create_speech_pairs_data() # print(speech_model_data()) diff --git a/speech_siamese.py b/speech_siamese.py index 9ed9591..a2cdba3 100644 --- a/speech_siamese.py +++ b/speech_siamese.py @@ -2,7 +2,7 @@ from __future__ import absolute_import from __future__ import print_function import numpy as np # from speech_data import speech_model_data -from speech_data import read_siamese_tfrecords +from speech_data import read_siamese_tfrecords_oneshot from keras.models import Model,load_model from keras.layers import Input, Dense, Dropout, LSTM, Lambda, Concatenate from keras.losses import categorical_crossentropy @@ -82,7 +82,7 @@ def siamese_model(input_dim): def train_siamese(): # the data, shuffled and split between train and test sets # tr_pairs, te_pairs, tr_y_e, te_y_e = speech_model_data() - (tr_pairs,te_pairs,tr_y,te_y,n_step,n_features) = read_siamese_tfrecords('story_words_test') + (tr_pairs,te_pairs,tr_y,te_y,n_step,n_features) = read_siamese_tfrecords_oneshot() # tr_y = to_categorical(tr_y_e, num_classes=2) # te_y = to_categorical(te_y_e, num_classes=2) input_dim = (n_step, n_features) From ab452494b3a5516ae4616711064cd1e4c2fad962 Mon Sep 17 00:00:00 2001 From: Malar Kannan Date: Thu, 9 Nov 2017 20:31:29 +0530 Subject: [PATCH 08/15] implemented streaming tfreccords --- speech_data.py | 45 +++++++++++++++++++++++++++++++++++++-------- speech_siamese.py | 33 ++++++++++++++++++++------------- 2 files changed, 57 insertions(+), 21 deletions(-) diff --git a/speech_data.py b/speech_data.py index 26067c5..2730bda 100644 --- a/speech_data.py +++ b/speech_data.py @@ -183,15 +183,15 @@ def reservoir_sample(iterable, k): sample[j] = item # replace item with gradually decreasing probability return sample -def read_siamese_tfrecords_oneshot(audio_group='audio'): +def read_siamese_tfrecords_oneshot(audio_group='audio',sample_size=3000): records_file = os.path.join('./outputs',audio_group+'_padded.tfrecords') record_iterator = tf.python_io.tf_record_iterator(path=records_file) input_pairs = [] output_class = [] const_file = os.path.join('./outputs',audio_group+'.constants') (n_spec,n_features,n_records) = pickle.load(open(const_file,'rb')) - print('reading tfrecords...') - samples = min([30000,n_records]) + print('reading tfrecords({})...'.format(audio_group)) + samples = min([sample_size,n_records]) input_data = np.zeros((samples,2,n_spec,n_features)) output_data = np.zeros((samples,2)) random_samples = enumerate(reservoir_sample(record_iterator,samples)) @@ -205,10 +205,38 @@ def read_siamese_tfrecords_oneshot(audio_group='audio'): input_data[i] = np.asarray([spec1,spec2]) output = example.features.feature['output'].int64_list.value output_data[i] = np.asarray(output) - print('converting to nparray...') - tr_pairs,te_pairs,tr_y,te_y = train_test_split(input_data,output_data,test_size=0.1) - result = (tr_pairs,te_pairs,tr_y,te_y,n_spec,n_features) - return result + # print('converting to nparray...') + # tr_pairs,te_pairs,tr_y,te_y = train_test_split(input_data,output_data,test_size=0.1) + # result = (tr_pairs,te_pairs,tr_y,te_y,n_spec,n_features) + return input_data,output_data + +def read_siamese_tfrecords_generator(audio_group='audio',batch_size=32): + records_file = os.path.join('./outputs',audio_group+'_padded.tfrecords') + input_pairs = [] + output_class = [] + const_file = os.path.join('./outputs',audio_group+'.constants') + (n_spec,n_features,n_records) = pickle.load(open(const_file,'rb')) + print('reading tfrecords({})...'.format(audio_group)) + def record_generator(): + input_data = [] + output_data = [] + while True: + record_iterator = tf.python_io.tf_record_iterator(path=records_file) + for (i,string_record) in tqdm(enumerate(record_iterator),total=n_records): + example = tf.train.Example() + example.ParseFromString(string_record) + spec1 = np.array(example.features.feature['spec1'].float_list.value).reshape(n_spec,n_features) + spec2 = np.array(example.features.feature['spec2'].float_list.value).reshape(n_spec,n_features) + input_data.append(np.asarray([spec1,spec2])) + output = example.features.feature['output'].int64_list.value + output_data.append(np.asarray(output)) + if len(input_data) == batch_size: + input_arr = np.asarray(input_data) + output_arr = np.asarray(output_data) + yield ([input_arr[:, 0], input_arr[:, 1]],output_arr) + input_data = [] + output_data = [] + return record_generator,n_spec,n_features,n_records def read_siamese_tfrecords(audio_group='audio'): audio_group='story_words_test' @@ -324,7 +352,8 @@ if __name__ == '__main__': # create_spectrogram_tfrecords('story_words_test') # read_siamese_tfrecords('story_all') # read_siamese_tfrecords('story_words_test') - pickle_constants('story_words_test') + padd_zeros_siamese_tfrecords('story_words') + # pickle_constants('story_words') # create_spectrogram_tfrecords('audio') # padd_zeros_siamese_tfrecords('audio') # create_padded_spectrogram() diff --git a/speech_siamese.py b/speech_siamese.py index a2cdba3..74d9566 100644 --- a/speech_siamese.py +++ b/speech_siamese.py @@ -2,7 +2,7 @@ from __future__ import absolute_import from __future__ import print_function import numpy as np # from speech_data import speech_model_data -from speech_data import read_siamese_tfrecords_oneshot +from speech_data import read_siamese_tfrecords_oneshot,read_siamese_tfrecords_generator from keras.models import Model,load_model from keras.layers import Input, Dense, Dropout, LSTM, Lambda, Concatenate from keras.losses import categorical_crossentropy @@ -82,7 +82,10 @@ def siamese_model(input_dim): def train_siamese(): # the data, shuffled and split between train and test sets # tr_pairs, te_pairs, tr_y_e, te_y_e = speech_model_data() - (tr_pairs,te_pairs,tr_y,te_y,n_step,n_features) = read_siamese_tfrecords_oneshot() + batch_size = 512 + tr_gen_fn,n_step,n_features,n_records = read_siamese_tfrecords_generator('audio',batch_size) + tr_gen = tr_gen_fn() + (te_pairs,te_y) = read_siamese_tfrecords_oneshot('audio',1000) # tr_y = to_categorical(tr_y_e, num_classes=2) # te_y = to_categorical(te_y_e, num_classes=2) input_dim = (n_step, n_features) @@ -113,22 +116,26 @@ def train_siamese(): # train rms = RMSprop(lr=0.001) model.compile(loss=categorical_crossentropy, optimizer=rms, metrics=[accuracy]) - model.fit( - [tr_pairs[:, 0], tr_pairs[:, 1]], - tr_y, - batch_size=128, - epochs=100, - validation_data=([te_pairs[:, 0], te_pairs[:, 1]], te_y), - callbacks=[tb_cb, cp_cb]) + # model.fit( + # [tr_pairs[:, 0], tr_pairs[:, 1]], + # tr_y, + # batch_size=128, + # epochs=100, + # validation_data=([te_pairs[:, 0], te_pairs[:, 1]], te_y), + # callbacks=[tb_cb, cp_cb]) + model.fit_generator(tr_gen + ,epochs=100 + ,steps_per_epoch=n_records//batch_size + ,use_multiprocessing=True) model.save('./models/siamese_speech_model-final.h5') # compute final accuracy on training and test sets - y_pred = model.predict([tr_pairs[:, 0], tr_pairs[:, 1]]) - tr_acc = compute_accuracy(tr_y, y_pred) + # y_pred = model.predict([tr_pairs[:, 0], tr_pairs[:, 1]]) + # tr_acc = compute_accuracy(tr_y, y_pred) + # print('* Accuracy on training set: %0.2f%%' % (100 * tr_acc)) + y_pred = model.predict([te_pairs[:, 0], te_pairs[:, 1]]) te_acc = compute_accuracy(te_y, y_pred) - - print('* Accuracy on training set: %0.2f%%' % (100 * tr_acc)) print('* Accuracy on test set: %0.2f%%' % (100 * te_acc)) From e9b18921eeaa06ce8a41ad5e52cf05f31de2cdf4 Mon Sep 17 00:00:00 2001 From: Malar Kannan Date: Fri, 10 Nov 2017 14:07:31 +0530 Subject: [PATCH 09/15] implemented train/test split at word-level and generator returns one-shot validation data --- speech_data.py | 206 +++++++++++++++++++++++++++++++--------------- speech_siamese.py | 15 ++-- 2 files changed, 149 insertions(+), 72 deletions(-) diff --git a/speech_data.py b/speech_data.py index 2730bda..2659eb7 100644 --- a/speech_data.py +++ b/speech_data.py @@ -35,72 +35,74 @@ def _int64_feature(value): def _bytes_feature(value): return tf.train.Feature(bytes_list=tf.train.BytesList(value=value)) -def create_spectrogram_tfrecords(audio_group='audio'): +def create_spectrogram_tfrecords(audio_group='audio',sample_count=0): ''' http://warmspringwinds.github.io/tensorflow/tf-slim/2016/12/21/tfrecords-guide/ http://www.machinelearninguru.com/deep_learning/tensorflow/basics/tfrecord/tfrecord.html ''' - audio_samples = pd.read_csv( './outputs/' + audio_group + '.csv' - , names=['word','phonemes', 'voice', 'language', 'rate', 'variant', 'file'] - , quoting=csv.QUOTE_NONE) + audio_samples = pd.read_csv( './outputs/' + audio_group + '.csv',index_col=0) audio_samples['file_path'] = audio_samples.loc[:, 'file'].apply(lambda x: 'outputs/' + audio_group + '/' + x) - audio_samples['file_exists'] = apply_by_multiprocessing(audio_samples['file_path'], os.path.exists) - audio_samples = audio_samples[audio_samples['file_exists'] == True].reset_index() + n_records,n_spec,n_features = 0,0,0 - n_records = n_spec = n_features = 0 + def write_samples(wg,sample_name): + wg_sampled = reservoir_sample(wg,sample_count) if sample_count > 0 else wg + word_group_prog = tqdm(wg_sampled,desc='Computing spectrogram') + record_file = './outputs/{}.{}.tfrecords'.format(audio_group,sample_name) + writer = tf.python_io.TFRecordWriter(record_file) + for (w, word_group) in word_group_prog: + word_group_prog.set_postfix(word=w,sample_name=sample_name) + g = word_group.reset_index() + g['spectrogram'] = apply_by_multiprocessing(g['file_path'],generate_aiff_spectrogram) + sample_right = g.loc[g['variant'] == 'low'] + sample_wrong = g.loc[g['variant'] == 'medium'] + same, diff = siamese_pairs(sample_right, sample_wrong) + groups = [([0,1],same),([1,0],diff)] + for (output,group) in groups: + group_prog = tqdm(group,desc='Writing Spectrogram') + for sample1,sample2 in group_prog: + group_prog.set_postfix(output=output + ,var1=sample1['variant'] + ,var2=sample2['variant']) + spectro1,spectro2 = sample1['spectrogram'],sample2['spectrogram'] + spec_n1,spec_n2 = spectro1.shape[0],spectro2.shape[0] + spec_w1,spec_w2 = spectro1.shape[1],spectro2.shape[1] + spec1,spec2 = spectro1.reshape(-1),spectro2.reshape(-1) + nonlocal n_spec,n_records,n_features + n_spec = max([n_spec,spec_n1,spec_n2]) + n_features = spec_w1 + n_records+=1 + example = tf.train.Example(features=tf.train.Features( + feature={ + 'word': _bytes_feature([w.encode('utf-8')]), + 'phoneme1': _bytes_feature([sample1['phonemes'].encode('utf-8')]), + 'phoneme2': _bytes_feature([sample2['phonemes'].encode('utf-8')]), + 'voice1': _bytes_feature([sample1['voice'].encode('utf-8')]), + 'voice2': _bytes_feature([sample2['voice'].encode('utf-8')]), + 'language': _bytes_feature([sample1['language'].encode('utf-8')]), + 'rate1':_int64_feature([sample1['rate']]), + 'rate2':_int64_feature([sample2['rate']]), + 'variant1': _bytes_feature([sample1['variant'].encode('utf-8')]), + 'variant2': _bytes_feature([sample2['variant'].encode('utf-8')]), + 'file1': _bytes_feature([sample1['file'].encode('utf-8')]), + 'file2': _bytes_feature([sample2['file'].encode('utf-8')]), + 'spec1':_float_feature(spec1), + 'spec2':_float_feature(spec2), + 'spec_n1':_int64_feature([spec_n1]), + 'spec_w1':_int64_feature([spec_w1]), + 'spec_n2':_int64_feature([spec_n2]), + 'spec_w2':_int64_feature([spec_w2]), + 'output':_int64_feature(output) + } + )) + writer.write(example.SerializeToString()) + group_prog.close() + word_group_prog.close() + writer.close() - writer = tf.python_io.TFRecordWriter('./outputs/' + audio_group + '.tfrecords') - prog = tqdm(audio_samples.groupby(audio_samples['word']),desc='Computing spectrogram') - for (w, word_group) in prog: - prog.set_postfix(word=w) - g = word_group.reset_index() - g['spectrogram'] = apply_by_multiprocessing(g['file_path'],generate_aiff_spectrogram) - sample_right = g.loc[g['variant'] == 'low'] - sample_wrong = g.loc[g['variant'] == 'medium'] - same, diff = siamese_pairs(sample_right, sample_wrong) - groups = [([0,1],same),([1,0],diff)] - for (output,group) in groups: - group_prog = tqdm(group,desc='Writing Spectrogram') - for sample1,sample2 in group_prog: - group_prog.set_postfix(output=output - ,var1=sample1['variant'] - ,var2=sample2['variant']) - spectro1,spectro2 = sample1['spectrogram'],sample2['spectrogram'] - spec_n1,spec_n2 = spectro1.shape[0],spectro2.shape[0] - spec_w1,spec_w2 = spectro1.shape[1],spectro2.shape[1] - spec1,spec2 = spectro1.reshape(-1),spectro2.reshape(-1) - - n_spec = max([n_spec,spec_n1,spec_n2]) - n_features = spec_w1 - n_records+=1 - - example = tf.train.Example(features=tf.train.Features( - feature={ - 'word': _bytes_feature([w.encode('utf-8')]), - 'phoneme1': _bytes_feature([sample1['phonemes'].encode('utf-8')]), - 'phoneme2': _bytes_feature([sample2['phonemes'].encode('utf-8')]), - 'voice1': _bytes_feature([sample1['voice'].encode('utf-8')]), - 'voice2': _bytes_feature([sample2['voice'].encode('utf-8')]), - 'language': _bytes_feature([sample1['language'].encode('utf-8')]), - 'rate1':_int64_feature([sample1['rate']]), - 'rate2':_int64_feature([sample2['rate']]), - 'variant1': _bytes_feature([sample1['variant'].encode('utf-8')]), - 'variant2': _bytes_feature([sample2['variant'].encode('utf-8')]), - 'file1': _bytes_feature([sample1['file'].encode('utf-8')]), - 'file2': _bytes_feature([sample2['file'].encode('utf-8')]), - 'spec1':_float_feature(spec1), - 'spec2':_float_feature(spec2), - 'spec_n1':_int64_feature([spec_n1]), - 'spec_w1':_int64_feature([spec_w1]), - 'spec_n2':_int64_feature([spec_n2]), - 'spec_w2':_int64_feature([spec_w2]), - 'output':_int64_feature(output) - } - )) - writer.write(example.SerializeToString()) - group_prog.close() - prog.close() - writer.close() + word_groups = [i for i in audio_samples.groupby('word')] + tr_audio_samples,te_audio_samples = train_test_split(word_groups,test_size=0.1) + write_samples(tr_audio_samples,'train') + write_samples(te_audio_samples,'test') const_file = os.path.join('./outputs',audio_group+'.constants') pickle.dump((n_spec,n_features,n_records),open(const_file,'wb')) @@ -196,12 +198,15 @@ def read_siamese_tfrecords_oneshot(audio_group='audio',sample_size=3000): output_data = np.zeros((samples,2)) random_samples = enumerate(reservoir_sample(record_iterator,samples)) for (i,string_record) in tqdm(random_samples,total=samples): - # if i == samples: - # break example = tf.train.Example() example.ParseFromString(string_record) - spec1 = np.array(example.features.feature['spec1'].float_list.value).reshape(n_spec,n_features) - spec2 = np.array(example.features.feature['spec2'].float_list.value).reshape(n_spec,n_features) + spec_n1 = example.features.feature['spec_n1'].int64_list.value[0] + spec_n2 = example.features.feature['spec_n2'].int64_list.value[0] + spec_w1 = example.features.feature['spec_w1'].int64_list.value[0] + spec_w2 = example.features.feature['spec_w2'].int64_list.value[0] + spec1 = np.array(example.features.feature['spec1'].float_list.value).reshape(spec_n1,spec_w1) + spec2 = np.array(example.features.feature['spec2'].float_list.value).reshape(spec_n2,spec_w2) + p_spec1,p_spec2 = padd_zeros(spec1,n_spec),padd_zeros(spec2,n_spec) input_data[i] = np.asarray([spec1,spec2]) output = example.features.feature['output'].int64_list.value output_data[i] = np.asarray(output) @@ -210,7 +215,65 @@ def read_siamese_tfrecords_oneshot(audio_group='audio',sample_size=3000): # result = (tr_pairs,te_pairs,tr_y,te_y,n_spec,n_features) return input_data,output_data -def read_siamese_tfrecords_generator(audio_group='audio',batch_size=32): +def read_siamese_tfrecords_generator(audio_group='audio',batch_size=32,sample_size=100): + records_file = os.path.join('./outputs',audio_group+'.train.tfrecords') + input_pairs = [] + output_class = [] + const_file = os.path.join('./outputs',audio_group+'.constants') + (n_spec,n_features,n_records) = pickle.load(open(const_file,'rb')) + print('reading tfrecords({}-train)...'.format(audio_group)) + def record_generator(): + input_data = [] + output_data = [] + while True: + record_iterator = tf.python_io.tf_record_iterator(path=records_file) + #tqdm(enumerate(record_iterator),total=n_records) + for (i,string_record) in enumerate(record_iterator): + example = tf.train.Example() + example.ParseFromString(string_record) + spec_n1 = example.features.feature['spec_n1'].int64_list.value[0] + spec_n2 = example.features.feature['spec_n2'].int64_list.value[0] + spec_w1 = example.features.feature['spec_w1'].int64_list.value[0] + spec_w2 = example.features.feature['spec_w2'].int64_list.value[0] + spec1 = np.array(example.features.feature['spec1'].float_list.value).reshape(spec_n1,spec_w1) + spec2 = np.array(example.features.feature['spec2'].float_list.value).reshape(spec_n2,spec_w2) + p_spec1,p_spec2 = padd_zeros(spec1,n_spec),padd_zeros(spec2,n_spec) + input_data.append(np.asarray([p_spec1,p_spec2])) + output = example.features.feature['output'].int64_list.value + output_data.append(np.asarray(output)) + if len(input_data) == batch_size: + input_arr = np.asarray(input_data) + output_arr = np.asarray(output_data) + yield ([input_arr[:, 0], input_arr[:, 1]],output_arr) + input_data = [] + output_data = [] + + # Read test in one-shot + te_records_file = os.path.join('./outputs',audio_group+'.test.tfrecords') + te_re_iterator = tf.python_io.tf_record_iterator(path=records_file) + print('reading tfrecords({}-test)...'.format(audio_group)) + samples = min([sample_size,n_records]) + # samples = n_records + input_data = np.zeros((samples,2,n_spec,n_features)) + output_data = np.zeros((samples,2)) + random_samples = enumerate(reservoir_sample(te_re_iterator,samples)) + for (i,string_record) in tqdm(random_samples,total=samples): + example = tf.train.Example() + example.ParseFromString(string_record) + spec_n1 = example.features.feature['spec_n1'].int64_list.value[0] + spec_n2 = example.features.feature['spec_n2'].int64_list.value[0] + spec_w1 = example.features.feature['spec_w1'].int64_list.value[0] + spec_w2 = example.features.feature['spec_w2'].int64_list.value[0] + spec1 = np.array(example.features.feature['spec1'].float_list.value).reshape(spec_n1,spec_w1) + spec2 = np.array(example.features.feature['spec2'].float_list.value).reshape(spec_n2,spec_w2) + p_spec1,p_spec2 = padd_zeros(spec1,n_spec),padd_zeros(spec2,n_spec) + input_data[i] = np.asarray([p_spec1,p_spec2]) + output = example.features.feature['output'].int64_list.value + output_data[i] = np.asarray(output) + + return record_generator,input_data,output_data,n_spec,n_features,n_records + +def read_siamese_tfrecords_generator_old(audio_group='audio',batch_size=32): records_file = os.path.join('./outputs',audio_group+'_padded.tfrecords') input_pairs = [] output_class = [] @@ -330,9 +393,16 @@ def fix_csv(audio_group='audio'): audio_csv_lines = open('./outputs/' + audio_group + '.csv','r').readlines() audio_csv_data = [i.strip().split(',') for i in audio_csv_lines] proper_rows = [i for i in audio_csv_data if len(i) == 7] - with open('./outputs/' + audio_group + '-new.csv','w') as fixed_csv: + with open('./outputs/' + audio_group + '.csv','w') as fixed_csv: fixed_csv_w = csv.writer(fixed_csv, quoting=csv.QUOTE_MINIMAL) fixed_csv_w.writerows(proper_rows) + audio_samples = pd.read_csv( './outputs/' + audio_group + '.csv' + , names=['word','phonemes', 'voice', 'language', 'rate', 'variant', 'file']) + audio_samples['file_path'] = audio_samples.loc[:, 'file'].apply(lambda x: 'outputs/' + audio_group + '/' + x) + audio_samples['file_exists'] = apply_by_multiprocessing(audio_samples['file_path'], os.path.exists) + audio_samples = audio_samples[audio_samples['file_exists'] == True] + audio_samples = audio_samples.drop(['file_path','file_exists'],axis=1).reset_index(drop=True) + audio_samples.to_csv('./outputs/' + audio_group + '.csv') def convert_old_audio(): audio_samples = pd.read_csv( './outputs/audio.csv.old' @@ -352,9 +422,11 @@ if __name__ == '__main__': # create_spectrogram_tfrecords('story_words_test') # read_siamese_tfrecords('story_all') # read_siamese_tfrecords('story_words_test') - padd_zeros_siamese_tfrecords('story_words') + # padd_zeros_siamese_tfrecords('story_words') + # fix_csv() # pickle_constants('story_words') - # create_spectrogram_tfrecords('audio') + # create_spectrogram_tfrecords('audio',sample_count=100) + read_siamese_tfrecords_generator('audio') # padd_zeros_siamese_tfrecords('audio') # create_padded_spectrogram() # create_speech_pairs_data() diff --git a/speech_siamese.py b/speech_siamese.py index 74d9566..298def4 100644 --- a/speech_siamese.py +++ b/speech_siamese.py @@ -13,6 +13,9 @@ from keras.optimizers import RMSprop from keras.callbacks import TensorBoard, ModelCheckpoint from keras import backend as K +def create_dir(direc): + if not os.path.exists(direc): + os.makedirs(direc) def euclidean_distance(vects): x, y = vects @@ -79,13 +82,14 @@ def siamese_model(input_dim): return model -def train_siamese(): +def train_siamese(audio_group = 'audio'): # the data, shuffled and split between train and test sets # tr_pairs, te_pairs, tr_y_e, te_y_e = speech_model_data() batch_size = 512 - tr_gen_fn,n_step,n_features,n_records = read_siamese_tfrecords_generator('audio',batch_size) + model_dir = './models/'+audio_group + create_dir(model_dir) + tr_gen_fn,te_pairs,te_y,n_step,n_features,n_records = read_siamese_tfrecords_generator(audio_group,batch_size,300) tr_gen = tr_gen_fn() - (te_pairs,te_y) = read_siamese_tfrecords_oneshot('audio',1000) # tr_y = to_categorical(tr_y_e, num_classes=2) # te_y = to_categorical(te_y_e, num_classes=2) input_dim = (n_step, n_features) @@ -102,7 +106,7 @@ def train_siamese(): embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None) - cp_file_fmt = './models/siamese_speech_model-{epoch:02d}-epoch-{val_loss:0.2f}\ + cp_file_fmt = model_dir+'/siamese_speech_model-{epoch:02d}-epoch-{val_loss:0.2f}\ -acc.h5' cp_cb = ModelCheckpoint( @@ -126,9 +130,10 @@ def train_siamese(): model.fit_generator(tr_gen ,epochs=100 ,steps_per_epoch=n_records//batch_size + ,validation_data=([te_pairs[:, 0], te_pairs[:, 1]], te_y) ,use_multiprocessing=True) - model.save('./models/siamese_speech_model-final.h5') + model.save(model_dir+'/siamese_speech_model-final.h5') # compute final accuracy on training and test sets # y_pred = model.predict([tr_pairs[:, 0], tr_pairs[:, 1]]) # tr_acc = compute_accuracy(tr_y, y_pred) From 1190312def711a181a42179796b85ed301c7902b Mon Sep 17 00:00:00 2001 From: Malar Kannan Date: Fri, 10 Nov 2017 14:15:12 +0530 Subject: [PATCH 10/15] removed tfrecord tensor code and remnants --- speech_data.py | 205 +------------------------------------------------ 1 file changed, 1 insertion(+), 204 deletions(-) diff --git a/speech_data.py b/speech_data.py index 2659eb7..c67b015 100644 --- a/speech_data.py +++ b/speech_data.py @@ -110,67 +110,6 @@ def padd_zeros(spgr, max_samples): return np.lib.pad(spgr, [(0, max_samples - spgr.shape[0]), (0, 0)], 'constant') -def find_max_n(trf): - max_n,n_records = 0,0 - max_n_it = tf.python_io.tf_record_iterator(path=trf) - for string_record in max_n_it: - example = tf.train.Example() - example.ParseFromString(string_record) - spec_n1 = example.features.feature['spec_n1'].int64_list.value[0] - spec_n2 = example.features.feature['spec_n2'].int64_list.value[0] - max_n = max([max_n,spec_n1,spec_n2]) - n_records+=1 - return (max_n,n_records) - -def padd_zeros_siamese_tfrecords(audio_group='audio'): - records_file = os.path.join('./outputs',audio_group+'.tfrecords') - record_iterator = tf.python_io.tf_record_iterator(path=records_file) - print('finding max_n...') - max_n,n_records = find_max_n(records_file) - p_spec1 = None - print('reading tfrecords...') - writer = tf.python_io.TFRecordWriter('./outputs/' + audio_group + '_padded.tfrecords') - for string_record in tqdm(record_iterator,desc='padding siamese record',total=n_records): - example = tf.train.Example() - example.ParseFromString(string_record) - spec_n1 = example.features.feature['spec_n1'].int64_list.value[0] - spec_n2 = example.features.feature['spec_n2'].int64_list.value[0] - spec_w1 = example.features.feature['spec_w1'].int64_list.value[0] - spec_w2 = example.features.feature['spec_w2'].int64_list.value[0] - spec1 = np.array(example.features.feature['spec1'].float_list.value).reshape(spec_n1,spec_w1) - spec2 = np.array(example.features.feature['spec2'].float_list.value).reshape(spec_n2,spec_w2) - p_spec1,p_spec2 = padd_zeros(spec1,max_n),padd_zeros(spec2,max_n) - output = example.features.feature['output'].int64_list.value - w_example = tf.train.Example(features=tf.train.Features( - feature={ - 'spec1':_float_feature(p_spec1.reshape(-1)), - 'spec2':_float_feature(p_spec2.reshape(-1)), - 'output':_int64_feature(output) - } - )) - writer.write(w_example.SerializeToString()) - const_file = os.path.join('./outputs',audio_group+'.constants') - pickle.dump((max_n,p_spec1.shape[1],n_records),open(const_file,'wb')) - writer.close() - -def pickle_constants(audio_group='audio'): - records_file = os.path.join('./outputs',audio_group+'_padded.tfrecords') - record_iterator = tf.python_io.tf_record_iterator(path=records_file) - print('finding max_n...') - max_n,n_records = find_max_n(records_file) - spec1 = 0 - print('finding spec_w1...') - record_iterator = tf.python_io.tf_record_iterator(path=records_file) - for string_record in record_iterator: - example = tf.train.Example() - example.ParseFromString(string_record) - spec1 = len(example.features.feature['spec1'].float_list.value)//max_n - print('found spec_w1...') - break - const_file = os.path.join('./outputs',audio_group+'.constants') - print(max_n,spec1,n_records) - pickle.dump((max_n,spec1,n_records),open(const_file,'wb')) - def reservoir_sample(iterable, k): it = iter(iterable) if not (k > 0): @@ -185,36 +124,6 @@ def reservoir_sample(iterable, k): sample[j] = item # replace item with gradually decreasing probability return sample -def read_siamese_tfrecords_oneshot(audio_group='audio',sample_size=3000): - records_file = os.path.join('./outputs',audio_group+'_padded.tfrecords') - record_iterator = tf.python_io.tf_record_iterator(path=records_file) - input_pairs = [] - output_class = [] - const_file = os.path.join('./outputs',audio_group+'.constants') - (n_spec,n_features,n_records) = pickle.load(open(const_file,'rb')) - print('reading tfrecords({})...'.format(audio_group)) - samples = min([sample_size,n_records]) - input_data = np.zeros((samples,2,n_spec,n_features)) - output_data = np.zeros((samples,2)) - random_samples = enumerate(reservoir_sample(record_iterator,samples)) - for (i,string_record) in tqdm(random_samples,total=samples): - example = tf.train.Example() - example.ParseFromString(string_record) - spec_n1 = example.features.feature['spec_n1'].int64_list.value[0] - spec_n2 = example.features.feature['spec_n2'].int64_list.value[0] - spec_w1 = example.features.feature['spec_w1'].int64_list.value[0] - spec_w2 = example.features.feature['spec_w2'].int64_list.value[0] - spec1 = np.array(example.features.feature['spec1'].float_list.value).reshape(spec_n1,spec_w1) - spec2 = np.array(example.features.feature['spec2'].float_list.value).reshape(spec_n2,spec_w2) - p_spec1,p_spec2 = padd_zeros(spec1,n_spec),padd_zeros(spec2,n_spec) - input_data[i] = np.asarray([spec1,spec2]) - output = example.features.feature['output'].int64_list.value - output_data[i] = np.asarray(output) - # print('converting to nparray...') - # tr_pairs,te_pairs,tr_y,te_y = train_test_split(input_data,output_data,test_size=0.1) - # result = (tr_pairs,te_pairs,tr_y,te_y,n_spec,n_features) - return input_data,output_data - def read_siamese_tfrecords_generator(audio_group='audio',batch_size=32,sample_size=100): records_file = os.path.join('./outputs',audio_group+'.train.tfrecords') input_pairs = [] @@ -273,120 +182,8 @@ def read_siamese_tfrecords_generator(audio_group='audio',batch_size=32,sample_si return record_generator,input_data,output_data,n_spec,n_features,n_records -def read_siamese_tfrecords_generator_old(audio_group='audio',batch_size=32): - records_file = os.path.join('./outputs',audio_group+'_padded.tfrecords') - input_pairs = [] - output_class = [] - const_file = os.path.join('./outputs',audio_group+'.constants') - (n_spec,n_features,n_records) = pickle.load(open(const_file,'rb')) - print('reading tfrecords({})...'.format(audio_group)) - def record_generator(): - input_data = [] - output_data = [] - while True: - record_iterator = tf.python_io.tf_record_iterator(path=records_file) - for (i,string_record) in tqdm(enumerate(record_iterator),total=n_records): - example = tf.train.Example() - example.ParseFromString(string_record) - spec1 = np.array(example.features.feature['spec1'].float_list.value).reshape(n_spec,n_features) - spec2 = np.array(example.features.feature['spec2'].float_list.value).reshape(n_spec,n_features) - input_data.append(np.asarray([spec1,spec2])) - output = example.features.feature['output'].int64_list.value - output_data.append(np.asarray(output)) - if len(input_data) == batch_size: - input_arr = np.asarray(input_data) - output_arr = np.asarray(output_data) - yield ([input_arr[:, 0], input_arr[:, 1]],output_arr) - input_data = [] - output_data = [] - return record_generator,n_spec,n_features,n_records - -def read_siamese_tfrecords(audio_group='audio'): - audio_group='story_words_test' - - record_file = os.path.join('./outputs',audio_group+'_padded.tfrecords') - const_file = os.path.join('./outputs',audio_group+'.constants') - (n_spec,n_features) = pickle.load(open(const_file,'rb')) - - filename_queue = tf.train.string_input_producer([record_file]) - reader = tf.TFRecordReader() - _, serialized_example = reader.read(filename_queue) - features = tf.parse_single_example(serialized_example, - features={ - 'spec1': tf.FixedLenFeature([1,n_spec,n_features], tf.float32), - 'spec2': tf.FixedLenFeature([1,n_spec,n_features], tf.float32), - 'output':tf.FixedLenFeature([2], tf.int64) - }) - spec1 = features['spec1'] - spec1 = tf.cast(spec1, tf.float32) * (1. / 255) - spec2 = features['spec2'] - spec2 = tf.cast(spec2, tf.float32) * (1. / 255) - output = tf.cast(features['output'], tf.int32) - return spec1,spec2, output,n_spec,n_features - -def read_siamese_tfrecords_batch(audio_group='audio', batch_size=32): - audio_group='story_words_test' - record_file = os.path.join('./outputs',audio_group+'_padded.tfrecords') - """ Return tensor to read from TFRecord """ - print('Creating graph for loading {} ...'.format(record_file)) - const_file = os.path.join('./outputs',audio_group+'.constants') - (n_spec,n_features) = pickle.load(open(const_file,'rb')) - records_file = os.path.join('./outputs',audio_group+'.tfrecords') - record_iterator = tf.python_io.tf_record_iterator(path=records_file) - n_records = len([i for i in record_iterator]) - batch_shape=[batch_size, n_spec, n_features] - with tf.variable_scope("SiameseTFRecords"): - record_input = data_flow_ops.RecordInput(record_file, batch_size=batch_size) - records_op = record_input.get_yield_op() - records_op = tf.split(records_op, batch_shape[0], 0) - records_op = [tf.reshape(record, []) for record in records_op] - specs1, specs2 = [],[] - outputs = [] - for i, serialized_example in tqdm(enumerate(records_op)): - with tf.variable_scope("parse_siamese_pairs", reuse=True): - features = tf.parse_single_example( - serialized_example, - features={ - 'spec1': tf.FixedLenFeature([n_spec,n_features], tf.float32), - 'spec2': tf.FixedLenFeature([n_spec,n_features], tf.float32), - 'output':tf.FixedLenFeature([2], tf.int64) - }) - spec1 = features['spec1'] - spec1 = tf.cast(spec1, tf.float32) * (1. / 255) - spec2 = features['spec2'] - output = tf.cast(spec2, tf.float32) * (1. / 255) - output = tf.cast(features['output'], tf.float32) - specs1.append(spec1) - specs2.append(spec2) - outputs.append(output) - - specs1 = tf.parallel_stack(specs1, 0) - specs2 = tf.parallel_stack(specs2, 0) - outputs = tf.parallel_stack(outputs, 0) - specs1 = tf.cast(specs1, tf.float32) - specs2 = tf.cast(specs2, tf.float32) - - specs1 = tf.reshape(specs1, shape=batch_shape) - specs2 = tf.reshape(specs1, shape=batch_shape) - specs1_shape = specs1.get_shape() - specs2_shape = specs2.get_shape() - outputs_shape = outputs.get_shape() - copy_stage = data_flow_ops.StagingArea( - [tf.float32, tf.float32, tf.float32], - shapes=[specs1_shape, specs2_shape, outputs_shape]) - copy_stage_op = copy_stage.put( - [specs1, specs2, outputs]) - staged_specs1, staged_specs2, staged_outputs = copy_stage.get() - return specs1, spec2, outputs,n_spec,n_features,n_records - def audio_samples_word_count(audio_group='audio'): - audio_group = 'story_all' - audio_samples = pd.read_csv( './outputs/' + audio_group + '.csv' - , names=['word','phonemes', 'voice', 'language', 'rate', 'variant', 'file'] - , quoting=csv.QUOTE_NONE) - audio_samples['file_path'] = audio_samples.loc[:, 'file'].apply(lambda x: 'outputs/' + audio_group + '/' + x) - audio_samples['file_exists'] = apply_by_multiprocessing(audio_samples['file_path'], os.path.exists) - audio_samples = audio_samples[audio_samples['file_exists'] == True].reset_index() + audio_samples = pd.read_csv( './outputs/' + audio_group + '.csv') return len(audio_samples.groupby(audio_samples['word'])) def fix_csv(audio_group='audio'): From bb72c4045eefaa110bff4c78667568c28c94df61 Mon Sep 17 00:00:00 2001 From: Malar Kannan Date: Fri, 10 Nov 2017 17:52:21 +0530 Subject: [PATCH 11/15] trying to overfit the model to identify false-negative types --- requirements-linux.txt | 77 ++++++++++++++++++++++++++++++++++++++++++ speech_data.py | 37 +++++++++++--------- speech_siamese.py | 68 ++++++++++++++++++++----------------- 3 files changed, 135 insertions(+), 47 deletions(-) create mode 100644 requirements-linux.txt diff --git a/requirements-linux.txt b/requirements-linux.txt new file mode 100644 index 0000000..c533525 --- /dev/null +++ b/requirements-linux.txt @@ -0,0 +1,77 @@ +bleach==1.5.0 +click==6.7 +cloudpickle==0.4.1 +cycler==0.10.0 +dask==0.15.4 +decorator==4.1.2 +distributed==1.19.3 +entrypoints==0.2.3 +enum34==1.1.6 +futures==3.1.1 +h5py==2.7.1 +HeapDict==1.0.0 +html5lib==0.9999999 +ipykernel==4.6.1 +ipython==6.2.1 +ipython-genutils==0.2.0 +ipywidgets==7.0.3 +jedi==0.11.0 +Jinja2==2.9.6 +jsonschema==2.6.0 +jupyter==1.0.0 +jupyter-client==5.1.0 +jupyter-console==5.2.0 +jupyter-core==4.3.0 +Keras==2.0.8 +locket==0.2.0 +Markdown==2.6.9 +MarkupSafe==1.0 +matplotlib==2.1.0 +mistune==0.7.4 +msgpack-python==0.4.8 +nbconvert==5.3.1 +nbformat==4.4.0 +notebook==5.2.0 +numexpr==2.6.4 +numpy==1.13.3 +pandas==0.20.3 +pandocfilters==1.4.2 +parso==0.1.0 +partd==0.3.8 +pexpect==4.2.1 +pickleshare==0.7.4 +pkg-resources==0.0.0 +progressbar2==3.34.3 +prompt-toolkit==1.0.15 +protobuf==3.4.0 +psutil==5.4.0 +ptyprocess==0.5.2 +PyAudio==0.2.11 +Pygments==2.2.0 +pyparsing==2.2.0 +pysndfile==1.0.0 +python-dateutil==2.6.1 +python-utils==2.2.0 +pytz==2017.2 +PyYAML==3.12 +pyzmq==16.0.2 +qtconsole==4.3.1 +scikit-learn==0.19.0 +scipy==0.19.1 +simplegeneric==0.8.1 +six==1.11.0 +sortedcontainers==1.5.7 +tables==3.4.2 +tblib==1.3.2 +tensorflow==1.3.0 +tensorflow-tensorboard==0.4.0rc1 +terminado==0.6 +testpath==0.3.1 +toolz==0.8.2 +tornado==4.5.2 +tqdm==4.19.4 +traitlets==4.3.2 +wcwidth==0.1.7 +Werkzeug==0.12.2 +widgetsnbextension==3.0.6 +zict==0.1.3 diff --git a/speech_data.py b/speech_data.py index c67b015..4a85b76 100644 --- a/speech_data.py +++ b/speech_data.py @@ -20,9 +20,10 @@ def siamese_pairs(rightGroup, wrongGroup): group1 = [r for (i, r) in rightGroup.iterrows()] group2 = [r for (i, r) in wrongGroup.iterrows()] rightWrongPairs = [(g1, g2) for g2 in group2 for g1 in group1] - rightRightPairs = [i for i in itertools.combinations(group1, 2)] - random.shuffle(rightWrongPairs) - random.shuffle(rightRightPairs) + rightRightPairs = [i for i in itertools.combinations(group1, 2)]#+[i for i in itertools.combinations(group2, 2)] + # random.shuffle(rightWrongPairs) + # random.shuffle(rightRightPairs) + # return rightRightPairs[:10],rightWrongPairs[:10] return rightRightPairs[:32],rightWrongPairs[:32] @@ -45,8 +46,7 @@ def create_spectrogram_tfrecords(audio_group='audio',sample_count=0): n_records,n_spec,n_features = 0,0,0 def write_samples(wg,sample_name): - wg_sampled = reservoir_sample(wg,sample_count) if sample_count > 0 else wg - word_group_prog = tqdm(wg_sampled,desc='Computing spectrogram') + word_group_prog = tqdm(wg,desc='Computing spectrogram') record_file = './outputs/{}.{}.tfrecords'.format(audio_group,sample_name) writer = tf.python_io.TFRecordWriter(record_file) for (w, word_group) in word_group_prog: @@ -100,7 +100,8 @@ def create_spectrogram_tfrecords(audio_group='audio',sample_count=0): writer.close() word_groups = [i for i in audio_samples.groupby('word')] - tr_audio_samples,te_audio_samples = train_test_split(word_groups,test_size=0.1) + wg_sampled = reservoir_sample(word_groups,sample_count) if sample_count > 0 else word_groups + tr_audio_samples,te_audio_samples = train_test_split(wg_sampled,test_size=0.1) write_samples(tr_audio_samples,'train') write_samples(te_audio_samples,'test') const_file = os.path.join('./outputs',audio_group+'.constants') @@ -124,7 +125,7 @@ def reservoir_sample(iterable, k): sample[j] = item # replace item with gradually decreasing probability return sample -def read_siamese_tfrecords_generator(audio_group='audio',batch_size=32,sample_size=100): +def read_siamese_tfrecords_generator(audio_group='audio',batch_size=32,test_size=100): records_file = os.path.join('./outputs',audio_group+'.train.tfrecords') input_pairs = [] output_class = [] @@ -160,13 +161,14 @@ def read_siamese_tfrecords_generator(audio_group='audio',batch_size=32,sample_si # Read test in one-shot te_records_file = os.path.join('./outputs',audio_group+'.test.tfrecords') te_re_iterator = tf.python_io.tf_record_iterator(path=records_file) + te_n_records = len([i for i in te_re_iterator]) + te_re_iterator = tf.python_io.tf_record_iterator(path=records_file) print('reading tfrecords({}-test)...'.format(audio_group)) - samples = min([sample_size,n_records]) - # samples = n_records - input_data = np.zeros((samples,2,n_spec,n_features)) - output_data = np.zeros((samples,2)) - random_samples = enumerate(reservoir_sample(te_re_iterator,samples)) - for (i,string_record) in tqdm(random_samples,total=samples): + test_size = min([test_size,te_n_records]) if test_size > 0 else te_n_records + input_data = np.zeros((test_size,2,n_spec,n_features)) + output_data = np.zeros((test_size,2)) + random_samples = enumerate(reservoir_sample(te_re_iterator,test_size)) + for (i,string_record) in tqdm(random_samples,total=test_size): example = tf.train.Example() example.ParseFromString(string_record) spec_n1 = example.features.feature['spec_n1'].int64_list.value[0] @@ -187,7 +189,7 @@ def audio_samples_word_count(audio_group='audio'): return len(audio_samples.groupby(audio_samples['word'])) def fix_csv(audio_group='audio'): - audio_csv_lines = open('./outputs/' + audio_group + '.csv','r').readlines() + audio_csv_lines = open('./outputs/' + audio_group + '.csv.orig','r').readlines() audio_csv_data = [i.strip().split(',') for i in audio_csv_lines] proper_rows = [i for i in audio_csv_data if len(i) == 7] with open('./outputs/' + audio_group + '.csv','w') as fixed_csv: @@ -220,10 +222,13 @@ if __name__ == '__main__': # read_siamese_tfrecords('story_all') # read_siamese_tfrecords('story_words_test') # padd_zeros_siamese_tfrecords('story_words') - # fix_csv() + # fix_csv('story_words') # pickle_constants('story_words') # create_spectrogram_tfrecords('audio',sample_count=100) - read_siamese_tfrecords_generator('audio') + # create_spectrogram_tfrecords('story_all',sample_count=25) + create_spectrogram_tfrecords('story_words',sample_count=10) + # create_spectrogram_tfrecords('audio',sample_count=50) + # read_siamese_tfrecords_generator('audio') # padd_zeros_siamese_tfrecords('audio') # create_padded_spectrogram() # create_speech_pairs_data() diff --git a/speech_siamese.py b/speech_siamese.py index 298def4..fb89e19 100644 --- a/speech_siamese.py +++ b/speech_siamese.py @@ -2,7 +2,7 @@ from __future__ import absolute_import from __future__ import print_function import numpy as np # from speech_data import speech_model_data -from speech_data import read_siamese_tfrecords_oneshot,read_siamese_tfrecords_generator +from speech_data import read_siamese_tfrecords_generator from keras.models import Model,load_model from keras.layers import Input, Dense, Dropout, LSTM, Lambda, Concatenate from keras.losses import categorical_crossentropy @@ -14,42 +14,46 @@ from keras.callbacks import TensorBoard, ModelCheckpoint from keras import backend as K def create_dir(direc): + import os if not os.path.exists(direc): os.makedirs(direc) -def euclidean_distance(vects): - x, y = vects - return K.sqrt( - K.maximum(K.sum(K.square(x - y), axis=1, keepdims=True), K.epsilon())) - - -def eucl_dist_output_shape(shapes): - shape1, shape2 = shapes - return (shape1[0], 1) - - -def contrastive_loss(y_true, y_pred): - '''Contrastive loss from Hadsell-et-al.'06 - http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf - ''' - return K.mean(y_true * K.square(y_pred) + - (1 - y_true) * K.square(K.maximum(1 - y_pred, 0))) +# def euclidean_distance(vects): +# x, y = vects +# return K.sqrt( +# K.maximum(K.sum(K.square(x - y), axis=1, keepdims=True), K.epsilon())) +# +# +# def eucl_dist_output_shape(shapes): +# shape1, shape2 = shapes +# return (shape1[0], 1) +# +# +# def contrastive_loss(y_true, y_pred): +# '''Contrastive loss from Hadsell-et-al.'06 +# http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf +# ''' +# return K.mean(y_true * K.square(y_pred) + +# (1 - y_true) * K.square(K.maximum(1 - y_pred, 0))) def create_base_rnn_network(input_dim): '''Base network to be shared (eq. to feature extraction). ''' inp = Input(shape=input_dim) - ls1 = LSTM(256, return_sequences=True)(inp) + ls0 = LSTM(512, return_sequences=True)(inp) + ls1 = LSTM(256, return_sequences=True)(ls0) ls2 = LSTM(128, return_sequences=True)(ls1) # ls3 = LSTM(32, return_sequences=True)(ls2) ls4 = LSTM(64)(ls2) + d1 = Dense(128, activation='relu')(ls4) + d2 = Dense(64, activation='relu')(d1) return Model(inp, ls4) def compute_accuracy(y_true, y_pred): '''Compute classification accuracy with a fixed threshold on distances. ''' - pred = y_pred.ravel() < 0.5 + pred = y_pred.ravel() > 0.5 return np.mean(pred == y_true) @@ -60,11 +64,12 @@ def accuracy(y_true, y_pred): def dense_classifier(processed): conc_proc = Concatenate()(processed) - d1 = Dense(16, activation='relu')(conc_proc) + d1 = Dense(64, activation='relu')(conc_proc) # dr1 = Dropout(0.1)(d1) - d2 = Dense(8, activation='relu')(d1) + d2 = Dense(128, activation='relu')(d1) + d3 = Dense(8, activation='relu')(d2) # dr2 = Dropout(0.1)(d2) - return Dense(2, activation='softmax')(d2) + return Dense(2, activation='softmax')(d3) def siamese_model(input_dim): # input_dim = (15, 1654) @@ -85,10 +90,10 @@ def siamese_model(input_dim): def train_siamese(audio_group = 'audio'): # the data, shuffled and split between train and test sets # tr_pairs, te_pairs, tr_y_e, te_y_e = speech_model_data() - batch_size = 512 + batch_size = 128 model_dir = './models/'+audio_group create_dir(model_dir) - tr_gen_fn,te_pairs,te_y,n_step,n_features,n_records = read_siamese_tfrecords_generator(audio_group,batch_size,300) + tr_gen_fn,te_pairs,te_y,n_step,n_features,n_records = read_siamese_tfrecords_generator(audio_group,batch_size,256) tr_gen = tr_gen_fn() # tr_y = to_categorical(tr_y_e, num_classes=2) # te_y = to_categorical(te_y_e, num_classes=2) @@ -113,12 +118,12 @@ def train_siamese(audio_group = 'audio'): cp_file_fmt, monitor='val_loss', verbose=0, - save_best_only=False, - save_weights_only=False, + save_best_only=True, + save_weights_only=True, mode='auto', period=1) # train - rms = RMSprop(lr=0.001) + rms = RMSprop()#lr=0.001 model.compile(loss=categorical_crossentropy, optimizer=rms, metrics=[accuracy]) # model.fit( # [tr_pairs[:, 0], tr_pairs[:, 1]], @@ -128,11 +133,11 @@ def train_siamese(audio_group = 'audio'): # validation_data=([te_pairs[:, 0], te_pairs[:, 1]], te_y), # callbacks=[tb_cb, cp_cb]) model.fit_generator(tr_gen - ,epochs=100 + ,epochs=1000 ,steps_per_epoch=n_records//batch_size ,validation_data=([te_pairs[:, 0], te_pairs[:, 1]], te_y) ,use_multiprocessing=True) - + # ,callbacks=[tb_cb, cp_cb]) model.save(model_dir+'/siamese_speech_model-final.h5') # compute final accuracy on training and test sets # y_pred = model.predict([tr_pairs[:, 0], tr_pairs[:, 1]]) @@ -146,4 +151,5 @@ def train_siamese(audio_group = 'audio'): if __name__ == '__main__': - train_siamese() + train_siamese('story_words') + # train_siamese('audio') From d978272bdb827d3d5da684278297c800614feef6 Mon Sep 17 00:00:00 2001 From: Malar Kannan Date: Fri, 10 Nov 2017 18:06:45 +0530 Subject: [PATCH 12/15] saving model and tensorboard checkpointing model --- speech_siamese.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/speech_siamese.py b/speech_siamese.py index fb89e19..307136a 100644 --- a/speech_siamese.py +++ b/speech_siamese.py @@ -93,6 +93,8 @@ def train_siamese(audio_group = 'audio'): batch_size = 128 model_dir = './models/'+audio_group create_dir(model_dir) + log_dir = './logs/'+audio_group + create_dir(log_dir) tr_gen_fn,te_pairs,te_y,n_step,n_features,n_records = read_siamese_tfrecords_generator(audio_group,batch_size,256) tr_gen = tr_gen_fn() # tr_y = to_categorical(tr_y_e, num_classes=2) @@ -102,7 +104,7 @@ def train_siamese(audio_group = 'audio'): model = siamese_model(input_dim) tb_cb = TensorBoard( - log_dir='./logs/siamese_logs', + log_dir=log_dir, histogram_freq=1, batch_size=32, write_graph=True, @@ -136,8 +138,8 @@ def train_siamese(audio_group = 'audio'): ,epochs=1000 ,steps_per_epoch=n_records//batch_size ,validation_data=([te_pairs[:, 0], te_pairs[:, 1]], te_y) - ,use_multiprocessing=True) - # ,callbacks=[tb_cb, cp_cb]) + ,use_multiprocessing=True + ,callbacks=[tb_cb, cp_cb]) model.save(model_dir+'/siamese_speech_model-final.h5') # compute final accuracy on training and test sets # y_pred = model.predict([tr_pairs[:, 0], tr_pairs[:, 1]]) From 988f66c2c21e8cb465b3caeaf197a72c77e54e37 Mon Sep 17 00:00:00 2001 From: Malar Kannan Date: Mon, 13 Nov 2017 17:33:37 +0530 Subject: [PATCH 13/15] avoiding same voice similar variants --- pandas_parallel.py | 25 -------------- requirements-linux.txt | 1 - speech_data.py | 21 +++++++++--- speech_siamese.py | 10 ++---- speech_utils.py | 74 ++++++++++++++++++++++++++++++++++++++++++ test_siamese.py | 61 ++++++++++++++++++++++++++++++---- 6 files changed, 148 insertions(+), 44 deletions(-) delete mode 100644 pandas_parallel.py create mode 100644 speech_utils.py diff --git a/pandas_parallel.py b/pandas_parallel.py deleted file mode 100644 index 245da38..0000000 --- a/pandas_parallel.py +++ /dev/null @@ -1,25 +0,0 @@ -import multiprocessing -import pandas as pd -import numpy as np - - - -def _apply_df(args): - df, func, num, kwargs = args - return num, df.apply(func, **kwargs) - -def apply_by_multiprocessing(df,func,**kwargs): - cores = multiprocessing.cpu_count() - workers=kwargs.pop('workers') if 'workers' in kwargs else cores - pool = multiprocessing.Pool(processes=workers) - result = pool.map(_apply_df, [(d, func, i, kwargs) for i,d in enumerate(np.array_split(df, workers))]) - pool.close() - result=sorted(result,key=lambda x:x[0]) - return pd.concat([i[1] for i in result]) - -def square(x): - return x**x - -if __name__ == '__main__': - df = pd.DataFrame({'a':range(10), 'b':range(10)}) - apply_by_multiprocessing(df, square, axis=1, workers=4) diff --git a/requirements-linux.txt b/requirements-linux.txt index c533525..dba434a 100644 --- a/requirements-linux.txt +++ b/requirements-linux.txt @@ -40,7 +40,6 @@ parso==0.1.0 partd==0.3.8 pexpect==4.2.1 pickleshare==0.7.4 -pkg-resources==0.0.0 progressbar2==3.34.3 prompt-toolkit==1.0.15 protobuf==3.4.0 diff --git a/speech_data.py b/speech_data.py index 4a85b76..d11c4ee 100644 --- a/speech_data.py +++ b/speech_data.py @@ -1,5 +1,6 @@ import pandas as pd -from pandas_parallel import apply_by_multiprocessing +from speech_utils import apply_by_multiprocessing +from speech_utils import threadsafe_iter # import dask as dd # import dask.dataframe as ddf import tensorflow as tf @@ -36,7 +37,7 @@ def _int64_feature(value): def _bytes_feature(value): return tf.train.Feature(bytes_list=tf.train.BytesList(value=value)) -def create_spectrogram_tfrecords(audio_group='audio',sample_count=0): +def create_spectrogram_tfrecords(audio_group='audio',sample_count=0,train_test_ratio=0.1): ''' http://warmspringwinds.github.io/tensorflow/tf-slim/2016/12/21/tfrecords-guide/ http://www.machinelearninguru.com/deep_learning/tensorflow/basics/tfrecord/tfrecord.html @@ -60,6 +61,13 @@ def create_spectrogram_tfrecords(audio_group='audio',sample_count=0): for (output,group) in groups: group_prog = tqdm(group,desc='Writing Spectrogram') for sample1,sample2 in group_prog: + same = sample1['variant'] == sample2['variant'] + phon_same = sample1['phonemes'] == sample2['phonemes'] + voice_diff = sample1['voice'] != sample2['voice'] + if not same and phon_same: + continue + if same and not voice_diff: + continue group_prog.set_postfix(output=output ,var1=sample1['variant'] ,var2=sample2['variant']) @@ -101,7 +109,7 @@ def create_spectrogram_tfrecords(audio_group='audio',sample_count=0): word_groups = [i for i in audio_samples.groupby('word')] wg_sampled = reservoir_sample(word_groups,sample_count) if sample_count > 0 else word_groups - tr_audio_samples,te_audio_samples = train_test_split(wg_sampled,test_size=0.1) + tr_audio_samples,te_audio_samples = train_test_split(wg_sampled,test_size=train_test_ratio) write_samples(tr_audio_samples,'train') write_samples(te_audio_samples,'test') const_file = os.path.join('./outputs',audio_group+'.constants') @@ -125,13 +133,16 @@ def reservoir_sample(iterable, k): sample[j] = item # replace item with gradually decreasing probability return sample -def read_siamese_tfrecords_generator(audio_group='audio',batch_size=32,test_size=100): + +def read_siamese_tfrecords_generator(audio_group='audio',batch_size=32,test_size=0): records_file = os.path.join('./outputs',audio_group+'.train.tfrecords') input_pairs = [] output_class = [] const_file = os.path.join('./outputs',audio_group+'.constants') (n_spec,n_features,n_records) = pickle.load(open(const_file,'rb')) print('reading tfrecords({}-train)...'.format(audio_group)) + + # @threadsafe_iter def record_generator(): input_data = [] output_data = [] @@ -226,7 +237,7 @@ if __name__ == '__main__': # pickle_constants('story_words') # create_spectrogram_tfrecords('audio',sample_count=100) # create_spectrogram_tfrecords('story_all',sample_count=25) - create_spectrogram_tfrecords('story_words',sample_count=10) + create_spectrogram_tfrecords('story_words',sample_count=10,train_test_ratio=0.2) # create_spectrogram_tfrecords('audio',sample_count=50) # read_siamese_tfrecords_generator('audio') # padd_zeros_siamese_tfrecords('audio') diff --git a/speech_siamese.py b/speech_siamese.py index 307136a..c5f61b6 100644 --- a/speech_siamese.py +++ b/speech_siamese.py @@ -12,11 +12,7 @@ from keras.utils import to_categorical from keras.optimizers import RMSprop from keras.callbacks import TensorBoard, ModelCheckpoint from keras import backend as K - -def create_dir(direc): - import os - if not os.path.exists(direc): - os.makedirs(direc) +from speech_utils import create_dir # def euclidean_distance(vects): # x, y = vects @@ -95,7 +91,7 @@ def train_siamese(audio_group = 'audio'): create_dir(model_dir) log_dir = './logs/'+audio_group create_dir(log_dir) - tr_gen_fn,te_pairs,te_y,n_step,n_features,n_records = read_siamese_tfrecords_generator(audio_group,batch_size,256) + tr_gen_fn,te_pairs,te_y,n_step,n_features,n_records = read_siamese_tfrecords_generator(audio_group,batch_size=batch_size) tr_gen = tr_gen_fn() # tr_y = to_categorical(tr_y_e, num_classes=2) # te_y = to_categorical(te_y_e, num_classes=2) @@ -138,7 +134,7 @@ def train_siamese(audio_group = 'audio'): ,epochs=1000 ,steps_per_epoch=n_records//batch_size ,validation_data=([te_pairs[:, 0], te_pairs[:, 1]], te_y) - ,use_multiprocessing=True + ,use_multiprocessing=True, workers=1 ,callbacks=[tb_cb, cp_cb]) model.save(model_dir+'/siamese_speech_model-final.h5') # compute final accuracy on training and test sets diff --git a/speech_utils.py b/speech_utils.py new file mode 100644 index 0000000..2581f84 --- /dev/null +++ b/speech_utils.py @@ -0,0 +1,74 @@ +import os +import threading + +import multiprocessing +import pandas as pd +import numpy as np + + + +def _apply_df(args): + df, func, num, kwargs = args + return num, df.apply(func, **kwargs) + +def apply_by_multiprocessing(df,func,**kwargs): + cores = multiprocessing.cpu_count() + workers=kwargs.pop('workers') if 'workers' in kwargs else cores + pool = multiprocessing.Pool(processes=workers) + result = pool.map(_apply_df, [(d, func, i, kwargs) for i,d in enumerate(np.array_split(df, workers))]) + pool.close() + result=sorted(result,key=lambda x:x[0]) + return pd.concat([i[1] for i in result]) + +def square(x): + return x**x + +if __name__ == '__main__': + df = pd.DataFrame({'a':range(10), 'b':range(10)}) + apply_by_multiprocessing(df, square, axis=1, workers=4) + + +def rm_rf(d): + for path in (os.path.join(d,f) for f in os.listdir(d)): + if os.path.isdir(path): + rm_rf(path) + else: + os.unlink(path) + os.rmdir(d) + +def create_dir(direc): + if not os.path.exists(direc): + os.makedirs(direc) + else: + rm_rf(direc) + create_dir(direc) + + +#################### Now make the data generator threadsafe #################### + +class threadsafe_iter: + """Takes an iterator/generator and makes it thread-safe by + serializing call to the `next` method of given iterator/generator. + """ + def __init__(self, it): + self.it = it + self.lock = threading.Lock() + + def __iter__(self): + return self + + def __next__(self): # Py3 + with self.lock: + return next(self.it) + + def next(self): # Py2 + with self.lock: + return self.it.next() + + +def threadsafe_generator(f): + """A decorator that takes a generator function and makes it thread-safe. + """ + def g(*a, **kw): + return threadsafe_iter(f(*a, **kw)) + return g diff --git a/test_siamese.py b/test_siamese.py index 35980d6..c228cec 100644 --- a/test_siamese.py +++ b/test_siamese.py @@ -1,13 +1,14 @@ from speech_siamese import siamese_model from record_mic_speech import record_spectrogram -from importlib import reload +# from importlib import reload # import speech_data # reload(speech_data) -from speech_data import create_test_pair,get_word_pairs_data,speech_data import numpy as np - -model = siamese_model((15, 1654)) -model.load_weights('./models/siamese_speech_model-final.h5') +import os +import pickle +import tensorflow as tf +import csv +from speech_data import padd_zeros def predict_recording_with(m,sample_size=15): spec1 = record_spectrogram(n_sec=1.4) @@ -24,7 +25,55 @@ def test_with(audio_group): print(np.argmax(model.predict([X[:, 0], X[:, 1]]),axis=1)) print(Y.astype(np.int8)) -test_with('rand_edu') +def evaluate_siamese(audio_group='audio',model_file = 'siamese_speech_model-46-epoch-0.29-acc.h5'): + records_file = os.path.join('./outputs',audio_group+'.train.tfrecords') + const_file = os.path.join('./outputs',audio_group+'.constants') + model_weights_path =os.path.join('./models/story_words/',model_file) + (n_spec,n_features,n_records) = pickle.load(open(const_file,'rb')) + print('evaluating tfrecords({}-train)...'.format(audio_group)) + + model = siamese_model((n_spec, n_features)) + model.load_weights(model_weights_path) + record_iterator = tf.python_io.tf_record_iterator(path=records_file) + #tqdm(enumerate(record_iterator),total=n_records) + with open('./outputs/' + audio_group + '.results.csv','w') as result_csv: + result_csv_w = csv.writer(result_csv, quoting=csv.QUOTE_MINIMAL) + for (i,string_record) in enumerate(record_iterator): + # string_record = next(record_iterator) + example = tf.train.Example() + example.ParseFromString(string_record) + spec_n1 = example.features.feature['spec_n1'].int64_list.value[0] + spec_n2 = example.features.feature['spec_n2'].int64_list.value[0] + spec_w1 = example.features.feature['spec_w1'].int64_list.value[0] + spec_w2 = example.features.feature['spec_w2'].int64_list.value[0] + spec1 = np.array(example.features.feature['spec1'].float_list.value).reshape(spec_n1,spec_w1) + spec2 = np.array(example.features.feature['spec2'].float_list.value).reshape(spec_n2,spec_w2) + p_spec1,p_spec2 = padd_zeros(spec1,n_spec),padd_zeros(spec2,n_spec) + input_arr = np.asarray([[p_spec1,p_spec2]]) + output_arr = np.asarray([example.features.feature['output'].int64_list.value]) + y_pred = model.predict([input_arr[:, 0], input_arr[:, 1]]) + predicted = np.asarray(y_pred[0]>0.5).astype(output_arr.dtype) + expected = output_arr[0] + if np.all(predicted == expected): + continue + word = example.features.feature['word'].bytes_list.value[0].decode() + phoneme1 = example.features.feature['phoneme1'].bytes_list.value[0].decode() + phoneme2 = example.features.feature['phoneme2'].bytes_list.value[0].decode() + voice1 = example.features.feature['voice1'].bytes_list.value[0].decode() + voice2 = example.features.feature['voice2'].bytes_list.value[0].decode() + language = example.features.feature['language'].bytes_list.value[0].decode() + rate1 = example.features.feature['rate1'].int64_list.value[0] + rate2 = example.features.feature['rate2'].int64_list.value[0] + variant1 = example.features.feature['variant1'].bytes_list.value[0].decode() + variant2 = example.features.feature['variant2'].bytes_list.value[0].decode() + file1 = example.features.feature['file1'].bytes_list.value[0].decode() + file2 = example.features.feature['file2'].bytes_list.value[0].decode() + print(phoneme1,phoneme2,voice1,voice2,rate1,rate2,variant1,variant2,file1,file2) + result_csv_w.writerow([phoneme1,phoneme2,voice1,voice2,rate1,rate2,variant1,variant2,file1,file2]) + + +evaluate_siamese('story_words',model_file='siamese_speech_model-92-epoch-0.20-acc.h5') +# test_with('rand_edu') # sunflower_data,sunflower_result = get_word_pairs_data('sweater',15) # print(np.argmax(model.predict([sunflower_data[:, 0], sunflower_data[:, 1]]),axis=1)) # print(sunflower_result) From e4b8b4e0a744150fcecf655f9bd231710ab292f1 Mon Sep 17 00:00:00 2001 From: Malar Kannan Date: Mon, 13 Nov 2017 19:22:30 +0530 Subject: [PATCH 14/15] visualizing and playing sound files where prediction fails --- speech_data.py | 2 +- spectro_gen.py => speech_spectrum.py | 0 record_mic_speech.py => speech_tools.py | 29 ++++++- test_siamese.py | 107 +++++++++++++++--------- 4 files changed, 95 insertions(+), 43 deletions(-) rename spectro_gen.py => speech_spectrum.py (100%) rename record_mic_speech.py => speech_tools.py (61%) diff --git a/speech_data.py b/speech_data.py index d11c4ee..58430c1 100644 --- a/speech_data.py +++ b/speech_data.py @@ -6,7 +6,7 @@ from speech_utils import threadsafe_iter import tensorflow as tf from tensorflow.python.ops import data_flow_ops import numpy as np -from spectro_gen import generate_aiff_spectrogram +from speech_spectrum import generate_aiff_spectrogram from sklearn.model_selection import train_test_split import itertools import os diff --git a/spectro_gen.py b/speech_spectrum.py similarity index 100% rename from spectro_gen.py rename to speech_spectrum.py diff --git a/record_mic_speech.py b/speech_tools.py similarity index 61% rename from record_mic_speech.py rename to speech_tools.py index 4ed11e2..fd48f34 100644 --- a/record_mic_speech.py +++ b/speech_tools.py @@ -1,15 +1,36 @@ import pyaudio +from pysndfile import sndio as snd import numpy as np # from matplotlib import pyplot as plt -from spectro_gen import plot_stft, generate_spec_frec +from speech_spectrum import plot_stft, generate_spec_frec +SAMPLE_RATE = 22050 +N_CHANNELS = 2 + +def file_player(): + p_oup = pyaudio.PyAudio() + def play_file(audiopath,plot=False): + print('playing',audiopath) + samples, samplerate, form = snd.read(audiopath) + stream = p_oup.open( + format=pyaudio.paFloat32, + channels=2, + rate=samplerate, + output=True) + one_channel = np.asarray([samples, samples]).T.reshape(-1) + audio_data = one_channel.astype(np.float32).tobytes() + stream.write(audio_data) + stream.close() + if plot: + plot_stft(samples, SAMPLE_RATE) + def close_player(): + p_oup.terminate() + return play_file,close_player def record_spectrogram(n_sec, plot=False, playback=False): - SAMPLE_RATE = 22050 - N_CHANNELS = 2 + # show_record_prompt() N_SEC = n_sec CHUNKSIZE = int(SAMPLE_RATE * N_SEC / N_CHANNELS) # fixed chunk size - # show_record_prompt() input('Press [Enter] to start recording sample... ') p_inp = pyaudio.PyAudio() stream = p_inp.open( diff --git a/test_siamese.py b/test_siamese.py index c228cec..0d7098e 100644 --- a/test_siamese.py +++ b/test_siamese.py @@ -1,9 +1,10 @@ -from speech_siamese import siamese_model -from record_mic_speech import record_spectrogram +# from speech_siamese import siamese_model +from speech_tools import record_spectrogram, file_player # from importlib import reload # import speech_data # reload(speech_data) import numpy as np +import pandas as pd import os import pickle import tensorflow as tf @@ -25,7 +26,8 @@ def test_with(audio_group): print(np.argmax(model.predict([X[:, 0], X[:, 1]]),axis=1)) print(Y.astype(np.int8)) -def evaluate_siamese(audio_group='audio',model_file = 'siamese_speech_model-46-epoch-0.29-acc.h5'): +def evaluate_siamese(audio_group='audio',model_file = 'siamese_speech_model-305-epoch-0.20-acc.h5'): + # audio_group='audio';model_file = 'siamese_speech_model-305-epoch-0.20-acc.h5' records_file = os.path.join('./outputs',audio_group+'.train.tfrecords') const_file = os.path.join('./outputs',audio_group+'.constants') model_weights_path =os.path.join('./models/story_words/',model_file) @@ -36,43 +38,72 @@ def evaluate_siamese(audio_group='audio',model_file = 'siamese_speech_model-46-e model.load_weights(model_weights_path) record_iterator = tf.python_io.tf_record_iterator(path=records_file) #tqdm(enumerate(record_iterator),total=n_records) - with open('./outputs/' + audio_group + '.results.csv','w') as result_csv: - result_csv_w = csv.writer(result_csv, quoting=csv.QUOTE_MINIMAL) - for (i,string_record) in enumerate(record_iterator): - # string_record = next(record_iterator) - example = tf.train.Example() - example.ParseFromString(string_record) - spec_n1 = example.features.feature['spec_n1'].int64_list.value[0] - spec_n2 = example.features.feature['spec_n2'].int64_list.value[0] - spec_w1 = example.features.feature['spec_w1'].int64_list.value[0] - spec_w2 = example.features.feature['spec_w2'].int64_list.value[0] - spec1 = np.array(example.features.feature['spec1'].float_list.value).reshape(spec_n1,spec_w1) - spec2 = np.array(example.features.feature['spec2'].float_list.value).reshape(spec_n2,spec_w2) - p_spec1,p_spec2 = padd_zeros(spec1,n_spec),padd_zeros(spec2,n_spec) - input_arr = np.asarray([[p_spec1,p_spec2]]) - output_arr = np.asarray([example.features.feature['output'].int64_list.value]) - y_pred = model.predict([input_arr[:, 0], input_arr[:, 1]]) - predicted = np.asarray(y_pred[0]>0.5).astype(output_arr.dtype) - expected = output_arr[0] - if np.all(predicted == expected): + result_csv = open('./outputs/' + audio_group + '.results.csv','w') + result_csv_w = csv.writer(result_csv, quoting=csv.QUOTE_MINIMAL) + result_csv_w.writerow(["phoneme1","phoneme2","voice1","voice2","rate1","rate2","variant1","variant2","file1","file2"]) + for (i,string_record) in enumerate(record_iterator): + # string_record = next(record_iterator) + example = tf.train.Example() + example.ParseFromString(string_record) + spec_n1 = example.features.feature['spec_n1'].int64_list.value[0] + spec_n2 = example.features.feature['spec_n2'].int64_list.value[0] + spec_w1 = example.features.feature['spec_w1'].int64_list.value[0] + spec_w2 = example.features.feature['spec_w2'].int64_list.value[0] + spec1 = np.array(example.features.feature['spec1'].float_list.value).reshape(spec_n1,spec_w1) + spec2 = np.array(example.features.feature['spec2'].float_list.value).reshape(spec_n2,spec_w2) + p_spec1,p_spec2 = padd_zeros(spec1,n_spec),padd_zeros(spec2,n_spec) + input_arr = np.asarray([[p_spec1,p_spec2]]) + output_arr = np.asarray([example.features.feature['output'].int64_list.value]) + y_pred = model.predict([input_arr[:, 0], input_arr[:, 1]]) + predicted = np.asarray(y_pred[0]>0.5).astype(output_arr.dtype) + expected = output_arr[0] + if np.all(predicted == expected): + continue + word = example.features.feature['word'].bytes_list.value[0].decode() + phoneme1 = example.features.feature['phoneme1'].bytes_list.value[0].decode() + phoneme2 = example.features.feature['phoneme2'].bytes_list.value[0].decode() + voice1 = example.features.feature['voice1'].bytes_list.value[0].decode() + voice2 = example.features.feature['voice2'].bytes_list.value[0].decode() + language = example.features.feature['language'].bytes_list.value[0].decode() + rate1 = example.features.feature['rate1'].int64_list.value[0] + rate2 = example.features.feature['rate2'].int64_list.value[0] + variant1 = example.features.feature['variant1'].bytes_list.value[0].decode() + variant2 = example.features.feature['variant2'].bytes_list.value[0].decode() + file1 = example.features.feature['file1'].bytes_list.value[0].decode() + file2 = example.features.feature['file2'].bytes_list.value[0].decode() + print(phoneme1,phoneme2,voice1,voice2,rate1,rate2,variant1,variant2,file1,file2) + result_csv_w.writerow([phoneme1,phoneme2,voice1,voice2,rate1,rate2,variant1,variant2,file1,file2]) + result_csv.close() + + +def play_results(audio_group='audio'): + result_data = pd.read_csv('./outputs/' + audio_group + '.results.csv') + play_file,close_player = file_player() + quit = False + for (i,r) in result_data.iterrows(): + if quit: + break + keys = ["phoneme1","phoneme2","voice1","voice2","rate1","rate2","variant1","variant2"] + row_vals = [str(r[k]) for k in keys] + h_str = '\t'.join(keys) + row_str = '\t'.join(row_vals) + while True: + print(h_str) + print(row_str) + play_file('./outputs/'+audio_group+'/'+r['file1'],True) + play_file('./outputs/'+audio_group+'/'+r['file2'],True) + a = input("press 'r/q/[Enter]' to replay/quit/continue:\t") + if a == 'r': continue - word = example.features.feature['word'].bytes_list.value[0].decode() - phoneme1 = example.features.feature['phoneme1'].bytes_list.value[0].decode() - phoneme2 = example.features.feature['phoneme2'].bytes_list.value[0].decode() - voice1 = example.features.feature['voice1'].bytes_list.value[0].decode() - voice2 = example.features.feature['voice2'].bytes_list.value[0].decode() - language = example.features.feature['language'].bytes_list.value[0].decode() - rate1 = example.features.feature['rate1'].int64_list.value[0] - rate2 = example.features.feature['rate2'].int64_list.value[0] - variant1 = example.features.feature['variant1'].bytes_list.value[0].decode() - variant2 = example.features.feature['variant2'].bytes_list.value[0].decode() - file1 = example.features.feature['file1'].bytes_list.value[0].decode() - file2 = example.features.feature['file2'].bytes_list.value[0].decode() - print(phoneme1,phoneme2,voice1,voice2,rate1,rate2,variant1,variant2,file1,file2) - result_csv_w.writerow([phoneme1,phoneme2,voice1,voice2,rate1,rate2,variant1,variant2,file1,file2]) + if a == 'q': + quit = True + break + else: + break + close_player() - -evaluate_siamese('story_words',model_file='siamese_speech_model-92-epoch-0.20-acc.h5') +# evaluate_siamese('story_words',model_file='siamese_speech_model-305-epoch-0.20-acc.h5') +play_results('story_words') # test_with('rand_edu') # sunflower_data,sunflower_result = get_word_pairs_data('sweater',15) # print(np.argmax(model.predict([sunflower_data[:, 0], sunflower_data[:, 1]]),axis=1)) From 10b024866e5dd7d34d3b22d14c58e579c131e887 Mon Sep 17 00:00:00 2001 From: Malar Kannan Date: Tue, 14 Nov 2017 17:54:44 +0530 Subject: [PATCH 15/15] implemented evaluation of test data with model by overfitting on smaller dataset --- speech_data.py | 12 +++- speech_siamese.py => speech_model.py | 29 +++++--- tts_samplegen.py => speech_samplegen.py | 6 +- speech_spectrum.py | 11 ++- test_siamese.py => speech_test.py | 88 ++++++++++++++++-------- speech_tools.py | 91 ++++++++++++++++++++++++- speech_utils.py | 74 -------------------- 7 files changed, 190 insertions(+), 121 deletions(-) rename speech_siamese.py => speech_model.py (86%) rename tts_samplegen.py => speech_samplegen.py (97%) rename test_siamese.py => speech_test.py (59%) delete mode 100644 speech_utils.py diff --git a/speech_data.py b/speech_data.py index 58430c1..4242998 100644 --- a/speech_data.py +++ b/speech_data.py @@ -1,6 +1,5 @@ import pandas as pd -from speech_utils import apply_by_multiprocessing -from speech_utils import threadsafe_iter +from speech_tools import apply_by_multiprocessing,threadsafe_iter # import dask as dd # import dask.dataframe as ddf import tensorflow as tf @@ -199,6 +198,12 @@ def audio_samples_word_count(audio_group='audio'): audio_samples = pd.read_csv( './outputs/' + audio_group + '.csv') return len(audio_samples.groupby(audio_samples['word'])) +def record_generator_count(records_file): + record_iterator = tf.python_io.tf_record_iterator(path=records_file) + count = len([i for i in record_iterator]) + record_iterator = tf.python_io.tf_record_iterator(path=records_file) + return record_iterator,count + def fix_csv(audio_group='audio'): audio_csv_lines = open('./outputs/' + audio_group + '.csv.orig','r').readlines() audio_csv_data = [i.strip().split(',') for i in audio_csv_lines] @@ -237,7 +242,8 @@ if __name__ == '__main__': # pickle_constants('story_words') # create_spectrogram_tfrecords('audio',sample_count=100) # create_spectrogram_tfrecords('story_all',sample_count=25) - create_spectrogram_tfrecords('story_words',sample_count=10,train_test_ratio=0.2) + # fix_csv('story_words_test') + create_spectrogram_tfrecords('story_words_test',sample_count=100,train_test_ratio=0.0) # create_spectrogram_tfrecords('audio',sample_count=50) # read_siamese_tfrecords_generator('audio') # padd_zeros_siamese_tfrecords('audio') diff --git a/speech_siamese.py b/speech_model.py similarity index 86% rename from speech_siamese.py rename to speech_model.py index c5f61b6..5136398 100644 --- a/speech_siamese.py +++ b/speech_model.py @@ -3,7 +3,7 @@ from __future__ import print_function import numpy as np # from speech_data import speech_model_data from speech_data import read_siamese_tfrecords_generator -from keras.models import Model,load_model +from keras.models import Model,load_model,model_from_yaml from keras.layers import Input, Dense, Dropout, LSTM, Lambda, Concatenate from keras.losses import categorical_crossentropy # from keras.losses import binary_crossentropy @@ -12,7 +12,7 @@ from keras.utils import to_categorical from keras.optimizers import RMSprop from keras.callbacks import TensorBoard, ModelCheckpoint from keras import backend as K -from speech_utils import create_dir +from speech_tools import create_dir # def euclidean_distance(vects): # x, y = vects @@ -36,13 +36,13 @@ def create_base_rnn_network(input_dim): '''Base network to be shared (eq. to feature extraction). ''' inp = Input(shape=input_dim) - ls0 = LSTM(512, return_sequences=True)(inp) - ls1 = LSTM(256, return_sequences=True)(ls0) + # ls0 = LSTM(512, return_sequences=True)(inp) + ls1 = LSTM(256, return_sequences=True)(inp) ls2 = LSTM(128, return_sequences=True)(ls1) # ls3 = LSTM(32, return_sequences=True)(ls2) ls4 = LSTM(64)(ls2) - d1 = Dense(128, activation='relu')(ls4) - d2 = Dense(64, activation='relu')(d1) + # d1 = Dense(128, activation='relu')(ls4) + d2 = Dense(64, activation='relu')(ls4) return Model(inp, ls4) @@ -62,8 +62,8 @@ def dense_classifier(processed): conc_proc = Concatenate()(processed) d1 = Dense(64, activation='relu')(conc_proc) # dr1 = Dropout(0.1)(d1) - d2 = Dense(128, activation='relu')(d1) - d3 = Dense(8, activation='relu')(d2) + # d2 = Dense(128, activation='relu')(d1) + d3 = Dense(8, activation='relu')(d1) # dr2 = Dropout(0.1)(d2) return Dense(2, activation='softmax')(d3) @@ -82,6 +82,16 @@ def siamese_model(input_dim): # model = Model([input_a, input_b], distance) return model +def write_model_arch(mod,mod_file): + model_f = open(mod_file,'w') + model_f.write(mod.to_yaml()) + model_f.close() + +def load_model_arch(mod_file): + model_f = open(mod_file,'r') + mod = model_from_yaml(model_f.read()) + model_f.close() + return mod def train_siamese(audio_group = 'audio'): # the data, shuffled and split between train and test sets @@ -91,7 +101,7 @@ def train_siamese(audio_group = 'audio'): create_dir(model_dir) log_dir = './logs/'+audio_group create_dir(log_dir) - tr_gen_fn,te_pairs,te_y,n_step,n_features,n_records = read_siamese_tfrecords_generator(audio_group,batch_size=batch_size) + tr_gen_fn,te_pairs,te_y,n_step,n_features,n_records = read_siamese_tfrecords_generator(audio_group,batch_size=batch_size,test_size=batch_size) tr_gen = tr_gen_fn() # tr_y = to_categorical(tr_y_e, num_classes=2) # te_y = to_categorical(te_y_e, num_classes=2) @@ -123,6 +133,7 @@ def train_siamese(audio_group = 'audio'): # train rms = RMSprop()#lr=0.001 model.compile(loss=categorical_crossentropy, optimizer=rms, metrics=[accuracy]) + write_model_arch(model,model_dir+'/siamese_speech_model_arch.yaml') # model.fit( # [tr_pairs[:, 0], tr_pairs[:, 1]], # tr_y, diff --git a/tts_samplegen.py b/speech_samplegen.py similarity index 97% rename from tts_samplegen.py rename to speech_samplegen.py index f9ef32e..a1e6047 100644 --- a/tts_samplegen.py +++ b/speech_samplegen.py @@ -12,6 +12,7 @@ import time import progressbar from generate_similar import similar_phoneme_phrase,similar_phrase +from speech_tools import format_filename OUTPUT_NAME = 'story_all' dest_dir = os.path.abspath('.') + '/outputs/' + OUTPUT_NAME + '/' @@ -40,7 +41,10 @@ def create_dir(direc): def dest_filename(w, v, r, t): - return '{}-{}-{}-{}-{}.aiff'.format(w, v, r, t, str(random.randint(0, 10000))) + rand_no = str(random.randint(0, 10000)) + fname = '{}-{}-{}-{}-{}.aiff'.format(w, v, r, t, rand_no) + sanitized = format_filename(fname) + return sanitized def dest_path(v, r, n): diff --git a/speech_spectrum.py b/speech_spectrum.py index 2e397e8..794586f 100644 --- a/speech_spectrum.py +++ b/speech_spectrum.py @@ -13,6 +13,8 @@ from pysndfile import sndio as snd from numpy.lib import stride_tricks """ short time fourier transform of audio signal """ +STFT_WINDOWS_MSEC = 20 +STFT_WINDOW_OVERLAP = 1.0 / 3 def stft(sig, frameSize, overlapFac=0.5, window=np.hanning): win = window(frameSize) @@ -74,7 +76,7 @@ def logscale_spec(spec, sr=44100, factor=20.): def generate_spec_frec(samples, samplerate): # samplerate, samples = wav.read(audiopath) # s = stft(samples, binsize) - s = stft(samples, samplerate * 150 // 1000, 1.0 / 3) + s = stft(samples, samplerate * STFT_WINDOWS_MSEC // 1000, STFT_WINDOW_OVERLAP) sshow, freq = logscale_spec(s, factor=1.0, sr=samplerate) ims = 20. * np.log10(np.abs(sshow) / 10e-6) @@ -141,8 +143,11 @@ def play_sunflower(): if __name__ == '__main__': - play_sunflower() - # plot_aiff_stft('./outputs/sunflowers-Alex-150-normal-589.aiff') + # play_sunflower() + plot_aiff_stft('./outputs/story_words/Agnes/150/chicken-Agnes-150-low-1077.aiff') + plot_aiff_stft('./outputs/story_words/Agnes/150/chicken-Agnes-150-medium-1762.aiff') + # spec = generate_aiff_spectrogram('./outputs/story_words/Agnes/150/chicken-Agnes-150-low-1077.aiff') + # print(spec.shape) # plot_aiff_stft('./outputs/sunflowers-Alex-180-normal-4763.aiff') # plot_aiff_stft('./outputs/sunflowers-Victoria-180-normal-870.aiff') # plot_aiff_stft('./outputs/sunflowers-Fred-180-phoneme-9733.aiff') diff --git a/test_siamese.py b/speech_test.py similarity index 59% rename from test_siamese.py rename to speech_test.py index 0d7098e..1ee7789 100644 --- a/test_siamese.py +++ b/speech_test.py @@ -1,5 +1,6 @@ -# from speech_siamese import siamese_model +from speech_model import load_model_arch from speech_tools import record_spectrogram, file_player +from speech_data import record_generator_count # from importlib import reload # import speech_data # reload(speech_data) @@ -9,6 +10,7 @@ import os import pickle import tensorflow as tf import csv +from tqdm import tqdm from speech_data import padd_zeros def predict_recording_with(m,sample_size=15): @@ -17,48 +19,40 @@ def predict_recording_with(m,sample_size=15): inp = create_test_pair(spec1,spec2,sample_size) return m.predict([inp[:, 0], inp[:, 1]]) -# while(True): -# print(predict_recording_with(model)) - def test_with(audio_group): X,Y = speech_data(audio_group) print(np.argmax(model.predict([X[:, 0], X[:, 1]]),axis=1)) print(Y.astype(np.int8)) -def evaluate_siamese(audio_group='audio',model_file = 'siamese_speech_model-305-epoch-0.20-acc.h5'): +def evaluate_siamese(records_file,audio_group='audio',weights = 'siamese_speech_model-final.h5'): # audio_group='audio';model_file = 'siamese_speech_model-305-epoch-0.20-acc.h5' - records_file = os.path.join('./outputs',audio_group+'.train.tfrecords') + # records_file = os.path.join('./outputs',eval_group+'.train.tfrecords') const_file = os.path.join('./outputs',audio_group+'.constants') - model_weights_path =os.path.join('./models/story_words/',model_file) + arch_file='./models/'+audio_group+'/siamese_speech_model_arch.yaml' + weight_file='./models/'+audio_group+'/'+weights (n_spec,n_features,n_records) = pickle.load(open(const_file,'rb')) - print('evaluating tfrecords({}-train)...'.format(audio_group)) - - model = siamese_model((n_spec, n_features)) - model.load_weights(model_weights_path) - record_iterator = tf.python_io.tf_record_iterator(path=records_file) - #tqdm(enumerate(record_iterator),total=n_records) - result_csv = open('./outputs/' + audio_group + '.results.csv','w') - result_csv_w = csv.writer(result_csv, quoting=csv.QUOTE_MINIMAL) - result_csv_w.writerow(["phoneme1","phoneme2","voice1","voice2","rate1","rate2","variant1","variant2","file1","file2"]) - for (i,string_record) in enumerate(record_iterator): + print('evaluating {}...'.format(records_file)) + model = load_model_arch(arch_file) + # model = siamese_model((n_spec, n_features)) + model.load_weights(weight_file) + record_iterator,records_count = record_generator_count(records_file) + total,same_success,diff_success,skipped,same_failed,diff_failed = 0,0,0,0,0,0 + all_results = [] + for (i,string_record) in tqdm(enumerate(record_iterator),total=records_count): # string_record = next(record_iterator) + total+=1 example = tf.train.Example() example.ParseFromString(string_record) spec_n1 = example.features.feature['spec_n1'].int64_list.value[0] spec_n2 = example.features.feature['spec_n2'].int64_list.value[0] + if n_spec < spec_n1 or n_spec < spec_n2: + skipped+=1 + continue spec_w1 = example.features.feature['spec_w1'].int64_list.value[0] spec_w2 = example.features.feature['spec_w2'].int64_list.value[0] spec1 = np.array(example.features.feature['spec1'].float_list.value).reshape(spec_n1,spec_w1) spec2 = np.array(example.features.feature['spec2'].float_list.value).reshape(spec_n2,spec_w2) - p_spec1,p_spec2 = padd_zeros(spec1,n_spec),padd_zeros(spec2,n_spec) - input_arr = np.asarray([[p_spec1,p_spec2]]) - output_arr = np.asarray([example.features.feature['output'].int64_list.value]) - y_pred = model.predict([input_arr[:, 0], input_arr[:, 1]]) - predicted = np.asarray(y_pred[0]>0.5).astype(output_arr.dtype) - expected = output_arr[0] - if np.all(predicted == expected): - continue word = example.features.feature['word'].bytes_list.value[0].decode() phoneme1 = example.features.feature['phoneme1'].bytes_list.value[0].decode() phoneme2 = example.features.feature['phoneme2'].bytes_list.value[0].decode() @@ -71,9 +65,41 @@ def evaluate_siamese(audio_group='audio',model_file = 'siamese_speech_model-305- variant2 = example.features.feature['variant2'].bytes_list.value[0].decode() file1 = example.features.feature['file1'].bytes_list.value[0].decode() file2 = example.features.feature['file2'].bytes_list.value[0].decode() - print(phoneme1,phoneme2,voice1,voice2,rate1,rate2,variant1,variant2,file1,file2) - result_csv_w.writerow([phoneme1,phoneme2,voice1,voice2,rate1,rate2,variant1,variant2,file1,file2]) - result_csv.close() + + p_spec1,p_spec2 = padd_zeros(spec1,n_spec),padd_zeros(spec2,n_spec) + input_arr = np.asarray([[p_spec1,p_spec2]]) + output_arr = np.asarray([example.features.feature['output'].int64_list.value]) + y_pred = model.predict([input_arr[:, 0], input_arr[:, 1]]) + predicted = np.asarray(y_pred[0]>0.5).astype(output_arr.dtype) + expected = output_arr[0] + status = np.all(predicted == expected) + result = {"phoneme1":phoneme1,"phoneme2":phoneme2,"voice1":voice1 + ,"voice2":voice2,"rate1":rate1,"rate2":rate2 + ,"variant1":variant1,"variant2":variant2,"file1":file1 + ,"file2":file2,"expected":expected[0],"predicted":y_pred[0][0] + ,"success":status} + all_results.append(result) + if status: + if variant1 == variant2: + same_success+=1 + else: + diff_success+=1 + continue + else: + if variant1 == variant2: + same_failed+=1 + else: + diff_failed+=1 + print('total-{},same_success-{},diff_success-{},skipped-{},same_failed-{},diff_failed-{}'.format(total,same_success,diff_success,skipped,same_failed,diff_failed)) + success = same_success+diff_success + failure = same_failed+diff_failed + print('accuracy-{:.3f}'.format(success*100/(success+failure))) + print('same_accuracy-{:.3f}'.format(same_success*100/(same_success+same_failed))) + print('diff_accuracy-{:.3f}'.format(diff_success*100/(diff_success+diff_failed))) + result_data = pd.DataFrame(all_results,columns=["phoneme1","phoneme2" + ,"voice1","voice2","rate1","rate2","variant1","variant2","file1","file2", + "expected","predicted","success"]) + result_data.to_csv('./outputs/' + audio_group + '.results.csv') def play_results(audio_group='audio'): @@ -102,8 +128,10 @@ def play_results(audio_group='audio'): break close_player() -# evaluate_siamese('story_words',model_file='siamese_speech_model-305-epoch-0.20-acc.h5') -play_results('story_words') +if __name__ == '__main__': + evaluate_siamese('./outputs/story_words_test.train.tfrecords',audio_group='story_words',weights ='siamese_speech_model-712-epoch-0.00-acc.h5') + # evaluate_siamese('./outputs/story_words.test.tfrecords',audio_group='story_words',weights ='siamese_speech_model-675-epoch-0.00-acc.h5') + # play_results('story_words') # test_with('rand_edu') # sunflower_data,sunflower_result = get_word_pairs_data('sweater',15) # print(np.argmax(model.predict([sunflower_data[:, 0], sunflower_data[:, 1]]),axis=1)) diff --git a/speech_tools.py b/speech_tools.py index fd48f34..c252ac5 100644 --- a/speech_tools.py +++ b/speech_tools.py @@ -1,6 +1,10 @@ +import os +import threading +import multiprocessing +import pandas as pd +import numpy as np import pyaudio from pysndfile import sndio as snd -import numpy as np # from matplotlib import pyplot as plt from speech_spectrum import plot_stft, generate_spec_frec @@ -61,3 +65,88 @@ def record_spectrogram(n_sec, plot=False, playback=False): p_oup.terminate() ims, _ = generate_spec_frec(one_channel, SAMPLE_RATE) return ims + + +def _apply_df(args): + df, func, num, kwargs = args + return num, df.apply(func, **kwargs) + +def apply_by_multiprocessing(df,func,**kwargs): + cores = multiprocessing.cpu_count() + workers=kwargs.pop('workers') if 'workers' in kwargs else cores + pool = multiprocessing.Pool(processes=workers) + result = pool.map(_apply_df, [(d, func, i, kwargs) for i,d in enumerate(np.array_split(df, workers))]) + pool.close() + result=sorted(result,key=lambda x:x[0]) + return pd.concat([i[1] for i in result]) + +def square(x): + return x**x + +if __name__ == '__main__': + df = pd.DataFrame({'a':range(10), 'b':range(10)}) + apply_by_multiprocessing(df, square, axis=1, workers=4) + + +def rm_rf(d): + for path in (os.path.join(d,f) for f in os.listdir(d)): + if os.path.isdir(path): + rm_rf(path) + else: + os.unlink(path) + os.rmdir(d) + +def create_dir(direc): + if not os.path.exists(direc): + os.makedirs(direc) + else: + rm_rf(direc) + create_dir(direc) + + +#################### Now make the data generator threadsafe #################### + +class threadsafe_iter: + """Takes an iterator/generator and makes it thread-safe by + serializing call to the `next` method of given iterator/generator. + """ + def __init__(self, it): + self.it = it + self.lock = threading.Lock() + + def __iter__(self): + return self + + def __next__(self): # Py3 + with self.lock: + return next(self.it) + + def next(self): # Py2 + with self.lock: + return self.it.next() + + +def threadsafe_generator(f): + """A decorator that takes a generator function and makes it thread-safe. + """ + def g(*a, **kw): + return threadsafe_iter(f(*a, **kw)) + return g + + + +def format_filename(s): + """ + Take a string and return a valid filename constructed from the string. + Uses a whitelist approach: any characters not present in valid_chars are + removed. Also spaces are replaced with underscores. + + Note: this method may produce invalid filenames such as ``, `.` or `..` + When I use this method I prepend a date string like '2009_01_15_19_46_32_' + and append a file extension like '.txt', so I avoid the potential of using + an invalid filename. + """ + valid_chars = "-_.() %s%s" % (string.ascii_letters, string.digits) + filename = ''.join(c for c in s if c in valid_chars) + filename = filename.replace(' ','_') # I don't like spaces in filenames. + return filename diff --git a/speech_utils.py b/speech_utils.py deleted file mode 100644 index 2581f84..0000000 --- a/speech_utils.py +++ /dev/null @@ -1,74 +0,0 @@ -import os -import threading - -import multiprocessing -import pandas as pd -import numpy as np - - - -def _apply_df(args): - df, func, num, kwargs = args - return num, df.apply(func, **kwargs) - -def apply_by_multiprocessing(df,func,**kwargs): - cores = multiprocessing.cpu_count() - workers=kwargs.pop('workers') if 'workers' in kwargs else cores - pool = multiprocessing.Pool(processes=workers) - result = pool.map(_apply_df, [(d, func, i, kwargs) for i,d in enumerate(np.array_split(df, workers))]) - pool.close() - result=sorted(result,key=lambda x:x[0]) - return pd.concat([i[1] for i in result]) - -def square(x): - return x**x - -if __name__ == '__main__': - df = pd.DataFrame({'a':range(10), 'b':range(10)}) - apply_by_multiprocessing(df, square, axis=1, workers=4) - - -def rm_rf(d): - for path in (os.path.join(d,f) for f in os.listdir(d)): - if os.path.isdir(path): - rm_rf(path) - else: - os.unlink(path) - os.rmdir(d) - -def create_dir(direc): - if not os.path.exists(direc): - os.makedirs(direc) - else: - rm_rf(direc) - create_dir(direc) - - -#################### Now make the data generator threadsafe #################### - -class threadsafe_iter: - """Takes an iterator/generator and makes it thread-safe by - serializing call to the `next` method of given iterator/generator. - """ - def __init__(self, it): - self.it = it - self.lock = threading.Lock() - - def __iter__(self): - return self - - def __next__(self): # Py3 - with self.lock: - return next(self.it) - - def next(self): # Py2 - with self.lock: - return self.it.next() - - -def threadsafe_generator(f): - """A decorator that takes a generator function and makes it thread-safe. - """ - def g(*a, **kw): - return threadsafe_iter(f(*a, **kw)) - return g