From b8a9f87031fc84af8e534858639feb5adab31a08 Mon Sep 17 00:00:00 2001 From: Malar Kannan Date: Tue, 7 Nov 2017 15:18:04 +0530 Subject: [PATCH] implemented padding and pipeline is complete --- speech_data.py | 45 ++++++++++++++++++++++++++++++++++++++------- speech_siamese.py | 8 ++++---- 2 files changed, 42 insertions(+), 11 deletions(-) diff --git a/speech_data.py b/speech_data.py index 83b5d6d..a47627c 100644 --- a/speech_data.py +++ b/speech_data.py @@ -95,17 +95,34 @@ def create_spectrogram_tfrecords(audio_group='audio'): writer.write(example.SerializeToString()) writer.close() +def padd_zeros(spgr, max_samples): + return np.lib.pad(spgr, [(0, max_samples - spgr.shape[0]), (0, 0)], + 'constant') + +def find_max_n(trf): + max_n = 0 + max_n_it = tf.python_io.tf_record_iterator(path=trf) + for string_record in max_n_it: + example = tf.train.Example() + example.ParseFromString(string_record) + spec_n1 = example.features.feature['spec_n1'].int64_list.value[0] + spec_n2 = example.features.feature['spec_n2'].int64_list.value[0] + max_n = max([max_n,spec_n1,spec_n2]) + return max_n + def read_siamese_tfrecords(audio_group='audio'): records_file = os.path.join('./outputs',audio_group+'.tfrecords') record_iterator = tf.python_io.tf_record_iterator(path=records_file) + # input1,input2 = [],[] input_pairs = [] output_class = [] - input_words = [] + max_n = find_max_n(records_file) + spec_w1 = 0 for string_record in record_iterator: example = tf.train.Example() example.ParseFromString(string_record) - word = example.features.feature['word'].bytes_list.value[0] - input_words.append(word) + # word = example.features.feature['word'].bytes_list.value[0] + # input_words.append(word) example.features.feature['spec2'].float_list.value[0] spec_n1 = example.features.feature['spec_n1'].int64_list.value[0] spec_n2 = example.features.feature['spec_n2'].int64_list.value[0] @@ -113,10 +130,23 @@ def read_siamese_tfrecords(audio_group='audio'): spec_w2 = example.features.feature['spec_w2'].int64_list.value[0] spec1 = np.array(example.features.feature['spec1'].float_list.value).reshape(spec_n1,spec_w1) spec2 = np.array(example.features.feature['spec2'].float_list.value).reshape(spec_n2,spec_w2) - input_pairs.append([spec1,spec2]) + p_spec1,p_spec2 = padd_zeros(spec1,max_n),padd_zeros(spec2,max_n) + # input1.append(spec1) + # input2.append(spec2) + input_pairs.append(np.asarray([p_spec1,p_spec2])) + # input_pairs.append([spec1,spec2]) output = example.features.feature['output'].int64_list.value - output_class.append(output) - return input_pairs,output_class + output_class.append(np.asarray(output)) + n_features = spec_w1 + # if len(input_pairs) > 50: + # break + input_data,output_data = np.asarray(input_pairs),np.asarray(output_class) + # import pdb; pdb.set_trace() + # tr_x1,te_x1,tr_x2,te_x2,tr_y,te_y = train_test_split(input1,input2,output_class) + tr_pairs,te_pairs,tr_y,te_y = train_test_split(input_data,output_data) + # return (tr_x1,te_x1,tr_x2,te_x2,tr_y,te_y) + n_step,n_features = int(max_n),int(spec_w1) + return (tr_pairs,te_pairs,tr_y,te_y,n_step,n_features) def audio_samples_word_count(audio_group='audio'): audio_group = 'story_all' @@ -157,7 +187,8 @@ if __name__ == '__main__': # create_spectrogram_data() # create_spectrogram_data('story_words') # create_spectrogram_tfrecords('story_words') - create_spectrogram_tfrecords('story_all') + # create_spectrogram_tfrecords('story_all') + read_siamese_tfrecords('story_all') # create_padded_spectrogram() # create_speech_pairs_data() # print(speech_model_data()) diff --git a/speech_siamese.py b/speech_siamese.py index 64a28fb..e9ad718 100644 --- a/speech_siamese.py +++ b/speech_siamese.py @@ -82,10 +82,10 @@ def siamese_model(input_dim): def train_siamese(): # the data, shuffled and split between train and test sets # tr_pairs, te_pairs, tr_y_e, te_y_e = speech_model_data() - pairs,y = read_siamese_tfrecords('story_words') + (tr_pairs,te_pairs,tr_y,te_y,n_step,n_features) = read_siamese_tfrecords('story_words') # tr_y = to_categorical(tr_y_e, num_classes=2) # te_y = to_categorical(te_y_e, num_classes=2) - input_dim = (None, 1654) + input_dim = (n_step, n_features) model = siamese_model(input_dim) @@ -114,11 +114,11 @@ def train_siamese(): rms = RMSprop(lr=0.001) model.compile(loss=categorical_crossentropy, optimizer=rms, metrics=[accuracy]) model.fit( - [tr_pairs[:, 0], tr_pairs[:, 1]], + [tr_x1, tr_x2], tr_y, batch_size=128, epochs=50, - validation_data=([te_pairs[:, 0], te_pairs[:, 1]], te_y), + validation_data=([tr_pairs[:, 0], tr_pairs[:, 1]], te_y), callbacks=[tb_cb, cp_cb]) model.save('./models/siamese_speech_model-final.h5')