diff --git a/speech_data.py b/speech_data.py index 26067c5..2730bda 100644 --- a/speech_data.py +++ b/speech_data.py @@ -183,15 +183,15 @@ def reservoir_sample(iterable, k): sample[j] = item # replace item with gradually decreasing probability return sample -def read_siamese_tfrecords_oneshot(audio_group='audio'): +def read_siamese_tfrecords_oneshot(audio_group='audio',sample_size=3000): records_file = os.path.join('./outputs',audio_group+'_padded.tfrecords') record_iterator = tf.python_io.tf_record_iterator(path=records_file) input_pairs = [] output_class = [] const_file = os.path.join('./outputs',audio_group+'.constants') (n_spec,n_features,n_records) = pickle.load(open(const_file,'rb')) - print('reading tfrecords...') - samples = min([30000,n_records]) + print('reading tfrecords({})...'.format(audio_group)) + samples = min([sample_size,n_records]) input_data = np.zeros((samples,2,n_spec,n_features)) output_data = np.zeros((samples,2)) random_samples = enumerate(reservoir_sample(record_iterator,samples)) @@ -205,10 +205,38 @@ def read_siamese_tfrecords_oneshot(audio_group='audio'): input_data[i] = np.asarray([spec1,spec2]) output = example.features.feature['output'].int64_list.value output_data[i] = np.asarray(output) - print('converting to nparray...') - tr_pairs,te_pairs,tr_y,te_y = train_test_split(input_data,output_data,test_size=0.1) - result = (tr_pairs,te_pairs,tr_y,te_y,n_spec,n_features) - return result + # print('converting to nparray...') + # tr_pairs,te_pairs,tr_y,te_y = train_test_split(input_data,output_data,test_size=0.1) + # result = (tr_pairs,te_pairs,tr_y,te_y,n_spec,n_features) + return input_data,output_data + +def read_siamese_tfrecords_generator(audio_group='audio',batch_size=32): + records_file = os.path.join('./outputs',audio_group+'_padded.tfrecords') + input_pairs = [] + output_class = [] + const_file = os.path.join('./outputs',audio_group+'.constants') + (n_spec,n_features,n_records) = pickle.load(open(const_file,'rb')) + print('reading tfrecords({})...'.format(audio_group)) + def record_generator(): + input_data = [] + output_data = [] + while True: + record_iterator = tf.python_io.tf_record_iterator(path=records_file) + for (i,string_record) in tqdm(enumerate(record_iterator),total=n_records): + example = tf.train.Example() + example.ParseFromString(string_record) + spec1 = np.array(example.features.feature['spec1'].float_list.value).reshape(n_spec,n_features) + spec2 = np.array(example.features.feature['spec2'].float_list.value).reshape(n_spec,n_features) + input_data.append(np.asarray([spec1,spec2])) + output = example.features.feature['output'].int64_list.value + output_data.append(np.asarray(output)) + if len(input_data) == batch_size: + input_arr = np.asarray(input_data) + output_arr = np.asarray(output_data) + yield ([input_arr[:, 0], input_arr[:, 1]],output_arr) + input_data = [] + output_data = [] + return record_generator,n_spec,n_features,n_records def read_siamese_tfrecords(audio_group='audio'): audio_group='story_words_test' @@ -324,7 +352,8 @@ if __name__ == '__main__': # create_spectrogram_tfrecords('story_words_test') # read_siamese_tfrecords('story_all') # read_siamese_tfrecords('story_words_test') - pickle_constants('story_words_test') + padd_zeros_siamese_tfrecords('story_words') + # pickle_constants('story_words') # create_spectrogram_tfrecords('audio') # padd_zeros_siamese_tfrecords('audio') # create_padded_spectrogram() diff --git a/speech_siamese.py b/speech_siamese.py index a2cdba3..74d9566 100644 --- a/speech_siamese.py +++ b/speech_siamese.py @@ -2,7 +2,7 @@ from __future__ import absolute_import from __future__ import print_function import numpy as np # from speech_data import speech_model_data -from speech_data import read_siamese_tfrecords_oneshot +from speech_data import read_siamese_tfrecords_oneshot,read_siamese_tfrecords_generator from keras.models import Model,load_model from keras.layers import Input, Dense, Dropout, LSTM, Lambda, Concatenate from keras.losses import categorical_crossentropy @@ -82,7 +82,10 @@ def siamese_model(input_dim): def train_siamese(): # the data, shuffled and split between train and test sets # tr_pairs, te_pairs, tr_y_e, te_y_e = speech_model_data() - (tr_pairs,te_pairs,tr_y,te_y,n_step,n_features) = read_siamese_tfrecords_oneshot() + batch_size = 512 + tr_gen_fn,n_step,n_features,n_records = read_siamese_tfrecords_generator('audio',batch_size) + tr_gen = tr_gen_fn() + (te_pairs,te_y) = read_siamese_tfrecords_oneshot('audio',1000) # tr_y = to_categorical(tr_y_e, num_classes=2) # te_y = to_categorical(te_y_e, num_classes=2) input_dim = (n_step, n_features) @@ -113,22 +116,26 @@ def train_siamese(): # train rms = RMSprop(lr=0.001) model.compile(loss=categorical_crossentropy, optimizer=rms, metrics=[accuracy]) - model.fit( - [tr_pairs[:, 0], tr_pairs[:, 1]], - tr_y, - batch_size=128, - epochs=100, - validation_data=([te_pairs[:, 0], te_pairs[:, 1]], te_y), - callbacks=[tb_cb, cp_cb]) + # model.fit( + # [tr_pairs[:, 0], tr_pairs[:, 1]], + # tr_y, + # batch_size=128, + # epochs=100, + # validation_data=([te_pairs[:, 0], te_pairs[:, 1]], te_y), + # callbacks=[tb_cb, cp_cb]) + model.fit_generator(tr_gen + ,epochs=100 + ,steps_per_epoch=n_records//batch_size + ,use_multiprocessing=True) model.save('./models/siamese_speech_model-final.h5') # compute final accuracy on training and test sets - y_pred = model.predict([tr_pairs[:, 0], tr_pairs[:, 1]]) - tr_acc = compute_accuracy(tr_y, y_pred) + # y_pred = model.predict([tr_pairs[:, 0], tr_pairs[:, 1]]) + # tr_acc = compute_accuracy(tr_y, y_pred) + # print('* Accuracy on training set: %0.2f%%' % (100 * tr_acc)) + y_pred = model.predict([te_pairs[:, 0], te_pairs[:, 1]]) te_acc = compute_accuracy(te_y, y_pred) - - print('* Accuracy on training set: %0.2f%%' % (100 * tr_acc)) print('* Accuracy on test set: %0.2f%%' % (100 * te_acc))