diff --git a/spectro_gen.py b/spectro_gen.py index 59ee522..6ae6750 100644 --- a/spectro_gen.py +++ b/spectro_gen.py @@ -78,6 +78,7 @@ def generate_spectrogram(samples, samplerate): sshow, freq = logscale_spec(s, factor=1.0, sr=samplerate) ims = 20. * np.log10(np.abs(sshow) / 10e-6) + ims[ims<0] = 0 return ims, freq diff --git a/speech_data.py b/speech_data.py index 95a353e..66b6d10 100644 --- a/speech_data.py +++ b/speech_data.py @@ -15,40 +15,47 @@ def get_siamese_pairs(groupF1, groupF2): return (t, f) -def create_X(sp, max_samples): - def append_zeros(spgr): - return np.lib.pad(spgr, [(0, max_samples - spgr.shape[0]), (0, 0)], - 'median') +def append_zeros(spgr, max_samples): + return np.lib.pad(spgr, [(0, max_samples - spgr.shape[0]), (0, 0)], + 'median') - l_sample = append_zeros(sp[0]['spectrogram']) - r_sample = append_zeros(sp[1]['spectrogram']) + +def create_pair(l, r, max_samples): + l_sample = append_zeros(l, max_samples) + r_sample = append_zeros(r, max_samples) return np.asarray([l_sample, r_sample]) -def sunflower_pairs_data(): +def create_test_pair(l, r, max_samples): + l_sample = append_zeros(l, max_samples) + r_sample = append_zeros(r, max_samples) + return np.asarray([[l_sample, r_sample]]) + +def create_X(sp, max_samples): + return create_pair(sp[0]['spectrogram'],sp[1]['spectrogram'],max_samples) + + +def get_word_pairs_data(word,max_samples): audio_samples = pd.read_csv( './outputs/audio.csv', names=['word', 'voice', 'rate', 'variant', 'file']) audio_samples = audio_samples.loc[audio_samples['word'] == - 'sunflowers'].reset_index(drop=True) + word].reset_index(drop=True) audio_samples.loc[:, 'spectrogram'] = audio_samples.loc[:, 'file'].apply( lambda x: 'outputs/audio/' + x).apply(generate_aiff_spectrogram) - max_samples = audio_samples['spectrogram'].apply( - lambda x: x.shape[0]).max() + # max_samples = audio_samples['spectrogram'].apply( + # lambda x: x.shape[0]).max() same_data, diff_data = [], [] for (w, g) in audio_samples.groupby(audio_samples['word']): sample_norm = g.loc[audio_samples['variant'] == 'normal'] sample_phon = g.loc[audio_samples['variant'] == 'phoneme'] same, diff = get_siamese_pairs(sample_norm, sample_phon) - same_data.extend(same) - diff_data.extend(diff) + same_data.extend([create_X(s, max_samples) for s in same[:10]]) + diff_data.extend([create_X(d, max_samples) for d in diff[:10]]) Y = np.hstack([np.ones(len(same_data)), np.zeros(len(diff_data))]) - X_sample_pairs = same_data + diff_data - - X_list = (create_X(sp, max_samples) for sp in X_sample_pairs) - X = np.vstack(X_list) - tr_pairs, te_pairs, tr_y, te_y = train_test_split(X, Y, test_size=0.1) - return train_test_split(X, Y, test_size=0.1) + X = np.asarray(same_data + diff_data) + # tr_pairs, te_pairs, tr_y, te_y = train_test_split(X, Y, test_size=0.1) + return (X,Y) def create_spectrogram_data(audio_group='audio'): diff --git a/speech_siamese.py b/speech_siamese.py index d75f202..260f6e9 100644 --- a/speech_siamese.py +++ b/speech_siamese.py @@ -63,27 +63,27 @@ def accuracy(y_true, y_pred): return K.mean(K.equal(y_true, K.cast(y_pred < 0.5, y_true.dtype))) -def train_siamese(): - # the data, shuffled and split between train and test sets - tr_pairs, te_pairs, tr_y, te_y = speech_model_data() - input_dim = (tr_pairs.shape[2], tr_pairs.shape[3]) - - # network definition +def siamese_model(input_dim): + # input_dim = (15, 1654) base_network = create_base_rnn_network(input_dim) input_a = Input(shape=input_dim) input_b = Input(shape=input_dim) - - # because we re-use the same instance `base_network`, - # the weights of the network - # will be shared across the two branches processed_a = base_network(input_a) processed_b = base_network(input_b) - distance = Lambda( euclidean_distance, output_shape=eucl_dist_output_shape)([processed_a, processed_b]) model = Model([input_a, input_b], distance) + return model + + +def train_siamese(): + # the data, shuffled and split between train and test sets + tr_pairs, te_pairs, tr_y, te_y = speech_model_data() + input_dim = (tr_pairs.shape[2], tr_pairs.shape[3]) + + model = siamese_model(input_dim) tb_cb = TensorBoard( log_dir='./logs/siamese_logs', @@ -128,21 +128,6 @@ def train_siamese(): print('* Accuracy on test set: %0.2f%%' % (100 * te_acc)) -def trained_siamese_model(): - # input_dim = (15, 1654) - # base_network = create_base_rnn_network(input_dim) - # input_a = Input(shape=input_dim) - # input_b = Input(shape=input_dim) - # processed_a = base_network(input_a) - # processed_b = base_network(input_b) - # distance = Lambda( - # euclidean_distance, - # output_shape=eucl_dist_output_shape)([processed_a, processed_b]) - # - # model = Model([input_a, input_b], distance) - model = load_model('./models/siamese_speech_model-final.h5') - return model - if __name__ == '__main__': train_siamese() diff --git a/test_siamese.py b/test_siamese.py index 13735bb..a83f478 100644 --- a/test_siamese.py +++ b/test_siamese.py @@ -1,7 +1,17 @@ -# from speech_siamese import trained_siamese_model -from keras.models import load_model +from speech_siamese import siamese_model from record_mic_speech import record_spectrogram +from importlib import reload +import speech_data +reload(speech_data) +from speech_data import create_test_pair,get_word_pairs_data +import numpy as np -model = load_model('./models/siamese_speech_model-final.h5') -spec1 = record_spectrogram(n_sec=1.2) -spec2 = record_spectrogram(n_sec=1.2) +sunflower_data,sunflower_result = get_word_pairs_data('sunflowers',15) +sunflower_result +model = siamese_model((15, 1654)) +model.load_weights('./models/siamese_speech_model-final.h5') +spec1 = record_spectrogram(n_sec=1.4) +spec2 = record_spectrogram(n_sec=1.4) +inp = create_test_pair(spec1,spec2,16) +model.predict([inp[:, 0], inp[:, 1]]) +model.predict([sunflower_data[:, 0], sunflower_data[:, 1]]) < 0.5