From f1e82a2539f7b8b2d692208379533cb0a246a375 Mon Sep 17 00:00:00 2001 From: Malar Kannan Date: Wed, 25 Oct 2017 15:38:03 +0530 Subject: [PATCH] added code to record and generate spectrogram, wip test model --- record_mic_speech.py | 68 ++++++++++++---------- spectro_gen.py | 2 +- speech_data.py | 39 +++++-------- speech_siamese.py | 134 +++++++++++++++++++++++++------------------ test_siamese.py | 7 +++ 5 files changed, 135 insertions(+), 115 deletions(-) create mode 100644 test_siamese.py diff --git a/record_mic_speech.py b/record_mic_speech.py index 1420502..8a73374 100644 --- a/record_mic_speech.py +++ b/record_mic_speech.py @@ -1,36 +1,42 @@ import pyaudio import numpy as np # from matplotlib import pyplot as plt -from spectro_gen import plot_stft +from spectro_gen import plot_stft, generate_spectrogram -SAMPLE_RATE = 22050 -N_SEC = 1.5 -CHUNKSIZE = int(SAMPLE_RATE * N_SEC) # fixed chunk size -p_inp = pyaudio.PyAudio() -stream = p_inp.open( - format=pyaudio.paFloat32, - channels=2, - rate=SAMPLE_RATE, - input=True, - frames_per_buffer=CHUNKSIZE) - -data = stream.read(CHUNKSIZE) -numpydata = np.frombuffer(data, dtype=np.float32) -multi_channel = np.abs(np.reshape(numpydata, (-1, 2))).mean(axis=1) -one_channel = np.asarray([multi_channel, -1 * multi_channel]).T.reshape(-1) -mean_channel_data = one_channel.tobytes() -plot_stft(one_channel, SAMPLE_RATE) -# plt.plot(one_channel) -# plt.show() - -stream.stop_stream() -stream.close() -p_inp.terminate() - -p_oup = pyaudio.PyAudio() -stream = p_oup.open( - format=pyaudio.paFloat32, channels=2, rate=SAMPLE_RATE, output=True) -stream.write(mean_channel_data) -stream.close() -p_oup.terminate() +def record_spectrogram(n_sec, plot=False, playback=False): + SAMPLE_RATE = 22050 + N_CHANNELS = 2 + N_SEC = n_sec + CHUNKSIZE = int(SAMPLE_RATE * N_SEC / N_CHANNELS) # fixed chunk size + # show_record_prompt() + input('Press [Enter] to start recording sample... ') + p_inp = pyaudio.PyAudio() + stream = p_inp.open( + format=pyaudio.paFloat32, + channels=N_CHANNELS, + rate=SAMPLE_RATE, + input=True, + frames_per_buffer=CHUNKSIZE) + data = stream.read(CHUNKSIZE) + numpydata = np.frombuffer(data, dtype=np.float32) + multi_channel = np.abs(np.reshape(numpydata, (-1, 2))).mean(axis=1) + one_channel = np.asarray([multi_channel, -1 * multi_channel]).T.reshape(-1) + mean_channel_data = one_channel.tobytes() + stream.stop_stream() + stream.close() + p_inp.terminate() + if plot: + plot_stft(one_channel, SAMPLE_RATE) + if playback: + p_oup = pyaudio.PyAudio() + stream = p_oup.open( + format=pyaudio.paFloat32, + channels=2, + rate=SAMPLE_RATE, + output=True) + stream.write(mean_channel_data) + stream.close() + p_oup.terminate() + ims, _ = generate_spectrogram(one_channel, SAMPLE_RATE) + return ims diff --git a/spectro_gen.py b/spectro_gen.py index c0f871f..59ee522 100644 --- a/spectro_gen.py +++ b/spectro_gen.py @@ -128,7 +128,7 @@ def play_sunflower(): sample_r = snd.get_info('./outputs/sunflowers-Alex-150-normal-589.aiff')[0] snd_data_f64 = snd.read('./outputs/sunflowers-Alex-150-normal-589.aiff')[0] snd_data_f32 = snd_data_f64.astype(np.float32) - snd_data_f32.shape + print(snd_data_f32.shape) snd_data = snd_data_f32.tobytes() p_oup = pyaudio.PyAudio() stream = p_oup.open( diff --git a/speech_data.py b/speech_data.py index 6ed90f6..95a353e 100644 --- a/speech_data.py +++ b/speech_data.py @@ -15,6 +15,16 @@ def get_siamese_pairs(groupF1, groupF2): return (t, f) +def create_X(sp, max_samples): + def append_zeros(spgr): + return np.lib.pad(spgr, [(0, max_samples - spgr.shape[0]), (0, 0)], + 'median') + + l_sample = append_zeros(sp[0]['spectrogram']) + r_sample = append_zeros(sp[1]['spectrogram']) + return np.asarray([l_sample, r_sample]) + + def sunflower_pairs_data(): audio_samples = pd.read_csv( './outputs/audio.csv', @@ -35,19 +45,7 @@ def sunflower_pairs_data(): Y = np.hstack([np.ones(len(same_data)), np.zeros(len(diff_data))]) X_sample_pairs = same_data + diff_data - def append_zeros(spgr): - sample = np.lib.pad(spgr, [(0, max_samples - spgr.shape[0]), (0, 0)], - 'median') - return np.expand_dims(sample, axis=0) - - def create_X(sp): - # sample_count = sp[0]['file'].shape[0] - l_sample = append_zeros(sp[0]['spectrogram']) - r_sample = append_zeros( - sp[1]['spectrogram']) - return np.expand_dims(np.vstack([l_sample, r_sample]), axis=0) - - X_list = (create_X(sp) for sp in X_sample_pairs) + X_list = (create_X(sp, max_samples) for sp in X_sample_pairs) X = np.vstack(X_list) tr_pairs, te_pairs, tr_y, te_y = train_test_split(X, Y, test_size=0.1) return train_test_split(X, Y, test_size=0.1) @@ -69,27 +67,16 @@ def create_speech_pairs_data(audio_group='audio'): audio_samples = pd.read_pickle('outputs/spectrogram.pkl') max_samples = audio_samples['spectrogram'].apply( lambda x: x.shape[0]).max() - # sample_size = audio_samples['spectrogram'][0].shape[1] - def append_zeros(spgr): - sample = np.lib.pad(spgr, [(0, max_samples - spgr.shape[0]), (0, 0)], - 'median') - return sample - - def create_X(sp): - l_sample = append_zeros(sp[0]['spectrogram']) - r_sample = append_zeros(sp[1]['spectrogram']) - return np.asarray([l_sample, r_sample]) - print('generating siamese speech pairs') same_data, diff_data = [], [] for (w, g) in audio_samples.groupby(audio_samples['word']): sample_norm = g.loc[audio_samples['variant'] == 'normal'] sample_phon = g.loc[audio_samples['variant'] == 'phoneme'] same, diff = get_siamese_pairs(sample_norm, sample_phon) - same_data.extend([create_X(s) for s in same[:10]]) - diff_data.extend([create_X(d) for d in diff[:10]]) + same_data.extend([create_X(s, max_samples) for s in same[:10]]) + diff_data.extend([create_X(d, max_samples) for d in diff[:10]]) print('creating all speech pairs') Y = np.hstack([np.ones(len(same_data)), np.zeros(len(diff_data))]) print('casting as array speech pairs') diff --git a/speech_siamese.py b/speech_siamese.py index c632fe4..d75f202 100644 --- a/speech_siamese.py +++ b/speech_siamese.py @@ -2,9 +2,9 @@ from __future__ import absolute_import from __future__ import print_function import numpy as np from speech_data import speech_model_data -from keras.models import Model +from keras.models import Model,load_model from keras.layers import Input, Dense, Dropout, LSTM, Lambda -from keras.optimizers import RMSprop, SGD +from keras.optimizers import RMSprop from keras.callbacks import TensorBoard, ModelCheckpoint from keras import backend as K @@ -63,66 +63,86 @@ def accuracy(y_true, y_pred): return K.mean(K.equal(y_true, K.cast(y_pred < 0.5, y_true.dtype))) -# the data, shuffled and split between train and test sets -tr_pairs, te_pairs, tr_y, te_y = speech_model_data() -input_dim = (tr_pairs.shape[2], tr_pairs.shape[3]) +def train_siamese(): + # the data, shuffled and split between train and test sets + tr_pairs, te_pairs, tr_y, te_y = speech_model_data() + input_dim = (tr_pairs.shape[2], tr_pairs.shape[3]) -# network definition -base_network = create_base_rnn_network(input_dim) -input_a = Input(shape=input_dim) -input_b = Input(shape=input_dim) + # network definition + base_network = create_base_rnn_network(input_dim) + input_a = Input(shape=input_dim) + input_b = Input(shape=input_dim) -# because we re-use the same instance `base_network`, -# the weights of the network -# will be shared across the two branches -processed_a = base_network(input_a) -processed_b = base_network(input_b) + # because we re-use the same instance `base_network`, + # the weights of the network + # will be shared across the two branches + processed_a = base_network(input_a) + processed_b = base_network(input_b) -distance = Lambda( - euclidean_distance, - output_shape=eucl_dist_output_shape)([processed_a, processed_b]) + distance = Lambda( + euclidean_distance, + output_shape=eucl_dist_output_shape)([processed_a, processed_b]) -model = Model([input_a, input_b], distance) + model = Model([input_a, input_b], distance) -tb_cb = TensorBoard( - log_dir='./logs/siamese_logs', - histogram_freq=1, - batch_size=32, - write_graph=True, - write_grads=True, - write_images=True, - embeddings_freq=0, - embeddings_layer_names=None, - embeddings_metadata=None) -cp_file_fmt = './models/siamese_speech_model-{epoch:02d}-epoch-{val_acc:0.2f}\ --acc.h5' + tb_cb = TensorBoard( + log_dir='./logs/siamese_logs', + histogram_freq=1, + batch_size=32, + write_graph=True, + write_grads=True, + write_images=True, + embeddings_freq=0, + embeddings_layer_names=None, + embeddings_metadata=None) + cp_file_fmt = './models/siamese_speech_model-{epoch:02d}-epoch-{val_acc:0.2f}\ + -acc.h5' -cp_cb = ModelCheckpoint( - cp_file_fmt, - monitor='val_acc', - verbose=0, - save_best_only=False, - save_weights_only=False, - mode='auto', - period=1) -# train -rms = RMSprop(lr=0.001) -sgd = SGD(lr=0.001) -model.compile(loss=contrastive_loss, optimizer=rms, metrics=[accuracy]) -model.fit( - [tr_pairs[:, 0], tr_pairs[:, 1]], - tr_y, - batch_size=128, - epochs=50, - validation_data=([te_pairs[:, 0], te_pairs[:, 1]], te_y), - callbacks=[tb_cb, cp_cb]) + cp_cb = ModelCheckpoint( + cp_file_fmt, + monitor='val_acc', + verbose=0, + save_best_only=False, + save_weights_only=False, + mode='auto', + period=1) + # train + rms = RMSprop(lr=0.001) + model.compile(loss=contrastive_loss, optimizer=rms, metrics=[accuracy]) + model.fit( + [tr_pairs[:, 0], tr_pairs[:, 1]], + tr_y, + batch_size=128, + epochs=50, + validation_data=([te_pairs[:, 0], te_pairs[:, 1]], te_y), + callbacks=[tb_cb, cp_cb]) -model.save('./models/siamese_speech_model-final.h5') -# compute final accuracy on training and test sets -y_pred = model.predict([tr_pairs[:, 0], tr_pairs[:, 1]]) -tr_acc = compute_accuracy(tr_y, y_pred) -y_pred = model.predict([te_pairs[:, 0], te_pairs[:, 1]]) -te_acc = compute_accuracy(te_y, y_pred) + model.save('./models/siamese_speech_model-final.h5') + # compute final accuracy on training and test sets + y_pred = model.predict([tr_pairs[:, 0], tr_pairs[:, 1]]) + tr_acc = compute_accuracy(tr_y, y_pred) + y_pred = model.predict([te_pairs[:, 0], te_pairs[:, 1]]) + te_acc = compute_accuracy(te_y, y_pred) -print('* Accuracy on training set: %0.2f%%' % (100 * tr_acc)) -print('* Accuracy on test set: %0.2f%%' % (100 * te_acc)) + print('* Accuracy on training set: %0.2f%%' % (100 * tr_acc)) + print('* Accuracy on test set: %0.2f%%' % (100 * te_acc)) + + +def trained_siamese_model(): + # input_dim = (15, 1654) + # base_network = create_base_rnn_network(input_dim) + # input_a = Input(shape=input_dim) + # input_b = Input(shape=input_dim) + # processed_a = base_network(input_a) + # processed_b = base_network(input_b) + # distance = Lambda( + # euclidean_distance, + # output_shape=eucl_dist_output_shape)([processed_a, processed_b]) + # + # model = Model([input_a, input_b], distance) + model = load_model('./models/siamese_speech_model-final.h5') + return model + + +if __name__ == '__main__': + train_siamese() diff --git a/test_siamese.py b/test_siamese.py new file mode 100644 index 0000000..13735bb --- /dev/null +++ b/test_siamese.py @@ -0,0 +1,7 @@ +# from speech_siamese import trained_siamese_model +from keras.models import load_model +from record_mic_speech import record_spectrogram + +model = load_model('./models/siamese_speech_model-final.h5') +spec1 = record_spectrogram(n_sec=1.2) +spec2 = record_spectrogram(n_sec=1.2)