diff --git a/mnist_siamese.py b/mnist_siamese.py index a5258ca..a6aec8b 100644 --- a/mnist_siamese.py +++ b/mnist_siamese.py @@ -17,7 +17,7 @@ import numpy as np import random from keras.datasets import mnist from keras.models import Model -from keras.layers import Dense, Dropout, Input, Lambda, Recurrent +from keras.layers import Dense, Dropout, Input, Lambda from keras.optimizers import RMSprop from keras import backend as K @@ -104,6 +104,7 @@ tr_pairs, tr_y = create_pairs(x_train, digit_indices) digit_indices = [np.where(y_test == i)[0] for i in range(num_classes)] te_pairs, te_y = create_pairs(x_test, digit_indices) +tr_pairs.shape # network definition base_network = create_base_network(input_dim) diff --git a/spectro_gen.py b/spectro_gen.py index 7df17e4..6a08fbf 100644 --- a/spectro_gen.py +++ b/spectro_gen.py @@ -63,7 +63,7 @@ def generate_aiff_spectrogram(audiopath): samples,samplerate,_ = snd.read(audiopath) # samplerate, samples = wav.read(audiopath) # s = stft(samples, binsize) - s = stft(samples, samplerate*150/1000,1.0/3) + s = stft(samples, samplerate*150//1000,1.0/3) sshow, freq = logscale_spec(s, factor=1.0, sr=samplerate) ims = 20.*np.log10(np.abs(sshow)/10e-6) @@ -74,7 +74,8 @@ def plotstft(audiopath, binsize=2**10, plotpath=None, colormap="jet"): samples,samplerate,_ = snd.read(audiopath) # samplerate, samples = wav.read(audiopath) # s = stft(samples, binsize) - s = stft(samples, samplerate*150/1000,1.0/3) + # print(samplerate*150//1000) + s = stft(samples, samplerate*150//1000,1.0/3) sshow, freq = logscale_spec(s, factor=1.0, sr=samplerate) ims = 20.*np.log10(np.abs(sshow)/10e-6) # amplitude to decibel diff --git a/speech_data.py b/speech_data.py new file mode 100644 index 0000000..25ea1e2 --- /dev/null +++ b/speech_data.py @@ -0,0 +1,68 @@ +import pandas as pd +import numpy as np +from spectro_gen import generate_aiff_spectrogram +from sklearn.model_selection import train_test_split +import tensorflow as tf + +def sunflower_data(): + audio_samples = pd.read_csv('./outputs/audio.csv',names=['word','voice','rate','variant','file']) + sunflowers = audio_samples.loc[audio_samples['word'] == 'sunflowers'].reset_index(drop=True) + sunflowers.loc[:,'file'] = sunflowers.loc[:,'file'].apply(lambda x:'outputs/'+x).apply(generate_aiff_spectrogram) + y_data = sunflowers['variant'].apply(lambda x:x=='normal').values + max_samples = sunflowers['file'].apply(lambda x:x.shape[0]).max() + sample_size = sunflowers['file'][0].shape[1] + sample_count = sunflowers['file'].shape[0] + sunflowers['file'][0].shape[0] + def append_zeros(spgr): + orig = spgr.shape[0] + return np.lib.pad(spgr,[(0, max_samples-orig), (0,0)],'median') + pad_sun = sunflowers['file'].apply(append_zeros).values + x_data = np.vstack(pad_sun).reshape((sample_count,max_samples,sample_size,)) + # x_data.shape + # y_data.shape + # train_test_split(x_data,y_data,test_size=0.33)[].shape + # len(train_test_split(x_data,y_data,test_size=0.33)) + # sunflowers.loc[:,'file'][0] + # generate_aiff_spectrogram('outputs/sunflowers-Alex-150-normal-589.aiff') + # sunflowers[sunflowers['variant'] == 'phoneme'] + # sunflowers[sunflowers['variant'] == 'normal'] + # for s in sunflowers.values: + # print(s) + return train_test_split(x_data,y_data,test_size=0.33) + + +def sunflower_pairs_data(): + audio_samples = pd.read_csv('./outputs/audio.csv',names=['word','voice','rate','variant','file']) + sunflowers = audio_samples.loc[audio_samples['word'] == 'sunflowers'].reset_index(drop=True) + sunflowers.loc[:,'file'] = sunflowers.loc[:,'file'].apply(lambda x:'outputs/'+x).apply(generate_aiff_spectrogram) + y_data = sunflowers['variant'].apply(lambda x:x=='normal').values + max_samples = sunflowers['file'].apply(lambda x:x.shape[0]).max() + sample_size = sunflowers['file'][0].shape[1] + sunflowers_pos = sunflowers[sunflowers['variant'] == 'normal'].reset_index(drop=True) + sunflowers_neg = sunflowers[sunflowers['variant'] == 'phoneme'].reset_index(drop=True) + def append_zeros(spgr): + return np.lib.pad(spgr,[(0, max_samples-spgr.shape[0]), (0,0)],'median') + def create_data(sf): + sample_count = sf['file'].shape[0] + pad_sun = sf['file'].apply(append_zeros).values + x_data = np.vstack(pad_sun).reshape((sample_count,max_samples,sample_size)) + return x_data + x_data_pos = create_data(sunflowers_pos) + x_data_neg = create_data(sunflowers_neg) + x_pos_train, x_pos_test, x_neg_train, x_neg_test =train_test_split(x_data_pos,x_data_neg,test_size=0.33) + tr_y = np.array(x_pos_train.shape[0]*[[1,0]]) + te_y = np.array(x_pos_test.shape[0]*[[1,0]]) + tr_pairs = np.array([x_pos_train,x_neg_train]).reshape(x_pos_train.shape[0],2,max_samples,sample_size) + te_pairs = np.array([x_pos_test,x_neg_test]).reshape(x_pos_test.shape[0],2,max_samples,sample_size) + # x_data.shape + # y_data.shape + # train_test_split(x_data,y_data,test_size=0.33)[].shape + # len(train_test_split(x_data,y_data,test_size=0.33)) + # sunflowers.loc[:,'file'][0] + # generate_aiff_spectrogram('outputs/sunflowers-Alex-150-normal-589.aiff') + # sunflowers[sunflowers['variant'] == 'phoneme'] + # sunflowers[sunflowers['variant'] == 'normal'] + # for s in sunflowers.values: + # print(s) + #return train_test_split(x_data,y_data,test_size=0.33) + return tr_pairs,te_pairs,tr_y,te_y diff --git a/speech_siamese.py b/speech_siamese.py new file mode 100644 index 0000000..821d29a --- /dev/null +++ b/speech_siamese.py @@ -0,0 +1,112 @@ +'''Train a Siamese MLP on pairs of digits from the MNIST dataset. + +It follows Hadsell-et-al.'06 [1] by computing the Euclidean distance on the +output of the shared network and by optimizing the contrastive loss (see paper +for mode details). + +[1] "Dimensionality Reduction by Learning an Invariant Mapping" + http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf + +Gets to 97.2% test accuracy after 20 epochs. +2 seconds per epoch on a Titan X Maxwell GPU +''' +from __future__ import absolute_import +from __future__ import print_function +import numpy as np + +import random +# from keras.datasets import mnist +from speech_data import sunflower_pairs_data +from keras.models import Model +from keras.layers import Dense, Dropout, Input, Lambda, LSTM, SimpleRNN +from keras.optimizers import RMSprop +from keras import backend as K + +def euclidean_distance(vects): + x, y = vects + return K.sqrt(K.maximum(K.sum(K.square(x - y), axis=1, keepdims=True), K.epsilon())) + + +def eucl_dist_output_shape(shapes): + shape1, shape2 = shapes + return (shape1[0], 1) + + +def contrastive_loss(y_true, y_pred): + '''Contrastive loss from Hadsell-et-al.'06 + http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf + ''' + margin = 1 + return K.mean(y_true * K.square(y_pred) + + (1 - y_true) * K.square(K.maximum(margin - y_pred, 0))) + + +def create_base_network(input_dim): + '''Base network to be shared (eq. to feature extraction). + ''' + inp = Input(shape=input_dim) + sr1 = SimpleRNN(128)(inp) + # sr2 = LSTM(128)(sr1) + # sr2 = SimpleRNN(128)(sr) + x = Dense(128, activation='relu')(sr1) + return Model(inp, x) + + +def compute_accuracy(y_true, y_pred): + '''Compute classification accuracy with a fixed threshold on distances. + ''' + pred = y_pred.ravel() < 0.5 + return np.mean(pred == y_true) + + +def accuracy(y_true, y_pred): + '''Compute classification accuracy with a fixed threshold on distances. + ''' + return K.mean(K.equal(y_true, K.cast(y_pred < 0.5, y_true.dtype))) + + +# the data, shuffled and split between train and test sets +tr_pairs,te_pairs,tr_y,te_y = sunflower_pairs_data() + # y_train.shape,y_test.shape +# x_train.shape,x_test.shape +# x_train = x_train.reshape(60000, 784) +# x_test = x_test.reshape(10000, 784) +# x_train = x_train.astype('float32') +# x_test = x_test.astype('float32') +# x_train /= 255 +# x_test /= 255 +input_dim = tr_pairs.shape[2:] +epochs = 20 + +# network definition +base_network = create_base_network(input_dim) +input_a = Input(shape=input_dim) +input_b = Input(shape=input_dim) + +# because we re-use the same instance `base_network`, +# the weights of the network +# will be shared across the two branches +processed_a = base_network(input_a) +processed_b = base_network(input_b) + +distance = Lambda(euclidean_distance, + output_shape=eucl_dist_output_shape)([processed_a, processed_b]) + +model = Model([input_a, input_b], distance) + +# train +rms = RMSprop() +model.compile(loss=contrastive_loss, optimizer=rms, metrics=[accuracy]) +model.fit([tr_pairs[:, 0], tr_pairs[:, 1]], tr_y, + batch_size=128, + epochs=epochs, + validation_data=([te_pairs[:, 0], te_pairs[:, 1]], te_y)) + +# compute final accuracy on training and test sets +y_pred = model.predict([tr_pairs[:, 0], tr_pairs[:, 1]]) +tr_acc = compute_accuracy(tr_y, y_pred) +y_pred = model.predict([te_pairs[:, 0], te_pairs[:, 1]]) +te_acc = compute_accuracy(te_y, y_pred) + +print('* Accuracy on training set: %0.2f%%' % (100 * tr_acc)) +print('* Accuracy on test set: %0.2f%%' % (100 * te_acc))