diff --git a/mnist_siamese.py b/mnist_siamese.py
index a5258ca..a6aec8b 100644
--- a/mnist_siamese.py
+++ b/mnist_siamese.py
@@ -17,7 +17,7 @@ import numpy as np
 import random
 from keras.datasets import mnist
 from keras.models import Model
-from keras.layers import Dense, Dropout, Input, Lambda, Recurrent
+from keras.layers import Dense, Dropout, Input, Lambda
 from keras.optimizers import RMSprop
 from keras import backend as K
 
@@ -104,6 +104,7 @@ tr_pairs, tr_y = create_pairs(x_train, digit_indices)
 
 digit_indices = [np.where(y_test == i)[0] for i in range(num_classes)]
 te_pairs, te_y = create_pairs(x_test, digit_indices)
+tr_pairs.shape
 
 # network definition
 base_network = create_base_network(input_dim)
diff --git a/spectro_gen.py b/spectro_gen.py
index 7df17e4..6a08fbf 100644
--- a/spectro_gen.py
+++ b/spectro_gen.py
@@ -63,7 +63,7 @@ def generate_aiff_spectrogram(audiopath):
     samples,samplerate,_ = snd.read(audiopath)
     # samplerate, samples = wav.read(audiopath)
     # s = stft(samples, binsize)
-    s = stft(samples, samplerate*150/1000,1.0/3)
+    s = stft(samples, samplerate*150//1000,1.0/3)
 
     sshow, freq = logscale_spec(s, factor=1.0, sr=samplerate)
     ims = 20.*np.log10(np.abs(sshow)/10e-6)
@@ -74,7 +74,8 @@ def plotstft(audiopath, binsize=2**10, plotpath=None, colormap="jet"):
     samples,samplerate,_ = snd.read(audiopath)
     # samplerate, samples = wav.read(audiopath)
     # s = stft(samples, binsize)
-    s = stft(samples, samplerate*150/1000,1.0/3)
+    # print(samplerate*150//1000)
+    s = stft(samples, samplerate*150//1000,1.0/3)
 
     sshow, freq = logscale_spec(s, factor=1.0, sr=samplerate)
     ims = 20.*np.log10(np.abs(sshow)/10e-6) # amplitude to decibel
diff --git a/speech_data.py b/speech_data.py
new file mode 100644
index 0000000..25ea1e2
--- /dev/null
+++ b/speech_data.py
@@ -0,0 +1,68 @@
+import pandas as pd
+import numpy as np
+from spectro_gen import generate_aiff_spectrogram
+from sklearn.model_selection import train_test_split
+import tensorflow as tf
+
+def sunflower_data():
+    audio_samples = pd.read_csv('./outputs/audio.csv',names=['word','voice','rate','variant','file'])
+    sunflowers = audio_samples.loc[audio_samples['word'] == 'sunflowers'].reset_index(drop=True)
+    sunflowers.loc[:,'file'] = sunflowers.loc[:,'file'].apply(lambda x:'outputs/'+x).apply(generate_aiff_spectrogram)
+    y_data = sunflowers['variant'].apply(lambda x:x=='normal').values
+    max_samples = sunflowers['file'].apply(lambda x:x.shape[0]).max()
+    sample_size = sunflowers['file'][0].shape[1]
+    sample_count = sunflowers['file'].shape[0]
+    sunflowers['file'][0].shape[0]
+    def append_zeros(spgr):
+        orig = spgr.shape[0]
+        return np.lib.pad(spgr,[(0, max_samples-orig), (0,0)],'median')
+    pad_sun = sunflowers['file'].apply(append_zeros).values
+    x_data = np.vstack(pad_sun).reshape((sample_count,max_samples,sample_size,))
+    # x_data.shape
+    # y_data.shape
+    # train_test_split(x_data,y_data,test_size=0.33)[].shape
+    # len(train_test_split(x_data,y_data,test_size=0.33))
+    # sunflowers.loc[:,'file'][0]
+    # generate_aiff_spectrogram('outputs/sunflowers-Alex-150-normal-589.aiff')
+    # sunflowers[sunflowers['variant'] == 'phoneme']
+    # sunflowers[sunflowers['variant'] == 'normal']
+    # for s in sunflowers.values:
+    #     print(s)
+    return train_test_split(x_data,y_data,test_size=0.33)
+
+
+def sunflower_pairs_data():
+    audio_samples = pd.read_csv('./outputs/audio.csv',names=['word','voice','rate','variant','file'])
+    sunflowers = audio_samples.loc[audio_samples['word'] == 'sunflowers'].reset_index(drop=True)
+    sunflowers.loc[:,'file'] = sunflowers.loc[:,'file'].apply(lambda x:'outputs/'+x).apply(generate_aiff_spectrogram)
+    y_data = sunflowers['variant'].apply(lambda x:x=='normal').values
+    max_samples = sunflowers['file'].apply(lambda x:x.shape[0]).max()
+    sample_size = sunflowers['file'][0].shape[1]
+    sunflowers_pos = sunflowers[sunflowers['variant'] == 'normal'].reset_index(drop=True)
+    sunflowers_neg = sunflowers[sunflowers['variant'] == 'phoneme'].reset_index(drop=True)
+    def append_zeros(spgr):
+        return np.lib.pad(spgr,[(0, max_samples-spgr.shape[0]), (0,0)],'median')
+    def create_data(sf):
+        sample_count = sf['file'].shape[0]
+        pad_sun = sf['file'].apply(append_zeros).values
+        x_data = np.vstack(pad_sun).reshape((sample_count,max_samples,sample_size))
+        return x_data
+    x_data_pos = create_data(sunflowers_pos)
+    x_data_neg = create_data(sunflowers_neg)
+    x_pos_train, x_pos_test, x_neg_train, x_neg_test =train_test_split(x_data_pos,x_data_neg,test_size=0.33)
+    tr_y = np.array(x_pos_train.shape[0]*[[1,0]])
+    te_y = np.array(x_pos_test.shape[0]*[[1,0]])
+    tr_pairs = np.array([x_pos_train,x_neg_train]).reshape(x_pos_train.shape[0],2,max_samples,sample_size)
+    te_pairs = np.array([x_pos_test,x_neg_test]).reshape(x_pos_test.shape[0],2,max_samples,sample_size)
+    # x_data.shape
+    # y_data.shape
+    # train_test_split(x_data,y_data,test_size=0.33)[].shape
+    # len(train_test_split(x_data,y_data,test_size=0.33))
+    # sunflowers.loc[:,'file'][0]
+    # generate_aiff_spectrogram('outputs/sunflowers-Alex-150-normal-589.aiff')
+    # sunflowers[sunflowers['variant'] == 'phoneme']
+    # sunflowers[sunflowers['variant'] == 'normal']
+    # for s in sunflowers.values:
+    #     print(s)
+    #return train_test_split(x_data,y_data,test_size=0.33)
+    return tr_pairs,te_pairs,tr_y,te_y
diff --git a/speech_siamese.py b/speech_siamese.py
new file mode 100644
index 0000000..821d29a
--- /dev/null
+++ b/speech_siamese.py
@@ -0,0 +1,112 @@
+'''Train a Siamese MLP on pairs of digits from the MNIST dataset.
+
+It follows Hadsell-et-al.'06 [1] by computing the Euclidean distance on the
+output of the shared network and by optimizing the contrastive loss (see paper
+for mode details).
+
+[1] "Dimensionality Reduction by Learning an Invariant Mapping"
+    http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
+
+Gets to 97.2% test accuracy after 20 epochs.
+2 seconds per epoch on a Titan X Maxwell GPU
+'''
+from __future__ import absolute_import
+from __future__ import print_function
+import numpy as np
+
+import random
+# from keras.datasets import mnist
+from speech_data import sunflower_pairs_data
+from keras.models import Model
+from keras.layers import Dense, Dropout, Input, Lambda, LSTM, SimpleRNN
+from keras.optimizers import RMSprop
+from keras import backend as K
+
+def euclidean_distance(vects):
+    x, y = vects
+    return K.sqrt(K.maximum(K.sum(K.square(x - y), axis=1, keepdims=True), K.epsilon()))
+
+
+def eucl_dist_output_shape(shapes):
+    shape1, shape2 = shapes
+    return (shape1[0], 1)
+
+
+def contrastive_loss(y_true, y_pred):
+    '''Contrastive loss from Hadsell-et-al.'06
+    http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
+    '''
+    margin = 1
+    return K.mean(y_true * K.square(y_pred) +
+                  (1 - y_true) * K.square(K.maximum(margin - y_pred, 0)))
+
+
+def create_base_network(input_dim):
+    '''Base network to be shared (eq. to feature extraction).
+    '''
+    inp = Input(shape=input_dim)
+    sr1 = SimpleRNN(128)(inp)
+    # sr2 = LSTM(128)(sr1)
+    # sr2 = SimpleRNN(128)(sr)
+    x = Dense(128, activation='relu')(sr1)
+    return Model(inp, x)
+
+
+def compute_accuracy(y_true, y_pred):
+    '''Compute classification accuracy with a fixed threshold on distances.
+    '''
+    pred = y_pred.ravel() < 0.5
+    return np.mean(pred == y_true)
+
+
+def accuracy(y_true, y_pred):
+    '''Compute classification accuracy with a fixed threshold on distances.
+    '''
+    return K.mean(K.equal(y_true, K.cast(y_pred < 0.5, y_true.dtype)))
+
+
+# the data, shuffled and split between train and test sets
+tr_pairs,te_pairs,tr_y,te_y = sunflower_pairs_data()
+ # y_train.shape,y_test.shape
+# x_train.shape,x_test.shape
+# x_train = x_train.reshape(60000, 784)
+# x_test = x_test.reshape(10000, 784)
+# x_train = x_train.astype('float32')
+# x_test = x_test.astype('float32')
+# x_train /= 255
+# x_test /= 255
+input_dim = tr_pairs.shape[2:]
+epochs = 20
+
+# network definition
+base_network = create_base_network(input_dim)
+input_a = Input(shape=input_dim)
+input_b = Input(shape=input_dim)
+
+# because we re-use the same instance `base_network`,
+# the weights of the network
+# will be shared across the two branches
+processed_a = base_network(input_a)
+processed_b = base_network(input_b)
+
+distance = Lambda(euclidean_distance,
+                  output_shape=eucl_dist_output_shape)([processed_a, processed_b])
+
+model = Model([input_a, input_b], distance)
+
+# train
+rms = RMSprop()
+model.compile(loss=contrastive_loss, optimizer=rms, metrics=[accuracy])
+model.fit([tr_pairs[:, 0], tr_pairs[:, 1]], tr_y,
+          batch_size=128,
+          epochs=epochs,
+          validation_data=([te_pairs[:, 0], te_pairs[:, 1]], te_y))
+
+# compute final accuracy on training and test sets
+y_pred = model.predict([tr_pairs[:, 0], tr_pairs[:, 1]])
+tr_acc = compute_accuracy(tr_y, y_pred)
+y_pred = model.predict([te_pairs[:, 0], te_pairs[:, 1]])
+te_acc = compute_accuracy(te_y, y_pred)
+
+print('* Accuracy on training set: %0.2f%%' % (100 * tr_acc))
+print('* Accuracy on test set: %0.2f%%' % (100 * te_acc))