diff --git a/Siamese.ipynb b/Siamese.ipynb deleted file mode 100644 index 72c80a6..0000000 --- a/Siamese.ipynb +++ /dev/null @@ -1,360 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "'''Train a Siamese MLP on pairs of digits from the MNIST dataset.\n", - "\n", - "It follows Hadsell-et-al.'06 [1] by computing the Euclidean distance on the\n", - "output of the shared network and by optimizing the contrastive loss (see paper\n", - "for mode details).\n", - "\n", - "[1] \"Dimensionality Reduction by Learning an Invariant Mapping\"\n", - " http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf\n", - "\n", - "Gets to 97.2% test accuracy after 20 epochs.\n", - "2 seconds per epoch on a Titan X Maxwell GPU\n", - "'''\n", - "from __future__ import absolute_import\n", - "from __future__ import print_function\n", - "import numpy as np\n", - "\n", - "# import random\n", - "# from keras.datasets import mnist\n", - "from speech_data import speech_model_data\n", - "from keras.models import Model\n", - "from keras.layers import Input, Dense, Dropout, SimpleRNN, LSTM, Lambda\n", - "# Dense, Dropout, Input, Lambda, LSTM, SimpleRNN\n", - "from keras.optimizers import RMSprop, SGD\n", - "from keras.callbacks import TensorBoard\n", - "from keras import backend as K\n", - "\n", - "\n", - "def euclidean_distance(vects):\n", - " x, y = vects\n", - " return K.sqrt(K.maximum(K.sum(K.square(x - y), axis=1, keepdims=True),\n", - " K.epsilon()))\n", - "\n", - "\n", - "def eucl_dist_output_shape(shapes):\n", - " shape1, shape2 = shapes\n", - " return (shape1[0], 1)\n", - "\n", - "\n", - "def contrastive_loss(y_true, y_pred):\n", - " '''Contrastive loss from Hadsell-et-al.'06\n", - " http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf\n", - " '''\n", - " margin = 1\n", - " # print(y_true, y_pred)\n", - " return K.mean(y_true * K.square(y_pred) +\n", - " (1 - y_true) * K.square(K.maximum(margin - y_pred, 0)))\n", - "\n", - "\n", - "def create_base_rnn_network(input_dim):\n", - " '''Base network to be shared (eq. to feature extraction).\n", - " '''\n", - " inp = Input(shape=input_dim)\n", - " # d1 = Dense(1024, activation='sigmoid')(inp)\n", - " # # d2 = Dense(2, activation='sigmoid')(d1)\n", - " ls1 = LSTM(1024, return_sequences=True)(inp)\n", - " ls2 = LSTM(512, return_sequences=True)(ls1)\n", - " ls3 = LSTM(32)(ls2) # , return_sequences=True\n", - " # sr2 = SimpleRNN(128, return_sequences=True)(sr1)\n", - " # sr3 = SimpleRNN(32)(sr2)\n", - " # x = Dense(128, activation='relu')(sr1)\n", - " return Model(inp, ls3)\n", - "\n", - "def create_base_network(input_dim):\n", - " '''Base network to be shared (eq. to feature extraction).\n", - " '''\n", - " input = Input(shape=input_dim)\n", - " x = Dense(128, activation='relu')(input)\n", - " x = Dropout(0.1)(x)\n", - " x = Dense(128, activation='relu')(x)\n", - " x = Dropout(0.1)(x)\n", - " x = Dense(128, activation='relu')(x)\n", - " return Model(input, x)\n", - "\n", - "def compute_accuracy(y_true, y_pred):\n", - " '''Compute classification accuracy with a fixed threshold on distances.\n", - " '''\n", - " pred = y_pred.ravel() < 0.5\n", - " return np.mean(pred == y_true)\n", - "\n", - "\n", - "def accuracy(y_true, y_pred):\n", - " '''Compute classification accuracy with a fixed threshold on distances.\n", - " '''\n", - " return K.mean(K.equal(y_true, K.cast(y_pred < 0.5, y_true.dtype)))\n", - "\n", - "\n", - "# the data, shuffled and split between train and test sets\n", - "tr_pairs, te_pairs, tr_y, te_y = speech_model_data()\n", - "\n", - "%matplotlib inline\n", - "import matplotlib.pyplot as plt\n", - "def plot_spec(ims):\n", - " timebins, freqbins = np.shape(ims)\n", - " # import pdb;pdb.set_trace()\n", - "# plt.figure(figsize=(15, 7.5))\n", - " plt.imshow(np.transpose(ims), origin=\"lower\", aspect=\"auto\", cmap=\"jet\", interpolation=\"none\")\n", - " plt.colorbar()\n", - " xlocs = np.float32(np.linspace(0, timebins-1, 5))\n", - " plt.xticks(xlocs, [\"%.02f\" % l for l in ((xlocs*15/timebins)+(0.5*2**10))/22100])\n", - " ylocs = np.int16(np.round(np.linspace(0, freqbins-1, 10)))\n", - "# plt.yticks(ylocs, [\"%.02f\" % freq[i] for i in ylocs])\n", - " \n", - "def show_nth(n):\n", - " plt.figure(figsize=(15,7.5))\n", - " plt.subplot(1,2,1)\n", - " plot_spec(te_pairs[n][0].reshape(15,1654))\n", - " print(te_y[n])\n", - " plt.subplot(1,2,2)\n", - " plot_spec(te_pairs[n][1].reshape(15,1654))\n", - "show_nth(0)\n", - "\n", - "# y_train.shape,y_test.shape\n", - "# x_train.shape,x_test.shape\n", - "# x_train = x_train.reshape(60000, 784)\n", - "# x_test = x_test.reshape(10000, 784)\n", - "# x_train = x_train.astype('float32')\n", - "# x_test = x_test.astype('float32')\n", - "# x_train /= 255\n", - "# x_test /= 255\n", - "\n", - "# input_dim = (tr_pairs.shape[2], tr_pairs.shape[3])\n", - "# epochs = 20\n", - "\n", - "# # network definition\n", - "# base_network = create_base_rnn_network(input_dim)\n", - "# input_a = Input(shape=input_dim)\n", - "# input_b = Input(shape=input_dim)\n", - "\n", - "# # because we re-use the same instance `base_network`,\n", - "# # the weights of the network\n", - "# # will be shared across the two branches\n", - "# processed_a = base_network(input_a)\n", - "# processed_b = base_network(input_b)\n", - "\n", - "# distance = Lambda(euclidean_distance,\n", - "# output_shape=eucl_dist_output_shape)(\n", - "# [processed_a, processed_b]\n", - "# )\n", - "\n", - "# model = Model([input_a, input_b], distance)\n", - "\n", - "# tb_cb = TensorBoard(log_dir='./siamese_logs', histogram_freq=1, batch_size=32,\n", - "# write_graph=True, write_grads=True, write_images=True,\n", - "# embeddings_freq=0, embeddings_layer_names=None,\n", - "# embeddings_metadata=None)\n", - "# # train\n", - "# rms = RMSprop(lr=0.00001) # lr=0.001)\n", - "# sgd = SGD(lr=0.001)\n", - "# model.compile(loss=contrastive_loss, optimizer=rms, metrics=[accuracy])\n", - "# model.fit([tr_pairs[:, 0], tr_pairs[:, 1]], tr_y,\n", - "# batch_size=128,\n", - "# epochs=epochs,\n", - "# validation_data=([te_pairs[:, 0], te_pairs[:, 1]], te_y),\n", - "# callbacks=[tb_cb])\n", - "\n", - "# # compute final accuracy on training and test sets\n", - "# y_pred = model.predict([tr_pairs[:, 0], tr_pairs[:, 1]])\n", - "# tr_acc = compute_accuracy(tr_y, y_pred)\n", - "# y_pred = model.predict([te_pairs[:, 0], te_pairs[:, 1]])\n", - "# te_acc = compute_accuracy(te_y, y_pred)\n", - "\n", - "# print('* Accuracy on training set: %0.2f%%' % (100 * tr_acc))\n", - "# print('* Accuracy on test set: %0.2f%%' % (100 * te_acc))\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Using TensorFlow backend.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Train on 36252 samples, validate on 4028 samples\n" - ] - } - ], - "source": [ - "'''Train a Siamese MLP on pairs of digits from the MNIST dataset.\n", - "\n", - "It follows Hadsell-et-al.'06 [1] by computing the Euclidean distance on the\n", - "output of the shared network and by optimizing the contrastive loss (see paper\n", - "for mode details).\n", - "\n", - "[1] \"Dimensionality Reduction by Learning an Invariant Mapping\"\n", - " http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf\n", - "\n", - "Gets to 97.2% test accuracy after 20 epochs.\n", - "2 seconds per epoch on a Titan X Maxwell GPU\n", - "'''\n", - "from __future__ import absolute_import\n", - "from __future__ import print_function\n", - "import numpy as np\n", - "\n", - "# import random\n", - "# from keras.datasets import mnist\n", - "from speech_data import speech_model_data\n", - "from keras.models import Model\n", - "from keras.layers import Input, Dense, Dropout, SimpleRNN, LSTM, Lambda\n", - "# Dense, Dropout, Input, Lambda, LSTM, SimpleRNN\n", - "from keras.optimizers import RMSprop, SGD\n", - "from keras.callbacks import TensorBoard\n", - "from keras import backend as K\n", - "\n", - "\n", - "def euclidean_distance(vects):\n", - " x, y = vects\n", - " return K.sqrt(K.maximum(K.sum(K.square(x - y), axis=1, keepdims=True),\n", - " K.epsilon()))\n", - "\n", - "\n", - "def eucl_dist_output_shape(shapes):\n", - " shape1, shape2 = shapes\n", - " return (shape1[0], 1)\n", - "\n", - "\n", - "def contrastive_loss(y_true, y_pred):\n", - " '''Contrastive loss from Hadsell-et-al.'06\n", - " http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf\n", - " '''\n", - " margin = 1\n", - " # print(y_true, y_pred)\n", - " return K.mean(y_true * K.square(y_pred) +\n", - " (1 - y_true) * K.square(K.maximum(margin - y_pred, 0)))\n", - "\n", - "\n", - "def create_base_rnn_network(input_dim):\n", - " '''Base network to be shared (eq. to feature extraction).\n", - " '''\n", - " inp = Input(shape=input_dim)\n", - " # d1 = Dense(1024, activation='sigmoid')(inp)\n", - " # # d2 = Dense(2, activation='sigmoid')(d1)\n", - " ls1 = LSTM(1024, return_sequences=True)(inp)\n", - " ls2 = LSTM(512, return_sequences=True)(ls1)\n", - " ls3 = LSTM(32)(ls2) # , return_sequences=True\n", - " # sr2 = SimpleRNN(128, return_sequences=True)(sr1)\n", - " # sr3 = SimpleRNN(32)(sr2)\n", - " # x = Dense(128, activation='relu')(sr1)\n", - " return Model(inp, ls3)\n", - "\n", - "def create_base_network(input_dim):\n", - " '''Base network to be shared (eq. to feature extraction).\n", - " '''\n", - " input = Input(shape=input_dim)\n", - " x = Dense(128, activation='relu')(input)\n", - " x = Dropout(0.1)(x)\n", - " x = Dense(128, activation='relu')(x)\n", - " x = Dropout(0.1)(x)\n", - " x = Dense(128, activation='relu')(x)\n", - " return Model(input, x)\n", - "\n", - "def compute_accuracy(y_true, y_pred):\n", - " '''Compute classification accuracy with a fixed threshold on distances.\n", - " '''\n", - " pred = y_pred.ravel() < 0.5\n", - " return np.mean(pred == y_true)\n", - "\n", - "\n", - "def accuracy(y_true, y_pred):\n", - " '''Compute classification accuracy with a fixed threshold on distances.\n", - " '''\n", - " return K.mean(K.equal(y_true, K.cast(y_pred < 0.5, y_true.dtype)))\n", - "\n", - "\n", - "# the data, shuffled and split between train and test sets\n", - "tr_pairs, te_pairs, tr_y, te_y = speech_model_data()\n", - "# y_train.shape,y_test.shape\n", - "# x_train.shape,x_test.shape\n", - "# x_train = x_train.reshape(60000, 784)\n", - "# x_test = x_test.reshape(10000, 784)\n", - "# x_train = x_train.astype('float32')\n", - "# x_test = x_test.astype('float32')\n", - "# x_train /= 255\n", - "# x_test /= 255\n", - "input_dim = (tr_pairs.shape[2], tr_pairs.shape[3])\n", - "epochs = 20\n", - "\n", - "# network definition\n", - "base_network = create_base_rnn_network(input_dim)\n", - "input_a = Input(shape=input_dim)\n", - "input_b = Input(shape=input_dim)\n", - "\n", - "# because we re-use the same instance `base_network`,\n", - "# the weights of the network\n", - "# will be shared across the two branches\n", - "processed_a = base_network(input_a)\n", - "processed_b = base_network(input_b)\n", - "\n", - "distance = Lambda(euclidean_distance,\n", - " output_shape=eucl_dist_output_shape)(\n", - " [processed_a, processed_b]\n", - ")\n", - "\n", - "model = Model([input_a, input_b], distance)\n", - "\n", - "tb_cb = TensorBoard(log_dir='./logs/siamese_logs', histogram_freq=1, batch_size=32,\n", - " write_graph=True, write_grads=True, write_images=True,\n", - " 3\n", - " embeddings_freq=0, embeddings_layer_names=None,\n", - " embeddings_metadata=None)\n", - "# train\n", - "rms = RMSprop(lr=0.001) # lr=0.001)\n", - "sgd = SGD(lr=0.001)\n", - "model.compile(loss=contrastive_loss, optimizer=rms, metrics=[accuracy])\n", - "model.fit([tr_pairs[:, 0], tr_pairs[:, 1]], tr_y,\n", - " batch_size=128,\n", - " epochs=epochs,\n", - " validation_data=([te_pairs[:, 0], te_pairs[:, 1]], te_y),\n", - " callbacks=[tb_cb])\n", - "\n", - "model.save('./models/siamese_speech_model.h5')\n", - "# compute final accuracy on training and test sets\n", - "y_pred = model.predict([tr_pairs[:, 0], tr_pairs[:, 1]])\n", - "tr_acc = compute_accuracy(tr_y, y_pred)\n", - "y_pred = model.predict([te_pairs[:, 0], te_pairs[:, 1]])\n", - "te_acc = compute_accuracy(te_y, y_pred)\n", - "\n", - "print('* Accuracy on training set: %0.2f%%' % (100 * tr_acc))\n", - "print('* Accuracy on test set: %0.2f%%' % (100 * te_acc))" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.5.2" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/mnist_siamese.py b/mnist_siamese.py deleted file mode 100644 index 88308bb..0000000 --- a/mnist_siamese.py +++ /dev/null @@ -1,142 +0,0 @@ -'''Train a Siamese MLP on pairs of digits from the MNIST dataset. - -It follows Hadsell-et-al.'06 [1] by computing the Euclidean distance on the -output of the shared network and by optimizing the contrastive loss (see paper -for mode details). - -[1] "Dimensionality Reduction by Learning an Invariant Mapping" - http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf - -Gets to 97.2% test accuracy after 20 epochs. -2 seconds per epoch on a Titan X Maxwell GPU -''' -from __future__ import absolute_import -from __future__ import print_function -import numpy as np - -import random -from keras.datasets import mnist -from keras.models import Model -from keras.layers import Dense, Dropout, Input, Lambda -from keras.optimizers import RMSprop -from keras import backend as K - -%matplotlib inline -import matplotlib.pyplot as plt - -num_classes = 10 - - -def euclidean_distance(vects): - x, y = vects - return K.sqrt(K.maximum(K.sum(K.square(x - y), axis=1, keepdims=True), K.epsilon())) - - -def eucl_dist_output_shape(shapes): - shape1, shape2 = shapes - return (shape1[0], 1) - - -def contrastive_loss(y_true, y_pred): - '''Contrastive loss from Hadsell-et-al.'06 - http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf - ''' - margin = 1 - return K.mean(y_true * K.square(y_pred) + - (1 - y_true) * K.square(K.maximum(margin - y_pred, 0))) - - -def create_pairs(x, digit_indices): - '''Positive and negative pair creation. - Alternates between positive and negative pairs. - ''' - pairs = [] - labels = [] - n = min([len(digit_indices[d]) for d in range(num_classes)]) - 1 - for d in range(num_classes): - for i in range(n): - z1, z2 = digit_indices[d][i], digit_indices[d][i + 1] - pairs += [[x[z1], x[z2]]] - inc = random.randrange(1, num_classes) - dn = (d + inc) % num_classes - z1, z2 = digit_indices[d][i], digit_indices[dn][i] - pairs += [[x[z1], x[z2]]] - labels += [1, 0] - return np.array(pairs), np.array(labels) - - -def create_base_network(input_dim): - '''Base network to be shared (eq. to feature extraction). - ''' - input = Input(shape=(input_dim,)) - x = Dense(128, activation='relu')(input) - x = Dropout(0.1)(x) - x = Dense(128, activation='relu')(x) - x = Dropout(0.1)(x) - x = Dense(128, activation='relu')(x) - return Model(input, x) - - -def compute_accuracy(y_true, y_pred): - '''Compute classification accuracy with a fixed threshold on distances. - ''' - pred = y_pred.ravel() < 0.5 - return np.mean(pred == y_true) - - -def accuracy(y_true, y_pred): - '''Compute classification accuracy with a fixed threshold on distances. - ''' - return K.mean(K.equal(y_true, K.cast(y_pred < 0.5, y_true.dtype))) - - -# the data, shuffled and split between train and test sets -(x_train, y_train), (x_test, y_test) = mnist.load_data() -x_train = x_train.reshape(60000, 784) -x_test = x_test.reshape(10000, 784) -x_train = x_train.astype('float32') -x_test = x_test.astype('float32') -x_train /= 255 -x_test /= 255 -input_dim = 784 -epochs = 20 - -# create training+test positive and negative pairs -digit_indices = [np.where(y_train == i)[0] for i in range(num_classes)] -tr_pairs, tr_y = create_pairs(x_train, digit_indices) - -digit_indices = [np.where(y_test == i)[0] for i in range(num_classes)] -te_pairs, te_y = create_pairs(x_test, digit_indices) -# network definition -base_network = create_base_network(input_dim) - -input_a = Input(shape=(input_dim,)) -input_b = Input(shape=(input_dim,)) - -# because we re-use the same instance `base_network`, -# the weights of the network -# will be shared across the two branches -processed_a = base_network(input_a) -processed_b = base_network(input_b) - -distance = Lambda(euclidean_distance, - output_shape=eucl_dist_output_shape)([processed_a, processed_b]) - -model = Model([input_a, input_b], distance) - -# train -rms = RMSprop() -model.compile(loss=contrastive_loss, optimizer=rms, metrics=[accuracy]) -model.fit([tr_pairs[:, 0], tr_pairs[:, 1]], tr_y, - batch_size=128, - epochs=epochs, - validation_data=([te_pairs[:, 0], te_pairs[:, 1]], te_y)) - -# compute final accuracy on training and test sets -y_pred = model.predict([tr_pairs[:, 0], tr_pairs[:, 1]]) -tr_acc = compute_accuracy(tr_y, y_pred) -y_pred = model.predict([te_pairs[:, 0], te_pairs[:, 1]]) -te_acc = compute_accuracy(te_y, y_pred) - -print('* Accuracy on training set: %0.2f%%' % (100 * tr_acc)) -print('* Accuracy on test set: %0.2f%%' % (100 * te_acc)) diff --git a/siamese_network_tf.py b/siamese_network_tf.py deleted file mode 100644 index 27d6290..0000000 --- a/siamese_network_tf.py +++ /dev/null @@ -1,90 +0,0 @@ -import tensorflow as tf -import numpy as np - -class SiameseLSTM(object): - """ - A LSTM based deep Siamese network for text similarity. - Uses an character embedding layer, followed by a biLSTM and Energy Loss layer. - """ - - def BiRNN(self, x, dropout, scope, embedding_size, sequence_length): - n_input=embedding_size - n_steps=sequence_length - n_hidden=n_steps - n_layers=3 - # Prepare data shape to match `bidirectional_rnn` function requirements - # Current data input shape: (batch_size, n_steps, n_input) (?, seq_len, embedding_size) - # Required shape: 'n_steps' tensors list of shape (batch_size, n_input) - # Permuting batch_size and n_steps - x = tf.transpose(x, [1, 0, 2]) - # Reshape to (n_steps*batch_size, n_input) - x = tf.reshape(x, [-1, n_input]) - print(x) - # Split to get a list of 'n_steps' tensors of shape (batch_size, n_input) - x = tf.split(x, n_steps, 0) - print(x) - # Define lstm cells with tensorflow - # Forward direction cell - with tf.name_scope("fw"+scope),tf.variable_scope("fw"+scope): - stacked_rnn_fw = [] - for _ in range(n_layers): - fw_cell = tf.nn.rnn_cell.BasicLSTMCell(n_hidden, forget_bias=1.0, state_is_tuple=True) - lstm_fw_cell = tf.contrib.rnn.DropoutWrapper(fw_cell,output_keep_prob=dropout) - stacked_rnn_fw.append(lstm_fw_cell) - lstm_fw_cell_m = tf.nn.rnn_cell.MultiRNNCell(cells=stacked_rnn_fw, state_is_tuple=True) - - with tf.name_scope("bw"+scope),tf.variable_scope("bw"+scope): - stacked_rnn_bw = [] - for _ in range(n_layers): - bw_cell = tf.nn.rnn_cell.BasicLSTMCell(n_hidden, forget_bias=1.0, state_is_tuple=True) - lstm_bw_cell = tf.contrib.rnn.DropoutWrapper(bw_cell,output_keep_prob=dropout) - stacked_rnn_bw.append(lstm_bw_cell) - lstm_bw_cell_m = tf.nn.rnn_cell.MultiRNNCell(cells=stacked_rnn_bw, state_is_tuple=True) - # Get lstm cell output - - with tf.name_scope("bw"+scope),tf.variable_scope("bw"+scope): - outputs, _, _ = tf.nn.static_bidirectional_rnn(lstm_fw_cell_m, lstm_bw_cell_m, x, dtype=tf.float32) - return outputs[-1] - - def contrastive_loss(self, y,d,batch_size): - tmp= y *tf.square(d) - #tmp= tf.mul(y,tf.square(d)) - tmp2 = (1-y) *tf.square(tf.maximum((1 - d),0)) - return tf.reduce_sum(tmp +tmp2)/batch_size/2 - - def __init__( - self, sequence_length, vocab_size, embedding_size, hidden_units, l2_reg_lambda, batch_size): - - # Placeholders for input, output and dropout - self.input_x1 = tf.placeholder(tf.int32, [None, sequence_length], name="input_x1") - self.input_x2 = tf.placeholder(tf.int32, [None, sequence_length], name="input_x2") - self.input_y = tf.placeholder(tf.float32, [None], name="input_y") - self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob") - - # Keeping track of l2 regularization loss (optional) - l2_loss = tf.constant(0.0, name="l2_loss") - - # Embedding layer - with tf.name_scope("embedding"): - self.W = tf.Variable( - tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0), - trainable=True,name="W") - self.embedded_chars1 = tf.nn.embedding_lookup(self.W, self.input_x1) - #self.embedded_chars_expanded1 = tf.expand_dims(self.embedded_chars1, -1) - self.embedded_chars2 = tf.nn.embedding_lookup(self.W, self.input_x2) - #self.embedded_chars_expanded2 = tf.expand_dims(self.embedded_chars2, -1) - - # Create a convolution + maxpool layer for each filter size - with tf.name_scope("output"): - self.out1=self.BiRNN(self.embedded_chars1, self.dropout_keep_prob, "side1", embedding_size, sequence_length) - self.out2=self.BiRNN(self.embedded_chars2, self.dropout_keep_prob, "side2", embedding_size, sequence_length) - self.distance = tf.sqrt(tf.reduce_sum(tf.square(tf.subtract(self.out1,self.out2)),1,keep_dims=True)) - self.distance = tf.div(self.distance, tf.add(tf.sqrt(tf.reduce_sum(tf.square(self.out1),1,keep_dims=True)),tf.sqrt(tf.reduce_sum(tf.square(self.out2),1,keep_dims=True)))) - self.distance = tf.reshape(self.distance, [-1], name="distance") - with tf.name_scope("loss"): - self.loss = self.contrastive_loss(self.input_y,self.distance, batch_size) - #### Accuracy computation is outside of this class. - with tf.name_scope("accuracy"): - self.temp_sim = tf.subtract(tf.ones_like(self.distance),tf.rint(self.distance), name="temp_sim") #auto threshold 0.5 - correct_predictions = tf.equal(self.temp_sim, self.input_y) - self.accuracy=tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy") diff --git a/speech_siamese.py b/speech_siamese.py index 2e37b2f..3eeed4c 100644 --- a/speech_siamese.py +++ b/speech_siamese.py @@ -1,27 +1,12 @@ -'''Train a Siamese MLP on pairs of digits from the MNIST dataset. -It follows Hadsell-et-al.'06 [1] by computing the Euclidean distance on the -output of the shared network and by optimizing the contrastive loss (see paper -for mode details). - -[1] "Dimensionality Reduction by Learning an Invariant Mapping" - http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf - -Gets to 97.2% test accuracy after 20 epochs. -2 seconds per epoch on a Titan X Maxwell GPU -''' from __future__ import absolute_import from __future__ import print_function import numpy as np - -# import random -# from keras.datasets import mnist from speech_data import speech_model_data from keras.models import Model -from keras.layers import Input, Dense, Dropout, SimpleRNN, LSTM, Lambda -# Dense, Dropout, Input, Lambda, LSTM, SimpleRNN +from keras.layers import Input, Dense, Dropout, LSTM, Lambda from keras.optimizers import RMSprop, SGD -from keras.callbacks import TensorBoard +from keras.callbacks import TensorBoard, ModelCheckpoint from keras import backend as K @@ -40,26 +25,20 @@ def contrastive_loss(y_true, y_pred): '''Contrastive loss from Hadsell-et-al.'06 http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf ''' - margin = 1 - # print(y_true, y_pred) return K.mean(y_true * K.square(y_pred) + - (1 - y_true) * K.square(K.maximum(margin - y_pred, 0))) + (1 - y_true) * K.square(K.maximum(1 - y_pred, 0))) def create_base_rnn_network(input_dim): '''Base network to be shared (eq. to feature extraction). ''' inp = Input(shape=input_dim) - # d1 = Dense(1024, activation='sigmoid')(inp) - # # d2 = Dense(2, activation='sigmoid')(d1) ls1 = LSTM(1024, return_sequences=True)(inp) ls2 = LSTM(512, return_sequences=True)(ls1) - ls3 = LSTM(32)(ls2) # , return_sequences=True - # sr2 = SimpleRNN(128, return_sequences=True)(sr1) - # sr3 = SimpleRNN(32)(sr2) - # x = Dense(128, activation='relu')(sr1) + ls3 = LSTM(32)(ls2) return Model(inp, ls3) + def create_base_network(input_dim): '''Base network to be shared (eq. to feature extraction). ''' @@ -71,6 +50,7 @@ def create_base_network(input_dim): x = Dense(128, activation='relu')(x) return Model(input, x) + def compute_accuracy(y_true, y_pred): '''Compute classification accuracy with a fixed threshold on distances. ''' @@ -86,16 +66,7 @@ def accuracy(y_true, y_pred): # the data, shuffled and split between train and test sets tr_pairs, te_pairs, tr_y, te_y = speech_model_data() -# y_train.shape,y_test.shape -# x_train.shape,x_test.shape -# x_train = x_train.reshape(60000, 784) -# x_test = x_test.reshape(10000, 784) -# x_train = x_train.astype('float32') -# x_test = x_test.astype('float32') -# x_train /= 255 -# x_test /= 255 input_dim = (tr_pairs.shape[2], tr_pairs.shape[3]) -epochs = 20 # network definition base_network = create_base_rnn_network(input_dim) @@ -115,20 +86,26 @@ distance = Lambda(euclidean_distance, model = Model([input_a, input_b], distance) -tb_cb = TensorBoard(log_dir='./siamese_logs', histogram_freq=1, batch_size=32, - write_graph=True, write_grads=True, write_images=True, - embeddings_freq=0, embeddings_layer_names=None, - embeddings_metadata=None) +tb_cb = TensorBoard(log_dir='./logs/siamese_logs', histogram_freq=1, + batch_size=32, write_graph=True, write_grads=True, + write_images=True, embeddings_freq=0, + embeddings_layer_names=None, embeddings_metadata=None) +cp_file_fmt = './models/siamese_speech_model-{epoch:02d}-epoch-{val_acc:0.2f}\ +-acc.h5' +cp_cb = ModelCheckpoint(cp_file_fmt, monitor='val_acc', verbose=0, + save_best_only=False, save_weights_only=False, + mode='auto', period=1) # train -rms = RMSprop(lr=0.001) # lr=0.001) +rms = RMSprop(lr=0.001) sgd = SGD(lr=0.001) model.compile(loss=contrastive_loss, optimizer=rms, metrics=[accuracy]) model.fit([tr_pairs[:, 0], tr_pairs[:, 1]], tr_y, batch_size=128, - epochs=epochs, + epochs=50, validation_data=([te_pairs[:, 0], te_pairs[:, 1]], te_y), - callbacks=[tb_cb]) + callbacks=[tb_cb, cp_cb]) +model.save('./models/siamese_speech_model-final.h5') # compute final accuracy on training and test sets y_pred = model.predict([tr_pairs[:, 0], tr_pairs[:, 1]]) tr_acc = compute_accuracy(tr_y, y_pred)