{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "'''Train a Siamese MLP on pairs of digits from the MNIST dataset.\n", "\n", "It follows Hadsell-et-al.'06 [1] by computing the Euclidean distance on the\n", "output of the shared network and by optimizing the contrastive loss (see paper\n", "for mode details).\n", "\n", "[1] \"Dimensionality Reduction by Learning an Invariant Mapping\"\n", " http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf\n", "\n", "Gets to 97.2% test accuracy after 20 epochs.\n", "2 seconds per epoch on a Titan X Maxwell GPU\n", "'''\n", "from __future__ import absolute_import\n", "from __future__ import print_function\n", "import numpy as np\n", "\n", "# import random\n", "# from keras.datasets import mnist\n", "from speech_data import speech_model_data\n", "from keras.models import Model\n", "from keras.layers import Input, Dense, Dropout, SimpleRNN, LSTM, Lambda\n", "# Dense, Dropout, Input, Lambda, LSTM, SimpleRNN\n", "from keras.optimizers import RMSprop, SGD\n", "from keras.callbacks import TensorBoard\n", "from keras import backend as K\n", "\n", "\n", "def euclidean_distance(vects):\n", " x, y = vects\n", " return K.sqrt(K.maximum(K.sum(K.square(x - y), axis=1, keepdims=True),\n", " K.epsilon()))\n", "\n", "\n", "def eucl_dist_output_shape(shapes):\n", " shape1, shape2 = shapes\n", " return (shape1[0], 1)\n", "\n", "\n", "def contrastive_loss(y_true, y_pred):\n", " '''Contrastive loss from Hadsell-et-al.'06\n", " http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf\n", " '''\n", " margin = 1\n", " # print(y_true, y_pred)\n", " return K.mean(y_true * K.square(y_pred) +\n", " (1 - y_true) * K.square(K.maximum(margin - y_pred, 0)))\n", "\n", "\n", "def create_base_rnn_network(input_dim):\n", " '''Base network to be shared (eq. to feature extraction).\n", " '''\n", " inp = Input(shape=input_dim)\n", " # d1 = Dense(1024, activation='sigmoid')(inp)\n", " # # d2 = Dense(2, activation='sigmoid')(d1)\n", " ls1 = LSTM(1024, return_sequences=True)(inp)\n", " ls2 = LSTM(512, return_sequences=True)(ls1)\n", " ls3 = LSTM(32)(ls2) # , return_sequences=True\n", " # sr2 = SimpleRNN(128, return_sequences=True)(sr1)\n", " # sr3 = SimpleRNN(32)(sr2)\n", " # x = Dense(128, activation='relu')(sr1)\n", " return Model(inp, ls3)\n", "\n", "def create_base_network(input_dim):\n", " '''Base network to be shared (eq. to feature extraction).\n", " '''\n", " input = Input(shape=input_dim)\n", " x = Dense(128, activation='relu')(input)\n", " x = Dropout(0.1)(x)\n", " x = Dense(128, activation='relu')(x)\n", " x = Dropout(0.1)(x)\n", " x = Dense(128, activation='relu')(x)\n", " return Model(input, x)\n", "\n", "def compute_accuracy(y_true, y_pred):\n", " '''Compute classification accuracy with a fixed threshold on distances.\n", " '''\n", " pred = y_pred.ravel() < 0.5\n", " return np.mean(pred == y_true)\n", "\n", "\n", "def accuracy(y_true, y_pred):\n", " '''Compute classification accuracy with a fixed threshold on distances.\n", " '''\n", " return K.mean(K.equal(y_true, K.cast(y_pred < 0.5, y_true.dtype)))\n", "\n", "\n", "# the data, shuffled and split between train and test sets\n", "tr_pairs, te_pairs, tr_y, te_y = speech_model_data()\n", "\n", "%matplotlib inline\n", "import matplotlib.pyplot as plt\n", "def plot_spec(ims):\n", " timebins, freqbins = np.shape(ims)\n", " # import pdb;pdb.set_trace()\n", "# plt.figure(figsize=(15, 7.5))\n", " plt.imshow(np.transpose(ims), origin=\"lower\", aspect=\"auto\", cmap=\"jet\", interpolation=\"none\")\n", " plt.colorbar()\n", " xlocs = np.float32(np.linspace(0, timebins-1, 5))\n", " plt.xticks(xlocs, [\"%.02f\" % l for l in ((xlocs*15/timebins)+(0.5*2**10))/22100])\n", " ylocs = np.int16(np.round(np.linspace(0, freqbins-1, 10)))\n", "# plt.yticks(ylocs, [\"%.02f\" % freq[i] for i in ylocs])\n", " \n", "def show_nth(n):\n", " plt.figure(figsize=(15,7.5))\n", " plt.subplot(1,2,1)\n", " plot_spec(te_pairs[n][0].reshape(15,1654))\n", " print(te_y[n])\n", " plt.subplot(1,2,2)\n", " plot_spec(te_pairs[n][1].reshape(15,1654))\n", "show_nth(0)\n", "\n", "# y_train.shape,y_test.shape\n", "# x_train.shape,x_test.shape\n", "# x_train = x_train.reshape(60000, 784)\n", "# x_test = x_test.reshape(10000, 784)\n", "# x_train = x_train.astype('float32')\n", "# x_test = x_test.astype('float32')\n", "# x_train /= 255\n", "# x_test /= 255\n", "\n", "# input_dim = (tr_pairs.shape[2], tr_pairs.shape[3])\n", "# epochs = 20\n", "\n", "# # network definition\n", "# base_network = create_base_rnn_network(input_dim)\n", "# input_a = Input(shape=input_dim)\n", "# input_b = Input(shape=input_dim)\n", "\n", "# # because we re-use the same instance `base_network`,\n", "# # the weights of the network\n", "# # will be shared across the two branches\n", "# processed_a = base_network(input_a)\n", "# processed_b = base_network(input_b)\n", "\n", "# distance = Lambda(euclidean_distance,\n", "# output_shape=eucl_dist_output_shape)(\n", "# [processed_a, processed_b]\n", "# )\n", "\n", "# model = Model([input_a, input_b], distance)\n", "\n", "# tb_cb = TensorBoard(log_dir='./siamese_logs', histogram_freq=1, batch_size=32,\n", "# write_graph=True, write_grads=True, write_images=True,\n", "# embeddings_freq=0, embeddings_layer_names=None,\n", "# embeddings_metadata=None)\n", "# # train\n", "# rms = RMSprop(lr=0.00001) # lr=0.001)\n", "# sgd = SGD(lr=0.001)\n", "# model.compile(loss=contrastive_loss, optimizer=rms, metrics=[accuracy])\n", "# model.fit([tr_pairs[:, 0], tr_pairs[:, 1]], tr_y,\n", "# batch_size=128,\n", "# epochs=epochs,\n", "# validation_data=([te_pairs[:, 0], te_pairs[:, 1]], te_y),\n", "# callbacks=[tb_cb])\n", "\n", "# # compute final accuracy on training and test sets\n", "# y_pred = model.predict([tr_pairs[:, 0], tr_pairs[:, 1]])\n", "# tr_acc = compute_accuracy(tr_y, y_pred)\n", "# y_pred = model.predict([te_pairs[:, 0], te_pairs[:, 1]])\n", "# te_acc = compute_accuracy(te_y, y_pred)\n", "\n", "# print('* Accuracy on training set: %0.2f%%' % (100 * tr_acc))\n", "# print('* Accuracy on test set: %0.2f%%' % (100 * te_acc))\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "'''Train a Siamese MLP on pairs of digits from the MNIST dataset.\n", "\n", "It follows Hadsell-et-al.'06 [1] by computing the Euclidean distance on the\n", "output of the shared network and by optimizing the contrastive loss (see paper\n", "for mode details).\n", "\n", "[1] \"Dimensionality Reduction by Learning an Invariant Mapping\"\n", " http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf\n", "\n", "Gets to 97.2% test accuracy after 20 epochs.\n", "2 seconds per epoch on a Titan X Maxwell GPU\n", "'''\n", "from __future__ import absolute_import\n", "from __future__ import print_function\n", "import numpy as np\n", "\n", "# import random\n", "# from keras.datasets import mnist\n", "from speech_data import speech_model_data\n", "from keras.models import Model\n", "from keras.layers import Input, Dense, Dropout, SimpleRNN, LSTM, Lambda\n", "# Dense, Dropout, Input, Lambda, LSTM, SimpleRNN\n", "from keras.optimizers import RMSprop, SGD\n", "from keras.callbacks import TensorBoard\n", "from keras import backend as K\n", "\n", "\n", "def euclidean_distance(vects):\n", " x, y = vects\n", " return K.sqrt(K.maximum(K.sum(K.square(x - y), axis=1, keepdims=True),\n", " K.epsilon()))\n", "\n", "\n", "def eucl_dist_output_shape(shapes):\n", " shape1, shape2 = shapes\n", " return (shape1[0], 1)\n", "\n", "\n", "def contrastive_loss(y_true, y_pred):\n", " '''Contrastive loss from Hadsell-et-al.'06\n", " http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf\n", " '''\n", " margin = 1\n", " # print(y_true, y_pred)\n", " return K.mean(y_true * K.square(y_pred) +\n", " (1 - y_true) * K.square(K.maximum(margin - y_pred, 0)))\n", "\n", "\n", "def create_base_rnn_network(input_dim):\n", " '''Base network to be shared (eq. to feature extraction).\n", " '''\n", " inp = Input(shape=input_dim)\n", " # d1 = Dense(1024, activation='sigmoid')(inp)\n", " # # d2 = Dense(2, activation='sigmoid')(d1)\n", " ls1 = LSTM(1024, return_sequences=True)(inp)\n", " ls2 = LSTM(512, return_sequences=True)(ls1)\n", " ls3 = LSTM(32)(ls2) # , return_sequences=True\n", " # sr2 = SimpleRNN(128, return_sequences=True)(sr1)\n", " # sr3 = SimpleRNN(32)(sr2)\n", " # x = Dense(128, activation='relu')(sr1)\n", " return Model(inp, ls3)\n", "\n", "def create_base_network(input_dim):\n", " '''Base network to be shared (eq. to feature extraction).\n", " '''\n", " input = Input(shape=input_dim)\n", " x = Dense(128, activation='relu')(input)\n", " x = Dropout(0.1)(x)\n", " x = Dense(128, activation='relu')(x)\n", " x = Dropout(0.1)(x)\n", " x = Dense(128, activation='relu')(x)\n", " return Model(input, x)\n", "\n", "def compute_accuracy(y_true, y_pred):\n", " '''Compute classification accuracy with a fixed threshold on distances.\n", " '''\n", " pred = y_pred.ravel() < 0.5\n", " return np.mean(pred == y_true)\n", "\n", "\n", "def accuracy(y_true, y_pred):\n", " '''Compute classification accuracy with a fixed threshold on distances.\n", " '''\n", " return K.mean(K.equal(y_true, K.cast(y_pred < 0.5, y_true.dtype)))\n", "\n", "\n", "# the data, shuffled and split between train and test sets\n", "tr_pairs, te_pairs, tr_y, te_y = speech_model_data()\n", "# y_train.shape,y_test.shape\n", "# x_train.shape,x_test.shape\n", "# x_train = x_train.reshape(60000, 784)\n", "# x_test = x_test.reshape(10000, 784)\n", "# x_train = x_train.astype('float32')\n", "# x_test = x_test.astype('float32')\n", "# x_train /= 255\n", "# x_test /= 255\n", "input_dim = (tr_pairs.shape[2], tr_pairs.shape[3])\n", "epochs = 20\n", "\n", "# network definition\n", "base_network = create_base_rnn_network(input_dim)\n", "input_a = Input(shape=input_dim)\n", "input_b = Input(shape=input_dim)\n", "\n", "# because we re-use the same instance `base_network`,\n", "# the weights of the network\n", "# will be shared across the two branches\n", "processed_a = base_network(input_a)\n", "processed_b = base_network(input_b)\n", "\n", "distance = Lambda(euclidean_distance,\n", " output_shape=eucl_dist_output_shape)(\n", " [processed_a, processed_b]\n", ")\n", "\n", "model = Model([input_a, input_b], distance)\n", "\n", "tb_cb = TensorBoard(log_dir='./siamese_logs', histogram_freq=1, batch_size=32,\n", " write_graph=True, write_grads=True, write_images=True,\n", " embeddings_freq=0, embeddings_layer_names=None,\n", " embeddings_metadata=None)\n", "# train\n", "rms = RMSprop(lr=0.001) # lr=0.001)\n", "sgd = SGD(lr=0.001)\n", "model.compile(loss=contrastive_loss, optimizer=rms, metrics=[accuracy])\n", "model.fit([tr_pairs[:, 0], tr_pairs[:, 1]], tr_y,\n", " batch_size=128,\n", " epochs=epochs,\n", " validation_data=([te_pairs[:, 0], te_pairs[:, 1]], te_y),\n", " callbacks=[tb_cb])\n", "\n", "model.save('./siamese_speech_model.h5')\n", "# compute final accuracy on training and test sets\n", "y_pred = model.predict([tr_pairs[:, 0], tr_pairs[:, 1]])\n", "tr_acc = compute_accuracy(tr_y, y_pred)\n", "y_pred = model.predict([te_pairs[:, 0], te_pairs[:, 1]])\n", "te_acc = compute_accuracy(te_y, y_pred)\n", "\n", "print('* Accuracy on training set: %0.2f%%' % (100 * tr_acc))\n", "print('* Accuracy on test set: %0.2f%%' % (100 * te_acc))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 2 }