360 lines
14 KiB
Plaintext
360 lines
14 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"'''Train a Siamese MLP on pairs of digits from the MNIST dataset.\n",
|
|
"\n",
|
|
"It follows Hadsell-et-al.'06 [1] by computing the Euclidean distance on the\n",
|
|
"output of the shared network and by optimizing the contrastive loss (see paper\n",
|
|
"for mode details).\n",
|
|
"\n",
|
|
"[1] \"Dimensionality Reduction by Learning an Invariant Mapping\"\n",
|
|
" http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf\n",
|
|
"\n",
|
|
"Gets to 97.2% test accuracy after 20 epochs.\n",
|
|
"2 seconds per epoch on a Titan X Maxwell GPU\n",
|
|
"'''\n",
|
|
"from __future__ import absolute_import\n",
|
|
"from __future__ import print_function\n",
|
|
"import numpy as np\n",
|
|
"\n",
|
|
"# import random\n",
|
|
"# from keras.datasets import mnist\n",
|
|
"from speech_data import speech_model_data\n",
|
|
"from keras.models import Model\n",
|
|
"from keras.layers import Input, Dense, Dropout, SimpleRNN, LSTM, Lambda\n",
|
|
"# Dense, Dropout, Input, Lambda, LSTM, SimpleRNN\n",
|
|
"from keras.optimizers import RMSprop, SGD\n",
|
|
"from keras.callbacks import TensorBoard\n",
|
|
"from keras import backend as K\n",
|
|
"\n",
|
|
"\n",
|
|
"def euclidean_distance(vects):\n",
|
|
" x, y = vects\n",
|
|
" return K.sqrt(K.maximum(K.sum(K.square(x - y), axis=1, keepdims=True),\n",
|
|
" K.epsilon()))\n",
|
|
"\n",
|
|
"\n",
|
|
"def eucl_dist_output_shape(shapes):\n",
|
|
" shape1, shape2 = shapes\n",
|
|
" return (shape1[0], 1)\n",
|
|
"\n",
|
|
"\n",
|
|
"def contrastive_loss(y_true, y_pred):\n",
|
|
" '''Contrastive loss from Hadsell-et-al.'06\n",
|
|
" http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf\n",
|
|
" '''\n",
|
|
" margin = 1\n",
|
|
" # print(y_true, y_pred)\n",
|
|
" return K.mean(y_true * K.square(y_pred) +\n",
|
|
" (1 - y_true) * K.square(K.maximum(margin - y_pred, 0)))\n",
|
|
"\n",
|
|
"\n",
|
|
"def create_base_rnn_network(input_dim):\n",
|
|
" '''Base network to be shared (eq. to feature extraction).\n",
|
|
" '''\n",
|
|
" inp = Input(shape=input_dim)\n",
|
|
" # d1 = Dense(1024, activation='sigmoid')(inp)\n",
|
|
" # # d2 = Dense(2, activation='sigmoid')(d1)\n",
|
|
" ls1 = LSTM(1024, return_sequences=True)(inp)\n",
|
|
" ls2 = LSTM(512, return_sequences=True)(ls1)\n",
|
|
" ls3 = LSTM(32)(ls2) # , return_sequences=True\n",
|
|
" # sr2 = SimpleRNN(128, return_sequences=True)(sr1)\n",
|
|
" # sr3 = SimpleRNN(32)(sr2)\n",
|
|
" # x = Dense(128, activation='relu')(sr1)\n",
|
|
" return Model(inp, ls3)\n",
|
|
"\n",
|
|
"def create_base_network(input_dim):\n",
|
|
" '''Base network to be shared (eq. to feature extraction).\n",
|
|
" '''\n",
|
|
" input = Input(shape=input_dim)\n",
|
|
" x = Dense(128, activation='relu')(input)\n",
|
|
" x = Dropout(0.1)(x)\n",
|
|
" x = Dense(128, activation='relu')(x)\n",
|
|
" x = Dropout(0.1)(x)\n",
|
|
" x = Dense(128, activation='relu')(x)\n",
|
|
" return Model(input, x)\n",
|
|
"\n",
|
|
"def compute_accuracy(y_true, y_pred):\n",
|
|
" '''Compute classification accuracy with a fixed threshold on distances.\n",
|
|
" '''\n",
|
|
" pred = y_pred.ravel() < 0.5\n",
|
|
" return np.mean(pred == y_true)\n",
|
|
"\n",
|
|
"\n",
|
|
"def accuracy(y_true, y_pred):\n",
|
|
" '''Compute classification accuracy with a fixed threshold on distances.\n",
|
|
" '''\n",
|
|
" return K.mean(K.equal(y_true, K.cast(y_pred < 0.5, y_true.dtype)))\n",
|
|
"\n",
|
|
"\n",
|
|
"# the data, shuffled and split between train and test sets\n",
|
|
"tr_pairs, te_pairs, tr_y, te_y = speech_model_data()\n",
|
|
"\n",
|
|
"%matplotlib inline\n",
|
|
"import matplotlib.pyplot as plt\n",
|
|
"def plot_spec(ims):\n",
|
|
" timebins, freqbins = np.shape(ims)\n",
|
|
" # import pdb;pdb.set_trace()\n",
|
|
"# plt.figure(figsize=(15, 7.5))\n",
|
|
" plt.imshow(np.transpose(ims), origin=\"lower\", aspect=\"auto\", cmap=\"jet\", interpolation=\"none\")\n",
|
|
" plt.colorbar()\n",
|
|
" xlocs = np.float32(np.linspace(0, timebins-1, 5))\n",
|
|
" plt.xticks(xlocs, [\"%.02f\" % l for l in ((xlocs*15/timebins)+(0.5*2**10))/22100])\n",
|
|
" ylocs = np.int16(np.round(np.linspace(0, freqbins-1, 10)))\n",
|
|
"# plt.yticks(ylocs, [\"%.02f\" % freq[i] for i in ylocs])\n",
|
|
" \n",
|
|
"def show_nth(n):\n",
|
|
" plt.figure(figsize=(15,7.5))\n",
|
|
" plt.subplot(1,2,1)\n",
|
|
" plot_spec(te_pairs[n][0].reshape(15,1654))\n",
|
|
" print(te_y[n])\n",
|
|
" plt.subplot(1,2,2)\n",
|
|
" plot_spec(te_pairs[n][1].reshape(15,1654))\n",
|
|
"show_nth(0)\n",
|
|
"\n",
|
|
"# y_train.shape,y_test.shape\n",
|
|
"# x_train.shape,x_test.shape\n",
|
|
"# x_train = x_train.reshape(60000, 784)\n",
|
|
"# x_test = x_test.reshape(10000, 784)\n",
|
|
"# x_train = x_train.astype('float32')\n",
|
|
"# x_test = x_test.astype('float32')\n",
|
|
"# x_train /= 255\n",
|
|
"# x_test /= 255\n",
|
|
"\n",
|
|
"# input_dim = (tr_pairs.shape[2], tr_pairs.shape[3])\n",
|
|
"# epochs = 20\n",
|
|
"\n",
|
|
"# # network definition\n",
|
|
"# base_network = create_base_rnn_network(input_dim)\n",
|
|
"# input_a = Input(shape=input_dim)\n",
|
|
"# input_b = Input(shape=input_dim)\n",
|
|
"\n",
|
|
"# # because we re-use the same instance `base_network`,\n",
|
|
"# # the weights of the network\n",
|
|
"# # will be shared across the two branches\n",
|
|
"# processed_a = base_network(input_a)\n",
|
|
"# processed_b = base_network(input_b)\n",
|
|
"\n",
|
|
"# distance = Lambda(euclidean_distance,\n",
|
|
"# output_shape=eucl_dist_output_shape)(\n",
|
|
"# [processed_a, processed_b]\n",
|
|
"# )\n",
|
|
"\n",
|
|
"# model = Model([input_a, input_b], distance)\n",
|
|
"\n",
|
|
"# tb_cb = TensorBoard(log_dir='./siamese_logs', histogram_freq=1, batch_size=32,\n",
|
|
"# write_graph=True, write_grads=True, write_images=True,\n",
|
|
"# embeddings_freq=0, embeddings_layer_names=None,\n",
|
|
"# embeddings_metadata=None)\n",
|
|
"# # train\n",
|
|
"# rms = RMSprop(lr=0.00001) # lr=0.001)\n",
|
|
"# sgd = SGD(lr=0.001)\n",
|
|
"# model.compile(loss=contrastive_loss, optimizer=rms, metrics=[accuracy])\n",
|
|
"# model.fit([tr_pairs[:, 0], tr_pairs[:, 1]], tr_y,\n",
|
|
"# batch_size=128,\n",
|
|
"# epochs=epochs,\n",
|
|
"# validation_data=([te_pairs[:, 0], te_pairs[:, 1]], te_y),\n",
|
|
"# callbacks=[tb_cb])\n",
|
|
"\n",
|
|
"# # compute final accuracy on training and test sets\n",
|
|
"# y_pred = model.predict([tr_pairs[:, 0], tr_pairs[:, 1]])\n",
|
|
"# tr_acc = compute_accuracy(tr_y, y_pred)\n",
|
|
"# y_pred = model.predict([te_pairs[:, 0], te_pairs[:, 1]])\n",
|
|
"# te_acc = compute_accuracy(te_y, y_pred)\n",
|
|
"\n",
|
|
"# print('* Accuracy on training set: %0.2f%%' % (100 * tr_acc))\n",
|
|
"# print('* Accuracy on test set: %0.2f%%' % (100 * te_acc))\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Using TensorFlow backend.\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Train on 36252 samples, validate on 4028 samples\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"'''Train a Siamese MLP on pairs of digits from the MNIST dataset.\n",
|
|
"\n",
|
|
"It follows Hadsell-et-al.'06 [1] by computing the Euclidean distance on the\n",
|
|
"output of the shared network and by optimizing the contrastive loss (see paper\n",
|
|
"for mode details).\n",
|
|
"\n",
|
|
"[1] \"Dimensionality Reduction by Learning an Invariant Mapping\"\n",
|
|
" http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf\n",
|
|
"\n",
|
|
"Gets to 97.2% test accuracy after 20 epochs.\n",
|
|
"2 seconds per epoch on a Titan X Maxwell GPU\n",
|
|
"'''\n",
|
|
"from __future__ import absolute_import\n",
|
|
"from __future__ import print_function\n",
|
|
"import numpy as np\n",
|
|
"\n",
|
|
"# import random\n",
|
|
"# from keras.datasets import mnist\n",
|
|
"from speech_data import speech_model_data\n",
|
|
"from keras.models import Model\n",
|
|
"from keras.layers import Input, Dense, Dropout, SimpleRNN, LSTM, Lambda\n",
|
|
"# Dense, Dropout, Input, Lambda, LSTM, SimpleRNN\n",
|
|
"from keras.optimizers import RMSprop, SGD\n",
|
|
"from keras.callbacks import TensorBoard\n",
|
|
"from keras import backend as K\n",
|
|
"\n",
|
|
"\n",
|
|
"def euclidean_distance(vects):\n",
|
|
" x, y = vects\n",
|
|
" return K.sqrt(K.maximum(K.sum(K.square(x - y), axis=1, keepdims=True),\n",
|
|
" K.epsilon()))\n",
|
|
"\n",
|
|
"\n",
|
|
"def eucl_dist_output_shape(shapes):\n",
|
|
" shape1, shape2 = shapes\n",
|
|
" return (shape1[0], 1)\n",
|
|
"\n",
|
|
"\n",
|
|
"def contrastive_loss(y_true, y_pred):\n",
|
|
" '''Contrastive loss from Hadsell-et-al.'06\n",
|
|
" http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf\n",
|
|
" '''\n",
|
|
" margin = 1\n",
|
|
" # print(y_true, y_pred)\n",
|
|
" return K.mean(y_true * K.square(y_pred) +\n",
|
|
" (1 - y_true) * K.square(K.maximum(margin - y_pred, 0)))\n",
|
|
"\n",
|
|
"\n",
|
|
"def create_base_rnn_network(input_dim):\n",
|
|
" '''Base network to be shared (eq. to feature extraction).\n",
|
|
" '''\n",
|
|
" inp = Input(shape=input_dim)\n",
|
|
" # d1 = Dense(1024, activation='sigmoid')(inp)\n",
|
|
" # # d2 = Dense(2, activation='sigmoid')(d1)\n",
|
|
" ls1 = LSTM(1024, return_sequences=True)(inp)\n",
|
|
" ls2 = LSTM(512, return_sequences=True)(ls1)\n",
|
|
" ls3 = LSTM(32)(ls2) # , return_sequences=True\n",
|
|
" # sr2 = SimpleRNN(128, return_sequences=True)(sr1)\n",
|
|
" # sr3 = SimpleRNN(32)(sr2)\n",
|
|
" # x = Dense(128, activation='relu')(sr1)\n",
|
|
" return Model(inp, ls3)\n",
|
|
"\n",
|
|
"def create_base_network(input_dim):\n",
|
|
" '''Base network to be shared (eq. to feature extraction).\n",
|
|
" '''\n",
|
|
" input = Input(shape=input_dim)\n",
|
|
" x = Dense(128, activation='relu')(input)\n",
|
|
" x = Dropout(0.1)(x)\n",
|
|
" x = Dense(128, activation='relu')(x)\n",
|
|
" x = Dropout(0.1)(x)\n",
|
|
" x = Dense(128, activation='relu')(x)\n",
|
|
" return Model(input, x)\n",
|
|
"\n",
|
|
"def compute_accuracy(y_true, y_pred):\n",
|
|
" '''Compute classification accuracy with a fixed threshold on distances.\n",
|
|
" '''\n",
|
|
" pred = y_pred.ravel() < 0.5\n",
|
|
" return np.mean(pred == y_true)\n",
|
|
"\n",
|
|
"\n",
|
|
"def accuracy(y_true, y_pred):\n",
|
|
" '''Compute classification accuracy with a fixed threshold on distances.\n",
|
|
" '''\n",
|
|
" return K.mean(K.equal(y_true, K.cast(y_pred < 0.5, y_true.dtype)))\n",
|
|
"\n",
|
|
"\n",
|
|
"# the data, shuffled and split between train and test sets\n",
|
|
"tr_pairs, te_pairs, tr_y, te_y = speech_model_data()\n",
|
|
"# y_train.shape,y_test.shape\n",
|
|
"# x_train.shape,x_test.shape\n",
|
|
"# x_train = x_train.reshape(60000, 784)\n",
|
|
"# x_test = x_test.reshape(10000, 784)\n",
|
|
"# x_train = x_train.astype('float32')\n",
|
|
"# x_test = x_test.astype('float32')\n",
|
|
"# x_train /= 255\n",
|
|
"# x_test /= 255\n",
|
|
"input_dim = (tr_pairs.shape[2], tr_pairs.shape[3])\n",
|
|
"epochs = 20\n",
|
|
"\n",
|
|
"# network definition\n",
|
|
"base_network = create_base_rnn_network(input_dim)\n",
|
|
"input_a = Input(shape=input_dim)\n",
|
|
"input_b = Input(shape=input_dim)\n",
|
|
"\n",
|
|
"# because we re-use the same instance `base_network`,\n",
|
|
"# the weights of the network\n",
|
|
"# will be shared across the two branches\n",
|
|
"processed_a = base_network(input_a)\n",
|
|
"processed_b = base_network(input_b)\n",
|
|
"\n",
|
|
"distance = Lambda(euclidean_distance,\n",
|
|
" output_shape=eucl_dist_output_shape)(\n",
|
|
" [processed_a, processed_b]\n",
|
|
")\n",
|
|
"\n",
|
|
"model = Model([input_a, input_b], distance)\n",
|
|
"\n",
|
|
"tb_cb = TensorBoard(log_dir='./logs/siamese_logs', histogram_freq=1, batch_size=32,\n",
|
|
" write_graph=True, write_grads=True, write_images=True,\n",
|
|
" embeddings_freq=0, embeddings_layer_names=None,\n",
|
|
" embeddings_metadata=None)\n",
|
|
"# train\n",
|
|
"rms = RMSprop(lr=0.001) # lr=0.001)\n",
|
|
"sgd = SGD(lr=0.001)\n",
|
|
"model.compile(loss=contrastive_loss, optimizer=rms, metrics=[accuracy])\n",
|
|
"model.fit([tr_pairs[:, 0], tr_pairs[:, 1]], tr_y,\n",
|
|
" batch_size=128,\n",
|
|
" epochs=epochs,\n",
|
|
" validation_data=([te_pairs[:, 0], te_pairs[:, 1]], te_y),\n",
|
|
" callbacks=[tb_cb])\n",
|
|
"\n",
|
|
"model.save('./models/siamese_speech_model.h5')\n",
|
|
"# compute final accuracy on training and test sets\n",
|
|
"y_pred = model.predict([tr_pairs[:, 0], tr_pairs[:, 1]])\n",
|
|
"tr_acc = compute_accuracy(tr_y, y_pred)\n",
|
|
"y_pred = model.predict([te_pairs[:, 0], te_pairs[:, 1]])\n",
|
|
"te_acc = compute_accuracy(te_y, y_pred)\n",
|
|
"\n",
|
|
"print('* Accuracy on training set: %0.2f%%' % (100 * tr_acc))\n",
|
|
"print('* Accuracy on test set: %0.2f%%' % (100 * te_acc))"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.5.2"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|