speech-scoring/Siamese.ipynb

352 lines
14 KiB
Plaintext
Raw Normal View History

{
"cells": [
{
"cell_type": "code",
2017-10-23 14:51:44 +00:00
"execution_count": null,
"metadata": {},
2017-10-23 14:51:44 +00:00
"outputs": [],
"source": [
"'''Train a Siamese MLP on pairs of digits from the MNIST dataset.\n",
"\n",
"It follows Hadsell-et-al.'06 [1] by computing the Euclidean distance on the\n",
"output of the shared network and by optimizing the contrastive loss (see paper\n",
"for mode details).\n",
"\n",
"[1] \"Dimensionality Reduction by Learning an Invariant Mapping\"\n",
" http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf\n",
"\n",
"Gets to 97.2% test accuracy after 20 epochs.\n",
"2 seconds per epoch on a Titan X Maxwell GPU\n",
"'''\n",
"from __future__ import absolute_import\n",
"from __future__ import print_function\n",
"import numpy as np\n",
"\n",
2017-10-23 14:51:44 +00:00
"# import random\n",
"# from keras.datasets import mnist\n",
"from speech_data import speech_model_data\n",
"from keras.models import Model\n",
2017-10-23 14:51:44 +00:00
"from keras.layers import Input, Dense, Dropout, SimpleRNN, LSTM, Lambda\n",
"# Dense, Dropout, Input, Lambda, LSTM, SimpleRNN\n",
"from keras.optimizers import RMSprop, SGD\n",
"from keras.callbacks import TensorBoard\n",
"from keras import backend as K\n",
"\n",
"\n",
"def euclidean_distance(vects):\n",
" x, y = vects\n",
2017-10-23 14:51:44 +00:00
" return K.sqrt(K.maximum(K.sum(K.square(x - y), axis=1, keepdims=True),\n",
" K.epsilon()))\n",
"\n",
"\n",
"def eucl_dist_output_shape(shapes):\n",
" shape1, shape2 = shapes\n",
" return (shape1[0], 1)\n",
"\n",
"\n",
"def contrastive_loss(y_true, y_pred):\n",
" '''Contrastive loss from Hadsell-et-al.'06\n",
" http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf\n",
" '''\n",
" margin = 1\n",
2017-10-23 14:51:44 +00:00
" # print(y_true, y_pred)\n",
" return K.mean(y_true * K.square(y_pred) +\n",
" (1 - y_true) * K.square(K.maximum(margin - y_pred, 0)))\n",
"\n",
"\n",
2017-10-23 14:51:44 +00:00
"def create_base_rnn_network(input_dim):\n",
" '''Base network to be shared (eq. to feature extraction).\n",
" '''\n",
2017-10-23 14:51:44 +00:00
" inp = Input(shape=input_dim)\n",
" # d1 = Dense(1024, activation='sigmoid')(inp)\n",
" # # d2 = Dense(2, activation='sigmoid')(d1)\n",
" ls1 = LSTM(1024, return_sequences=True)(inp)\n",
" ls2 = LSTM(512, return_sequences=True)(ls1)\n",
" ls3 = LSTM(32)(ls2) # , return_sequences=True\n",
" # sr2 = SimpleRNN(128, return_sequences=True)(sr1)\n",
" # sr3 = SimpleRNN(32)(sr2)\n",
" # x = Dense(128, activation='relu')(sr1)\n",
" return Model(inp, ls3)\n",
"\n",
"def create_base_network(input_dim):\n",
" '''Base network to be shared (eq. to feature extraction).\n",
" '''\n",
2017-10-23 14:51:44 +00:00
" input = Input(shape=input_dim)\n",
" x = Dense(128, activation='relu')(input)\n",
" x = Dropout(0.1)(x)\n",
" x = Dense(128, activation='relu')(x)\n",
" x = Dropout(0.1)(x)\n",
" x = Dense(128, activation='relu')(x)\n",
" return Model(input, x)\n",
"\n",
"def compute_accuracy(y_true, y_pred):\n",
" '''Compute classification accuracy with a fixed threshold on distances.\n",
" '''\n",
" pred = y_pred.ravel() < 0.5\n",
" return np.mean(pred == y_true)\n",
"\n",
"\n",
"def accuracy(y_true, y_pred):\n",
" '''Compute classification accuracy with a fixed threshold on distances.\n",
" '''\n",
" return K.mean(K.equal(y_true, K.cast(y_pred < 0.5, y_true.dtype)))\n",
"\n",
"\n",
"# the data, shuffled and split between train and test sets\n",
2017-10-23 14:51:44 +00:00
"tr_pairs, te_pairs, tr_y, te_y = speech_model_data()\n",
"\n",
2017-10-23 14:51:44 +00:00
"%matplotlib inline\n",
"import matplotlib.pyplot as plt\n",
"def plot_spec(ims):\n",
" timebins, freqbins = np.shape(ims)\n",
" # import pdb;pdb.set_trace()\n",
"# plt.figure(figsize=(15, 7.5))\n",
" plt.imshow(np.transpose(ims), origin=\"lower\", aspect=\"auto\", cmap=\"jet\", interpolation=\"none\")\n",
" plt.colorbar()\n",
" xlocs = np.float32(np.linspace(0, timebins-1, 5))\n",
" plt.xticks(xlocs, [\"%.02f\" % l for l in ((xlocs*15/timebins)+(0.5*2**10))/22100])\n",
" ylocs = np.int16(np.round(np.linspace(0, freqbins-1, 10)))\n",
"# plt.yticks(ylocs, [\"%.02f\" % freq[i] for i in ylocs])\n",
" \n",
"def show_nth(n):\n",
2017-10-23 14:51:44 +00:00
" plt.figure(figsize=(15,7.5))\n",
" plt.subplot(1,2,1)\n",
2017-10-23 14:51:44 +00:00
" plot_spec(te_pairs[n][0].reshape(15,1654))\n",
" print(te_y[n])\n",
" plt.subplot(1,2,2)\n",
2017-10-23 14:51:44 +00:00
" plot_spec(te_pairs[n][1].reshape(15,1654))\n",
"show_nth(0)\n",
"\n",
2017-10-23 14:51:44 +00:00
"# y_train.shape,y_test.shape\n",
"# x_train.shape,x_test.shape\n",
"# x_train = x_train.reshape(60000, 784)\n",
"# x_test = x_test.reshape(10000, 784)\n",
"# x_train = x_train.astype('float32')\n",
"# x_test = x_test.astype('float32')\n",
"# x_train /= 255\n",
"# x_test /= 255\n",
"\n",
"# input_dim = (tr_pairs.shape[2], tr_pairs.shape[3])\n",
"# epochs = 20\n",
"\n",
"# # network definition\n",
"# base_network = create_base_rnn_network(input_dim)\n",
"# input_a = Input(shape=input_dim)\n",
"# input_b = Input(shape=input_dim)\n",
"\n",
"# # because we re-use the same instance `base_network`,\n",
"# # the weights of the network\n",
"# # will be shared across the two branches\n",
"# processed_a = base_network(input_a)\n",
"# processed_b = base_network(input_b)\n",
"\n",
"# distance = Lambda(euclidean_distance,\n",
2017-10-23 14:51:44 +00:00
"# output_shape=eucl_dist_output_shape)(\n",
"# [processed_a, processed_b]\n",
"# )\n",
"\n",
"# model = Model([input_a, input_b], distance)\n",
"\n",
2017-10-23 14:51:44 +00:00
"# tb_cb = TensorBoard(log_dir='./siamese_logs', histogram_freq=1, batch_size=32,\n",
"# write_graph=True, write_grads=True, write_images=True,\n",
"# embeddings_freq=0, embeddings_layer_names=None,\n",
"# embeddings_metadata=None)\n",
"# # train\n",
2017-10-23 14:51:44 +00:00
"# rms = RMSprop(lr=0.00001) # lr=0.001)\n",
"# sgd = SGD(lr=0.001)\n",
"# model.compile(loss=contrastive_loss, optimizer=rms, metrics=[accuracy])\n",
"# model.fit([tr_pairs[:, 0], tr_pairs[:, 1]], tr_y,\n",
"# batch_size=128,\n",
"# epochs=epochs,\n",
2017-10-23 14:51:44 +00:00
"# validation_data=([te_pairs[:, 0], te_pairs[:, 1]], te_y),\n",
"# callbacks=[tb_cb])\n",
"\n",
"# # compute final accuracy on training and test sets\n",
"# y_pred = model.predict([tr_pairs[:, 0], tr_pairs[:, 1]])\n",
"# tr_acc = compute_accuracy(tr_y, y_pred)\n",
"# y_pred = model.predict([te_pairs[:, 0], te_pairs[:, 1]])\n",
"# te_acc = compute_accuracy(te_y, y_pred)\n",
"\n",
"# print('* Accuracy on training set: %0.2f%%' % (100 * tr_acc))\n",
"# print('* Accuracy on test set: %0.2f%%' % (100 * te_acc))\n"
]
},
{
"cell_type": "code",
2017-10-23 14:51:44 +00:00
"execution_count": null,
"metadata": {},
2017-10-23 14:51:44 +00:00
"outputs": [],
"source": [
"'''Train a Siamese MLP on pairs of digits from the MNIST dataset.\n",
"\n",
"It follows Hadsell-et-al.'06 [1] by computing the Euclidean distance on the\n",
"output of the shared network and by optimizing the contrastive loss (see paper\n",
"for mode details).\n",
"\n",
"[1] \"Dimensionality Reduction by Learning an Invariant Mapping\"\n",
" http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf\n",
"\n",
"Gets to 97.2% test accuracy after 20 epochs.\n",
"2 seconds per epoch on a Titan X Maxwell GPU\n",
"'''\n",
"from __future__ import absolute_import\n",
"from __future__ import print_function\n",
"import numpy as np\n",
"\n",
"# import random\n",
"# from keras.datasets import mnist\n",
"from speech_data import speech_model_data\n",
"from keras.models import Model\n",
"from keras.layers import Input, Dense, Dropout, SimpleRNN, LSTM, Lambda\n",
"# Dense, Dropout, Input, Lambda, LSTM, SimpleRNN\n",
"from keras.optimizers import RMSprop, SGD\n",
"from keras.callbacks import TensorBoard\n",
"from keras import backend as K\n",
"\n",
"\n",
"def euclidean_distance(vects):\n",
" x, y = vects\n",
" return K.sqrt(K.maximum(K.sum(K.square(x - y), axis=1, keepdims=True),\n",
" K.epsilon()))\n",
"\n",
"\n",
"def eucl_dist_output_shape(shapes):\n",
" shape1, shape2 = shapes\n",
" return (shape1[0], 1)\n",
"\n",
"\n",
"def contrastive_loss(y_true, y_pred):\n",
" '''Contrastive loss from Hadsell-et-al.'06\n",
" http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf\n",
" '''\n",
" margin = 1\n",
" # print(y_true, y_pred)\n",
" return K.mean(y_true * K.square(y_pred) +\n",
" (1 - y_true) * K.square(K.maximum(margin - y_pred, 0)))\n",
"\n",
"\n",
"def create_base_rnn_network(input_dim):\n",
" '''Base network to be shared (eq. to feature extraction).\n",
" '''\n",
" inp = Input(shape=input_dim)\n",
" # d1 = Dense(1024, activation='sigmoid')(inp)\n",
" # # d2 = Dense(2, activation='sigmoid')(d1)\n",
" ls1 = LSTM(1024, return_sequences=True)(inp)\n",
" ls2 = LSTM(512, return_sequences=True)(ls1)\n",
" ls3 = LSTM(32)(ls2) # , return_sequences=True\n",
" # sr2 = SimpleRNN(128, return_sequences=True)(sr1)\n",
" # sr3 = SimpleRNN(32)(sr2)\n",
" # x = Dense(128, activation='relu')(sr1)\n",
" return Model(inp, ls3)\n",
"\n",
"def create_base_network(input_dim):\n",
" '''Base network to be shared (eq. to feature extraction).\n",
" '''\n",
" input = Input(shape=input_dim)\n",
" x = Dense(128, activation='relu')(input)\n",
" x = Dropout(0.1)(x)\n",
" x = Dense(128, activation='relu')(x)\n",
" x = Dropout(0.1)(x)\n",
" x = Dense(128, activation='relu')(x)\n",
" return Model(input, x)\n",
"\n",
"def compute_accuracy(y_true, y_pred):\n",
" '''Compute classification accuracy with a fixed threshold on distances.\n",
" '''\n",
" pred = y_pred.ravel() < 0.5\n",
" return np.mean(pred == y_true)\n",
"\n",
"\n",
"def accuracy(y_true, y_pred):\n",
" '''Compute classification accuracy with a fixed threshold on distances.\n",
" '''\n",
" return K.mean(K.equal(y_true, K.cast(y_pred < 0.5, y_true.dtype)))\n",
"\n",
"\n",
"# the data, shuffled and split between train and test sets\n",
"tr_pairs, te_pairs, tr_y, te_y = speech_model_data()\n",
"# y_train.shape,y_test.shape\n",
"# x_train.shape,x_test.shape\n",
"# x_train = x_train.reshape(60000, 784)\n",
"# x_test = x_test.reshape(10000, 784)\n",
"# x_train = x_train.astype('float32')\n",
"# x_test = x_test.astype('float32')\n",
"# x_train /= 255\n",
"# x_test /= 255\n",
2017-10-23 14:51:44 +00:00
"input_dim = (tr_pairs.shape[2], tr_pairs.shape[3])\n",
"epochs = 20\n",
"\n",
2017-10-23 14:51:44 +00:00
"# network definition\n",
"base_network = create_base_rnn_network(input_dim)\n",
"input_a = Input(shape=input_dim)\n",
"input_b = Input(shape=input_dim)\n",
"\n",
"# because we re-use the same instance `base_network`,\n",
"# the weights of the network\n",
"# will be shared across the two branches\n",
"processed_a = base_network(input_a)\n",
"processed_b = base_network(input_b)\n",
"\n",
"distance = Lambda(euclidean_distance,\n",
" output_shape=eucl_dist_output_shape)(\n",
" [processed_a, processed_b]\n",
")\n",
"\n",
"model = Model([input_a, input_b], distance)\n",
"\n",
"tb_cb = TensorBoard(log_dir='./siamese_logs', histogram_freq=1, batch_size=32,\n",
" write_graph=True, write_grads=True, write_images=True,\n",
" embeddings_freq=0, embeddings_layer_names=None,\n",
" embeddings_metadata=None)\n",
"# train\n",
"rms = RMSprop(lr=0.001) # lr=0.001)\n",
"sgd = SGD(lr=0.001)\n",
"model.compile(loss=contrastive_loss, optimizer=rms, metrics=[accuracy])\n",
"model.fit([tr_pairs[:, 0], tr_pairs[:, 1]], tr_y,\n",
" batch_size=128,\n",
" epochs=epochs,\n",
" validation_data=([te_pairs[:, 0], te_pairs[:, 1]], te_y),\n",
" callbacks=[tb_cb])\n",
"\n",
"model.save('./siamese_speech_model.h5')\n",
"# compute final accuracy on training and test sets\n",
"y_pred = model.predict([tr_pairs[:, 0], tr_pairs[:, 1]])\n",
"tr_acc = compute_accuracy(tr_y, y_pred)\n",
"y_pred = model.predict([te_pairs[:, 0], te_pairs[:, 1]])\n",
"te_acc = compute_accuracy(te_y, y_pred)\n",
"\n",
"print('* Accuracy on training set: %0.2f%%' % (100 * tr_acc))\n",
"print('* Accuracy on test set: %0.2f%%' % (100 * te_acc))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}