speech-scoring/Siamese.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "'''Train a Siamese MLP on pairs of digits from the MNIST dataset.\n",
    "\n",
    "It follows Hadsell-et-al.'06 [1] by computing the Euclidean distance on the\n",
    "output of the shared network and by optimizing the contrastive loss (see paper\n",
    "for mode details).\n",
    "\n",
    "[1] \"Dimensionality Reduction by Learning an Invariant Mapping\"\n",
    "    http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf\n",
    "\n",
    "Gets to 97.2% test accuracy after 20 epochs.\n",
    "2 seconds per epoch on a Titan X Maxwell GPU\n",
    "'''\n",
    "from __future__ import absolute_import\n",
    "from __future__ import print_function\n",
    "import numpy as np\n",
    "\n",
    "# import random\n",
    "# from keras.datasets import mnist\n",
    "from speech_data import speech_model_data\n",
    "from keras.models import Model\n",
    "from keras.layers import Input, Dense, Dropout, SimpleRNN, LSTM, Lambda\n",
    "# Dense, Dropout, Input, Lambda, LSTM, SimpleRNN\n",
    "from keras.optimizers import RMSprop, SGD\n",
    "from keras.callbacks import TensorBoard\n",
    "from keras import backend as K\n",
    "\n",
    "\n",
    "def euclidean_distance(vects):\n",
    "    x, y = vects\n",
    "    return K.sqrt(K.maximum(K.sum(K.square(x - y), axis=1, keepdims=True),\n",
    "                            K.epsilon()))\n",
    "\n",
    "\n",
    "def eucl_dist_output_shape(shapes):\n",
    "    shape1, shape2 = shapes\n",
    "    return (shape1[0], 1)\n",
    "\n",
    "\n",
    "def contrastive_loss(y_true, y_pred):\n",
    "    '''Contrastive loss from Hadsell-et-al.'06\n",
    "    http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf\n",
    "    '''\n",
    "    margin = 1\n",
    "    # print(y_true, y_pred)\n",
    "    return K.mean(y_true * K.square(y_pred) +\n",
    "                  (1 - y_true) * K.square(K.maximum(margin - y_pred, 0)))\n",
    "\n",
    "\n",
    "def create_base_rnn_network(input_dim):\n",
    "    '''Base network to be shared (eq. to feature extraction).\n",
    "    '''\n",
    "    inp = Input(shape=input_dim)\n",
    "    # d1 = Dense(1024, activation='sigmoid')(inp)\n",
    "    # # d2 = Dense(2, activation='sigmoid')(d1)\n",
    "    ls1 = LSTM(1024, return_sequences=True)(inp)\n",
    "    ls2 = LSTM(512, return_sequences=True)(ls1)\n",
    "    ls3 = LSTM(32)(ls2)  # , return_sequences=True\n",
    "    # sr2 = SimpleRNN(128, return_sequences=True)(sr1)\n",
    "    # sr3 = SimpleRNN(32)(sr2)\n",
    "    # x = Dense(128, activation='relu')(sr1)\n",
    "    return Model(inp, ls3)\n",
    "\n",
    "def create_base_network(input_dim):\n",
    "    '''Base network to be shared (eq. to feature extraction).\n",
    "    '''\n",
    "    input = Input(shape=input_dim)\n",
    "    x = Dense(128, activation='relu')(input)\n",
    "    x = Dropout(0.1)(x)\n",
    "    x = Dense(128, activation='relu')(x)\n",
    "    x = Dropout(0.1)(x)\n",
    "    x = Dense(128, activation='relu')(x)\n",
    "    return Model(input, x)\n",
    "\n",
    "def compute_accuracy(y_true, y_pred):\n",
    "    '''Compute classification accuracy with a fixed threshold on distances.\n",
    "    '''\n",
    "    pred = y_pred.ravel() < 0.5\n",
    "    return np.mean(pred == y_true)\n",
    "\n",
    "\n",
    "def accuracy(y_true, y_pred):\n",
    "    '''Compute classification accuracy with a fixed threshold on distances.\n",
    "    '''\n",
    "    return K.mean(K.equal(y_true, K.cast(y_pred < 0.5, y_true.dtype)))\n",
    "\n",
    "\n",
    "# the data, shuffled and split between train and test sets\n",
    "tr_pairs, te_pairs, tr_y, te_y = speech_model_data()\n",
    "\n",
    "%matplotlib inline\n",
    "import matplotlib.pyplot as plt\n",
    "def plot_spec(ims):\n",
    "    timebins, freqbins = np.shape(ims)\n",
    "    # import pdb;pdb.set_trace()\n",
    "#     plt.figure(figsize=(15, 7.5))\n",
    "    plt.imshow(np.transpose(ims), origin=\"lower\", aspect=\"auto\", cmap=\"jet\", interpolation=\"none\")\n",
    "    plt.colorbar()\n",
    "    xlocs = np.float32(np.linspace(0, timebins-1, 5))\n",
    "    plt.xticks(xlocs, [\"%.02f\" % l for l in ((xlocs*15/timebins)+(0.5*2**10))/22100])\n",
    "    ylocs = np.int16(np.round(np.linspace(0, freqbins-1, 10)))\n",
    "#     plt.yticks(ylocs, [\"%.02f\" % freq[i] for i in ylocs])\n",
    "    \n",
    "def show_nth(n):\n",
    "    plt.figure(figsize=(15,7.5))\n",
    "    plt.subplot(1,2,1)\n",
    "    plot_spec(te_pairs[n][0].reshape(15,1654))\n",
    "    print(te_y[n])\n",
    "    plt.subplot(1,2,2)\n",
    "    plot_spec(te_pairs[n][1].reshape(15,1654))\n",
    "show_nth(0)\n",
    "\n",
    "# y_train.shape,y_test.shape\n",
    "# x_train.shape,x_test.shape\n",
    "# x_train = x_train.reshape(60000, 784)\n",
    "# x_test = x_test.reshape(10000, 784)\n",
    "# x_train = x_train.astype('float32')\n",
    "# x_test = x_test.astype('float32')\n",
    "# x_train /= 255\n",
    "# x_test /= 255\n",
    "\n",
    "# input_dim = (tr_pairs.shape[2], tr_pairs.shape[3])\n",
    "# epochs = 20\n",
    "\n",
    "# # network definition\n",
    "# base_network = create_base_rnn_network(input_dim)\n",
    "# input_a = Input(shape=input_dim)\n",
    "# input_b = Input(shape=input_dim)\n",
    "\n",
    "# # because we re-use the same instance `base_network`,\n",
    "# # the weights of the network\n",
    "# # will be shared across the two branches\n",
    "# processed_a = base_network(input_a)\n",
    "# processed_b = base_network(input_b)\n",
    "\n",
    "# distance = Lambda(euclidean_distance,\n",
    "#                   output_shape=eucl_dist_output_shape)(\n",
    "#     [processed_a, processed_b]\n",
    "# )\n",
    "\n",
    "# model = Model([input_a, input_b], distance)\n",
    "\n",
    "# tb_cb = TensorBoard(log_dir='./siamese_logs', histogram_freq=1, batch_size=32,\n",
    "#                     write_graph=True, write_grads=True, write_images=True,\n",
    "#                     embeddings_freq=0, embeddings_layer_names=None,\n",
    "#                     embeddings_metadata=None)\n",
    "# # train\n",
    "# rms = RMSprop(lr=0.00001)  # lr=0.001)\n",
    "# sgd = SGD(lr=0.001)\n",
    "# model.compile(loss=contrastive_loss, optimizer=rms, metrics=[accuracy])\n",
    "# model.fit([tr_pairs[:, 0], tr_pairs[:, 1]], tr_y,\n",
    "#           batch_size=128,\n",
    "#           epochs=epochs,\n",
    "#           validation_data=([te_pairs[:, 0], te_pairs[:, 1]], te_y),\n",
    "#           callbacks=[tb_cb])\n",
    "\n",
    "# # compute final accuracy on training and test sets\n",
    "# y_pred = model.predict([tr_pairs[:, 0], tr_pairs[:, 1]])\n",
    "# tr_acc = compute_accuracy(tr_y, y_pred)\n",
    "# y_pred = model.predict([te_pairs[:, 0], te_pairs[:, 1]])\n",
    "# te_acc = compute_accuracy(te_y, y_pred)\n",
    "\n",
    "# print('* Accuracy on training set: %0.2f%%' % (100 * tr_acc))\n",
    "# print('* Accuracy on test set: %0.2f%%' % (100 * te_acc))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using TensorFlow backend.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train on 36252 samples, validate on 4028 samples\n"
     ]
    }
   ],
   "source": [
    "'''Train a Siamese MLP on pairs of digits from the MNIST dataset.\n",
    "\n",
    "It follows Hadsell-et-al.'06 [1] by computing the Euclidean distance on the\n",
    "output of the shared network and by optimizing the contrastive loss (see paper\n",
    "for mode details).\n",
    "\n",
    "[1] \"Dimensionality Reduction by Learning an Invariant Mapping\"\n",
    "    http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf\n",
    "\n",
    "Gets to 97.2% test accuracy after 20 epochs.\n",
    "2 seconds per epoch on a Titan X Maxwell GPU\n",
    "'''\n",
    "from __future__ import absolute_import\n",
    "from __future__ import print_function\n",
    "import numpy as np\n",
    "\n",
    "# import random\n",
    "# from keras.datasets import mnist\n",
    "from speech_data import speech_model_data\n",
    "from keras.models import Model\n",
    "from keras.layers import Input, Dense, Dropout, SimpleRNN, LSTM, Lambda\n",
    "# Dense, Dropout, Input, Lambda, LSTM, SimpleRNN\n",
    "from keras.optimizers import RMSprop, SGD\n",
    "from keras.callbacks import TensorBoard\n",
    "from keras import backend as K\n",
    "\n",
    "\n",
    "def euclidean_distance(vects):\n",
    "    x, y = vects\n",
    "    return K.sqrt(K.maximum(K.sum(K.square(x - y), axis=1, keepdims=True),\n",
    "                            K.epsilon()))\n",
    "\n",
    "\n",
    "def eucl_dist_output_shape(shapes):\n",
    "    shape1, shape2 = shapes\n",
    "    return (shape1[0], 1)\n",
    "\n",
    "\n",
    "def contrastive_loss(y_true, y_pred):\n",
    "    '''Contrastive loss from Hadsell-et-al.'06\n",
    "    http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf\n",
    "    '''\n",
    "    margin = 1\n",
    "    # print(y_true, y_pred)\n",
    "    return K.mean(y_true * K.square(y_pred) +\n",
    "                  (1 - y_true) * K.square(K.maximum(margin - y_pred, 0)))\n",
    "\n",
    "\n",
    "def create_base_rnn_network(input_dim):\n",
    "    '''Base network to be shared (eq. to feature extraction).\n",
    "    '''\n",
    "    inp = Input(shape=input_dim)\n",
    "    # d1 = Dense(1024, activation='sigmoid')(inp)\n",
    "    # # d2 = Dense(2, activation='sigmoid')(d1)\n",
    "    ls1 = LSTM(1024, return_sequences=True)(inp)\n",
    "    ls2 = LSTM(512, return_sequences=True)(ls1)\n",
    "    ls3 = LSTM(32)(ls2)  # , return_sequences=True\n",
    "    # sr2 = SimpleRNN(128, return_sequences=True)(sr1)\n",
    "    # sr3 = SimpleRNN(32)(sr2)\n",
    "    # x = Dense(128, activation='relu')(sr1)\n",
    "    return Model(inp, ls3)\n",
    "\n",
    "def create_base_network(input_dim):\n",
    "    '''Base network to be shared (eq. to feature extraction).\n",
    "    '''\n",
    "    input = Input(shape=input_dim)\n",
    "    x = Dense(128, activation='relu')(input)\n",
    "    x = Dropout(0.1)(x)\n",
    "    x = Dense(128, activation='relu')(x)\n",
    "    x = Dropout(0.1)(x)\n",
    "    x = Dense(128, activation='relu')(x)\n",
    "    return Model(input, x)\n",
    "\n",
    "def compute_accuracy(y_true, y_pred):\n",
    "    '''Compute classification accuracy with a fixed threshold on distances.\n",
    "    '''\n",
    "    pred = y_pred.ravel() < 0.5\n",
    "    return np.mean(pred == y_true)\n",
    "\n",
    "\n",
    "def accuracy(y_true, y_pred):\n",
    "    '''Compute classification accuracy with a fixed threshold on distances.\n",
    "    '''\n",
    "    return K.mean(K.equal(y_true, K.cast(y_pred < 0.5, y_true.dtype)))\n",
    "\n",
    "\n",
    "# the data, shuffled and split between train and test sets\n",
    "tr_pairs, te_pairs, tr_y, te_y = speech_model_data()\n",
    "# y_train.shape,y_test.shape\n",
    "# x_train.shape,x_test.shape\n",
    "# x_train = x_train.reshape(60000, 784)\n",
    "# x_test = x_test.reshape(10000, 784)\n",
    "# x_train = x_train.astype('float32')\n",
    "# x_test = x_test.astype('float32')\n",
    "# x_train /= 255\n",
    "# x_test /= 255\n",
    "input_dim = (tr_pairs.shape[2], tr_pairs.shape[3])\n",
    "epochs = 20\n",
    "\n",
    "# network definition\n",
    "base_network = create_base_rnn_network(input_dim)\n",
    "input_a = Input(shape=input_dim)\n",
    "input_b = Input(shape=input_dim)\n",
    "\n",
    "# because we re-use the same instance `base_network`,\n",
    "# the weights of the network\n",
    "# will be shared across the two branches\n",
    "processed_a = base_network(input_a)\n",
    "processed_b = base_network(input_b)\n",
    "\n",
    "distance = Lambda(euclidean_distance,\n",
    "                  output_shape=eucl_dist_output_shape)(\n",
    "    [processed_a, processed_b]\n",
    ")\n",
    "\n",
    "model = Model([input_a, input_b], distance)\n",
    "\n",
    "tb_cb = TensorBoard(log_dir='./logs/siamese_logs', histogram_freq=1, batch_size=32,\n",
    "                    write_graph=True, write_grads=True, write_images=True,\n",
    "                    embeddings_freq=0, embeddings_layer_names=None,\n",
    "                    embeddings_metadata=None)\n",
    "# train\n",
    "rms = RMSprop(lr=0.001)  # lr=0.001)\n",
    "sgd = SGD(lr=0.001)\n",
    "model.compile(loss=contrastive_loss, optimizer=rms, metrics=[accuracy])\n",
    "model.fit([tr_pairs[:, 0], tr_pairs[:, 1]], tr_y,\n",
    "          batch_size=128,\n",
    "          epochs=epochs,\n",
    "          validation_data=([te_pairs[:, 0], te_pairs[:, 1]], te_y),\n",
    "          callbacks=[tb_cb])\n",
    "\n",
    "model.save('./models/siamese_speech_model.h5')\n",
    "# compute final accuracy on training and test sets\n",
    "y_pred = model.predict([tr_pairs[:, 0], tr_pairs[:, 1]])\n",
    "tr_acc = compute_accuracy(tr_y, y_pred)\n",
    "y_pred = model.predict([te_pairs[:, 0], te_pairs[:, 1]])\n",
    "te_acc = compute_accuracy(te_y, y_pred)\n",
    "\n",
    "print('* Accuracy on training set: %0.2f%%' % (100 * tr_acc))\n",
    "print('* Accuracy on test set: %0.2f%%' % (100 * te_acc))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}