diff --git a/speech_data.py b/speech_data.py index a5c166e..9154f43 100644 --- a/speech_data.py +++ b/speech_data.py @@ -107,8 +107,8 @@ def create_speech_pairs_data(audio_group='audio'): def speech_model_data(): tr_pairs = np.load('outputs/tr_pairs.npy') / 255.0 te_pairs = np.load('outputs/te_pairs.npy') / 255.0 - # tr_pairs[tr_pairs < 0] = 0 - # te_pairs[te_pairs < 0] = 0 + tr_pairs[tr_pairs < 0] = 0 + te_pairs[te_pairs < 0] = 0 tr_y = np.load('outputs/tr_y.npy') te_y = np.load('outputs/te_y.npy') return tr_pairs, te_pairs, tr_y, te_y diff --git a/speech_siamese.py b/speech_siamese.py index 260f6e9..c1a5902 100644 --- a/speech_siamese.py +++ b/speech_siamese.py @@ -3,7 +3,10 @@ from __future__ import print_function import numpy as np from speech_data import speech_model_data from keras.models import Model,load_model -from keras.layers import Input, Dense, Dropout, LSTM, Lambda +from keras.layers import Input, Dense, Dropout, LSTM, Lambda, Concatenate +# from keras.losses import categorical_crossentropy +from keras.losses import binary_crossentropy +# from keras.utils.np_utils import to_categorical from keras.optimizers import RMSprop from keras.callbacks import TensorBoard, ModelCheckpoint from keras import backend as K @@ -34,20 +37,9 @@ def create_base_rnn_network(input_dim): inp = Input(shape=input_dim) ls1 = LSTM(1024, return_sequences=True)(inp) ls2 = LSTM(512, return_sequences=True)(ls1) - ls3 = LSTM(32)(ls2) - return Model(inp, ls3) - - -def create_base_network(input_dim): - '''Base network to be shared (eq. to feature extraction). - ''' - input = Input(shape=input_dim) - x = Dense(128, activation='relu')(input) - x = Dropout(0.1)(x) - x = Dense(128, activation='relu')(x) - x = Dropout(0.1)(x) - x = Dense(128, activation='relu')(x) - return Model(input, x) + # ls3 = LSTM(32, return_sequences=True)(ls2) + ls4 = LSTM(32)(ls2) + return Model(inp, ls4) def compute_accuracy(y_true, y_pred): @@ -62,6 +54,13 @@ def accuracy(y_true, y_pred): ''' return K.mean(K.equal(y_true, K.cast(y_pred < 0.5, y_true.dtype))) +def dense_classifier(processed): + conc_proc = Concatenate()(processed) + d1 = Dense(8, activation='relu')(conc_proc) + dr1 = Dropout(0.1)(d1) + # d2 = Dense(8, activation='relu')(dr1) + # dr2 = Dropout(0.1)(d2) + return Dense(1, activation='sigmoid')(dr1) def siamese_model(input_dim): # input_dim = (15, 1654) @@ -70,11 +69,12 @@ def siamese_model(input_dim): input_b = Input(shape=input_dim) processed_a = base_network(input_a) processed_b = base_network(input_b) - distance = Lambda( - euclidean_distance, - output_shape=eucl_dist_output_shape)([processed_a, processed_b]) - - model = Model([input_a, input_b], distance) + final_output = dense_classifier([processed_a,processed_b]) + model = Model([input_a, input_b], final_output) + # distance = Lambda( + # euclidean_distance, + # output_shape=eucl_dist_output_shape)([processed_a, processed_b]) + # model = Model([input_a, input_b], distance) return model @@ -95,12 +95,12 @@ def train_siamese(): embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None) - cp_file_fmt = './models/siamese_speech_model-{epoch:02d}-epoch-{val_acc:0.2f}\ + cp_file_fmt = './models/siamese_speech_model-{epoch:02d}-epoch-{val_loss:0.2f}\ -acc.h5' cp_cb = ModelCheckpoint( cp_file_fmt, - monitor='val_acc', + monitor='val_loss', verbose=0, save_best_only=False, save_weights_only=False, @@ -108,7 +108,7 @@ def train_siamese(): period=1) # train rms = RMSprop(lr=0.001) - model.compile(loss=contrastive_loss, optimizer=rms, metrics=[accuracy]) + model.compile(loss=binary_crossentropy, optimizer=rms, metrics=[accuracy]) model.fit( [tr_pairs[:, 0], tr_pairs[:, 1]], tr_y,