diff --git a/speech_data.py b/speech_data.py index 21f0466..9169058 100644 --- a/speech_data.py +++ b/speech_data.py @@ -41,7 +41,7 @@ def create_spectrogram_tfrecords(audio_group='audio',sample_count=0,train_test_r http://warmspringwinds.github.io/tensorflow/tf-slim/2016/12/21/tfrecords-guide/ http://www.machinelearninguru.com/deep_learning/tensorflow/basics/tfrecord/tfrecord.html ''' - audio_samples = pd.read_csv( './outputs/' + audio_group + '.csv',index_col=0) + audio_samples = pd.read_csv( './outputs/' + audio_group + '.fixed.csv',index_col=0) audio_samples['file_path'] = audio_samples.loc[:, 'file'].apply(lambda x: 'outputs/' + audio_group + '/' + x) n_records,n_spec,n_features = 0,0,0 @@ -205,19 +205,19 @@ def record_generator_count(records_file): return record_iterator,count def fix_csv(audio_group='audio'): - audio_csv_lines = open('./outputs/' + audio_group + '.csv.orig','r').readlines() + audio_csv_lines = open('./outputs/' + audio_group + '.csv','r').readlines() audio_csv_data = [i.strip().split(',') for i in audio_csv_lines] proper_rows = [i for i in audio_csv_data if len(i) == 7] - with open('./outputs/' + audio_group + '.csv','w') as fixed_csv: + with open('./outputs/' + audio_group + '.fixed.csv','w') as fixed_csv: fixed_csv_w = csv.writer(fixed_csv, quoting=csv.QUOTE_MINIMAL) fixed_csv_w.writerows(proper_rows) - audio_samples = pd.read_csv( './outputs/' + audio_group + '.csv' + audio_samples = pd.read_csv( './outputs/' + audio_group + '.fixed.csv' , names=['word','phonemes', 'voice', 'language', 'rate', 'variant', 'file']) audio_samples['file_path'] = audio_samples.loc[:, 'file'].apply(lambda x: 'outputs/' + audio_group + '/' + x) audio_samples['file_exists'] = apply_by_multiprocessing(audio_samples['file_path'], os.path.exists) audio_samples = audio_samples[audio_samples['file_exists'] == True] audio_samples = audio_samples.drop(['file_path','file_exists'],axis=1).reset_index(drop=True) - audio_samples.to_csv('./outputs/' + audio_group + '.csv') + audio_samples.to_csv('./outputs/' + audio_group + '.fixed.csv') def convert_old_audio(): audio_samples = pd.read_csv( './outputs/audio.csv.old' @@ -243,7 +243,8 @@ if __name__ == '__main__': # create_spectrogram_tfrecords('audio',sample_count=100) # create_spectrogram_tfrecords('story_all',sample_count=25) # fix_csv('story_words_test') - create_spectrogram_tfrecords('story_words_test',sample_count=100,train_test_ratio=0.0) + fix_csv('story_phrases') + create_spectrogram_tfrecords('story_phrases',sample_count=100,train_test_ratio=0.3) # create_spectrogram_tfrecords('audio',sample_count=50) # read_siamese_tfrecords_generator('audio') # padd_zeros_siamese_tfrecords('audio') diff --git a/speech_model.py b/speech_model.py index 4b2d234..5d5842e 100644 --- a/speech_model.py +++ b/speech_model.py @@ -1,36 +1,16 @@ from __future__ import absolute_import from __future__ import print_function import numpy as np -# from speech_data import speech_model_data from speech_data import read_siamese_tfrecords_generator from keras.models import Model,load_model,model_from_yaml from keras.layers import Input, Dense, Dropout, LSTM, Lambda, Concatenate from keras.losses import categorical_crossentropy -# from keras.losses import binary_crossentropy from keras.utils import to_categorical -# from keras.utils.np_utils import to_categorical from keras.optimizers import RMSprop from keras.callbacks import TensorBoard, ModelCheckpoint from keras import backend as K from speech_tools import create_dir,step_count -# def euclidean_distance(vects): -# x, y = vects -# return K.sqrt( -# K.maximum(K.sum(K.square(x - y), axis=1, keepdims=True), K.epsilon())) -# -# -# def eucl_dist_output_shape(shapes): -# shape1, shape2 = shapes -# return (shape1[0], 1) -# -# -# def contrastive_loss(y_true, y_pred): -# '''Contrastive loss from Hadsell-et-al.'06 -# http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf -# ''' -# return K.mean(y_true * K.square(y_pred) + -# (1 - y_true) * K.square(K.maximum(1 - y_pred, 0))) def create_base_rnn_network(input_dim): '''Base network to be shared (eq. to feature extraction). @@ -68,7 +48,6 @@ def dense_classifier(processed): return Dense(2, activation='softmax')(d3) def siamese_model(input_dim): - # input_dim = (15, 1654) base_network = create_base_rnn_network(input_dim) input_a = Input(shape=input_dim) input_b = Input(shape=input_dim) @@ -94,8 +73,6 @@ def load_model_arch(mod_file): return mod def train_siamese(audio_group = 'audio'): - # the data, shuffled and split between train and test sets - # tr_pairs, te_pairs, tr_y_e, te_y_e = speech_model_data() batch_size = 256 model_dir = './models/'+audio_group create_dir(model_dir) @@ -103,8 +80,6 @@ def train_siamese(audio_group = 'audio'): create_dir(log_dir) tr_gen_fn,te_pairs,te_y,n_step,n_features,n_records = read_siamese_tfrecords_generator(audio_group,batch_size=batch_size,test_size=batch_size) tr_gen = tr_gen_fn() - # tr_y = to_categorical(tr_y_e, num_classes=2) - # te_y = to_categorical(te_y_e, num_classes=2) input_dim = (n_step, n_features) model = siamese_model(input_dim) @@ -131,29 +106,17 @@ def train_siamese(audio_group = 'audio'): mode='auto', period=1) # train - rms = RMSprop()#lr=0.001 + rms = RMSprop() model.compile(loss=categorical_crossentropy, optimizer=rms, metrics=[accuracy]) write_model_arch(model,model_dir+'/siamese_speech_model_arch.yaml') - # model.fit( - # [tr_pairs[:, 0], tr_pairs[:, 1]], - # tr_y, - # batch_size=128, - # epochs=100, - # validation_data=([te_pairs[:, 0], te_pairs[:, 1]], te_y), - # callbacks=[tb_cb, cp_cb]) epoch_n_steps = step_count(n_records,batch_size) model.fit_generator(tr_gen , epochs=1000 , steps_per_epoch=epoch_n_steps , validation_data=([te_pairs[:, 0], te_pairs[:, 1]], te_y) - # ,use_multiprocessing=True, workers=1 , max_queue_size=32 , callbacks=[tb_cb, cp_cb]) model.save(model_dir+'/siamese_speech_model-final.h5') - # compute final accuracy on training and test sets - # y_pred = model.predict([tr_pairs[:, 0], tr_pairs[:, 1]]) - # tr_acc = compute_accuracy(tr_y, y_pred) - # print('* Accuracy on training set: %0.2f%%' % (100 * tr_acc)) y_pred = model.predict([te_pairs[:, 0], te_pairs[:, 1]]) te_acc = compute_accuracy(te_y, y_pred) @@ -162,5 +125,4 @@ def train_siamese(audio_group = 'audio'): if __name__ == '__main__': - train_siamese('story_words_test') - # train_siamese('audio') + train_siamese('story_phrases')