diff --git a/segment_data.py b/segment_data.py new file mode 100644 index 0000000..77aba57 --- /dev/null +++ b/segment_data.py @@ -0,0 +1,11 @@ +import pandas as pd + +def fix_csv(collection_name = 'test'): + seg_data = pd.read_csv('./outputs/'+collection_name+'.csv',names=['phrase','filename' + ,'start_phoneme','end_phoneme','start_time','end_time']) + seg_data.to_csv('./outputs/'+collection_name+'.fixed.csv') + + +def segment_data_gen(collection_name = 'test'): + # collection_name = 'test' + seg_data = pd.read_csv('./outputs/'+collection_name+'.fixed.csv',index_col=0) diff --git a/segment_model.py b/segment_model.py new file mode 100644 index 0000000..683cdd6 --- /dev/null +++ b/segment_model.py @@ -0,0 +1,108 @@ +from __future__ import absolute_import +from __future__ import print_function +import numpy as np +from keras.models import Model,load_model,model_from_yaml +from keras.layers import Input,Concatenate,Lambda, BatchNormalization, Dropout +from keras.layers import Dense, LSTM, Bidirectional, GRU +from keras.losses import categorical_crossentropy +from keras.utils import to_categorical +from keras.optimizers import RMSprop +from keras.callbacks import TensorBoard, ModelCheckpoint +from keras import backend as K +from keras.utils import plot_model +from speech_tools import create_dir,step_count +from speech_data import segment_data_gen + + +def accuracy(y_true, y_pred): + '''Compute classification accuracy with a fixed threshold on distances. + ''' + return K.mean(K.equal(y_true, K.cast(y_pred > 0.5, y_true.dtype))) + +def dense_classifier(processed): + conc_proc = Concatenate()(processed) + d1 = Dense(64, activation='relu')(conc_proc) + # dr1 = Dropout(0.1)(d1) + # d2 = Dense(128, activation='relu')(d1) + d3 = Dense(8, activation='relu')(d1) + # dr2 = Dropout(0.1)(d2) + return Dense(2, activation='softmax')(d3) + +def segment_model(input_dim): + inp = Input(shape=input_dim) + # ls0 = LSTM(512, return_sequences=True)(inp) + ls1 = LSTM(128, return_sequences=True)(inp) + ls2 = LSTM(64, return_sequences=True)(ls1) + # ls3 = LSTM(32, return_sequences=True)(ls2) + ls4 = LSTM(32)(ls2) + d1 = Dense(64, activation='relu')(ls4) + d3 = Dense(8, activation='relu')(d1) + oup = Dense(2, activation='softmax')(d3) + return Model(inp, oup) + +def write_model_arch(mod,mod_file): + model_f = open(mod_file,'w') + model_f.write(mod.to_yaml()) + model_f.close() + +def load_model_arch(mod_file): + model_f = open(mod_file,'r') + mod = model_from_yaml(model_f.read()) + model_f.close() + return mod + +def train_segment(collection_name = 'test'): + batch_size = 128 + model_dir = './models/segment/'+collection_name + create_dir(model_dir) + log_dir = './logs/segment/'+collection_name + create_dir(log_dir) + tr_gen_fn = segment_data_gen() + tr_gen = tr_gen_fn() + input_dim = (n_step, n_features) + + model = segment_model(input_dim) + plot_model(model,show_shapes=True, to_file=model_dir+'/model.png') + + tb_cb = TensorBoard( + log_dir=log_dir, + histogram_freq=1, + batch_size=32, + write_graph=True, + write_grads=True, + write_images=True, + embeddings_freq=0, + embeddings_layer_names=None, + embeddings_metadata=None) + cp_file_fmt = model_dir+'/siamese_speech_model-{epoch:02d}-epoch-{val_loss:0.2f}\ +-acc.h5' + + cp_cb = ModelCheckpoint( + cp_file_fmt, + monitor='val_loss', + verbose=0, + save_best_only=True, + save_weights_only=True, + mode='auto', + period=1) + # train + rms = RMSprop() + model.compile(loss=categorical_crossentropy, optimizer=rms, metrics=[accuracy]) + write_model_arch(model,model_dir+'/siamese_speech_model_arch.yaml') + epoch_n_steps = step_count(n_records,batch_size) + model.fit_generator(tr_gen + , epochs=1000 + , steps_per_epoch=epoch_n_steps + , validation_data=([te_pairs[:, 0], te_pairs[:, 1]], te_y) + , max_queue_size=32 + , callbacks=[tb_cb, cp_cb]) + model.save(model_dir+'/speech_segment_model-final.h5') + + y_pred = model.predict([te_pairs[:, 0], te_pairs[:, 1]]) + te_acc = compute_accuracy(te_y, y_pred) + print('* Accuracy on test set: %0.2f%%' % (100 * te_acc)) + + + +if __name__ == '__main__': + train_segment('test') diff --git a/speech_model.py b/speech_model.py index f083360..ab9145c 100644 --- a/speech_model.py +++ b/speech_model.py @@ -3,7 +3,8 @@ from __future__ import print_function import numpy as np from speech_data import read_siamese_tfrecords_generator from keras.models import Model,load_model,model_from_yaml -from keras.layers import Input, Dense, Dropout, LSTM, Lambda, Concatenate, Bidirectional +from keras.layers import Input,Concatenate,Lambda, BatchNormalization, Dropout +from keras.layers import Dense, LSTM, Bidirectional, GRU from keras.losses import categorical_crossentropy from keras.utils import to_categorical from keras.optimizers import RMSprop diff --git a/speech_tts_queue.py b/speech_segmentgen.py similarity index 100% rename from speech_tts_queue.py rename to speech_segmentgen.py