diff --git a/requirements-linux.txt b/requirements-linux.txt new file mode 100644 index 0000000..c533525 --- /dev/null +++ b/requirements-linux.txt @@ -0,0 +1,77 @@ +bleach==1.5.0 +click==6.7 +cloudpickle==0.4.1 +cycler==0.10.0 +dask==0.15.4 +decorator==4.1.2 +distributed==1.19.3 +entrypoints==0.2.3 +enum34==1.1.6 +futures==3.1.1 +h5py==2.7.1 +HeapDict==1.0.0 +html5lib==0.9999999 +ipykernel==4.6.1 +ipython==6.2.1 +ipython-genutils==0.2.0 +ipywidgets==7.0.3 +jedi==0.11.0 +Jinja2==2.9.6 +jsonschema==2.6.0 +jupyter==1.0.0 +jupyter-client==5.1.0 +jupyter-console==5.2.0 +jupyter-core==4.3.0 +Keras==2.0.8 +locket==0.2.0 +Markdown==2.6.9 +MarkupSafe==1.0 +matplotlib==2.1.0 +mistune==0.7.4 +msgpack-python==0.4.8 +nbconvert==5.3.1 +nbformat==4.4.0 +notebook==5.2.0 +numexpr==2.6.4 +numpy==1.13.3 +pandas==0.20.3 +pandocfilters==1.4.2 +parso==0.1.0 +partd==0.3.8 +pexpect==4.2.1 +pickleshare==0.7.4 +pkg-resources==0.0.0 +progressbar2==3.34.3 +prompt-toolkit==1.0.15 +protobuf==3.4.0 +psutil==5.4.0 +ptyprocess==0.5.2 +PyAudio==0.2.11 +Pygments==2.2.0 +pyparsing==2.2.0 +pysndfile==1.0.0 +python-dateutil==2.6.1 +python-utils==2.2.0 +pytz==2017.2 +PyYAML==3.12 +pyzmq==16.0.2 +qtconsole==4.3.1 +scikit-learn==0.19.0 +scipy==0.19.1 +simplegeneric==0.8.1 +six==1.11.0 +sortedcontainers==1.5.7 +tables==3.4.2 +tblib==1.3.2 +tensorflow==1.3.0 +tensorflow-tensorboard==0.4.0rc1 +terminado==0.6 +testpath==0.3.1 +toolz==0.8.2 +tornado==4.5.2 +tqdm==4.19.4 +traitlets==4.3.2 +wcwidth==0.1.7 +Werkzeug==0.12.2 +widgetsnbextension==3.0.6 +zict==0.1.3 diff --git a/speech_data.py b/speech_data.py index c67b015..4a85b76 100644 --- a/speech_data.py +++ b/speech_data.py @@ -20,9 +20,10 @@ def siamese_pairs(rightGroup, wrongGroup): group1 = [r for (i, r) in rightGroup.iterrows()] group2 = [r for (i, r) in wrongGroup.iterrows()] rightWrongPairs = [(g1, g2) for g2 in group2 for g1 in group1] - rightRightPairs = [i for i in itertools.combinations(group1, 2)] - random.shuffle(rightWrongPairs) - random.shuffle(rightRightPairs) + rightRightPairs = [i for i in itertools.combinations(group1, 2)]#+[i for i in itertools.combinations(group2, 2)] + # random.shuffle(rightWrongPairs) + # random.shuffle(rightRightPairs) + # return rightRightPairs[:10],rightWrongPairs[:10] return rightRightPairs[:32],rightWrongPairs[:32] @@ -45,8 +46,7 @@ def create_spectrogram_tfrecords(audio_group='audio',sample_count=0): n_records,n_spec,n_features = 0,0,0 def write_samples(wg,sample_name): - wg_sampled = reservoir_sample(wg,sample_count) if sample_count > 0 else wg - word_group_prog = tqdm(wg_sampled,desc='Computing spectrogram') + word_group_prog = tqdm(wg,desc='Computing spectrogram') record_file = './outputs/{}.{}.tfrecords'.format(audio_group,sample_name) writer = tf.python_io.TFRecordWriter(record_file) for (w, word_group) in word_group_prog: @@ -100,7 +100,8 @@ def create_spectrogram_tfrecords(audio_group='audio',sample_count=0): writer.close() word_groups = [i for i in audio_samples.groupby('word')] - tr_audio_samples,te_audio_samples = train_test_split(word_groups,test_size=0.1) + wg_sampled = reservoir_sample(word_groups,sample_count) if sample_count > 0 else word_groups + tr_audio_samples,te_audio_samples = train_test_split(wg_sampled,test_size=0.1) write_samples(tr_audio_samples,'train') write_samples(te_audio_samples,'test') const_file = os.path.join('./outputs',audio_group+'.constants') @@ -124,7 +125,7 @@ def reservoir_sample(iterable, k): sample[j] = item # replace item with gradually decreasing probability return sample -def read_siamese_tfrecords_generator(audio_group='audio',batch_size=32,sample_size=100): +def read_siamese_tfrecords_generator(audio_group='audio',batch_size=32,test_size=100): records_file = os.path.join('./outputs',audio_group+'.train.tfrecords') input_pairs = [] output_class = [] @@ -160,13 +161,14 @@ def read_siamese_tfrecords_generator(audio_group='audio',batch_size=32,sample_si # Read test in one-shot te_records_file = os.path.join('./outputs',audio_group+'.test.tfrecords') te_re_iterator = tf.python_io.tf_record_iterator(path=records_file) + te_n_records = len([i for i in te_re_iterator]) + te_re_iterator = tf.python_io.tf_record_iterator(path=records_file) print('reading tfrecords({}-test)...'.format(audio_group)) - samples = min([sample_size,n_records]) - # samples = n_records - input_data = np.zeros((samples,2,n_spec,n_features)) - output_data = np.zeros((samples,2)) - random_samples = enumerate(reservoir_sample(te_re_iterator,samples)) - for (i,string_record) in tqdm(random_samples,total=samples): + test_size = min([test_size,te_n_records]) if test_size > 0 else te_n_records + input_data = np.zeros((test_size,2,n_spec,n_features)) + output_data = np.zeros((test_size,2)) + random_samples = enumerate(reservoir_sample(te_re_iterator,test_size)) + for (i,string_record) in tqdm(random_samples,total=test_size): example = tf.train.Example() example.ParseFromString(string_record) spec_n1 = example.features.feature['spec_n1'].int64_list.value[0] @@ -187,7 +189,7 @@ def audio_samples_word_count(audio_group='audio'): return len(audio_samples.groupby(audio_samples['word'])) def fix_csv(audio_group='audio'): - audio_csv_lines = open('./outputs/' + audio_group + '.csv','r').readlines() + audio_csv_lines = open('./outputs/' + audio_group + '.csv.orig','r').readlines() audio_csv_data = [i.strip().split(',') for i in audio_csv_lines] proper_rows = [i for i in audio_csv_data if len(i) == 7] with open('./outputs/' + audio_group + '.csv','w') as fixed_csv: @@ -220,10 +222,13 @@ if __name__ == '__main__': # read_siamese_tfrecords('story_all') # read_siamese_tfrecords('story_words_test') # padd_zeros_siamese_tfrecords('story_words') - # fix_csv() + # fix_csv('story_words') # pickle_constants('story_words') # create_spectrogram_tfrecords('audio',sample_count=100) - read_siamese_tfrecords_generator('audio') + # create_spectrogram_tfrecords('story_all',sample_count=25) + create_spectrogram_tfrecords('story_words',sample_count=10) + # create_spectrogram_tfrecords('audio',sample_count=50) + # read_siamese_tfrecords_generator('audio') # padd_zeros_siamese_tfrecords('audio') # create_padded_spectrogram() # create_speech_pairs_data() diff --git a/speech_siamese.py b/speech_siamese.py index 298def4..fb89e19 100644 --- a/speech_siamese.py +++ b/speech_siamese.py @@ -2,7 +2,7 @@ from __future__ import absolute_import from __future__ import print_function import numpy as np # from speech_data import speech_model_data -from speech_data import read_siamese_tfrecords_oneshot,read_siamese_tfrecords_generator +from speech_data import read_siamese_tfrecords_generator from keras.models import Model,load_model from keras.layers import Input, Dense, Dropout, LSTM, Lambda, Concatenate from keras.losses import categorical_crossentropy @@ -14,42 +14,46 @@ from keras.callbacks import TensorBoard, ModelCheckpoint from keras import backend as K def create_dir(direc): + import os if not os.path.exists(direc): os.makedirs(direc) -def euclidean_distance(vects): - x, y = vects - return K.sqrt( - K.maximum(K.sum(K.square(x - y), axis=1, keepdims=True), K.epsilon())) - - -def eucl_dist_output_shape(shapes): - shape1, shape2 = shapes - return (shape1[0], 1) - - -def contrastive_loss(y_true, y_pred): - '''Contrastive loss from Hadsell-et-al.'06 - http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf - ''' - return K.mean(y_true * K.square(y_pred) + - (1 - y_true) * K.square(K.maximum(1 - y_pred, 0))) +# def euclidean_distance(vects): +# x, y = vects +# return K.sqrt( +# K.maximum(K.sum(K.square(x - y), axis=1, keepdims=True), K.epsilon())) +# +# +# def eucl_dist_output_shape(shapes): +# shape1, shape2 = shapes +# return (shape1[0], 1) +# +# +# def contrastive_loss(y_true, y_pred): +# '''Contrastive loss from Hadsell-et-al.'06 +# http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf +# ''' +# return K.mean(y_true * K.square(y_pred) + +# (1 - y_true) * K.square(K.maximum(1 - y_pred, 0))) def create_base_rnn_network(input_dim): '''Base network to be shared (eq. to feature extraction). ''' inp = Input(shape=input_dim) - ls1 = LSTM(256, return_sequences=True)(inp) + ls0 = LSTM(512, return_sequences=True)(inp) + ls1 = LSTM(256, return_sequences=True)(ls0) ls2 = LSTM(128, return_sequences=True)(ls1) # ls3 = LSTM(32, return_sequences=True)(ls2) ls4 = LSTM(64)(ls2) + d1 = Dense(128, activation='relu')(ls4) + d2 = Dense(64, activation='relu')(d1) return Model(inp, ls4) def compute_accuracy(y_true, y_pred): '''Compute classification accuracy with a fixed threshold on distances. ''' - pred = y_pred.ravel() < 0.5 + pred = y_pred.ravel() > 0.5 return np.mean(pred == y_true) @@ -60,11 +64,12 @@ def accuracy(y_true, y_pred): def dense_classifier(processed): conc_proc = Concatenate()(processed) - d1 = Dense(16, activation='relu')(conc_proc) + d1 = Dense(64, activation='relu')(conc_proc) # dr1 = Dropout(0.1)(d1) - d2 = Dense(8, activation='relu')(d1) + d2 = Dense(128, activation='relu')(d1) + d3 = Dense(8, activation='relu')(d2) # dr2 = Dropout(0.1)(d2) - return Dense(2, activation='softmax')(d2) + return Dense(2, activation='softmax')(d3) def siamese_model(input_dim): # input_dim = (15, 1654) @@ -85,10 +90,10 @@ def siamese_model(input_dim): def train_siamese(audio_group = 'audio'): # the data, shuffled and split between train and test sets # tr_pairs, te_pairs, tr_y_e, te_y_e = speech_model_data() - batch_size = 512 + batch_size = 128 model_dir = './models/'+audio_group create_dir(model_dir) - tr_gen_fn,te_pairs,te_y,n_step,n_features,n_records = read_siamese_tfrecords_generator(audio_group,batch_size,300) + tr_gen_fn,te_pairs,te_y,n_step,n_features,n_records = read_siamese_tfrecords_generator(audio_group,batch_size,256) tr_gen = tr_gen_fn() # tr_y = to_categorical(tr_y_e, num_classes=2) # te_y = to_categorical(te_y_e, num_classes=2) @@ -113,12 +118,12 @@ def train_siamese(audio_group = 'audio'): cp_file_fmt, monitor='val_loss', verbose=0, - save_best_only=False, - save_weights_only=False, + save_best_only=True, + save_weights_only=True, mode='auto', period=1) # train - rms = RMSprop(lr=0.001) + rms = RMSprop()#lr=0.001 model.compile(loss=categorical_crossentropy, optimizer=rms, metrics=[accuracy]) # model.fit( # [tr_pairs[:, 0], tr_pairs[:, 1]], @@ -128,11 +133,11 @@ def train_siamese(audio_group = 'audio'): # validation_data=([te_pairs[:, 0], te_pairs[:, 1]], te_y), # callbacks=[tb_cb, cp_cb]) model.fit_generator(tr_gen - ,epochs=100 + ,epochs=1000 ,steps_per_epoch=n_records//batch_size ,validation_data=([te_pairs[:, 0], te_pairs[:, 1]], te_y) ,use_multiprocessing=True) - + # ,callbacks=[tb_cb, cp_cb]) model.save(model_dir+'/siamese_speech_model-final.h5') # compute final accuracy on training and test sets # y_pred = model.predict([tr_pairs[:, 0], tr_pairs[:, 1]]) @@ -146,4 +151,5 @@ def train_siamese(audio_group = 'audio'): if __name__ == '__main__': - train_siamese() + train_siamese('story_words') + # train_siamese('audio')