From e9b18921eeaa06ce8a41ad5e52cf05f31de2cdf4 Mon Sep 17 00:00:00 2001 From: Malar Kannan Date: Fri, 10 Nov 2017 14:07:31 +0530 Subject: [PATCH] implemented train/test split at word-level and generator returns one-shot validation data --- speech_data.py | 206 +++++++++++++++++++++++++++++++--------------- speech_siamese.py | 15 ++-- 2 files changed, 149 insertions(+), 72 deletions(-) diff --git a/speech_data.py b/speech_data.py index 2730bda..2659eb7 100644 --- a/speech_data.py +++ b/speech_data.py @@ -35,72 +35,74 @@ def _int64_feature(value): def _bytes_feature(value): return tf.train.Feature(bytes_list=tf.train.BytesList(value=value)) -def create_spectrogram_tfrecords(audio_group='audio'): +def create_spectrogram_tfrecords(audio_group='audio',sample_count=0): ''' http://warmspringwinds.github.io/tensorflow/tf-slim/2016/12/21/tfrecords-guide/ http://www.machinelearninguru.com/deep_learning/tensorflow/basics/tfrecord/tfrecord.html ''' - audio_samples = pd.read_csv( './outputs/' + audio_group + '.csv' - , names=['word','phonemes', 'voice', 'language', 'rate', 'variant', 'file'] - , quoting=csv.QUOTE_NONE) + audio_samples = pd.read_csv( './outputs/' + audio_group + '.csv',index_col=0) audio_samples['file_path'] = audio_samples.loc[:, 'file'].apply(lambda x: 'outputs/' + audio_group + '/' + x) - audio_samples['file_exists'] = apply_by_multiprocessing(audio_samples['file_path'], os.path.exists) - audio_samples = audio_samples[audio_samples['file_exists'] == True].reset_index() + n_records,n_spec,n_features = 0,0,0 - n_records = n_spec = n_features = 0 + def write_samples(wg,sample_name): + wg_sampled = reservoir_sample(wg,sample_count) if sample_count > 0 else wg + word_group_prog = tqdm(wg_sampled,desc='Computing spectrogram') + record_file = './outputs/{}.{}.tfrecords'.format(audio_group,sample_name) + writer = tf.python_io.TFRecordWriter(record_file) + for (w, word_group) in word_group_prog: + word_group_prog.set_postfix(word=w,sample_name=sample_name) + g = word_group.reset_index() + g['spectrogram'] = apply_by_multiprocessing(g['file_path'],generate_aiff_spectrogram) + sample_right = g.loc[g['variant'] == 'low'] + sample_wrong = g.loc[g['variant'] == 'medium'] + same, diff = siamese_pairs(sample_right, sample_wrong) + groups = [([0,1],same),([1,0],diff)] + for (output,group) in groups: + group_prog = tqdm(group,desc='Writing Spectrogram') + for sample1,sample2 in group_prog: + group_prog.set_postfix(output=output + ,var1=sample1['variant'] + ,var2=sample2['variant']) + spectro1,spectro2 = sample1['spectrogram'],sample2['spectrogram'] + spec_n1,spec_n2 = spectro1.shape[0],spectro2.shape[0] + spec_w1,spec_w2 = spectro1.shape[1],spectro2.shape[1] + spec1,spec2 = spectro1.reshape(-1),spectro2.reshape(-1) + nonlocal n_spec,n_records,n_features + n_spec = max([n_spec,spec_n1,spec_n2]) + n_features = spec_w1 + n_records+=1 + example = tf.train.Example(features=tf.train.Features( + feature={ + 'word': _bytes_feature([w.encode('utf-8')]), + 'phoneme1': _bytes_feature([sample1['phonemes'].encode('utf-8')]), + 'phoneme2': _bytes_feature([sample2['phonemes'].encode('utf-8')]), + 'voice1': _bytes_feature([sample1['voice'].encode('utf-8')]), + 'voice2': _bytes_feature([sample2['voice'].encode('utf-8')]), + 'language': _bytes_feature([sample1['language'].encode('utf-8')]), + 'rate1':_int64_feature([sample1['rate']]), + 'rate2':_int64_feature([sample2['rate']]), + 'variant1': _bytes_feature([sample1['variant'].encode('utf-8')]), + 'variant2': _bytes_feature([sample2['variant'].encode('utf-8')]), + 'file1': _bytes_feature([sample1['file'].encode('utf-8')]), + 'file2': _bytes_feature([sample2['file'].encode('utf-8')]), + 'spec1':_float_feature(spec1), + 'spec2':_float_feature(spec2), + 'spec_n1':_int64_feature([spec_n1]), + 'spec_w1':_int64_feature([spec_w1]), + 'spec_n2':_int64_feature([spec_n2]), + 'spec_w2':_int64_feature([spec_w2]), + 'output':_int64_feature(output) + } + )) + writer.write(example.SerializeToString()) + group_prog.close() + word_group_prog.close() + writer.close() - writer = tf.python_io.TFRecordWriter('./outputs/' + audio_group + '.tfrecords') - prog = tqdm(audio_samples.groupby(audio_samples['word']),desc='Computing spectrogram') - for (w, word_group) in prog: - prog.set_postfix(word=w) - g = word_group.reset_index() - g['spectrogram'] = apply_by_multiprocessing(g['file_path'],generate_aiff_spectrogram) - sample_right = g.loc[g['variant'] == 'low'] - sample_wrong = g.loc[g['variant'] == 'medium'] - same, diff = siamese_pairs(sample_right, sample_wrong) - groups = [([0,1],same),([1,0],diff)] - for (output,group) in groups: - group_prog = tqdm(group,desc='Writing Spectrogram') - for sample1,sample2 in group_prog: - group_prog.set_postfix(output=output - ,var1=sample1['variant'] - ,var2=sample2['variant']) - spectro1,spectro2 = sample1['spectrogram'],sample2['spectrogram'] - spec_n1,spec_n2 = spectro1.shape[0],spectro2.shape[0] - spec_w1,spec_w2 = spectro1.shape[1],spectro2.shape[1] - spec1,spec2 = spectro1.reshape(-1),spectro2.reshape(-1) - - n_spec = max([n_spec,spec_n1,spec_n2]) - n_features = spec_w1 - n_records+=1 - - example = tf.train.Example(features=tf.train.Features( - feature={ - 'word': _bytes_feature([w.encode('utf-8')]), - 'phoneme1': _bytes_feature([sample1['phonemes'].encode('utf-8')]), - 'phoneme2': _bytes_feature([sample2['phonemes'].encode('utf-8')]), - 'voice1': _bytes_feature([sample1['voice'].encode('utf-8')]), - 'voice2': _bytes_feature([sample2['voice'].encode('utf-8')]), - 'language': _bytes_feature([sample1['language'].encode('utf-8')]), - 'rate1':_int64_feature([sample1['rate']]), - 'rate2':_int64_feature([sample2['rate']]), - 'variant1': _bytes_feature([sample1['variant'].encode('utf-8')]), - 'variant2': _bytes_feature([sample2['variant'].encode('utf-8')]), - 'file1': _bytes_feature([sample1['file'].encode('utf-8')]), - 'file2': _bytes_feature([sample2['file'].encode('utf-8')]), - 'spec1':_float_feature(spec1), - 'spec2':_float_feature(spec2), - 'spec_n1':_int64_feature([spec_n1]), - 'spec_w1':_int64_feature([spec_w1]), - 'spec_n2':_int64_feature([spec_n2]), - 'spec_w2':_int64_feature([spec_w2]), - 'output':_int64_feature(output) - } - )) - writer.write(example.SerializeToString()) - group_prog.close() - prog.close() - writer.close() + word_groups = [i for i in audio_samples.groupby('word')] + tr_audio_samples,te_audio_samples = train_test_split(word_groups,test_size=0.1) + write_samples(tr_audio_samples,'train') + write_samples(te_audio_samples,'test') const_file = os.path.join('./outputs',audio_group+'.constants') pickle.dump((n_spec,n_features,n_records),open(const_file,'wb')) @@ -196,12 +198,15 @@ def read_siamese_tfrecords_oneshot(audio_group='audio',sample_size=3000): output_data = np.zeros((samples,2)) random_samples = enumerate(reservoir_sample(record_iterator,samples)) for (i,string_record) in tqdm(random_samples,total=samples): - # if i == samples: - # break example = tf.train.Example() example.ParseFromString(string_record) - spec1 = np.array(example.features.feature['spec1'].float_list.value).reshape(n_spec,n_features) - spec2 = np.array(example.features.feature['spec2'].float_list.value).reshape(n_spec,n_features) + spec_n1 = example.features.feature['spec_n1'].int64_list.value[0] + spec_n2 = example.features.feature['spec_n2'].int64_list.value[0] + spec_w1 = example.features.feature['spec_w1'].int64_list.value[0] + spec_w2 = example.features.feature['spec_w2'].int64_list.value[0] + spec1 = np.array(example.features.feature['spec1'].float_list.value).reshape(spec_n1,spec_w1) + spec2 = np.array(example.features.feature['spec2'].float_list.value).reshape(spec_n2,spec_w2) + p_spec1,p_spec2 = padd_zeros(spec1,n_spec),padd_zeros(spec2,n_spec) input_data[i] = np.asarray([spec1,spec2]) output = example.features.feature['output'].int64_list.value output_data[i] = np.asarray(output) @@ -210,7 +215,65 @@ def read_siamese_tfrecords_oneshot(audio_group='audio',sample_size=3000): # result = (tr_pairs,te_pairs,tr_y,te_y,n_spec,n_features) return input_data,output_data -def read_siamese_tfrecords_generator(audio_group='audio',batch_size=32): +def read_siamese_tfrecords_generator(audio_group='audio',batch_size=32,sample_size=100): + records_file = os.path.join('./outputs',audio_group+'.train.tfrecords') + input_pairs = [] + output_class = [] + const_file = os.path.join('./outputs',audio_group+'.constants') + (n_spec,n_features,n_records) = pickle.load(open(const_file,'rb')) + print('reading tfrecords({}-train)...'.format(audio_group)) + def record_generator(): + input_data = [] + output_data = [] + while True: + record_iterator = tf.python_io.tf_record_iterator(path=records_file) + #tqdm(enumerate(record_iterator),total=n_records) + for (i,string_record) in enumerate(record_iterator): + example = tf.train.Example() + example.ParseFromString(string_record) + spec_n1 = example.features.feature['spec_n1'].int64_list.value[0] + spec_n2 = example.features.feature['spec_n2'].int64_list.value[0] + spec_w1 = example.features.feature['spec_w1'].int64_list.value[0] + spec_w2 = example.features.feature['spec_w2'].int64_list.value[0] + spec1 = np.array(example.features.feature['spec1'].float_list.value).reshape(spec_n1,spec_w1) + spec2 = np.array(example.features.feature['spec2'].float_list.value).reshape(spec_n2,spec_w2) + p_spec1,p_spec2 = padd_zeros(spec1,n_spec),padd_zeros(spec2,n_spec) + input_data.append(np.asarray([p_spec1,p_spec2])) + output = example.features.feature['output'].int64_list.value + output_data.append(np.asarray(output)) + if len(input_data) == batch_size: + input_arr = np.asarray(input_data) + output_arr = np.asarray(output_data) + yield ([input_arr[:, 0], input_arr[:, 1]],output_arr) + input_data = [] + output_data = [] + + # Read test in one-shot + te_records_file = os.path.join('./outputs',audio_group+'.test.tfrecords') + te_re_iterator = tf.python_io.tf_record_iterator(path=records_file) + print('reading tfrecords({}-test)...'.format(audio_group)) + samples = min([sample_size,n_records]) + # samples = n_records + input_data = np.zeros((samples,2,n_spec,n_features)) + output_data = np.zeros((samples,2)) + random_samples = enumerate(reservoir_sample(te_re_iterator,samples)) + for (i,string_record) in tqdm(random_samples,total=samples): + example = tf.train.Example() + example.ParseFromString(string_record) + spec_n1 = example.features.feature['spec_n1'].int64_list.value[0] + spec_n2 = example.features.feature['spec_n2'].int64_list.value[0] + spec_w1 = example.features.feature['spec_w1'].int64_list.value[0] + spec_w2 = example.features.feature['spec_w2'].int64_list.value[0] + spec1 = np.array(example.features.feature['spec1'].float_list.value).reshape(spec_n1,spec_w1) + spec2 = np.array(example.features.feature['spec2'].float_list.value).reshape(spec_n2,spec_w2) + p_spec1,p_spec2 = padd_zeros(spec1,n_spec),padd_zeros(spec2,n_spec) + input_data[i] = np.asarray([p_spec1,p_spec2]) + output = example.features.feature['output'].int64_list.value + output_data[i] = np.asarray(output) + + return record_generator,input_data,output_data,n_spec,n_features,n_records + +def read_siamese_tfrecords_generator_old(audio_group='audio',batch_size=32): records_file = os.path.join('./outputs',audio_group+'_padded.tfrecords') input_pairs = [] output_class = [] @@ -330,9 +393,16 @@ def fix_csv(audio_group='audio'): audio_csv_lines = open('./outputs/' + audio_group + '.csv','r').readlines() audio_csv_data = [i.strip().split(',') for i in audio_csv_lines] proper_rows = [i for i in audio_csv_data if len(i) == 7] - with open('./outputs/' + audio_group + '-new.csv','w') as fixed_csv: + with open('./outputs/' + audio_group + '.csv','w') as fixed_csv: fixed_csv_w = csv.writer(fixed_csv, quoting=csv.QUOTE_MINIMAL) fixed_csv_w.writerows(proper_rows) + audio_samples = pd.read_csv( './outputs/' + audio_group + '.csv' + , names=['word','phonemes', 'voice', 'language', 'rate', 'variant', 'file']) + audio_samples['file_path'] = audio_samples.loc[:, 'file'].apply(lambda x: 'outputs/' + audio_group + '/' + x) + audio_samples['file_exists'] = apply_by_multiprocessing(audio_samples['file_path'], os.path.exists) + audio_samples = audio_samples[audio_samples['file_exists'] == True] + audio_samples = audio_samples.drop(['file_path','file_exists'],axis=1).reset_index(drop=True) + audio_samples.to_csv('./outputs/' + audio_group + '.csv') def convert_old_audio(): audio_samples = pd.read_csv( './outputs/audio.csv.old' @@ -352,9 +422,11 @@ if __name__ == '__main__': # create_spectrogram_tfrecords('story_words_test') # read_siamese_tfrecords('story_all') # read_siamese_tfrecords('story_words_test') - padd_zeros_siamese_tfrecords('story_words') + # padd_zeros_siamese_tfrecords('story_words') + # fix_csv() # pickle_constants('story_words') - # create_spectrogram_tfrecords('audio') + # create_spectrogram_tfrecords('audio',sample_count=100) + read_siamese_tfrecords_generator('audio') # padd_zeros_siamese_tfrecords('audio') # create_padded_spectrogram() # create_speech_pairs_data() diff --git a/speech_siamese.py b/speech_siamese.py index 74d9566..298def4 100644 --- a/speech_siamese.py +++ b/speech_siamese.py @@ -13,6 +13,9 @@ from keras.optimizers import RMSprop from keras.callbacks import TensorBoard, ModelCheckpoint from keras import backend as K +def create_dir(direc): + if not os.path.exists(direc): + os.makedirs(direc) def euclidean_distance(vects): x, y = vects @@ -79,13 +82,14 @@ def siamese_model(input_dim): return model -def train_siamese(): +def train_siamese(audio_group = 'audio'): # the data, shuffled and split between train and test sets # tr_pairs, te_pairs, tr_y_e, te_y_e = speech_model_data() batch_size = 512 - tr_gen_fn,n_step,n_features,n_records = read_siamese_tfrecords_generator('audio',batch_size) + model_dir = './models/'+audio_group + create_dir(model_dir) + tr_gen_fn,te_pairs,te_y,n_step,n_features,n_records = read_siamese_tfrecords_generator(audio_group,batch_size,300) tr_gen = tr_gen_fn() - (te_pairs,te_y) = read_siamese_tfrecords_oneshot('audio',1000) # tr_y = to_categorical(tr_y_e, num_classes=2) # te_y = to_categorical(te_y_e, num_classes=2) input_dim = (n_step, n_features) @@ -102,7 +106,7 @@ def train_siamese(): embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None) - cp_file_fmt = './models/siamese_speech_model-{epoch:02d}-epoch-{val_loss:0.2f}\ + cp_file_fmt = model_dir+'/siamese_speech_model-{epoch:02d}-epoch-{val_loss:0.2f}\ -acc.h5' cp_cb = ModelCheckpoint( @@ -126,9 +130,10 @@ def train_siamese(): model.fit_generator(tr_gen ,epochs=100 ,steps_per_epoch=n_records//batch_size + ,validation_data=([te_pairs[:, 0], te_pairs[:, 1]], te_y) ,use_multiprocessing=True) - model.save('./models/siamese_speech_model-final.h5') + model.save(model_dir+'/siamese_speech_model-final.h5') # compute final accuracy on training and test sets # y_pred = model.predict([tr_pairs[:, 0], tr_pairs[:, 1]]) # tr_acc = compute_accuracy(tr_y, y_pred)