diff --git a/speech_data.py b/speech_data.py index 7eff4fb..183cc19 100644 --- a/speech_data.py +++ b/speech_data.py @@ -6,6 +6,7 @@ import tensorflow as tf from tensorflow.python.ops import data_flow_ops import numpy as np from speech_spectrum import generate_aiff_spectrogram +from speech_pitch import pitch_array from speech_pitch import compute_mfcc from sklearn.model_selection import train_test_split import itertools @@ -20,7 +21,7 @@ from tqdm import tqdm def siamese_pairs(rightGroup, wrongGroup): group1 = [r for (i, r) in rightGroup.iterrows()] group2 = [r for (i, r) in wrongGroup.iterrows()] - rightWrongPairs = [(g1, g2) for g2 in group2 for g1 in group1]#+[(g2, g1) for g2 in group2 for g1 in group1] + rightWrongPairs = [(g1, g2) for g2 in group2 for g1 in group1]+[(g2, g1) for g2 in group2 for g1 in group1] rightRightPairs = [i for i in itertools.permutations(group1, 2)]#+[i for i in itertools.combinations(group2, 2)] def filter_criteria(s1,s2): same = s1['variant'] == s2['variant'] @@ -36,7 +37,7 @@ def siamese_pairs(rightGroup, wrongGroup): random.shuffle(validRWPairs) random.shuffle(validRRPairs) # return rightRightPairs[:10],rightWrongPairs[:10] - return validRWPairs[:32],validRRPairs[:32] + return validRRPairs[:32],validRWPairs[:32] def _float_feature(value): @@ -64,7 +65,8 @@ def create_spectrogram_tfrecords(audio_group='audio',sample_count=0,train_test_r for (w, word_group) in word_group_prog: word_group_prog.set_postfix(word=w,sample_name=sample_name) g = word_group.reset_index() - g['spectrogram'] = apply_by_multiprocessing(g['file_path'],generate_aiff_spectrogram) + g['spectrogram'] = apply_by_multiprocessing(g['file_path'],pitch_array) + # g['spectrogram'] = apply_by_multiprocessing(g['file_path'],generate_aiff_spectrogram) # g['spectrogram'] = apply_by_multiprocessing(g['file_path'],compute_mfcc) sample_right = g.loc[g['variant'] == 'low'] sample_wrong = g.loc[g['variant'] == 'medium'] @@ -259,7 +261,7 @@ if __name__ == '__main__': # create_spectrogram_tfrecords('story_all',sample_count=25) # fix_csv('story_words_test') #fix_csv('audio') - create_spectrogram_tfrecords('story_words_test',sample_count=10,train_test_ratio=0.1) + create_spectrogram_tfrecords('story_words_pitch',sample_count=0,train_test_ratio=0.1) #record_generator_count() # create_spectrogram_tfrecords('audio',sample_count=50) # read_siamese_tfrecords_generator('audio') diff --git a/speech_model.py b/speech_model.py index 23168cb..949ff08 100644 --- a/speech_model.py +++ b/speech_model.py @@ -46,10 +46,12 @@ def dense_classifier(processed): conc_proc = Concatenate()(processed) d1 = Dense(64, activation='relu')(conc_proc) # dr1 = Dropout(0.1)(d1) + bn_d1 = BatchNormalization(momentum=0.98)(d1) # d2 = Dense(128, activation='relu')(d1) - d3 = Dense(8, activation='relu')(d1) + d3 = Dense(8, activation='relu')(bn_d1) + bn_d3 = BatchNormalization(momentum=0.98)(d3) # dr2 = Dropout(0.1)(d2) - return Dense(2, activation='softmax')(d3) + return Dense(2, activation='softmax')(bn_d3) def siamese_model(input_dim): base_network = create_base_rnn_network(input_dim) @@ -127,4 +129,4 @@ def train_siamese(audio_group = 'audio'): if __name__ == '__main__': - train_siamese('story_words_test') + train_siamese('story_words_pitch') diff --git a/speech_test.py b/speech_test.py index 4fcd2fc..e427e06 100644 --- a/speech_test.py +++ b/speech_test.py @@ -177,7 +177,7 @@ def visualize_results(audio_group='audio'): if __name__ == '__main__': # evaluate_siamese('./outputs/story_words_test.train.tfrecords',audio_group='story_words.gpu',weights ='siamese_speech_model-58-epoch-0.00-acc.h5') # evaluate_siamese('./outputs/story_words.test.tfrecords',audio_group='story_words',weights ='siamese_speech_model-675-epoch-0.00-acc.h5') - evaluate_siamese('./outputs/story_words_test.train.tfrecords',audio_group='story_words_test.10',weights ='siamese_speech_model-891-epoch-0.02-acc.h5') + evaluate_siamese('./outputs/story_words_pitch.test.tfrecords',audio_group='story_words_pitch',weights ='siamese_speech_model-867-epoch-0.12-acc.h5') # play_results('story_words') #inspect_tfrecord('./outputs/story_phrases.test.tfrecords',audio_group='story_phrases') # visualize_results('story_words.gpu')