from speech_model import load_model_arch from speech_tools import record_spectrogram, file_player, padd_zeros, pair_for_word from speech_data import record_generator_count # from importlib import reload # import speech_data # reload(speech_data) import numpy as np import pandas as pd import os import pickle import tensorflow as tf import csv from tqdm import tqdm from speech_data import padd_zeros import seaborn as sns def predict_recording_with(m,sample_size=15): spec1 = record_spectrogram(n_sec=1.4) spec2 = record_spectrogram(n_sec=1.4) inp = create_test_pair(spec1,spec2,sample_size) return m.predict([inp[:, 0], inp[:, 1]]) def predict_tts_sample(sample_word = 'able',audio_group='story_words',weights = 'siamese_speech_model-153-epoch-0.55-acc.h5'): # sample_word = 'able';audio_group='story_words';weights = 'siamese_speech_model-153-epoch-0.55-acc.h5' const_file = './models/'+audio_group+'/constants.pkl' arch_file='./models/'+audio_group+'/siamese_speech_model_arch.yaml' weight_file='./models/'+audio_group+'/'+weights (sample_size,n_features,n_records) = pickle.load(open(const_file,'rb')) model = load_model_arch(arch_file) model.load_weights(weight_file) spec1,spec2 = pair_for_word(sample_word) p_spec1 = padd_zeros(spec1,sample_size) p_spec2 = padd_zeros(spec2,sample_size) inp = np.array([[p_spec1,p_spec2]]) result = model.predict([inp[:, 0], inp[:, 1]])[0] res_str = 'same' if result[0] < result[1] else 'diff' return res_str def test_with(audio_group): X,Y = speech_data(audio_group) print(np.argmax(model.predict([X[:, 0], X[:, 1]]),axis=1)) print(Y.astype(np.int8)) def evaluate_siamese(records_file,audio_group='audio',weights = 'siamese_speech_model-final.h5'): # audio_group='audio';model_file = 'siamese_speech_model-305-epoch-0.20-acc.h5' # records_file = os.path.join('./outputs',eval_group+'.train.tfrecords') const_file = os.path.join('./models/'+audio_group+'/','constants.pkl') arch_file='./models/'+audio_group+'/siamese_speech_model_arch.yaml' weight_file='./models/'+audio_group+'/'+weights (n_spec,n_features,n_records) = pickle.load(open(const_file,'rb')) print('evaluating {}...'.format(records_file)) model = load_model_arch(arch_file) # model = siamese_model((n_spec, n_features)) model.load_weights(weight_file) record_iterator,records_count = record_generator_count(records_file) total,same_success,diff_success,skipped,same_failed,diff_failed = 0,0,0,0,0,0 all_results = [] for (i,string_record) in tqdm(enumerate(record_iterator),total=records_count): total+=1 example = tf.train.Example() example.ParseFromString(string_record) spec_n1 = example.features.feature['spec_n1'].int64_list.value[0] spec_n2 = example.features.feature['spec_n2'].int64_list.value[0] if n_spec < spec_n1 or n_spec < spec_n2: skipped+=1 continue spec_w1 = example.features.feature['spec_w1'].int64_list.value[0] spec_w2 = example.features.feature['spec_w2'].int64_list.value[0] spec1 = np.array(example.features.feature['spec1'].float_list.value).reshape(spec_n1,spec_w1) spec2 = np.array(example.features.feature['spec2'].float_list.value).reshape(spec_n2,spec_w2) word = example.features.feature['word'].bytes_list.value[0].decode() phoneme1 = example.features.feature['phoneme1'].bytes_list.value[0].decode() phoneme2 = example.features.feature['phoneme2'].bytes_list.value[0].decode() voice1 = example.features.feature['voice1'].bytes_list.value[0].decode() voice2 = example.features.feature['voice2'].bytes_list.value[0].decode() language = example.features.feature['language'].bytes_list.value[0].decode() rate1 = example.features.feature['rate1'].int64_list.value[0] rate2 = example.features.feature['rate2'].int64_list.value[0] variant1 = example.features.feature['variant1'].bytes_list.value[0].decode() variant2 = example.features.feature['variant2'].bytes_list.value[0].decode() file1 = example.features.feature['file1'].bytes_list.value[0].decode() file2 = example.features.feature['file2'].bytes_list.value[0].decode() p_spec1,p_spec2 = padd_zeros(spec1,n_spec),padd_zeros(spec2,n_spec) input_arr = np.asarray([[p_spec1,p_spec2]]) output_arr = np.asarray([example.features.feature['output'].int64_list.value]) y_pred = model.predict([input_arr[:, 0], input_arr[:, 1]]) predicted = np.asarray(y_pred[0]>0.5).astype(output_arr.dtype) expected = output_arr[0] status = np.all(predicted == expected) result = {"phoneme1":phoneme1,"phoneme2":phoneme2,"voice1":voice1 ,"voice2":voice2,"rate1":rate1,"rate2":rate2 ,"variant1":variant1,"variant2":variant2,"file1":file1 ,"file2":file2,"expected":expected[0],"predicted":y_pred[0][0] ,"success":status} all_results.append(result) if status: if variant1 == variant2: same_success+=1 else: diff_success+=1 continue else: if variant1 == variant2: same_failed+=1 else: diff_failed+=1 print('total-{},same_success-{},diff_success-{},skipped-{},same_failed-{},diff_failed-{}'.format(total,same_success,diff_success,skipped,same_failed,diff_failed)) success = same_success+diff_success failure = same_failed+diff_failed print('accuracy-{:.3f}'.format(success*100/(success+failure))) print('same_accuracy-{:.3f}'.format(same_success*100/(same_success+same_failed))) print('diff_accuracy-{:.3f}'.format(diff_success*100/(diff_success+diff_failed))) result_data = pd.DataFrame(all_results,columns=["phoneme1","phoneme2" ,"voice1","voice2","rate1","rate2","variant1","variant2","file1","file2", "expected","predicted","success"]) result_data.to_csv('./outputs/' + audio_group + '.results.csv') def inspect_tfrecord(records_file,audio_group='audio'): record_iterator,records_count = record_generator_count(records_file) all_results = [] for (i,string_record) in tqdm(enumerate(record_iterator),total=records_count): # string_record = next(record_iterator) example = tf.train.Example() example.ParseFromString(string_record) spec_n1 = example.features.feature['spec_n1'].int64_list.value[0] spec_n2 = example.features.feature['spec_n2'].int64_list.value[0] word = example.features.feature['word'].bytes_list.value[0].decode() phoneme1 = example.features.feature['phoneme1'].bytes_list.value[0].decode() phoneme2 = example.features.feature['phoneme2'].bytes_list.value[0].decode() voice1 = example.features.feature['voice1'].bytes_list.value[0].decode() voice2 = example.features.feature['voice2'].bytes_list.value[0].decode() language = example.features.feature['language'].bytes_list.value[0].decode() rate1 = example.features.feature['rate1'].int64_list.value[0] rate2 = example.features.feature['rate2'].int64_list.value[0] variant1 = example.features.feature['variant1'].bytes_list.value[0].decode() variant2 = example.features.feature['variant2'].bytes_list.value[0].decode() file1 = example.features.feature['file1'].bytes_list.value[0].decode() file2 = example.features.feature['file2'].bytes_list.value[0].decode() output_arr = np.asarray([example.features.feature['output'].int64_list.value]) expected = output_arr[0] result = {"phoneme1":phoneme1,"phoneme2":phoneme2,"voice1":voice1 ,"voice2":voice2,"rate1":rate1,"rate2":rate2,"spec_n1":spec_n1 ,"spec_n2":spec_n2,"variant1":variant1,"variant2":variant2 ,"file1":file1,"file2":file2,"expected":expected[0]} all_results.append(result) result_data = pd.DataFrame(all_results,columns=["phoneme1","phoneme2" ,"voice1","voice2","rate1","rate2","spec_n1","spec_n2","variant1","variant2","file1","file2", "expected"]) result_data.to_csv('./outputs/' + audio_group + '.pairs.csv') def play_results(audio_group='audio'): result_data = pd.read_csv('./outputs/' + audio_group + '.results.csv') play_file,close_player = file_player() quit = False for (i,r) in result_data.iterrows(): if quit: break keys = ["phoneme1","phoneme2","voice1","voice2","rate1","rate2","variant1","variant2"] row_vals = [str(r[k]) for k in keys] h_str = '\t'.join(keys) row_str = '\t'.join(row_vals) while True: print(h_str) print(row_str) play_file('./outputs/'+audio_group+'/'+r['file1'],True) play_file('./outputs/'+audio_group+'/'+r['file2'],True) a = input("press 'r/q/[Enter]' to replay/quit/continue:\t") if a == 'r': continue if a == 'q': quit = True break else: break close_player() def visualize_results(audio_group='audio'): # %matplotlib inline audio_group = 'story_phrases' source = pd.read_csv('./outputs/'+audio_group+'.pairs.csv',index_col=0) source.groupby(['voice1','voice2']).size() result = pd.read_csv('./outputs/' + audio_group + '.results.csv',index_col=0) # result.groupby('success').size().plot(kind='bar') result.describe(include=['object']) failed = result[result['success'] == False] same_failed = failed[failed['variant1'] == failed['variant2']] diff_failed = failed[failed['variant1'] != failed['variant2']] result.groupby(['voice1','voice2']).size() if __name__ == '__main__': # evaluate_siamese('./outputs/story_words_test.train.tfrecords',audio_group='story_words.gpu',weights ='siamese_speech_model-58-epoch-0.00-acc.h5') # evaluate_siamese('./outputs/story_words.test.tfrecords',audio_group='story_words',weights ='siamese_speech_model-675-epoch-0.00-acc.h5') evaluate_siamese('./outputs/story_words.test.tfrecords',audio_group='story_words',weights ='siamese_speech_model-153-epoch-0.55-acc.h5') # play_results('story_words') #inspect_tfrecord('./outputs/story_phrases.test.tfrecords',audio_group='story_phrases') # visualize_results('story_words.gpu') # test_with('rand_edu') # sunflower_data,sunflower_result = get_word_pairs_data('sweater',15) # print(np.argmax(model.predict([sunflower_data[:, 0], sunflower_data[:, 1]]),axis=1)) # print(sunflower_result)