speech-scoring/speech_test.py

203 lines
10 KiB
Python
Raw Normal View History

from speech_model import load_model_arch
from speech_tools import record_spectrogram, file_player, padd_zeros, pair_for_word
from speech_data import record_generator_count
2017-11-13 12:03:37 +00:00
# from importlib import reload
2017-11-03 08:49:19 +00:00
# import speech_data
# reload(speech_data)
import numpy as np
import pandas as pd
2017-11-13 12:03:37 +00:00
import os
import pickle
import tensorflow as tf
import csv
from tqdm import tqdm
2017-11-13 12:03:37 +00:00
from speech_data import padd_zeros
2017-11-17 06:27:38 +00:00
import seaborn as sns
2017-10-26 07:18:31 +00:00
def predict_recording_with(m,sample_size=15):
spec1 = record_spectrogram(n_sec=1.4)
spec2 = record_spectrogram(n_sec=1.4)
inp = create_test_pair(spec1,spec2,sample_size)
return m.predict([inp[:, 0], inp[:, 1]])
def predict_tts_sample(sample_word = 'able',audio_group='story_words',weights = 'siamese_speech_model-153-epoch-0.55-acc.h5'):
# sample_word = 'able';audio_group='story_words';weights = 'siamese_speech_model-153-epoch-0.55-acc.h5'
const_file = './models/'+audio_group+'/constants.pkl'
arch_file='./models/'+audio_group+'/siamese_speech_model_arch.yaml'
weight_file='./models/'+audio_group+'/'+weights
(sample_size,n_features,n_records) = pickle.load(open(const_file,'rb'))
model = load_model_arch(arch_file)
model.load_weights(weight_file)
spec1,spec2 = pair_for_word(sample_word)
p_spec1 = padd_zeros(spec1,sample_size)
p_spec2 = padd_zeros(spec2,sample_size)
inp = np.array([[p_spec1,p_spec2]])
result = model.predict([inp[:, 0], inp[:, 1]])[0]
res_str = 'same' if result[0] < result[1] else 'diff'
return res_str
2017-11-03 08:49:19 +00:00
def test_with(audio_group):
X,Y = speech_data(audio_group)
print(np.argmax(model.predict([X[:, 0], X[:, 1]]),axis=1))
print(Y.astype(np.int8))
def evaluate_siamese(records_file,audio_group='audio',weights = 'siamese_speech_model-final.h5'):
# audio_group='audio';model_file = 'siamese_speech_model-305-epoch-0.20-acc.h5'
# records_file = os.path.join('./outputs',eval_group+'.train.tfrecords')
2017-11-27 08:38:01 +00:00
const_file = os.path.join('./models/'+audio_group+'/','constants.pkl')
arch_file='./models/'+audio_group+'/siamese_speech_model_arch.yaml'
weight_file='./models/'+audio_group+'/'+weights
2017-11-13 12:03:37 +00:00
(n_spec,n_features,n_records) = pickle.load(open(const_file,'rb'))
print('evaluating {}...'.format(records_file))
model = load_model_arch(arch_file)
# model = siamese_model((n_spec, n_features))
model.load_weights(weight_file)
record_iterator,records_count = record_generator_count(records_file)
total,same_success,diff_success,skipped,same_failed,diff_failed = 0,0,0,0,0,0
all_results = []
for (i,string_record) in tqdm(enumerate(record_iterator),total=records_count):
total+=1
example = tf.train.Example()
example.ParseFromString(string_record)
spec_n1 = example.features.feature['spec_n1'].int64_list.value[0]
spec_n2 = example.features.feature['spec_n2'].int64_list.value[0]
if n_spec < spec_n1 or n_spec < spec_n2:
skipped+=1
continue
spec_w1 = example.features.feature['spec_w1'].int64_list.value[0]
spec_w2 = example.features.feature['spec_w2'].int64_list.value[0]
spec1 = np.array(example.features.feature['spec1'].float_list.value).reshape(spec_n1,spec_w1)
spec2 = np.array(example.features.feature['spec2'].float_list.value).reshape(spec_n2,spec_w2)
word = example.features.feature['word'].bytes_list.value[0].decode()
phoneme1 = example.features.feature['phoneme1'].bytes_list.value[0].decode()
phoneme2 = example.features.feature['phoneme2'].bytes_list.value[0].decode()
voice1 = example.features.feature['voice1'].bytes_list.value[0].decode()
voice2 = example.features.feature['voice2'].bytes_list.value[0].decode()
language = example.features.feature['language'].bytes_list.value[0].decode()
rate1 = example.features.feature['rate1'].int64_list.value[0]
rate2 = example.features.feature['rate2'].int64_list.value[0]
variant1 = example.features.feature['variant1'].bytes_list.value[0].decode()
variant2 = example.features.feature['variant2'].bytes_list.value[0].decode()
file1 = example.features.feature['file1'].bytes_list.value[0].decode()
file2 = example.features.feature['file2'].bytes_list.value[0].decode()
p_spec1,p_spec2 = padd_zeros(spec1,n_spec),padd_zeros(spec2,n_spec)
input_arr = np.asarray([[p_spec1,p_spec2]])
output_arr = np.asarray([example.features.feature['output'].int64_list.value])
y_pred = model.predict([input_arr[:, 0], input_arr[:, 1]])
predicted = np.asarray(y_pred[0]>0.5).astype(output_arr.dtype)
expected = output_arr[0]
status = np.all(predicted == expected)
result = {"phoneme1":phoneme1,"phoneme2":phoneme2,"voice1":voice1
,"voice2":voice2,"rate1":rate1,"rate2":rate2
,"variant1":variant1,"variant2":variant2,"file1":file1
,"file2":file2,"expected":expected[0],"predicted":y_pred[0][0]
,"success":status}
all_results.append(result)
if status:
if variant1 == variant2:
same_success+=1
else:
diff_success+=1
continue
else:
if variant1 == variant2:
same_failed+=1
else:
diff_failed+=1
print('total-{},same_success-{},diff_success-{},skipped-{},same_failed-{},diff_failed-{}'.format(total,same_success,diff_success,skipped,same_failed,diff_failed))
success = same_success+diff_success
failure = same_failed+diff_failed
print('accuracy-{:.3f}'.format(success*100/(success+failure)))
print('same_accuracy-{:.3f}'.format(same_success*100/(same_success+same_failed)))
print('diff_accuracy-{:.3f}'.format(diff_success*100/(diff_success+diff_failed)))
result_data = pd.DataFrame(all_results,columns=["phoneme1","phoneme2"
,"voice1","voice2","rate1","rate2","variant1","variant2","file1","file2",
"expected","predicted","success"])
result_data.to_csv('./outputs/' + audio_group + '.results.csv')
2017-11-17 11:59:48 +00:00
def inspect_tfrecord(records_file,audio_group='audio'):
record_iterator,records_count = record_generator_count(records_file)
all_results = []
for (i,string_record) in tqdm(enumerate(record_iterator),total=records_count):
# string_record = next(record_iterator)
example = tf.train.Example()
example.ParseFromString(string_record)
spec_n1 = example.features.feature['spec_n1'].int64_list.value[0]
spec_n2 = example.features.feature['spec_n2'].int64_list.value[0]
word = example.features.feature['word'].bytes_list.value[0].decode()
phoneme1 = example.features.feature['phoneme1'].bytes_list.value[0].decode()
phoneme2 = example.features.feature['phoneme2'].bytes_list.value[0].decode()
voice1 = example.features.feature['voice1'].bytes_list.value[0].decode()
voice2 = example.features.feature['voice2'].bytes_list.value[0].decode()
language = example.features.feature['language'].bytes_list.value[0].decode()
rate1 = example.features.feature['rate1'].int64_list.value[0]
rate2 = example.features.feature['rate2'].int64_list.value[0]
variant1 = example.features.feature['variant1'].bytes_list.value[0].decode()
variant2 = example.features.feature['variant2'].bytes_list.value[0].decode()
file1 = example.features.feature['file1'].bytes_list.value[0].decode()
file2 = example.features.feature['file2'].bytes_list.value[0].decode()
output_arr = np.asarray([example.features.feature['output'].int64_list.value])
expected = output_arr[0]
result = {"phoneme1":phoneme1,"phoneme2":phoneme2,"voice1":voice1
,"voice2":voice2,"rate1":rate1,"rate2":rate2,"spec_n1":spec_n1
,"spec_n2":spec_n2,"variant1":variant1,"variant2":variant2
,"file1":file1,"file2":file2,"expected":expected[0]}
all_results.append(result)
result_data = pd.DataFrame(all_results,columns=["phoneme1","phoneme2"
,"voice1","voice2","rate1","rate2","spec_n1","spec_n2","variant1","variant2","file1","file2",
"expected"])
result_data.to_csv('./outputs/' + audio_group + '.pairs.csv')
2017-11-13 12:03:37 +00:00
def play_results(audio_group='audio'):
result_data = pd.read_csv('./outputs/' + audio_group + '.results.csv')
play_file,close_player = file_player()
quit = False
for (i,r) in result_data.iterrows():
if quit:
break
keys = ["phoneme1","phoneme2","voice1","voice2","rate1","rate2","variant1","variant2"]
row_vals = [str(r[k]) for k in keys]
h_str = '\t'.join(keys)
row_str = '\t'.join(row_vals)
while True:
print(h_str)
print(row_str)
play_file('./outputs/'+audio_group+'/'+r['file1'],True)
play_file('./outputs/'+audio_group+'/'+r['file2'],True)
a = input("press 'r/q/[Enter]' to replay/quit/continue:\t")
if a == 'r':
continue
if a == 'q':
quit = True
break
else:
break
close_player()
2017-11-13 12:03:37 +00:00
def visualize_results(audio_group='audio'):
2017-11-15 09:47:37 +00:00
# %matplotlib inline
2017-11-17 06:27:38 +00:00
audio_group = 'story_phrases'
2017-11-17 11:59:48 +00:00
source = pd.read_csv('./outputs/'+audio_group+'.pairs.csv',index_col=0)
source.groupby(['voice1','voice2']).size()
result = pd.read_csv('./outputs/' + audio_group + '.results.csv',index_col=0)
2017-11-17 11:59:48 +00:00
# result.groupby('success').size().plot(kind='bar')
2017-11-15 09:47:37 +00:00
result.describe(include=['object'])
failed = result[result['success'] == False]
same_failed = failed[failed['variant1'] == failed['variant2']]
diff_failed = failed[failed['variant1'] != failed['variant2']]
2017-11-17 06:27:38 +00:00
result.groupby(['voice1','voice2']).size()
if __name__ == '__main__':
# evaluate_siamese('./outputs/story_words_test.train.tfrecords',audio_group='story_words.gpu',weights ='siamese_speech_model-58-epoch-0.00-acc.h5')
# evaluate_siamese('./outputs/story_words.test.tfrecords',audio_group='story_words',weights ='siamese_speech_model-675-epoch-0.00-acc.h5')
evaluate_siamese('./outputs/story_words.test.tfrecords',audio_group='story_words',weights ='siamese_speech_model-153-epoch-0.55-acc.h5')
# play_results('story_words')
#inspect_tfrecord('./outputs/story_phrases.test.tfrecords',audio_group='story_phrases')
2017-11-17 11:59:48 +00:00
# visualize_results('story_words.gpu')
2017-11-13 12:03:37 +00:00
# test_with('rand_edu')
2017-11-02 07:44:59 +00:00
# sunflower_data,sunflower_result = get_word_pairs_data('sweater',15)
# print(np.argmax(model.predict([sunflower_data[:, 0], sunflower_data[:, 1]]),axis=1))
# print(sunflower_result)