diff --git a/speech_data.py b/speech_data.py index d11c4ee..58430c1 100644 --- a/speech_data.py +++ b/speech_data.py @@ -6,7 +6,7 @@ from speech_utils import threadsafe_iter import tensorflow as tf from tensorflow.python.ops import data_flow_ops import numpy as np -from spectro_gen import generate_aiff_spectrogram +from speech_spectrum import generate_aiff_spectrogram from sklearn.model_selection import train_test_split import itertools import os diff --git a/spectro_gen.py b/speech_spectrum.py similarity index 100% rename from spectro_gen.py rename to speech_spectrum.py diff --git a/record_mic_speech.py b/speech_tools.py similarity index 61% rename from record_mic_speech.py rename to speech_tools.py index 4ed11e2..fd48f34 100644 --- a/record_mic_speech.py +++ b/speech_tools.py @@ -1,15 +1,36 @@ import pyaudio +from pysndfile import sndio as snd import numpy as np # from matplotlib import pyplot as plt -from spectro_gen import plot_stft, generate_spec_frec +from speech_spectrum import plot_stft, generate_spec_frec +SAMPLE_RATE = 22050 +N_CHANNELS = 2 + +def file_player(): + p_oup = pyaudio.PyAudio() + def play_file(audiopath,plot=False): + print('playing',audiopath) + samples, samplerate, form = snd.read(audiopath) + stream = p_oup.open( + format=pyaudio.paFloat32, + channels=2, + rate=samplerate, + output=True) + one_channel = np.asarray([samples, samples]).T.reshape(-1) + audio_data = one_channel.astype(np.float32).tobytes() + stream.write(audio_data) + stream.close() + if plot: + plot_stft(samples, SAMPLE_RATE) + def close_player(): + p_oup.terminate() + return play_file,close_player def record_spectrogram(n_sec, plot=False, playback=False): - SAMPLE_RATE = 22050 - N_CHANNELS = 2 + # show_record_prompt() N_SEC = n_sec CHUNKSIZE = int(SAMPLE_RATE * N_SEC / N_CHANNELS) # fixed chunk size - # show_record_prompt() input('Press [Enter] to start recording sample... ') p_inp = pyaudio.PyAudio() stream = p_inp.open( diff --git a/test_siamese.py b/test_siamese.py index c228cec..0d7098e 100644 --- a/test_siamese.py +++ b/test_siamese.py @@ -1,9 +1,10 @@ -from speech_siamese import siamese_model -from record_mic_speech import record_spectrogram +# from speech_siamese import siamese_model +from speech_tools import record_spectrogram, file_player # from importlib import reload # import speech_data # reload(speech_data) import numpy as np +import pandas as pd import os import pickle import tensorflow as tf @@ -25,7 +26,8 @@ def test_with(audio_group): print(np.argmax(model.predict([X[:, 0], X[:, 1]]),axis=1)) print(Y.astype(np.int8)) -def evaluate_siamese(audio_group='audio',model_file = 'siamese_speech_model-46-epoch-0.29-acc.h5'): +def evaluate_siamese(audio_group='audio',model_file = 'siamese_speech_model-305-epoch-0.20-acc.h5'): + # audio_group='audio';model_file = 'siamese_speech_model-305-epoch-0.20-acc.h5' records_file = os.path.join('./outputs',audio_group+'.train.tfrecords') const_file = os.path.join('./outputs',audio_group+'.constants') model_weights_path =os.path.join('./models/story_words/',model_file) @@ -36,43 +38,72 @@ def evaluate_siamese(audio_group='audio',model_file = 'siamese_speech_model-46-e model.load_weights(model_weights_path) record_iterator = tf.python_io.tf_record_iterator(path=records_file) #tqdm(enumerate(record_iterator),total=n_records) - with open('./outputs/' + audio_group + '.results.csv','w') as result_csv: - result_csv_w = csv.writer(result_csv, quoting=csv.QUOTE_MINIMAL) - for (i,string_record) in enumerate(record_iterator): - # string_record = next(record_iterator) - example = tf.train.Example() - example.ParseFromString(string_record) - spec_n1 = example.features.feature['spec_n1'].int64_list.value[0] - spec_n2 = example.features.feature['spec_n2'].int64_list.value[0] - spec_w1 = example.features.feature['spec_w1'].int64_list.value[0] - spec_w2 = example.features.feature['spec_w2'].int64_list.value[0] - spec1 = np.array(example.features.feature['spec1'].float_list.value).reshape(spec_n1,spec_w1) - spec2 = np.array(example.features.feature['spec2'].float_list.value).reshape(spec_n2,spec_w2) - p_spec1,p_spec2 = padd_zeros(spec1,n_spec),padd_zeros(spec2,n_spec) - input_arr = np.asarray([[p_spec1,p_spec2]]) - output_arr = np.asarray([example.features.feature['output'].int64_list.value]) - y_pred = model.predict([input_arr[:, 0], input_arr[:, 1]]) - predicted = np.asarray(y_pred[0]>0.5).astype(output_arr.dtype) - expected = output_arr[0] - if np.all(predicted == expected): + result_csv = open('./outputs/' + audio_group + '.results.csv','w') + result_csv_w = csv.writer(result_csv, quoting=csv.QUOTE_MINIMAL) + result_csv_w.writerow(["phoneme1","phoneme2","voice1","voice2","rate1","rate2","variant1","variant2","file1","file2"]) + for (i,string_record) in enumerate(record_iterator): + # string_record = next(record_iterator) + example = tf.train.Example() + example.ParseFromString(string_record) + spec_n1 = example.features.feature['spec_n1'].int64_list.value[0] + spec_n2 = example.features.feature['spec_n2'].int64_list.value[0] + spec_w1 = example.features.feature['spec_w1'].int64_list.value[0] + spec_w2 = example.features.feature['spec_w2'].int64_list.value[0] + spec1 = np.array(example.features.feature['spec1'].float_list.value).reshape(spec_n1,spec_w1) + spec2 = np.array(example.features.feature['spec2'].float_list.value).reshape(spec_n2,spec_w2) + p_spec1,p_spec2 = padd_zeros(spec1,n_spec),padd_zeros(spec2,n_spec) + input_arr = np.asarray([[p_spec1,p_spec2]]) + output_arr = np.asarray([example.features.feature['output'].int64_list.value]) + y_pred = model.predict([input_arr[:, 0], input_arr[:, 1]]) + predicted = np.asarray(y_pred[0]>0.5).astype(output_arr.dtype) + expected = output_arr[0] + if np.all(predicted == expected): + continue + word = example.features.feature['word'].bytes_list.value[0].decode() + phoneme1 = example.features.feature['phoneme1'].bytes_list.value[0].decode() + phoneme2 = example.features.feature['phoneme2'].bytes_list.value[0].decode() + voice1 = example.features.feature['voice1'].bytes_list.value[0].decode() + voice2 = example.features.feature['voice2'].bytes_list.value[0].decode() + language = example.features.feature['language'].bytes_list.value[0].decode() + rate1 = example.features.feature['rate1'].int64_list.value[0] + rate2 = example.features.feature['rate2'].int64_list.value[0] + variant1 = example.features.feature['variant1'].bytes_list.value[0].decode() + variant2 = example.features.feature['variant2'].bytes_list.value[0].decode() + file1 = example.features.feature['file1'].bytes_list.value[0].decode() + file2 = example.features.feature['file2'].bytes_list.value[0].decode() + print(phoneme1,phoneme2,voice1,voice2,rate1,rate2,variant1,variant2,file1,file2) + result_csv_w.writerow([phoneme1,phoneme2,voice1,voice2,rate1,rate2,variant1,variant2,file1,file2]) + result_csv.close() + + +def play_results(audio_group='audio'): + result_data = pd.read_csv('./outputs/' + audio_group + '.results.csv') + play_file,close_player = file_player() + quit = False + for (i,r) in result_data.iterrows(): + if quit: + break + keys = ["phoneme1","phoneme2","voice1","voice2","rate1","rate2","variant1","variant2"] + row_vals = [str(r[k]) for k in keys] + h_str = '\t'.join(keys) + row_str = '\t'.join(row_vals) + while True: + print(h_str) + print(row_str) + play_file('./outputs/'+audio_group+'/'+r['file1'],True) + play_file('./outputs/'+audio_group+'/'+r['file2'],True) + a = input("press 'r/q/[Enter]' to replay/quit/continue:\t") + if a == 'r': continue - word = example.features.feature['word'].bytes_list.value[0].decode() - phoneme1 = example.features.feature['phoneme1'].bytes_list.value[0].decode() - phoneme2 = example.features.feature['phoneme2'].bytes_list.value[0].decode() - voice1 = example.features.feature['voice1'].bytes_list.value[0].decode() - voice2 = example.features.feature['voice2'].bytes_list.value[0].decode() - language = example.features.feature['language'].bytes_list.value[0].decode() - rate1 = example.features.feature['rate1'].int64_list.value[0] - rate2 = example.features.feature['rate2'].int64_list.value[0] - variant1 = example.features.feature['variant1'].bytes_list.value[0].decode() - variant2 = example.features.feature['variant2'].bytes_list.value[0].decode() - file1 = example.features.feature['file1'].bytes_list.value[0].decode() - file2 = example.features.feature['file2'].bytes_list.value[0].decode() - print(phoneme1,phoneme2,voice1,voice2,rate1,rate2,variant1,variant2,file1,file2) - result_csv_w.writerow([phoneme1,phoneme2,voice1,voice2,rate1,rate2,variant1,variant2,file1,file2]) + if a == 'q': + quit = True + break + else: + break + close_player() - -evaluate_siamese('story_words',model_file='siamese_speech_model-92-epoch-0.20-acc.h5') +# evaluate_siamese('story_words',model_file='siamese_speech_model-305-epoch-0.20-acc.h5') +play_results('story_words') # test_with('rand_edu') # sunflower_data,sunflower_result = get_word_pairs_data('sweater',15) # print(np.argmax(model.predict([sunflower_data[:, 0], sunflower_data[:, 1]]),axis=1))