diff --git a/pandas_parallel.py b/pandas_parallel.py deleted file mode 100644 index 245da38..0000000 --- a/pandas_parallel.py +++ /dev/null @@ -1,25 +0,0 @@ -import multiprocessing -import pandas as pd -import numpy as np - - - -def _apply_df(args): - df, func, num, kwargs = args - return num, df.apply(func, **kwargs) - -def apply_by_multiprocessing(df,func,**kwargs): - cores = multiprocessing.cpu_count() - workers=kwargs.pop('workers') if 'workers' in kwargs else cores - pool = multiprocessing.Pool(processes=workers) - result = pool.map(_apply_df, [(d, func, i, kwargs) for i,d in enumerate(np.array_split(df, workers))]) - pool.close() - result=sorted(result,key=lambda x:x[0]) - return pd.concat([i[1] for i in result]) - -def square(x): - return x**x - -if __name__ == '__main__': - df = pd.DataFrame({'a':range(10), 'b':range(10)}) - apply_by_multiprocessing(df, square, axis=1, workers=4) diff --git a/record_mic_speech.py b/record_mic_speech.py deleted file mode 100644 index 4ed11e2..0000000 --- a/record_mic_speech.py +++ /dev/null @@ -1,42 +0,0 @@ -import pyaudio -import numpy as np -# from matplotlib import pyplot as plt -from spectro_gen import plot_stft, generate_spec_frec - - -def record_spectrogram(n_sec, plot=False, playback=False): - SAMPLE_RATE = 22050 - N_CHANNELS = 2 - N_SEC = n_sec - CHUNKSIZE = int(SAMPLE_RATE * N_SEC / N_CHANNELS) # fixed chunk size - # show_record_prompt() - input('Press [Enter] to start recording sample... ') - p_inp = pyaudio.PyAudio() - stream = p_inp.open( - format=pyaudio.paFloat32, - channels=N_CHANNELS, - rate=SAMPLE_RATE, - input=True, - frames_per_buffer=CHUNKSIZE) - data = stream.read(CHUNKSIZE) - numpydata = np.frombuffer(data, dtype=np.float32) - multi_channel = np.abs(np.reshape(numpydata, (-1, 2))).mean(axis=1) - one_channel = np.asarray([multi_channel, -1 * multi_channel]).T.reshape(-1) - mean_channel_data = one_channel.tobytes() - stream.stop_stream() - stream.close() - p_inp.terminate() - if plot: - plot_stft(one_channel, SAMPLE_RATE) - if playback: - p_oup = pyaudio.PyAudio() - stream = p_oup.open( - format=pyaudio.paFloat32, - channels=2, - rate=SAMPLE_RATE, - output=True) - stream.write(mean_channel_data) - stream.close() - p_oup.terminate() - ims, _ = generate_spec_frec(one_channel, SAMPLE_RATE) - return ims diff --git a/requirements-linux.txt b/requirements-linux.txt new file mode 100644 index 0000000..dba434a --- /dev/null +++ b/requirements-linux.txt @@ -0,0 +1,76 @@ +bleach==1.5.0 +click==6.7 +cloudpickle==0.4.1 +cycler==0.10.0 +dask==0.15.4 +decorator==4.1.2 +distributed==1.19.3 +entrypoints==0.2.3 +enum34==1.1.6 +futures==3.1.1 +h5py==2.7.1 +HeapDict==1.0.0 +html5lib==0.9999999 +ipykernel==4.6.1 +ipython==6.2.1 +ipython-genutils==0.2.0 +ipywidgets==7.0.3 +jedi==0.11.0 +Jinja2==2.9.6 +jsonschema==2.6.0 +jupyter==1.0.0 +jupyter-client==5.1.0 +jupyter-console==5.2.0 +jupyter-core==4.3.0 +Keras==2.0.8 +locket==0.2.0 +Markdown==2.6.9 +MarkupSafe==1.0 +matplotlib==2.1.0 +mistune==0.7.4 +msgpack-python==0.4.8 +nbconvert==5.3.1 +nbformat==4.4.0 +notebook==5.2.0 +numexpr==2.6.4 +numpy==1.13.3 +pandas==0.20.3 +pandocfilters==1.4.2 +parso==0.1.0 +partd==0.3.8 +pexpect==4.2.1 +pickleshare==0.7.4 +progressbar2==3.34.3 +prompt-toolkit==1.0.15 +protobuf==3.4.0 +psutil==5.4.0 +ptyprocess==0.5.2 +PyAudio==0.2.11 +Pygments==2.2.0 +pyparsing==2.2.0 +pysndfile==1.0.0 +python-dateutil==2.6.1 +python-utils==2.2.0 +pytz==2017.2 +PyYAML==3.12 +pyzmq==16.0.2 +qtconsole==4.3.1 +scikit-learn==0.19.0 +scipy==0.19.1 +simplegeneric==0.8.1 +six==1.11.0 +sortedcontainers==1.5.7 +tables==3.4.2 +tblib==1.3.2 +tensorflow==1.3.0 +tensorflow-tensorboard==0.4.0rc1 +terminado==0.6 +testpath==0.3.1 +toolz==0.8.2 +tornado==4.5.2 +tqdm==4.19.4 +traitlets==4.3.2 +wcwidth==0.1.7 +Werkzeug==0.12.2 +widgetsnbextension==3.0.6 +zict==0.1.3 diff --git a/speech_data.py b/speech_data.py index 8876480..4242998 100644 --- a/speech_data.py +++ b/speech_data.py @@ -1,220 +1,252 @@ import pandas as pd -from pandas_parallel import apply_by_multiprocessing +from speech_tools import apply_by_multiprocessing,threadsafe_iter # import dask as dd # import dask.dataframe as ddf import tensorflow as tf +from tensorflow.python.ops import data_flow_ops import numpy as np -from spectro_gen import generate_aiff_spectrogram +from speech_spectrum import generate_aiff_spectrogram from sklearn.model_selection import train_test_split import itertools import os import random import csv import gc +import pickle +from tqdm import tqdm -def get_siamese_pairs(groupF1, groupF2): - group1 = [r for (i, r) in groupF1.iterrows()] - group2 = [r for (i, r) in groupF2.iterrows()] - diff = [(g1, g2) for g2 in group2 for g1 in group1] - same = [i for i in itertools.combinations(group1, 2) - ] + [i for i in itertools.combinations(group2, 2)] - random.shuffle(same) - random.shuffle(diff) - # return (random.sample(same,10), random.sample(diff,10)) - return same[:10],diff[:10] def siamese_pairs(rightGroup, wrongGroup): group1 = [r for (i, r) in rightGroup.iterrows()] group2 = [r for (i, r) in wrongGroup.iterrows()] rightWrongPairs = [(g1, g2) for g2 in group2 for g1 in group1] - rightRightPairs = [i for i in itertools.combinations(group1, 2)] - random.shuffle(rightWrongPairs) - random.shuffle(rightRightPairs) - # return (random.sample(same,10), random.sample(diff,10)) - return rightRightPairs[:10],rightWrongPairs[:10] + rightRightPairs = [i for i in itertools.combinations(group1, 2)]#+[i for i in itertools.combinations(group2, 2)] + # random.shuffle(rightWrongPairs) + # random.shuffle(rightRightPairs) + # return rightRightPairs[:10],rightWrongPairs[:10] + return rightRightPairs[:32],rightWrongPairs[:32] -def append_zeros(spgr, max_samples): - return np.lib.pad(spgr, [(0, max_samples - spgr.shape[0]), (0, 0)], - 'median') + +def _float_feature(value): + return tf.train.Feature(float_list=tf.train.FloatList(value=value)) + +def _int64_feature(value): + return tf.train.Feature(int64_list=tf.train.Int64List(value=value)) + +def _bytes_feature(value): + return tf.train.Feature(bytes_list=tf.train.BytesList(value=value)) + +def create_spectrogram_tfrecords(audio_group='audio',sample_count=0,train_test_ratio=0.1): + ''' + http://warmspringwinds.github.io/tensorflow/tf-slim/2016/12/21/tfrecords-guide/ + http://www.machinelearninguru.com/deep_learning/tensorflow/basics/tfrecord/tfrecord.html + ''' + audio_samples = pd.read_csv( './outputs/' + audio_group + '.csv',index_col=0) + audio_samples['file_path'] = audio_samples.loc[:, 'file'].apply(lambda x: 'outputs/' + audio_group + '/' + x) + n_records,n_spec,n_features = 0,0,0 + + def write_samples(wg,sample_name): + word_group_prog = tqdm(wg,desc='Computing spectrogram') + record_file = './outputs/{}.{}.tfrecords'.format(audio_group,sample_name) + writer = tf.python_io.TFRecordWriter(record_file) + for (w, word_group) in word_group_prog: + word_group_prog.set_postfix(word=w,sample_name=sample_name) + g = word_group.reset_index() + g['spectrogram'] = apply_by_multiprocessing(g['file_path'],generate_aiff_spectrogram) + sample_right = g.loc[g['variant'] == 'low'] + sample_wrong = g.loc[g['variant'] == 'medium'] + same, diff = siamese_pairs(sample_right, sample_wrong) + groups = [([0,1],same),([1,0],diff)] + for (output,group) in groups: + group_prog = tqdm(group,desc='Writing Spectrogram') + for sample1,sample2 in group_prog: + same = sample1['variant'] == sample2['variant'] + phon_same = sample1['phonemes'] == sample2['phonemes'] + voice_diff = sample1['voice'] != sample2['voice'] + if not same and phon_same: + continue + if same and not voice_diff: + continue + group_prog.set_postfix(output=output + ,var1=sample1['variant'] + ,var2=sample2['variant']) + spectro1,spectro2 = sample1['spectrogram'],sample2['spectrogram'] + spec_n1,spec_n2 = spectro1.shape[0],spectro2.shape[0] + spec_w1,spec_w2 = spectro1.shape[1],spectro2.shape[1] + spec1,spec2 = spectro1.reshape(-1),spectro2.reshape(-1) + nonlocal n_spec,n_records,n_features + n_spec = max([n_spec,spec_n1,spec_n2]) + n_features = spec_w1 + n_records+=1 + example = tf.train.Example(features=tf.train.Features( + feature={ + 'word': _bytes_feature([w.encode('utf-8')]), + 'phoneme1': _bytes_feature([sample1['phonemes'].encode('utf-8')]), + 'phoneme2': _bytes_feature([sample2['phonemes'].encode('utf-8')]), + 'voice1': _bytes_feature([sample1['voice'].encode('utf-8')]), + 'voice2': _bytes_feature([sample2['voice'].encode('utf-8')]), + 'language': _bytes_feature([sample1['language'].encode('utf-8')]), + 'rate1':_int64_feature([sample1['rate']]), + 'rate2':_int64_feature([sample2['rate']]), + 'variant1': _bytes_feature([sample1['variant'].encode('utf-8')]), + 'variant2': _bytes_feature([sample2['variant'].encode('utf-8')]), + 'file1': _bytes_feature([sample1['file'].encode('utf-8')]), + 'file2': _bytes_feature([sample2['file'].encode('utf-8')]), + 'spec1':_float_feature(spec1), + 'spec2':_float_feature(spec2), + 'spec_n1':_int64_feature([spec_n1]), + 'spec_w1':_int64_feature([spec_w1]), + 'spec_n2':_int64_feature([spec_n2]), + 'spec_w2':_int64_feature([spec_w2]), + 'output':_int64_feature(output) + } + )) + writer.write(example.SerializeToString()) + group_prog.close() + word_group_prog.close() + writer.close() + + word_groups = [i for i in audio_samples.groupby('word')] + wg_sampled = reservoir_sample(word_groups,sample_count) if sample_count > 0 else word_groups + tr_audio_samples,te_audio_samples = train_test_split(wg_sampled,test_size=train_test_ratio) + write_samples(tr_audio_samples,'train') + write_samples(te_audio_samples,'test') + const_file = os.path.join('./outputs',audio_group+'.constants') + pickle.dump((n_spec,n_features,n_records),open(const_file,'wb')) def padd_zeros(spgr, max_samples): return np.lib.pad(spgr, [(0, max_samples - spgr.shape[0]), (0, 0)], 'constant') -def to_onehot(a,class_count=2): - # >>> a = np.array([1, 0, 3]) - a_row_n = a.shape[0] - b = np.zeros((a_row_n, class_count)) - b[np.arange(a_row_n), a] = 1 - return b +def reservoir_sample(iterable, k): + it = iter(iterable) + if not (k > 0): + raise ValueError("sample size must be positive") -def create_pair(l, r, max_samples): - l_sample = padd_zeros(l, max_samples) - r_sample = padd_zeros(r, max_samples) - return np.asarray([l_sample, r_sample]) + sample = list(itertools.islice(it, k)) # fill the reservoir + random.shuffle(sample) # if number of items less then *k* then + # return all items in random order. + for i, item in enumerate(it, start=k+1): + j = random.randrange(i) # random [0..i) + if j < k: + sample[j] = item # replace item with gradually decreasing probability + return sample -def create_test_pair(l, r, max_samples): - l_sample = append_zeros(l, max_samples) - r_sample = append_zeros(r, max_samples) - return np.asarray([[l_sample, r_sample]]) +def read_siamese_tfrecords_generator(audio_group='audio',batch_size=32,test_size=0): + records_file = os.path.join('./outputs',audio_group+'.train.tfrecords') + input_pairs = [] + output_class = [] + const_file = os.path.join('./outputs',audio_group+'.constants') + (n_spec,n_features,n_records) = pickle.load(open(const_file,'rb')) + print('reading tfrecords({}-train)...'.format(audio_group)) + # @threadsafe_iter + def record_generator(): + input_data = [] + output_data = [] + while True: + record_iterator = tf.python_io.tf_record_iterator(path=records_file) + #tqdm(enumerate(record_iterator),total=n_records) + for (i,string_record) in enumerate(record_iterator): + example = tf.train.Example() + example.ParseFromString(string_record) + spec_n1 = example.features.feature['spec_n1'].int64_list.value[0] + spec_n2 = example.features.feature['spec_n2'].int64_list.value[0] + spec_w1 = example.features.feature['spec_w1'].int64_list.value[0] + spec_w2 = example.features.feature['spec_w2'].int64_list.value[0] + spec1 = np.array(example.features.feature['spec1'].float_list.value).reshape(spec_n1,spec_w1) + spec2 = np.array(example.features.feature['spec2'].float_list.value).reshape(spec_n2,spec_w2) + p_spec1,p_spec2 = padd_zeros(spec1,n_spec),padd_zeros(spec2,n_spec) + input_data.append(np.asarray([p_spec1,p_spec2])) + output = example.features.feature['output'].int64_list.value + output_data.append(np.asarray(output)) + if len(input_data) == batch_size: + input_arr = np.asarray(input_data) + output_arr = np.asarray(output_data) + yield ([input_arr[:, 0], input_arr[:, 1]],output_arr) + input_data = [] + output_data = [] -def create_X(sp, max_samples): - return create_pair(sp[0]['spectrogram'], sp[1]['spectrogram'], max_samples) + # Read test in one-shot + te_records_file = os.path.join('./outputs',audio_group+'.test.tfrecords') + te_re_iterator = tf.python_io.tf_record_iterator(path=records_file) + te_n_records = len([i for i in te_re_iterator]) + te_re_iterator = tf.python_io.tf_record_iterator(path=records_file) + print('reading tfrecords({}-test)...'.format(audio_group)) + test_size = min([test_size,te_n_records]) if test_size > 0 else te_n_records + input_data = np.zeros((test_size,2,n_spec,n_features)) + output_data = np.zeros((test_size,2)) + random_samples = enumerate(reservoir_sample(te_re_iterator,test_size)) + for (i,string_record) in tqdm(random_samples,total=test_size): + example = tf.train.Example() + example.ParseFromString(string_record) + spec_n1 = example.features.feature['spec_n1'].int64_list.value[0] + spec_n2 = example.features.feature['spec_n2'].int64_list.value[0] + spec_w1 = example.features.feature['spec_w1'].int64_list.value[0] + spec_w2 = example.features.feature['spec_w2'].int64_list.value[0] + spec1 = np.array(example.features.feature['spec1'].float_list.value).reshape(spec_n1,spec_w1) + spec2 = np.array(example.features.feature['spec2'].float_list.value).reshape(spec_n2,spec_w2) + p_spec1,p_spec2 = padd_zeros(spec1,n_spec),padd_zeros(spec2,n_spec) + input_data[i] = np.asarray([p_spec1,p_spec2]) + output = example.features.feature['output'].int64_list.value + output_data[i] = np.asarray(output) + return record_generator,input_data,output_data,n_spec,n_features,n_records -# def get_word_pairs_data(word, max_samples): -# audio_samples = pd.read_csv( -# './outputs/audio.csv', -# names=['word', 'voice', 'rate', 'variant', 'file']) -# audio_samples = audio_samples.loc[audio_samples['word'] == -# word].reset_index(drop=True) -# audio_samples.loc[:, 'spectrogram'] = audio_samples.loc[:, 'file'].apply( -# lambda x: 'outputs/audio/' + x).apply(generate_aiff_spectrogram) -# max_samples = audio_samples['spectrogram'].apply( -# lambda x: x.shape[0]).max() -# same_data, diff_data = [], [] -# for (w, g) in audio_samples.groupby(audio_samples['word']): -# sample_norm = g.loc[audio_samples['variant'] == 'normal'] -# sample_phon = g.loc[audio_samples['variant'] == 'phoneme'] -# same, diff = get_siamese_pairs(sample_norm, sample_phon) -# same_data.extend([create_X(s, max_samples) for s in same]) -# diff_data.extend([create_X(d, max_samples) for d in diff]) -# Y = np.hstack([np.ones(len(same_data)), np.zeros(len(diff_data))]) -# X = np.asarray(same_data + diff_data) -# # tr_pairs, te_pairs, tr_y, te_y = train_test_split(X, Y, test_size=0.1) -# return (X, Y) +def audio_samples_word_count(audio_group='audio'): + audio_samples = pd.read_csv( './outputs/' + audio_group + '.csv') + return len(audio_samples.groupby(audio_samples['word'])) +def record_generator_count(records_file): + record_iterator = tf.python_io.tf_record_iterator(path=records_file) + count = len([i for i in record_iterator]) + record_iterator = tf.python_io.tf_record_iterator(path=records_file) + return record_iterator,count -def create_spectrogram_data(audio_group='audio'): +def fix_csv(audio_group='audio'): + audio_csv_lines = open('./outputs/' + audio_group + '.csv.orig','r').readlines() + audio_csv_data = [i.strip().split(',') for i in audio_csv_lines] + proper_rows = [i for i in audio_csv_data if len(i) == 7] + with open('./outputs/' + audio_group + '.csv','w') as fixed_csv: + fixed_csv_w = csv.writer(fixed_csv, quoting=csv.QUOTE_MINIMAL) + fixed_csv_w.writerows(proper_rows) audio_samples = pd.read_csv( './outputs/' + audio_group + '.csv' - , names=['word','phonemes', 'voice', 'language', 'rate', 'variant', 'file'] - , quoting=csv.QUOTE_NONE) - # audio_samples = audio_samples.loc[audio_samples['word'] == - # 'sunflowers'].reset_index(drop=True) - audio_samples['file_paths'] = audio_samples.loc[:, 'file'].apply(lambda x: 'outputs/' + audio_group + '/' + x) - audio_samples['file_exists'] = apply_by_multiprocessing(audio_samples['file_paths'], os.path.exists) - audio_samples = audio_samples[audio_samples['file_exists'] == True].reset_index() - audio_samples['spectrogram'] = apply_by_multiprocessing(audio_samples['file_paths'],generate_aiff_spectrogram)#.apply( - audio_samples['window_count'] = audio_samples.loc[:,'spectrogram'].apply(lambda x: x.shape[0]) - audio_samples.to_pickle('outputs/{}-spectrogram.pkl'.format(audio_group)) - -def create_spectrogram_tfrecords(audio_group='audio'): - audio_samples = pd.read_csv( './outputs/' + audio_group + '.csv' - , names=['word','phonemes', 'voice', 'language', 'rate', 'variant', 'file'] - , quoting=csv.QUOTE_NONE) - # audio_samples = audio_samples.loc[audio_samples['word'] == - # 'sunflowers'].reset_index(drop=True) + , names=['word','phonemes', 'voice', 'language', 'rate', 'variant', 'file']) audio_samples['file_path'] = audio_samples.loc[:, 'file'].apply(lambda x: 'outputs/' + audio_group + '/' + x) audio_samples['file_exists'] = apply_by_multiprocessing(audio_samples['file_path'], os.path.exists) - audio_samples = audio_samples[audio_samples['file_exists'] == True].reset_index() - - def _float_feature(value): - return tf.train.Feature(float_list=tf.train.FloatList(value=value)) - - def _int64_feature(value): - return tf.train.Feature(int64_list=tf.train.Int64List(value=value)) - - def _bytes_feature(value): - return tf.train.Feature(bytes_list=tf.train.BytesList(value=value)) - - writer = tf.python_io.TFRecordWriter('./outputs/' + audio_group + '.tfrecords') - # audio_samples = audio_samples[:100] - for (w, word_group) in audio_samples.groupby(audio_samples['word']): - g = word_group.reset_index() - g['spectrogram'] = apply_by_multiprocessing(g['file_path'],generate_aiff_spectrogram) - sample_right = g.loc[audio_samples['variant'] == 'low'] - sample_wrong = g.loc[audio_samples['variant'] == 'medium'] - same, diff = siamese_pairs(sample_right, sample_wrong) - groups = [([0,1],same),([1,0],diff)] - for (output,group) in groups: - for sample1,sample2 in group: - spectro1,spectro2 = sample1['spectrogram'],sample2['spectrogram'] - spec_n1,spec_n2 = spectro1.shape[0],spectro2.shape[0] - spec_w1,spec_w2 = spectro1.shape[1],spectro2.shape[1] - spec1,spec2 = spectro1.reshape(-1),spectro2.reshape(-1) - example = tf.train.Example(features=tf.train.Features( - feature={ - 'word': _bytes_feature([w.encode('utf-8')]), - 'phoneme1': _bytes_feature([sample1['phonemes'].encode('utf-8')]), - 'phoneme2': _bytes_feature([sample2['phonemes'].encode('utf-8')]), - 'voice1': _bytes_feature([sample1['voice'].encode('utf-8')]), - 'voice2': _bytes_feature([sample2['voice'].encode('utf-8')]), - 'language': _bytes_feature([sample1['language'].encode('utf-8')]), - 'rate1':_int64_feature([sample1['rate']]), - 'rate2':_int64_feature([sample2['rate']]), - 'variant1': _bytes_feature([sample1['variant'].encode('utf-8')]), - 'variant2': _bytes_feature([sample2['variant'].encode('utf-8')]), - 'file1': _bytes_feature([sample1['file'].encode('utf-8')]), - 'file2': _bytes_feature([sample2['file'].encode('utf-8')]), - 'spec1':_float_feature(spec1), - 'spec2':_float_feature(spec2), - 'spec_n1':_int64_feature([spec_n1]), - 'spec_w1':_int64_feature([spec_w1]), - 'spec_n2':_int64_feature([spec_n2]), - 'spec_w2':_int64_feature([spec_w2]), - 'output':_int64_feature(output) - } - )) - writer.write(example.SerializeToString()) - writer.close() - -def create_tagged_data(audio_samples): - same_data, diff_data = [], [] - for (w, g) in audio_samples.groupby(audio_samples['word']): - # sample_norm = g.loc[audio_samples['variant'] == 'low'] - # sample_phon = g.loc[audio_samples['variant'] == 'medium'] - sample_norm = g.loc[audio_samples['variant'] == 'normal'] - sample_phon = g.loc[audio_samples['variant'] == 'phoneme'] - same, diff = get_siamese_pairs(sample_norm, sample_phon) - same_data.extend([create_X(s) for s in same]) - diff_data.extend([create_X(d) for d in diff]) - print('creating all speech pairs') - Y_f = np.hstack([np.ones(len(same_data)), np.zeros(len(diff_data))]) - Y = to_onehot(Y_f.astype(np.int8)) - print('casting as array speech pairs') - X = np.asarray(same_data + diff_data) - return X,Y - -def create_speech_pairs_data(audio_group='audio'): - audio_samples = pd.read_pickle('outputs/{}-spectrogram.pkl'.format(audio_group)) - # sample_size = audio_samples['spectrogram'][0].shape[1] - tr_audio_samples,te_audio_samples = train_test_split(audio_samples, test_size=0.1) - def save_samples_for(sample_name,samples): - print('generating {} siamese speech pairs'.format(sample_name)) - X,Y = create_tagged_data(samples) - print('shuffling array speech pairs') - rng_state = np.random.get_state() - np.random.shuffle(X) - np.random.set_state(rng_state) - np.random.shuffle(Y) - print('pickling X/Y') - np.save('outputs/{}-train-X.npy'.format(audio_group), X) - np.save('outputs/{}-train-Y.npy'.format(audio_group), Y) - save_samples_for('train',tr_audio_samples) - save_samples_for('test',te_audio_samples) - -def speech_data(audio_group='audio'): - X = np.load('outputs/{}-X.npy'.format(audio_group)) / 255.0 - Y = np.load('outputs/{}-Y.npy'.format(audio_group)) - return (X,Y) - -def speech_model_data(): - tr_pairs = np.load('outputs/tr_pairs.npy') / 255.0 - te_pairs = np.load('outputs/te_pairs.npy') / 255.0 - tr_pairs[tr_pairs < 0] = 0 - te_pairs[te_pairs < 0] = 0 - tr_y = np.load('outputs/tr_y.npy') - te_y = np.load('outputs/te_y.npy') - return tr_pairs, te_pairs, tr_y, te_y + audio_samples = audio_samples[audio_samples['file_exists'] == True] + audio_samples = audio_samples.drop(['file_path','file_exists'],axis=1).reset_index(drop=True) + audio_samples.to_csv('./outputs/' + audio_group + '.csv') +def convert_old_audio(): + audio_samples = pd.read_csv( './outputs/audio.csv.old' + , names=['word', 'voice', 'rate', 'variant', 'file']) + audio_samples['phonemes'] = 'unknown' + audio_samples['language'] = 'en-US' + audio_samples.loc[audio_samples['variant'] == 'normal','variant'] = 'low' + audio_samples.loc[audio_samples['variant'] == 'phoneme','variant'] = 'medium' + audio_samples = audio_samples[['word','phonemes', 'voice', 'language', 'rate', 'variant', 'file']] + audio_samples.to_csv('./outputs/audio_new.csv',index=False,header=False) if __name__ == '__main__': # sunflower_pairs_data() # create_spectrogram_data() # create_spectrogram_data('story_words') - create_spectrogram_tfrecords('story_words') + # create_spectrogram_tfrecords('story_words') + # create_spectrogram_tfrecords('story_words_test') + # read_siamese_tfrecords('story_all') + # read_siamese_tfrecords('story_words_test') + # padd_zeros_siamese_tfrecords('story_words') + # fix_csv('story_words') + # pickle_constants('story_words') + # create_spectrogram_tfrecords('audio',sample_count=100) + # create_spectrogram_tfrecords('story_all',sample_count=25) + # fix_csv('story_words_test') + create_spectrogram_tfrecords('story_words_test',sample_count=100,train_test_ratio=0.0) + # create_spectrogram_tfrecords('audio',sample_count=50) + # read_siamese_tfrecords_generator('audio') + # padd_zeros_siamese_tfrecords('audio') # create_padded_spectrogram() # create_speech_pairs_data() # print(speech_model_data()) diff --git a/speech_model.py b/speech_model.py new file mode 100644 index 0000000..5136398 --- /dev/null +++ b/speech_model.py @@ -0,0 +1,164 @@ +from __future__ import absolute_import +from __future__ import print_function +import numpy as np +# from speech_data import speech_model_data +from speech_data import read_siamese_tfrecords_generator +from keras.models import Model,load_model,model_from_yaml +from keras.layers import Input, Dense, Dropout, LSTM, Lambda, Concatenate +from keras.losses import categorical_crossentropy +# from keras.losses import binary_crossentropy +from keras.utils import to_categorical +# from keras.utils.np_utils import to_categorical +from keras.optimizers import RMSprop +from keras.callbacks import TensorBoard, ModelCheckpoint +from keras import backend as K +from speech_tools import create_dir + +# def euclidean_distance(vects): +# x, y = vects +# return K.sqrt( +# K.maximum(K.sum(K.square(x - y), axis=1, keepdims=True), K.epsilon())) +# +# +# def eucl_dist_output_shape(shapes): +# shape1, shape2 = shapes +# return (shape1[0], 1) +# +# +# def contrastive_loss(y_true, y_pred): +# '''Contrastive loss from Hadsell-et-al.'06 +# http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf +# ''' +# return K.mean(y_true * K.square(y_pred) + +# (1 - y_true) * K.square(K.maximum(1 - y_pred, 0))) + +def create_base_rnn_network(input_dim): + '''Base network to be shared (eq. to feature extraction). + ''' + inp = Input(shape=input_dim) + # ls0 = LSTM(512, return_sequences=True)(inp) + ls1 = LSTM(256, return_sequences=True)(inp) + ls2 = LSTM(128, return_sequences=True)(ls1) + # ls3 = LSTM(32, return_sequences=True)(ls2) + ls4 = LSTM(64)(ls2) + # d1 = Dense(128, activation='relu')(ls4) + d2 = Dense(64, activation='relu')(ls4) + return Model(inp, ls4) + + +def compute_accuracy(y_true, y_pred): + '''Compute classification accuracy with a fixed threshold on distances. + ''' + pred = y_pred.ravel() > 0.5 + return np.mean(pred == y_true) + + +def accuracy(y_true, y_pred): + '''Compute classification accuracy with a fixed threshold on distances. + ''' + return K.mean(K.equal(y_true, K.cast(y_pred > 0.5, y_true.dtype))) + +def dense_classifier(processed): + conc_proc = Concatenate()(processed) + d1 = Dense(64, activation='relu')(conc_proc) + # dr1 = Dropout(0.1)(d1) + # d2 = Dense(128, activation='relu')(d1) + d3 = Dense(8, activation='relu')(d1) + # dr2 = Dropout(0.1)(d2) + return Dense(2, activation='softmax')(d3) + +def siamese_model(input_dim): + # input_dim = (15, 1654) + base_network = create_base_rnn_network(input_dim) + input_a = Input(shape=input_dim) + input_b = Input(shape=input_dim) + processed_a = base_network(input_a) + processed_b = base_network(input_b) + final_output = dense_classifier([processed_a,processed_b]) + model = Model([input_a, input_b], final_output) + # distance = Lambda( + # euclidean_distance, + # output_shape=eucl_dist_output_shape)([processed_a, processed_b]) + # model = Model([input_a, input_b], distance) + return model + +def write_model_arch(mod,mod_file): + model_f = open(mod_file,'w') + model_f.write(mod.to_yaml()) + model_f.close() + +def load_model_arch(mod_file): + model_f = open(mod_file,'r') + mod = model_from_yaml(model_f.read()) + model_f.close() + return mod + +def train_siamese(audio_group = 'audio'): + # the data, shuffled and split between train and test sets + # tr_pairs, te_pairs, tr_y_e, te_y_e = speech_model_data() + batch_size = 128 + model_dir = './models/'+audio_group + create_dir(model_dir) + log_dir = './logs/'+audio_group + create_dir(log_dir) + tr_gen_fn,te_pairs,te_y,n_step,n_features,n_records = read_siamese_tfrecords_generator(audio_group,batch_size=batch_size,test_size=batch_size) + tr_gen = tr_gen_fn() + # tr_y = to_categorical(tr_y_e, num_classes=2) + # te_y = to_categorical(te_y_e, num_classes=2) + input_dim = (n_step, n_features) + + model = siamese_model(input_dim) + + tb_cb = TensorBoard( + log_dir=log_dir, + histogram_freq=1, + batch_size=32, + write_graph=True, + write_grads=True, + write_images=True, + embeddings_freq=0, + embeddings_layer_names=None, + embeddings_metadata=None) + cp_file_fmt = model_dir+'/siamese_speech_model-{epoch:02d}-epoch-{val_loss:0.2f}\ +-acc.h5' + + cp_cb = ModelCheckpoint( + cp_file_fmt, + monitor='val_loss', + verbose=0, + save_best_only=True, + save_weights_only=True, + mode='auto', + period=1) + # train + rms = RMSprop()#lr=0.001 + model.compile(loss=categorical_crossentropy, optimizer=rms, metrics=[accuracy]) + write_model_arch(model,model_dir+'/siamese_speech_model_arch.yaml') + # model.fit( + # [tr_pairs[:, 0], tr_pairs[:, 1]], + # tr_y, + # batch_size=128, + # epochs=100, + # validation_data=([te_pairs[:, 0], te_pairs[:, 1]], te_y), + # callbacks=[tb_cb, cp_cb]) + model.fit_generator(tr_gen + ,epochs=1000 + ,steps_per_epoch=n_records//batch_size + ,validation_data=([te_pairs[:, 0], te_pairs[:, 1]], te_y) + ,use_multiprocessing=True, workers=1 + ,callbacks=[tb_cb, cp_cb]) + model.save(model_dir+'/siamese_speech_model-final.h5') + # compute final accuracy on training and test sets + # y_pred = model.predict([tr_pairs[:, 0], tr_pairs[:, 1]]) + # tr_acc = compute_accuracy(tr_y, y_pred) + # print('* Accuracy on training set: %0.2f%%' % (100 * tr_acc)) + + y_pred = model.predict([te_pairs[:, 0], te_pairs[:, 1]]) + te_acc = compute_accuracy(te_y, y_pred) + print('* Accuracy on test set: %0.2f%%' % (100 * te_acc)) + + + +if __name__ == '__main__': + train_siamese('story_words') + # train_siamese('audio') diff --git a/tts_samplegen.py b/speech_samplegen.py similarity index 94% rename from tts_samplegen.py rename to speech_samplegen.py index 01c37cf..eefb63b 100644 --- a/tts_samplegen.py +++ b/speech_samplegen.py @@ -3,6 +3,7 @@ from AppKit import NSSpeechSynthesizer, NSSpeechInputModeProperty from AppKit import NSSpeechModePhoneme from Foundation import NSURL import json +import csv import random import os import re @@ -11,6 +12,7 @@ import time import progressbar from generate_similar import similar_phoneme_phrase,similar_phrase +from speech_tools import format_filename OUTPUT_NAME = 'story_words_test' dest_dir = os.path.abspath('.') + '/outputs/' + OUTPUT_NAME + '/' @@ -39,7 +41,10 @@ def create_dir(direc): def dest_filename(w, v, r, t): - return '{}-{}-{}-{}-{}.aiff'.format(w, v, r, t, str(random.randint(0, 10000))) + rand_no = str(random.randint(0, 10000)) + fname = '{}-{}-{}-{}-{}.aiff'.format(w, v, r, t, rand_no) + sanitized = format_filename(fname) + return sanitized def dest_path(v, r, n): @@ -81,6 +86,11 @@ class SynthFile(object): return ','.join([str(c) for c in cols])+'\n' + def get_values(self): + cols = [self.word, self.phoneme, self.voice, + self.voice_lang, self.rate, self.variant, + self.filename] + return [str(c) for c in cols] class SynthVariant(object): """docstring for SynthVariant.""" @@ -191,22 +201,11 @@ def synth_generator(): print("It took {} to synthsize all variants.".format(time_str)) return synth_for_words - -def write_synths(synth_list, fname, csv=False): - f = open(fname, 'w') - if csv: - for s in synth_list: - f.write(s.get_csv()) - else: - json.dump([s.get_json() for s in synth_list], f) - f.close() - - def synth_logger(fname, csv=False): f = open(fname, 'w') - + s_csv_w = csv.writer(f, quoting=csv.QUOTE_MINIMAL) def csv_writer(s): - f.write(s.get_csv()) + s_csv_w.writerow(s.get_values()) synth_list = [] def json_writer(s): diff --git a/speech_siamese.py b/speech_siamese.py deleted file mode 100644 index e07c6cf..0000000 --- a/speech_siamese.py +++ /dev/null @@ -1,135 +0,0 @@ -from __future__ import absolute_import -from __future__ import print_function -import numpy as np -from speech_data import speech_model_data -from keras.models import Model,load_model -from keras.layers import Input, Dense, Dropout, LSTM, Lambda, Concatenate -from keras.losses import categorical_crossentropy -# from keras.losses import binary_crossentropy -from keras.utils import to_categorical -# from keras.utils.np_utils import to_categorical -from keras.optimizers import RMSprop -from keras.callbacks import TensorBoard, ModelCheckpoint -from keras import backend as K - - -def euclidean_distance(vects): - x, y = vects - return K.sqrt( - K.maximum(K.sum(K.square(x - y), axis=1, keepdims=True), K.epsilon())) - - -def eucl_dist_output_shape(shapes): - shape1, shape2 = shapes - return (shape1[0], 1) - - -def contrastive_loss(y_true, y_pred): - '''Contrastive loss from Hadsell-et-al.'06 - http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf - ''' - return K.mean(y_true * K.square(y_pred) + - (1 - y_true) * K.square(K.maximum(1 - y_pred, 0))) - -def create_base_rnn_network(input_dim): - '''Base network to be shared (eq. to feature extraction). - ''' - inp = Input(shape=input_dim) - ls1 = LSTM(256, return_sequences=True)(inp) - ls2 = LSTM(128, return_sequences=True)(ls1) - # ls3 = LSTM(32, return_sequences=True)(ls2) - ls4 = LSTM(64)(ls2) - return Model(inp, ls4) - - -def compute_accuracy(y_true, y_pred): - '''Compute classification accuracy with a fixed threshold on distances. - ''' - pred = y_pred.ravel() < 0.5 - return np.mean(pred == y_true) - - -def accuracy(y_true, y_pred): - '''Compute classification accuracy with a fixed threshold on distances. - ''' - return K.mean(K.equal(y_true, K.cast(y_pred > 0.5, y_true.dtype))) - -def dense_classifier(processed): - conc_proc = Concatenate()(processed) - d1 = Dense(16, activation='relu')(conc_proc) - # dr1 = Dropout(0.1)(d1) - d2 = Dense(8, activation='relu')(d1) - # dr2 = Dropout(0.1)(d2) - return Dense(2, activation='softmax')(d2) - -def siamese_model(input_dim): - # input_dim = (15, 1654) - base_network = create_base_rnn_network(input_dim) - input_a = Input(shape=input_dim) - input_b = Input(shape=input_dim) - processed_a = base_network(input_a) - processed_b = base_network(input_b) - final_output = dense_classifier([processed_a,processed_b]) - model = Model([input_a, input_b], final_output) - # distance = Lambda( - # euclidean_distance, - # output_shape=eucl_dist_output_shape)([processed_a, processed_b]) - # model = Model([input_a, input_b], distance) - return model - - -def train_siamese(): - # the data, shuffled and split between train and test sets - tr_pairs, te_pairs, tr_y_e, te_y_e = speech_model_data() - tr_y = to_categorical(tr_y_e, num_classes=2) - te_y = to_categorical(te_y_e, num_classes=2) - input_dim = (tr_pairs.shape[2], tr_pairs.shape[3]) - - model = siamese_model(input_dim) - - tb_cb = TensorBoard( - log_dir='./logs/siamese_logs', - histogram_freq=1, - batch_size=32, - write_graph=True, - write_grads=True, - write_images=True, - embeddings_freq=0, - embeddings_layer_names=None, - embeddings_metadata=None) - cp_file_fmt = './models/siamese_speech_model-{epoch:02d}-epoch-{val_loss:0.2f}\ --acc.h5' - - cp_cb = ModelCheckpoint( - cp_file_fmt, - monitor='val_loss', - verbose=0, - save_best_only=False, - save_weights_only=False, - mode='auto', - period=1) - # train - rms = RMSprop(lr=0.001) - model.compile(loss=categorical_crossentropy, optimizer=rms, metrics=[accuracy]) - model.fit( - [tr_pairs[:, 0], tr_pairs[:, 1]], - tr_y, - batch_size=128, - epochs=50, - validation_data=([te_pairs[:, 0], te_pairs[:, 1]], te_y), - callbacks=[tb_cb, cp_cb]) - - model.save('./models/siamese_speech_model-final.h5') - # compute final accuracy on training and test sets - y_pred = model.predict([tr_pairs[:, 0], tr_pairs[:, 1]]) - tr_acc = compute_accuracy(tr_y, y_pred) - y_pred = model.predict([te_pairs[:, 0], te_pairs[:, 1]]) - te_acc = compute_accuracy(te_y, y_pred) - - print('* Accuracy on training set: %0.2f%%' % (100 * tr_acc)) - print('* Accuracy on test set: %0.2f%%' % (100 * te_acc)) - - - -if __name__ == '__main__': - train_siamese() diff --git a/spectro_gen.py b/speech_spectrum.py similarity index 91% rename from spectro_gen.py rename to speech_spectrum.py index 2e397e8..794586f 100644 --- a/spectro_gen.py +++ b/speech_spectrum.py @@ -13,6 +13,8 @@ from pysndfile import sndio as snd from numpy.lib import stride_tricks """ short time fourier transform of audio signal """ +STFT_WINDOWS_MSEC = 20 +STFT_WINDOW_OVERLAP = 1.0 / 3 def stft(sig, frameSize, overlapFac=0.5, window=np.hanning): win = window(frameSize) @@ -74,7 +76,7 @@ def logscale_spec(spec, sr=44100, factor=20.): def generate_spec_frec(samples, samplerate): # samplerate, samples = wav.read(audiopath) # s = stft(samples, binsize) - s = stft(samples, samplerate * 150 // 1000, 1.0 / 3) + s = stft(samples, samplerate * STFT_WINDOWS_MSEC // 1000, STFT_WINDOW_OVERLAP) sshow, freq = logscale_spec(s, factor=1.0, sr=samplerate) ims = 20. * np.log10(np.abs(sshow) / 10e-6) @@ -141,8 +143,11 @@ def play_sunflower(): if __name__ == '__main__': - play_sunflower() - # plot_aiff_stft('./outputs/sunflowers-Alex-150-normal-589.aiff') + # play_sunflower() + plot_aiff_stft('./outputs/story_words/Agnes/150/chicken-Agnes-150-low-1077.aiff') + plot_aiff_stft('./outputs/story_words/Agnes/150/chicken-Agnes-150-medium-1762.aiff') + # spec = generate_aiff_spectrogram('./outputs/story_words/Agnes/150/chicken-Agnes-150-low-1077.aiff') + # print(spec.shape) # plot_aiff_stft('./outputs/sunflowers-Alex-180-normal-4763.aiff') # plot_aiff_stft('./outputs/sunflowers-Victoria-180-normal-870.aiff') # plot_aiff_stft('./outputs/sunflowers-Fred-180-phoneme-9733.aiff') diff --git a/speech_test.py b/speech_test.py new file mode 100644 index 0000000..1ee7789 --- /dev/null +++ b/speech_test.py @@ -0,0 +1,138 @@ +from speech_model import load_model_arch +from speech_tools import record_spectrogram, file_player +from speech_data import record_generator_count +# from importlib import reload +# import speech_data +# reload(speech_data) +import numpy as np +import pandas as pd +import os +import pickle +import tensorflow as tf +import csv +from tqdm import tqdm +from speech_data import padd_zeros + +def predict_recording_with(m,sample_size=15): + spec1 = record_spectrogram(n_sec=1.4) + spec2 = record_spectrogram(n_sec=1.4) + inp = create_test_pair(spec1,spec2,sample_size) + return m.predict([inp[:, 0], inp[:, 1]]) + + +def test_with(audio_group): + X,Y = speech_data(audio_group) + print(np.argmax(model.predict([X[:, 0], X[:, 1]]),axis=1)) + print(Y.astype(np.int8)) + +def evaluate_siamese(records_file,audio_group='audio',weights = 'siamese_speech_model-final.h5'): + # audio_group='audio';model_file = 'siamese_speech_model-305-epoch-0.20-acc.h5' + # records_file = os.path.join('./outputs',eval_group+'.train.tfrecords') + const_file = os.path.join('./outputs',audio_group+'.constants') + arch_file='./models/'+audio_group+'/siamese_speech_model_arch.yaml' + weight_file='./models/'+audio_group+'/'+weights + (n_spec,n_features,n_records) = pickle.load(open(const_file,'rb')) + print('evaluating {}...'.format(records_file)) + model = load_model_arch(arch_file) + # model = siamese_model((n_spec, n_features)) + model.load_weights(weight_file) + record_iterator,records_count = record_generator_count(records_file) + total,same_success,diff_success,skipped,same_failed,diff_failed = 0,0,0,0,0,0 + all_results = [] + for (i,string_record) in tqdm(enumerate(record_iterator),total=records_count): + # string_record = next(record_iterator) + total+=1 + example = tf.train.Example() + example.ParseFromString(string_record) + spec_n1 = example.features.feature['spec_n1'].int64_list.value[0] + spec_n2 = example.features.feature['spec_n2'].int64_list.value[0] + if n_spec < spec_n1 or n_spec < spec_n2: + skipped+=1 + continue + spec_w1 = example.features.feature['spec_w1'].int64_list.value[0] + spec_w2 = example.features.feature['spec_w2'].int64_list.value[0] + spec1 = np.array(example.features.feature['spec1'].float_list.value).reshape(spec_n1,spec_w1) + spec2 = np.array(example.features.feature['spec2'].float_list.value).reshape(spec_n2,spec_w2) + word = example.features.feature['word'].bytes_list.value[0].decode() + phoneme1 = example.features.feature['phoneme1'].bytes_list.value[0].decode() + phoneme2 = example.features.feature['phoneme2'].bytes_list.value[0].decode() + voice1 = example.features.feature['voice1'].bytes_list.value[0].decode() + voice2 = example.features.feature['voice2'].bytes_list.value[0].decode() + language = example.features.feature['language'].bytes_list.value[0].decode() + rate1 = example.features.feature['rate1'].int64_list.value[0] + rate2 = example.features.feature['rate2'].int64_list.value[0] + variant1 = example.features.feature['variant1'].bytes_list.value[0].decode() + variant2 = example.features.feature['variant2'].bytes_list.value[0].decode() + file1 = example.features.feature['file1'].bytes_list.value[0].decode() + file2 = example.features.feature['file2'].bytes_list.value[0].decode() + + p_spec1,p_spec2 = padd_zeros(spec1,n_spec),padd_zeros(spec2,n_spec) + input_arr = np.asarray([[p_spec1,p_spec2]]) + output_arr = np.asarray([example.features.feature['output'].int64_list.value]) + y_pred = model.predict([input_arr[:, 0], input_arr[:, 1]]) + predicted = np.asarray(y_pred[0]>0.5).astype(output_arr.dtype) + expected = output_arr[0] + status = np.all(predicted == expected) + result = {"phoneme1":phoneme1,"phoneme2":phoneme2,"voice1":voice1 + ,"voice2":voice2,"rate1":rate1,"rate2":rate2 + ,"variant1":variant1,"variant2":variant2,"file1":file1 + ,"file2":file2,"expected":expected[0],"predicted":y_pred[0][0] + ,"success":status} + all_results.append(result) + if status: + if variant1 == variant2: + same_success+=1 + else: + diff_success+=1 + continue + else: + if variant1 == variant2: + same_failed+=1 + else: + diff_failed+=1 + print('total-{},same_success-{},diff_success-{},skipped-{},same_failed-{},diff_failed-{}'.format(total,same_success,diff_success,skipped,same_failed,diff_failed)) + success = same_success+diff_success + failure = same_failed+diff_failed + print('accuracy-{:.3f}'.format(success*100/(success+failure))) + print('same_accuracy-{:.3f}'.format(same_success*100/(same_success+same_failed))) + print('diff_accuracy-{:.3f}'.format(diff_success*100/(diff_success+diff_failed))) + result_data = pd.DataFrame(all_results,columns=["phoneme1","phoneme2" + ,"voice1","voice2","rate1","rate2","variant1","variant2","file1","file2", + "expected","predicted","success"]) + result_data.to_csv('./outputs/' + audio_group + '.results.csv') + + +def play_results(audio_group='audio'): + result_data = pd.read_csv('./outputs/' + audio_group + '.results.csv') + play_file,close_player = file_player() + quit = False + for (i,r) in result_data.iterrows(): + if quit: + break + keys = ["phoneme1","phoneme2","voice1","voice2","rate1","rate2","variant1","variant2"] + row_vals = [str(r[k]) for k in keys] + h_str = '\t'.join(keys) + row_str = '\t'.join(row_vals) + while True: + print(h_str) + print(row_str) + play_file('./outputs/'+audio_group+'/'+r['file1'],True) + play_file('./outputs/'+audio_group+'/'+r['file2'],True) + a = input("press 'r/q/[Enter]' to replay/quit/continue:\t") + if a == 'r': + continue + if a == 'q': + quit = True + break + else: + break + close_player() + +if __name__ == '__main__': + evaluate_siamese('./outputs/story_words_test.train.tfrecords',audio_group='story_words',weights ='siamese_speech_model-712-epoch-0.00-acc.h5') + # evaluate_siamese('./outputs/story_words.test.tfrecords',audio_group='story_words',weights ='siamese_speech_model-675-epoch-0.00-acc.h5') + # play_results('story_words') +# test_with('rand_edu') +# sunflower_data,sunflower_result = get_word_pairs_data('sweater',15) +# print(np.argmax(model.predict([sunflower_data[:, 0], sunflower_data[:, 1]]),axis=1)) +# print(sunflower_result) diff --git a/speech_tools.py b/speech_tools.py new file mode 100644 index 0000000..c252ac5 --- /dev/null +++ b/speech_tools.py @@ -0,0 +1,152 @@ +import os +import threading +import multiprocessing +import pandas as pd +import numpy as np +import pyaudio +from pysndfile import sndio as snd +# from matplotlib import pyplot as plt +from speech_spectrum import plot_stft, generate_spec_frec + +SAMPLE_RATE = 22050 +N_CHANNELS = 2 + +def file_player(): + p_oup = pyaudio.PyAudio() + def play_file(audiopath,plot=False): + print('playing',audiopath) + samples, samplerate, form = snd.read(audiopath) + stream = p_oup.open( + format=pyaudio.paFloat32, + channels=2, + rate=samplerate, + output=True) + one_channel = np.asarray([samples, samples]).T.reshape(-1) + audio_data = one_channel.astype(np.float32).tobytes() + stream.write(audio_data) + stream.close() + if plot: + plot_stft(samples, SAMPLE_RATE) + def close_player(): + p_oup.terminate() + return play_file,close_player + +def record_spectrogram(n_sec, plot=False, playback=False): + # show_record_prompt() + N_SEC = n_sec + CHUNKSIZE = int(SAMPLE_RATE * N_SEC / N_CHANNELS) # fixed chunk size + input('Press [Enter] to start recording sample... ') + p_inp = pyaudio.PyAudio() + stream = p_inp.open( + format=pyaudio.paFloat32, + channels=N_CHANNELS, + rate=SAMPLE_RATE, + input=True, + frames_per_buffer=CHUNKSIZE) + data = stream.read(CHUNKSIZE) + numpydata = np.frombuffer(data, dtype=np.float32) + multi_channel = np.abs(np.reshape(numpydata, (-1, 2))).mean(axis=1) + one_channel = np.asarray([multi_channel, -1 * multi_channel]).T.reshape(-1) + mean_channel_data = one_channel.tobytes() + stream.stop_stream() + stream.close() + p_inp.terminate() + if plot: + plot_stft(one_channel, SAMPLE_RATE) + if playback: + p_oup = pyaudio.PyAudio() + stream = p_oup.open( + format=pyaudio.paFloat32, + channels=2, + rate=SAMPLE_RATE, + output=True) + stream.write(mean_channel_data) + stream.close() + p_oup.terminate() + ims, _ = generate_spec_frec(one_channel, SAMPLE_RATE) + return ims + + +def _apply_df(args): + df, func, num, kwargs = args + return num, df.apply(func, **kwargs) + +def apply_by_multiprocessing(df,func,**kwargs): + cores = multiprocessing.cpu_count() + workers=kwargs.pop('workers') if 'workers' in kwargs else cores + pool = multiprocessing.Pool(processes=workers) + result = pool.map(_apply_df, [(d, func, i, kwargs) for i,d in enumerate(np.array_split(df, workers))]) + pool.close() + result=sorted(result,key=lambda x:x[0]) + return pd.concat([i[1] for i in result]) + +def square(x): + return x**x + +if __name__ == '__main__': + df = pd.DataFrame({'a':range(10), 'b':range(10)}) + apply_by_multiprocessing(df, square, axis=1, workers=4) + + +def rm_rf(d): + for path in (os.path.join(d,f) for f in os.listdir(d)): + if os.path.isdir(path): + rm_rf(path) + else: + os.unlink(path) + os.rmdir(d) + +def create_dir(direc): + if not os.path.exists(direc): + os.makedirs(direc) + else: + rm_rf(direc) + create_dir(direc) + + +#################### Now make the data generator threadsafe #################### + +class threadsafe_iter: + """Takes an iterator/generator and makes it thread-safe by + serializing call to the `next` method of given iterator/generator. + """ + def __init__(self, it): + self.it = it + self.lock = threading.Lock() + + def __iter__(self): + return self + + def __next__(self): # Py3 + with self.lock: + return next(self.it) + + def next(self): # Py2 + with self.lock: + return self.it.next() + + +def threadsafe_generator(f): + """A decorator that takes a generator function and makes it thread-safe. + """ + def g(*a, **kw): + return threadsafe_iter(f(*a, **kw)) + return g + + + +def format_filename(s): + """ + Take a string and return a valid filename constructed from the string. + Uses a whitelist approach: any characters not present in valid_chars are + removed. Also spaces are replaced with underscores. + + Note: this method may produce invalid filenames such as ``, `.` or `..` + When I use this method I prepend a date string like '2009_01_15_19_46_32_' + and append a file extension like '.txt', so I avoid the potential of using + an invalid filename. + """ + valid_chars = "-_.() %s%s" % (string.ascii_letters, string.digits) + filename = ''.join(c for c in s if c in valid_chars) + filename = filename.replace(' ','_') # I don't like spaces in filenames. + return filename diff --git a/test_siamese.py b/test_siamese.py deleted file mode 100644 index 35980d6..0000000 --- a/test_siamese.py +++ /dev/null @@ -1,30 +0,0 @@ -from speech_siamese import siamese_model -from record_mic_speech import record_spectrogram -from importlib import reload -# import speech_data -# reload(speech_data) -from speech_data import create_test_pair,get_word_pairs_data,speech_data -import numpy as np - -model = siamese_model((15, 1654)) -model.load_weights('./models/siamese_speech_model-final.h5') - -def predict_recording_with(m,sample_size=15): - spec1 = record_spectrogram(n_sec=1.4) - spec2 = record_spectrogram(n_sec=1.4) - inp = create_test_pair(spec1,spec2,sample_size) - return m.predict([inp[:, 0], inp[:, 1]]) - -# while(True): -# print(predict_recording_with(model)) - - -def test_with(audio_group): - X,Y = speech_data(audio_group) - print(np.argmax(model.predict([X[:, 0], X[:, 1]]),axis=1)) - print(Y.astype(np.int8)) - -test_with('rand_edu') -# sunflower_data,sunflower_result = get_word_pairs_data('sweater',15) -# print(np.argmax(model.predict([sunflower_data[:, 0], sunflower_data[:, 1]]),axis=1)) -# print(sunflower_result)