From 82d0398d2c1b099d069ab8040f8f2a74b16d4155 Mon Sep 17 00:00:00 2001 From: Malar Kannan Date: Wed, 25 Oct 2017 13:36:41 +0530 Subject: [PATCH] formatted --- arpabet-to-apple.py | 55 --------------- create_triplets.py | 10 --- record_mic_speech.py | 41 +++++++++-- snippets.py | 12 ---- spectro_gen.py | 80 +++++++++++++++------- speech_data.py | 157 ++++++++++++++++++++----------------------- speech_siamese.py | 50 ++++++++------ tts_samplegen.py | 122 +++++++++++++++++++++------------ 8 files changed, 275 insertions(+), 252 deletions(-) delete mode 100644 arpabet-to-apple.py delete mode 100644 create_triplets.py delete mode 100644 snippets.py diff --git a/arpabet-to-apple.py b/arpabet-to-apple.py deleted file mode 100644 index dc542e4..0000000 --- a/arpabet-to-apple.py +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env python3 - -""" -Convert ARPABET -to Apple's codes -""" - -import sys - - -mapping = {s.split()[0]: s.split()[1] for s in """ -AA AA -AE AE -AH UX -AO AO -AW AW -AY AY -B b -CH C -D d -DH D -EH EH -ER UXr -EY EY -F f -G g -HH h -IH IH -IY IY -JH J -K k -L l -M m -N n -NG N -OW OW -OY OY -P p -R r -S s -SH S -T t -TH T -UH UH -UW UW -V v -W w -Y y -Z z -ZH Z -""".strip().split('\n')} - -arpabet_phonemes = sys.stdin.read().split() -apple_phonemes = [mapping[p.upper()] for p in arpabet_phonemes] -print('[[inpt PHON]] ' + ''.join(apple_phonemes)) diff --git a/create_triplets.py b/create_triplets.py deleted file mode 100644 index ff1c04f..0000000 --- a/create_triplets.py +++ /dev/null @@ -1,10 +0,0 @@ -import pandas as pd - -audio_file = pd.read_csv('./outputs/audio.csv',names=['word','voice','rate','type','filename']) -word_goups = audio_file.groupby('word') -# audio -lst = [1, 2, 3, 1, 2, 3] -s = pd.Series([1, 2, 3, 10, 20, 30], lst) -df3 = pd.DataFrame({'X' : ['A', 'B', 'A', 'B'], 'Y' : [1, 4, 3, 2]}) - -s.groupby(level=0).sum() diff --git a/record_mic_speech.py b/record_mic_speech.py index fb2b63e..8ffacb4 100644 --- a/record_mic_speech.py +++ b/record_mic_speech.py @@ -2,14 +2,24 @@ import pyaudio import numpy as np from matplotlib import pyplot as plt -CHUNKSIZE = 1024 # fixed chunk size +CHUNKSIZE = 44100 * 10 # fixed chunk size # initialize portaudio -p = pyaudio.PyAudio() -stream = p.open(format=pyaudio.paInt16, channels=1, rate=44100, input=True, frames_per_buffer=CHUNKSIZE) +p_inp = pyaudio.PyAudio() +# dev_n = p.get_device_count() +# dev_infos = [p.get_device_info_by_index(index) for index in range(dev_n)] +# [i for i in dev_infos] # if i['name'] == 'record'] +stream = p_inp.open( + format=pyaudio.paInt24, + channels=2, + rate=44100, + input=True, + frames_per_buffer=CHUNKSIZE) # do this as long as you want fresh samples data = stream.read(CHUNKSIZE) +len(data) +CHUNKSIZE*10 numpydata = np.fromstring(data, dtype=np.int16) # plot data @@ -19,4 +29,27 @@ plt.show() # close stream stream.stop_stream() stream.close() -p.terminate() +p_inp.terminate() +# open the file for reading. +# wf = wave.open(sys.argv[1], 'rb') + +# create an audio object +# p = pyaudio.PyAudio() + +# open stream based on the wave object which has been input. +p_oup = pyaudio.PyAudio() +stream = p_oup.open( + format=pyaudio.paInt24, channels=2, rate=44100, output=True) + +# read data (based on the chunk size) +# data = wf.readframes(CHUNKSIZE) + +# play stream (looping from beginning of file to the end) +# while data != '': +# writing to the stream is what *actually* plays the sound. +stream.write(data) +# data = wf.readframes(chunk) + +# cleanup stuff. +stream.close() +p_oup.terminate() diff --git a/snippets.py b/snippets.py deleted file mode 100644 index 5726d21..0000000 --- a/snippets.py +++ /dev/null @@ -1,12 +0,0 @@ -# import scipy.signal as sg -# import pysndfile.sndio as snd -# -# snd_data,samples,_ = snd.read('./outputs/sunflowers-Alex-150-normal-589.aiff') -# samples_per_seg = 3*int(samples*150/(3*1000)) -# # samples/(len(snd_data)*1000.0) -# len(snd_data) -# samples_per_seg/2 -# -# len(sg.spectrogram(snd_data,nperseg=samples_per_seg,noverlap=samples_per_seg/3)[2]) -# -# from spectro_gen import generate_aiff_spectrogram diff --git a/spectro_gen.py b/spectro_gen.py index 6a08fbf..c0da090 100644 --- a/spectro_gen.py +++ b/spectro_gen.py @@ -1,6 +1,7 @@ #!/usr/bin/env python -#coding: utf-8 -""" This work is licensed under a Creative Commons Attribution 3.0 Unported License. + +""" This work is licensed under a Creative Commons Attribution 3.0 Unported + License. Frank Zalkow, 2012-2013 http://www.frank-zalkow.de/en/code-snippets/create-audio-spectrograms-with-python.html?i=1 """ @@ -9,8 +10,9 @@ import numpy as np from matplotlib import pyplot as plt from pysndfile import sndio as snd from numpy.lib import stride_tricks - """ short time fourier transform of audio signal """ + + def stft(sig, frameSize, overlapFac=0.5, window=np.hanning): win = window(frameSize) hopSize = int(frameSize - np.floor(overlapFac * frameSize)) @@ -18,82 +20,103 @@ def stft(sig, frameSize, overlapFac=0.5, window=np.hanning): # zeros at beginning (thus center of 1st window should be for sample nr. 0) # sig = (sig*255).astype(np.uint8) # import pdb;pdb.set_trace() - count = int(np.floor(frameSize/2.0)) + count = int(np.floor(frameSize / 2.0)) # import pdb;pdb.set_trace() samples = np.append(np.zeros(count), sig) # cols for windowing - cols = int(np.ceil( (len(samples) - frameSize) / float(hopSize)) + 1) + cols = int(np.ceil((len(samples) - frameSize) / float(hopSize)) + 1) # zeros at end (thus samples can be fully covered by frames) samples = np.append(samples, np.zeros(frameSize)) - frames = stride_tricks.as_strided(samples, shape=(cols, frameSize), strides=(samples.strides[0]*hopSize, samples.strides[0])).copy() + frames = stride_tricks.as_strided( + samples, + shape=(cols, frameSize), + strides=(samples.strides[0] * hopSize, samples.strides[0])).copy() frames *= win return np.fft.rfft(frames) + """ scale frequency axis logarithmically """ + + def logscale_spec(spec, sr=44100, factor=20.): timebins, freqbins = np.shape(spec) - scale = np.linspace(0, 1, freqbins) ** factor - scale *= (freqbins-1)/max(scale) + scale = np.linspace(0, 1, freqbins)**factor + scale *= (freqbins - 1) / max(scale) scale = np.unique(np.round(scale)).astype(np.uint32) # import pdb;pdb.set_trace() # create spectrogram with new freq bins newspec = np.complex128(np.zeros([timebins, len(scale)])) for i in range(0, len(scale)): - if i == len(scale)-1: - newspec[:,i] = np.sum(spec[:,scale[i]:], axis=1) + if i == len(scale) - 1: + newspec[:, i] = np.sum(spec[:, scale[i]:], axis=1) else: - newspec[:,i] = np.sum(spec[:,scale[i]:scale[i+1]], axis=1) + newspec[:, i] = np.sum(spec[:, scale[i]:scale[i + 1]], axis=1) # list center freq of bins - allfreqs = np.abs(np.fft.fftfreq(freqbins*2, 1./sr)[:freqbins+1]) + allfreqs = np.abs(np.fft.fftfreq(freqbins * 2, 1. / sr)[:freqbins + 1]) freqs = [] for i in range(0, len(scale)): - if i == len(scale)-1: + if i == len(scale) - 1: freqs += [np.mean(allfreqs[scale[i]:])] else: - freqs += [np.mean(allfreqs[scale[i]:scale[i+1]])] + freqs += [np.mean(allfreqs[scale[i]:scale[i + 1]])] return newspec, freqs + """ generate spectrogram for aiff audio with 150ms windows and 50ms overlap""" + + def generate_aiff_spectrogram(audiopath): - samples,samplerate,_ = snd.read(audiopath) + samples, samplerate, _ = snd.read(audiopath) # samplerate, samples = wav.read(audiopath) # s = stft(samples, binsize) - s = stft(samples, samplerate*150//1000,1.0/3) + s = stft(samples, samplerate * 150 // 1000, 1.0 / 3) sshow, freq = logscale_spec(s, factor=1.0, sr=samplerate) - ims = 20.*np.log10(np.abs(sshow)/10e-6) + ims = 20. * np.log10(np.abs(sshow) / 10e-6) return ims + """ plot spectrogram""" + + def plotstft(audiopath, binsize=2**10, plotpath=None, colormap="jet"): - samples,samplerate,_ = snd.read(audiopath) + samples, samplerate, _ = snd.read(audiopath) # samplerate, samples = wav.read(audiopath) # s = stft(samples, binsize) # print(samplerate*150//1000) - s = stft(samples, samplerate*150//1000,1.0/3) + s = stft(samples, samplerate * 150 // 1000, 1.0 / 3) sshow, freq = logscale_spec(s, factor=1.0, sr=samplerate) - ims = 20.*np.log10(np.abs(sshow)/10e-6) # amplitude to decibel + ims = 20. * np.log10(np.abs(sshow) / 10e-6) # amplitude to decibel timebins, freqbins = np.shape(ims) # import pdb;pdb.set_trace() plt.figure(figsize=(15, 7.5)) - plt.imshow(np.transpose(ims), origin="lower", aspect="auto", cmap=colormap, interpolation="none") + plt.imshow( + np.transpose(ims), + origin="lower", + aspect="auto", + cmap=colormap, + interpolation="none") plt.colorbar() plt.xlabel("time (s)") plt.ylabel("frequency (hz)") - plt.xlim([0, timebins-1]) + plt.xlim([0, timebins - 1]) plt.ylim([0, freqbins]) - xlocs = np.float32(np.linspace(0, timebins-1, 5)) - plt.xticks(xlocs, ["%.02f" % l for l in ((xlocs*len(samples)/timebins)+(0.5*binsize))/samplerate]) - ylocs = np.int16(np.round(np.linspace(0, freqbins-1, 10))) + xlocs = np.float32(np.linspace(0, timebins - 1, 5)) + plt.xticks(xlocs, [ + "%.02f" % l + for l in ( + (xlocs * len(samples) / timebins) + (0.5 * binsize)) / samplerate + ]) + ylocs = np.int16(np.round(np.linspace(0, freqbins - 1, 10))) plt.yticks(ylocs, ["%.02f" % freq[i] for i in ylocs]) if plotpath: @@ -103,6 +126,13 @@ def plotstft(audiopath, binsize=2**10, plotpath=None, colormap="jet"): plt.clf() + +snd.get_info('./outputs/sunflowers-Alex-150-normal-589.aiff') +snd_data_arr = snd.read('./outputs/sunflowers-Alex-150-normal-589.aiff')[0] +snd_data = snd_data_arr.tobytes() +snd_data_arr.dtype +len(snd_data) + if __name__ == '__main__': plotstft('./outputs/sunflowers-Alex-150-normal-589.aiff') plotstft('./outputs/sunflowers-Alex-180-normal-4763.aiff') diff --git a/speech_data.py b/speech_data.py index f1a8c94..6ed90f6 100644 --- a/speech_data.py +++ b/speech_data.py @@ -3,135 +3,124 @@ import numpy as np from spectro_gen import generate_aiff_spectrogram from sklearn.model_selection import train_test_split import itertools -import pickle,gc +import gc -def sunflower_data(): - audio_samples = pd.read_csv('./outputs/audio.csv',names=['word','voice','rate','variant','file']) - sunflowers = audio_samples.loc[audio_samples['word'] == 'sunflowers'].reset_index(drop=True) - sunflowers.loc[:,'file'] = sunflowers.loc[:,'file'].apply(lambda x:'outputs/'+x).apply(generate_aiff_spectrogram) - y_data = sunflowers['variant'].apply(lambda x:x=='normal').values - max_samples = sunflowers['file'].apply(lambda x:x.shape[0]).max() - sample_size = sunflowers['file'][0].shape[1] - sample_count = sunflowers['file'].shape[0] - sunflowers['file'][0].shape[0] - def append_zeros(spgr): - orig = spgr.shape[0] - return np.lib.pad(spgr,[(0, max_samples-orig), (0,0)],'median') - pad_sun = sunflowers['file'].apply(append_zeros).values - x_data = np.vstack(pad_sun).reshape((sample_count,max_samples,sample_size,)) - return train_test_split(x_data,y_data,test_size=0.33) -def get_siamese_pairs(groupF1,groupF2): - group1 = [r for (i,r) in groupF1.iterrows()] - group2 = [r for (i,r) in groupF2.iterrows()] - f = [(g1,g2) for g2 in group2 for g1 in group1] - t = [i for i in itertools.combinations(group1,2)]+[i for i in itertools.combinations(group2,2)] - return (t,f) +def get_siamese_pairs(groupF1, groupF2): + group1 = [r for (i, r) in groupF1.iterrows()] + group2 = [r for (i, r) in groupF2.iterrows()] + f = [(g1, g2) for g2 in group2 for g1 in group1] + t = [i for i in itertools.combinations(group1, 2) + ] + [i for i in itertools.combinations(group2, 2)] + return (t, f) + def sunflower_pairs_data(): - audio_samples = pd.read_csv('./outputs/audio.csv',names=['word','voice','rate','variant','file']) - audio_samples = audio_samples.loc[audio_samples['word'] == 'sunflowers'].reset_index(drop=True) - audio_samples.loc[:,'spectrogram'] = audio_samples.loc[:,'file'].apply(lambda x:'outputs/audio/'+x).apply(generate_aiff_spectrogram) - max_samples = audio_samples['spectrogram'].apply(lambda x:x.shape[0]).max() - sample_size = audio_samples['spectrogram'][0].shape[1] - same_data,diff_data = [],[] - for (w,g) in audio_samples.groupby(audio_samples['word']): + audio_samples = pd.read_csv( + './outputs/audio.csv', + names=['word', 'voice', 'rate', 'variant', 'file']) + audio_samples = audio_samples.loc[audio_samples['word'] == + 'sunflowers'].reset_index(drop=True) + audio_samples.loc[:, 'spectrogram'] = audio_samples.loc[:, 'file'].apply( + lambda x: 'outputs/audio/' + x).apply(generate_aiff_spectrogram) + max_samples = audio_samples['spectrogram'].apply( + lambda x: x.shape[0]).max() + same_data, diff_data = [], [] + for (w, g) in audio_samples.groupby(audio_samples['word']): sample_norm = g.loc[audio_samples['variant'] == 'normal'] sample_phon = g.loc[audio_samples['variant'] == 'phoneme'] - same , diff = get_siamese_pairs(sample_norm,sample_phon) + same, diff = get_siamese_pairs(sample_norm, sample_phon) same_data.extend(same) diff_data.extend(diff) - Y = np.hstack([np.ones(len(same_data)),np.zeros(len(diff_data))]) - X_sample_pairs = same_data+diff_data + Y = np.hstack([np.ones(len(same_data)), np.zeros(len(diff_data))]) + X_sample_pairs = same_data + diff_data + def append_zeros(spgr): - sample = np.lib.pad(spgr,[(0, max_samples-spgr.shape[0]), (0,0)],'median') - return np.expand_dims(sample,axis=0) + sample = np.lib.pad(spgr, [(0, max_samples - spgr.shape[0]), (0, 0)], + 'median') + return np.expand_dims(sample, axis=0) + def create_X(sp): # sample_count = sp[0]['file'].shape[0] l_sample = append_zeros(sp[0]['spectrogram']) - r_sample = append_zeros(sp[1]['spectrogram'])#.apply(append_zeros).values - # x_data = np.vstack(pad_sun).reshape((sample_count,max_samples,sample_size)) - return np.expand_dims(np.vstack([l_sample,r_sample]),axis=0) + r_sample = append_zeros( + sp[1]['spectrogram']) + return np.expand_dims(np.vstack([l_sample, r_sample]), axis=0) + X_list = (create_X(sp) for sp in X_sample_pairs) X = np.vstack(X_list) - tr_pairs,te_pairs,tr_y,te_y = train_test_split(X,Y,test_size=0.1) - return train_test_split(X,Y,test_size=0.1) + tr_pairs, te_pairs, tr_y, te_y = train_test_split(X, Y, test_size=0.1) + return train_test_split(X, Y, test_size=0.1) + def create_spectrogram_data(audio_group='audio'): - audio_samples = pd.read_csv('./outputs/'+audio_group+'.csv',names=['word','voice','rate','variant','file']) - # audio_samples = audio_samples.loc[audio_samples['word'] == 'sunflowers'].reset_index(drop=True) - audio_samples.loc[:,'spectrogram'] = audio_samples.loc[:,'file'].apply(lambda x:'outputs/'+audio_group+'/'+x).apply(generate_aiff_spectrogram) + audio_samples = pd.read_csv( + './outputs/' + audio_group + '.csv', + names=['word', 'voice', 'rate', 'variant', 'file']) + # audio_samples = audio_samples.loc[audio_samples['word'] == + # 'sunflowers'].reset_index(drop=True) + audio_samples.loc[:, 'spectrogram'] = audio_samples.loc[:, 'file'].apply( + lambda x: 'outputs/' + audio_group + '/' + x).apply( + generate_aiff_spectrogram) audio_samples.to_pickle('outputs/spectrogram.pkl') + def create_speech_pairs_data(audio_group='audio'): audio_samples = pd.read_pickle('outputs/spectrogram.pkl') - max_samples = audio_samples['spectrogram'].apply(lambda x:x.shape[0]).max() - sample_size = audio_samples['spectrogram'][0].shape[1] + max_samples = audio_samples['spectrogram'].apply( + lambda x: x.shape[0]).max() + + # sample_size = audio_samples['spectrogram'][0].shape[1] def append_zeros(spgr): - sample = np.lib.pad(spgr,[(0, max_samples-spgr.shape[0]), (0,0)],'median') + sample = np.lib.pad(spgr, [(0, max_samples - spgr.shape[0]), (0, 0)], + 'median') return sample + def create_X(sp): l_sample = append_zeros(sp[0]['spectrogram']) r_sample = append_zeros(sp[1]['spectrogram']) - return np.asarray([l_sample,r_sample]) + return np.asarray([l_sample, r_sample]) print('generating siamese speech pairs') - same_data,diff_data = [],[] - for (w,g) in audio_samples.groupby(audio_samples['word']): - sample_norm = g.loc[audio_samples['variant'] == 'normal']#.reset_index(drop=True) - sample_phon = g.loc[audio_samples['variant'] == 'phoneme']#.reset_index(drop=True) - same , diff = get_siamese_pairs(sample_norm,sample_phon) + same_data, diff_data = [], [] + for (w, g) in audio_samples.groupby(audio_samples['word']): + sample_norm = g.loc[audio_samples['variant'] == 'normal'] + sample_phon = g.loc[audio_samples['variant'] == 'phoneme'] + same, diff = get_siamese_pairs(sample_norm, sample_phon) same_data.extend([create_X(s) for s in same[:10]]) diff_data.extend([create_X(d) for d in diff[:10]]) print('creating all speech pairs') - Y = np.hstack([np.ones(len(same_data)),np.zeros(len(diff_data))]) + Y = np.hstack([np.ones(len(same_data)), np.zeros(len(diff_data))]) print('casting as array speech pairs') - X = np.asarray(same_data+diff_data) + X = np.asarray(same_data + diff_data) print('pickling X/Y') - np.save('outputs/X.npy',X) - np.save('outputs/Y.npy',Y) - del X + np.save('outputs/X.npy', X) + np.save('outputs/Y.npy', Y) + del same_data + del diff_data gc.collect() print('train/test splitting speech pairs') - tr_pairs,te_pairs,tr_y,te_y = train_test_split(X,Y,test_size=0.1) + tr_pairs, te_pairs, tr_y, te_y = train_test_split(X, Y, test_size=0.1) print('pickling train/test') - np.save('outputs/tr_pairs.npy',tr_pairs) - np.save('outputs/te_pairs.npy',te_pairs) - np.save('outputs/tr_y.npy',tr_y) - np.save('outputs/te_y.npy',te_y) + np.save('outputs/tr_pairs.npy', tr_pairs) + np.save('outputs/te_pairs.npy', te_pairs) + np.save('outputs/tr_y.npy', tr_y) + np.save('outputs/te_y.npy', te_y) -# def create_speech_model_data(): -# (max_samples,sample_size) = pickle.load(open('./spectrogram_vars.pkl','rb')) -# x_data_pos = np.load('outputs/x_data_pos.npy') -# x_data_neg = np.load('outputs/x_data_neg.npy') -# x_pos_train, x_pos_test, x_neg_train, x_neg_test =train_test_split(x_data_pos,x_data_neg,test_size=0.1) -# del x_data_pos -# del x_data_neg -# gc.collect() -# print('split train and test') -# tr_y = np.array(x_pos_train.shape[0]*[1]) -# te_y = np.array(x_pos_test.shape[0]*[[1,0]]) -# tr_pairs = np.array([x_pos_train,x_neg_train]).reshape(x_pos_train.shape[0],2,max_samples,sample_size) -# te_pairs = np.array([x_pos_test,x_neg_test]).reshape(x_pos_test.shape[0],2,max_samples,sample_size) -# print('reshaped to input dim') -# np.save('outputs/tr_pairs.npy',tr_pairs) -# np.save('outputs/te_pairs.npy',te_pairs) -# np.save('outputs/tr_y.npy',tr_y) -# np.save('outputs/te_y.npy',te_y) -# print('pickled speech model data') def speech_model_data(): - tr_pairs = np.load('outputs/tr_pairs.npy')/255.0 - te_pairs = np.load('outputs/te_pairs.npy')/255.0 + tr_pairs = np.load('outputs/tr_pairs.npy') / 255.0 + te_pairs = np.load('outputs/te_pairs.npy') / 255.0 tr_pairs[tr_pairs < 0] = 0 te_pairs[te_pairs < 0] = 0 tr_y = np.load('outputs/tr_y.npy') te_y = np.load('outputs/te_y.npy') - return tr_pairs,te_pairs,tr_y,te_y + return tr_pairs, te_pairs, tr_y, te_y + if __name__ == '__main__': # sunflower_pairs_data() - #create_spectrogram_data() + # create_spectrogram_data() create_speech_pairs_data() # print(speech_model_data()) diff --git a/speech_siamese.py b/speech_siamese.py index 3eeed4c..c632fe4 100644 --- a/speech_siamese.py +++ b/speech_siamese.py @@ -1,4 +1,3 @@ - from __future__ import absolute_import from __future__ import print_function import numpy as np @@ -12,8 +11,8 @@ from keras import backend as K def euclidean_distance(vects): x, y = vects - return K.sqrt(K.maximum(K.sum(K.square(x - y), axis=1, keepdims=True), - K.epsilon())) + return K.sqrt( + K.maximum(K.sum(K.square(x - y), axis=1, keepdims=True), K.epsilon())) def eucl_dist_output_shape(shapes): @@ -79,31 +78,44 @@ input_b = Input(shape=input_dim) processed_a = base_network(input_a) processed_b = base_network(input_b) -distance = Lambda(euclidean_distance, - output_shape=eucl_dist_output_shape)( - [processed_a, processed_b] -) +distance = Lambda( + euclidean_distance, + output_shape=eucl_dist_output_shape)([processed_a, processed_b]) model = Model([input_a, input_b], distance) -tb_cb = TensorBoard(log_dir='./logs/siamese_logs', histogram_freq=1, - batch_size=32, write_graph=True, write_grads=True, - write_images=True, embeddings_freq=0, - embeddings_layer_names=None, embeddings_metadata=None) +tb_cb = TensorBoard( + log_dir='./logs/siamese_logs', + histogram_freq=1, + batch_size=32, + write_graph=True, + write_grads=True, + write_images=True, + embeddings_freq=0, + embeddings_layer_names=None, + embeddings_metadata=None) cp_file_fmt = './models/siamese_speech_model-{epoch:02d}-epoch-{val_acc:0.2f}\ -acc.h5' -cp_cb = ModelCheckpoint(cp_file_fmt, monitor='val_acc', verbose=0, - save_best_only=False, save_weights_only=False, - mode='auto', period=1) + +cp_cb = ModelCheckpoint( + cp_file_fmt, + monitor='val_acc', + verbose=0, + save_best_only=False, + save_weights_only=False, + mode='auto', + period=1) # train rms = RMSprop(lr=0.001) sgd = SGD(lr=0.001) model.compile(loss=contrastive_loss, optimizer=rms, metrics=[accuracy]) -model.fit([tr_pairs[:, 0], tr_pairs[:, 1]], tr_y, - batch_size=128, - epochs=50, - validation_data=([te_pairs[:, 0], te_pairs[:, 1]], te_y), - callbacks=[tb_cb, cp_cb]) +model.fit( + [tr_pairs[:, 0], tr_pairs[:, 1]], + tr_y, + batch_size=128, + epochs=50, + validation_data=([te_pairs[:, 0], te_pairs[:, 1]], te_y), + callbacks=[tb_cb, cp_cb]) model.save('./models/siamese_speech_model-final.h5') # compute final accuracy on training and test sets diff --git a/tts_samplegen.py b/tts_samplegen.py index e9c9282..022a16a 100644 --- a/tts_samplegen.py +++ b/tts_samplegen.py @@ -1,29 +1,42 @@ import objc -from AppKit import NSSpeechSynthesizer,NSSpeechInputModeProperty,NSSpeechModePhoneme -from Foundation import NSURL,NSError,NSObject +from AppKit import NSSpeechSynthesizer, NSSpeechInputModeProperty +from AppKit import NSSpeechModePhoneme +from Foundation import NSURL import json import random import os import re import subprocess - OUTPUT_NAME = 'audio' -dest_dir = os.path.abspath('.')+'/outputs/'+OUTPUT_NAME+'/' -dest_file = './outputs/'+OUTPUT_NAME+'.csv' +dest_dir = os.path.abspath('.') + '/outputs/' + OUTPUT_NAME + '/' +dest_file = './outputs/' + OUTPUT_NAME + '.csv' + + def create_dir(direc): if not os.path.exists(direc): os.mkdir(direc) -dest_filename = lambda n,v,r,t: '{}-{}-{}-{}-'.format(n,v,r,t)+str(random.randint(0,10000))+'.aiff' -dest_path = lambda v,r,n: dest_dir+v+'/'+r+'/'+n -dest_url = lambda p: NSURL.fileURLWithPath_(p) -def cli_gen_audio(speech_cmd,rate,voice,out_path): - subprocess.call(['say','-v',voice,'-r',str(rate),'-o',out_path,speech_cmd]) + +def dest_filename(n, v, r, t): + return '{}-{}-{}-{}-'.format(n, v, r, + t) + str(random.randint(0, 10000)) + '.aiff' + + +def dest_path(v, r, n): + return dest_dir + v + '/' + r + '/' + n + + +def cli_gen_audio(speech_cmd, rate, voice, out_path): + subprocess.call( + ['say', '-v', voice, '-r', + str(rate), '-o', out_path, speech_cmd]) + class SynthFile(object): """docstring for SynthFile.""" - def __init__(self,word,phon, filename,voice,rate,operation): + + def __init__(self, word, phon, filename, voice, rate, operation): super(SynthFile, self).__init__() self.word = word self.phoneme = phon @@ -33,91 +46,114 @@ class SynthFile(object): self.variant = operation def get_json(self): - return {'filename':self.filename,'voice':self.voice, - 'rate':self.rate,'operation':self.operation} + return { + 'filename': self.filename, + 'voice': self.voice, + 'rate': self.rate, + 'operation': self.operation + } def get_csv(self): - return '{},{},{},{},{}\n'.format(self.word,self.phoneme,self.voice,self.rate,self.variant,self.filename) + return '{},{},{},{},{}\n'.format(self.word, self.phoneme, self.voice, + self.rate, self.variant, + self.filename) + class SynthVariant(object): """docstring for SynthVariant.""" - def __init__(self,identifier,rate): + + def __init__(self, identifier, rate): super(SynthVariant, self).__init__() self.synth = NSSpeechSynthesizer.alloc().initWithVoice_(identifier) self.synth.setVolume_(100) self.synth.setRate_(rate) - self.phone_synth = NSSpeechSynthesizer.alloc().initWithVoice_(identifier) + self.phone_synth = NSSpeechSynthesizer.alloc().initWithVoice_( + identifier) self.phone_synth.setVolume_(100) self.phone_synth.setRate_(rate) - self.phone_synth.setObject_forProperty_error_(NSSpeechModePhoneme,NSSpeechInputModeProperty,None) + self.phone_synth.setObject_forProperty_error_( + NSSpeechModePhoneme, NSSpeechInputModeProperty, None) self.identifier = identifier self.rate = rate self.name = identifier.split('.')[-1] def __repr__(self): - return 'Synthesizer[{} - {}]({})'.format(self.name,self.rate) + return 'Synthesizer[{} - {}]({})'.format(self.name, self.rate) - def generate_audio(self,word,variant): - orig_phon,phoneme,phon_cmd = self.synth.phonemesFromText_(word),'',word + def generate_audio(self, word, variant): + orig_phon, phoneme, phon_cmd = self.synth.phonemesFromText_( + word), '', word if variant == 'low': # self.synth.startSpeakingString_toURL_(word,d_url) phoneme = orig_phon elif variant == 'medium': - phoneme = re.sub('[0-9]','',orig_phon) - phon_cmd = '[[inpt PHON]] '+phoneme + phoneme = re.sub('[0-9]', '', orig_phon) + phon_cmd = '[[inpt PHON]] ' + phoneme elif variant == 'high': phoneme = orig_phon phon_cmd = word # elif variant == 'long': - # if phon != '': - # self.phone_synth.startSpeakingString_toURL_(phon,d_url) - # else: - # self.synth.startSpeakingString_toURL_(word,d_url) - fname = dest_filename(word,phoneme,self.name,self.rate) - d_path = dest_path(self.name,self.rate,fname) - d_url = dest_url(d_path) - cli_gen_audio(phon_cmd,self.rate,self.name,d_path) - return SynthFile(word,phoneme,fname,self.name,self.rate,variant) + # if phon != '': + # self.phone_synth.startSpeakingString_toURL_(phon,d_url) + # else: + # self.synth.startSpeakingString_toURL_(word,d_url) + fname = dest_filename(word, phoneme, self.name, self.rate) + d_path = dest_path(self.name, self.rate, fname) + # d_url = NSURL.fileURLWithPath_(d_path) + cli_gen_audio(phon_cmd, self.rate, self.name, d_path) + return SynthFile(word, phoneme, fname, self.name, self.rate, variant) def synth_generator(): voices_installed = NSSpeechSynthesizer.availableVoices() - voice_attrs = [NSSpeechSynthesizer.attributesForVoice_(v) for v in voices_installed] - us_voices_ids = [v['VoiceIdentifier'] for v in voice_attrs if v['VoiceLanguage'] == 'en-US' and v['VoiceIdentifier'].split('.')[-1][0].isupper()] - # us_voices_ids = ['com.apple.speech.synthesis.voice.Fred','com.apple.speech.synthesis.voice.Alex', + voice_attrs = [ + NSSpeechSynthesizer.attributesForVoice_(v) for v in voices_installed + ] + us_voices_ids = [ + v['VoiceIdentifier'] for v in voice_attrs + if v['VoiceLanguage'] == 'en-US' + and v['VoiceIdentifier'].split('.')[-1][0].isupper() + ] + # us_voices_ids = ['com.apple.speech.synthesis.voice.Fred', + # 'com.apple.speech.synthesis.voice.Alex', # 'com.apple.speech.synthesis.voice.Victoria'] # voice_rates = list(range(150,221,(220-180)//4)) - voice_rates = [150,180,210,250] + voice_rates = [150, 180, 210, 250] voice_synths = [] create_dir(dest_dir) for v in us_voices_ids: for r in voice_rates: - create_dir(dest_dir+v+'/'+r) - voice_synths.append(SynthVariant(v,r)) + create_dir(dest_dir + v + '/' + r) + voice_synths.append(SynthVariant(v, r)) + def synth_for_words(words): all_synths = [] for w in words: for s in voice_synths: - for v in ['low','medium','high']: - all_synths.append(s.generate_audio(w,v)) + for v in ['low', 'medium', 'high']: + all_synths.append(s.generate_audio(w, v)) return all_synths + return synth_for_words -def write_synths(synth_list,fname,csv=False): - f = open(fname,'w') + +def write_synths(synth_list, fname, csv=False): + f = open(fname, 'w') if csv: for s in synth_list: f.write(s.get_csv()) else: - json.dump([s.get_json() for s in synth_list],f) + json.dump([s.get_json() for s in synth_list], f) f.close() + def generate_audio_for_stories(): stories_data = json.load(open('./inputs/all_stories_hs.json')) word_list = [t[0] for i in stories_data.values() for t in i] words_audio_synth = synth_generator() return words_audio_synth(word_list) + # words_audio_synth = synth_generator() # synth = NSSpeechSynthesizer.alloc().init() # voices_installed = NSSpeechSynthesizer.availableVoices() @@ -131,5 +167,5 @@ def generate_audio_for_stories(): synths = synth_generator()([OUTPUT_NAME]) # synths = generate_audio_for_stories() -write_synths(synths,dest_file,True) +write_synths(synths, dest_file, True) # write_synths(synths,'./outputs/synths.json')