diff --git a/.gitignore b/.gitignore index 6bc0dad..ac08111 100644 --- a/.gitignore +++ b/.gitignore @@ -138,3 +138,4 @@ Temporary Items # End of https://www.gitignore.io/api/macos outputs/* +inputs/mnist diff --git a/TODO.md b/TODO.md new file mode 100644 index 0000000..7cd8d97 --- /dev/null +++ b/TODO.md @@ -0,0 +1,4 @@ +0. generate the samples of phoneme similarity variants. +1. create spectrograms of 150ms windows with 50ms overlap for each word. +2. train a rnn to output a vector using the spectrograms +3. train a nn to output True/False based on the acceptability of the rnn output. -> Siamese network(implementation detail) diff --git a/create_triplets.py b/create_triplets.py index 068f3a0..ff1c04f 100644 --- a/create_triplets.py +++ b/create_triplets.py @@ -6,5 +6,5 @@ word_goups = audio_file.groupby('word') lst = [1, 2, 3, 1, 2, 3] s = pd.Series([1, 2, 3, 10, 20, 30], lst) df3 = pd.DataFrame({'X' : ['A', 'B', 'A', 'B'], 'Y' : [1, 4, 3, 2]}) -df3 + s.groupby(level=0).sum() diff --git a/generate_similar.py b/generate_similar.py index 6c4145d..8a8aada 100644 --- a/generate_similar.py +++ b/generate_similar.py @@ -38,11 +38,40 @@ UW UW V v W w Y y +X x Z z ZH Z """.strip().split('\n')} -mapping sim_mat = pd.read_csv('./similarity.csv',header=0,index_col=0) -[mapping[re.sub('[0-9]','',i)] for i in sim_mat.index.tolist()] -# sim_mat.loc + +def convert_ph(ph): + stress_level = re.search("(\w+)([0-9])",ph) + if stress_level: + return stress_level.group(2)+mapping[stress_level.group(1)] + else: + return mapping[ph] + +def sim_mat_to_apple_table(smt): + colnames = [convert_ph(ph) for ph in smt.index.tolist()] + smt = pd.DataFrame(np.nan_to_num(smt.values)) + fsmt = (smt.T+smt) + np.fill_diagonal(fsmt.values,100.0) + asmt = pd.DataFrame.copy(fsmt) + asmt.columns = colnames + asmt.index = colnames + apple_sim_lookup = asmt.stack().reset_index() + apple_sim_lookup.columns = ['q','r','s'] + return apple_sim_lookup + +apple_sim_lookup = sim_mat_to_apple_table(sim_mat) + +def top_match(ph): + selected = apple_sim_lookup[(apple_sim_lookup.q == ph) & (apple_sim_lookup.s < 100) & (apple_sim_lookup.s >= 70)] + tm = ph + if len(selected) > 0: + tm = pd.DataFrame.sort_values(selected,'s',ascending=False).iloc[0].r + return tm + +def similar_phoneme(ph_str): + return ph_str diff --git a/siamese_network.py b/siamese_network.py new file mode 100644 index 0000000..27d6290 --- /dev/null +++ b/siamese_network.py @@ -0,0 +1,90 @@ +import tensorflow as tf +import numpy as np + +class SiameseLSTM(object): + """ + A LSTM based deep Siamese network for text similarity. + Uses an character embedding layer, followed by a biLSTM and Energy Loss layer. + """ + + def BiRNN(self, x, dropout, scope, embedding_size, sequence_length): + n_input=embedding_size + n_steps=sequence_length + n_hidden=n_steps + n_layers=3 + # Prepare data shape to match `bidirectional_rnn` function requirements + # Current data input shape: (batch_size, n_steps, n_input) (?, seq_len, embedding_size) + # Required shape: 'n_steps' tensors list of shape (batch_size, n_input) + # Permuting batch_size and n_steps + x = tf.transpose(x, [1, 0, 2]) + # Reshape to (n_steps*batch_size, n_input) + x = tf.reshape(x, [-1, n_input]) + print(x) + # Split to get a list of 'n_steps' tensors of shape (batch_size, n_input) + x = tf.split(x, n_steps, 0) + print(x) + # Define lstm cells with tensorflow + # Forward direction cell + with tf.name_scope("fw"+scope),tf.variable_scope("fw"+scope): + stacked_rnn_fw = [] + for _ in range(n_layers): + fw_cell = tf.nn.rnn_cell.BasicLSTMCell(n_hidden, forget_bias=1.0, state_is_tuple=True) + lstm_fw_cell = tf.contrib.rnn.DropoutWrapper(fw_cell,output_keep_prob=dropout) + stacked_rnn_fw.append(lstm_fw_cell) + lstm_fw_cell_m = tf.nn.rnn_cell.MultiRNNCell(cells=stacked_rnn_fw, state_is_tuple=True) + + with tf.name_scope("bw"+scope),tf.variable_scope("bw"+scope): + stacked_rnn_bw = [] + for _ in range(n_layers): + bw_cell = tf.nn.rnn_cell.BasicLSTMCell(n_hidden, forget_bias=1.0, state_is_tuple=True) + lstm_bw_cell = tf.contrib.rnn.DropoutWrapper(bw_cell,output_keep_prob=dropout) + stacked_rnn_bw.append(lstm_bw_cell) + lstm_bw_cell_m = tf.nn.rnn_cell.MultiRNNCell(cells=stacked_rnn_bw, state_is_tuple=True) + # Get lstm cell output + + with tf.name_scope("bw"+scope),tf.variable_scope("bw"+scope): + outputs, _, _ = tf.nn.static_bidirectional_rnn(lstm_fw_cell_m, lstm_bw_cell_m, x, dtype=tf.float32) + return outputs[-1] + + def contrastive_loss(self, y,d,batch_size): + tmp= y *tf.square(d) + #tmp= tf.mul(y,tf.square(d)) + tmp2 = (1-y) *tf.square(tf.maximum((1 - d),0)) + return tf.reduce_sum(tmp +tmp2)/batch_size/2 + + def __init__( + self, sequence_length, vocab_size, embedding_size, hidden_units, l2_reg_lambda, batch_size): + + # Placeholders for input, output and dropout + self.input_x1 = tf.placeholder(tf.int32, [None, sequence_length], name="input_x1") + self.input_x2 = tf.placeholder(tf.int32, [None, sequence_length], name="input_x2") + self.input_y = tf.placeholder(tf.float32, [None], name="input_y") + self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob") + + # Keeping track of l2 regularization loss (optional) + l2_loss = tf.constant(0.0, name="l2_loss") + + # Embedding layer + with tf.name_scope("embedding"): + self.W = tf.Variable( + tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0), + trainable=True,name="W") + self.embedded_chars1 = tf.nn.embedding_lookup(self.W, self.input_x1) + #self.embedded_chars_expanded1 = tf.expand_dims(self.embedded_chars1, -1) + self.embedded_chars2 = tf.nn.embedding_lookup(self.W, self.input_x2) + #self.embedded_chars_expanded2 = tf.expand_dims(self.embedded_chars2, -1) + + # Create a convolution + maxpool layer for each filter size + with tf.name_scope("output"): + self.out1=self.BiRNN(self.embedded_chars1, self.dropout_keep_prob, "side1", embedding_size, sequence_length) + self.out2=self.BiRNN(self.embedded_chars2, self.dropout_keep_prob, "side2", embedding_size, sequence_length) + self.distance = tf.sqrt(tf.reduce_sum(tf.square(tf.subtract(self.out1,self.out2)),1,keep_dims=True)) + self.distance = tf.div(self.distance, tf.add(tf.sqrt(tf.reduce_sum(tf.square(self.out1),1,keep_dims=True)),tf.sqrt(tf.reduce_sum(tf.square(self.out2),1,keep_dims=True)))) + self.distance = tf.reshape(self.distance, [-1], name="distance") + with tf.name_scope("loss"): + self.loss = self.contrastive_loss(self.input_y,self.distance, batch_size) + #### Accuracy computation is outside of this class. + with tf.name_scope("accuracy"): + self.temp_sim = tf.subtract(tf.ones_like(self.distance),tf.rint(self.distance), name="temp_sim") #auto threshold 0.5 + correct_predictions = tf.equal(self.temp_sim, self.input_y) + self.accuracy=tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy") diff --git a/snippets.py b/snippets.py new file mode 100644 index 0000000..5726d21 --- /dev/null +++ b/snippets.py @@ -0,0 +1,12 @@ +# import scipy.signal as sg +# import pysndfile.sndio as snd +# +# snd_data,samples,_ = snd.read('./outputs/sunflowers-Alex-150-normal-589.aiff') +# samples_per_seg = 3*int(samples*150/(3*1000)) +# # samples/(len(snd_data)*1000.0) +# len(snd_data) +# samples_per_seg/2 +# +# len(sg.spectrogram(snd_data,nperseg=samples_per_seg,noverlap=samples_per_seg/3)[2]) +# +# from spectro_gen import generate_aiff_spectrogram diff --git a/spectro_gen.py b/spectro_gen.py new file mode 100644 index 0000000..7df17e4 --- /dev/null +++ b/spectro_gen.py @@ -0,0 +1,110 @@ +#!/usr/bin/env python +#coding: utf-8 +""" This work is licensed under a Creative Commons Attribution 3.0 Unported License. + Frank Zalkow, 2012-2013 + http://www.frank-zalkow.de/en/code-snippets/create-audio-spectrograms-with-python.html?i=1 +""" +# %matplotlib inline +import numpy as np +from matplotlib import pyplot as plt +from pysndfile import sndio as snd +from numpy.lib import stride_tricks + +""" short time fourier transform of audio signal """ +def stft(sig, frameSize, overlapFac=0.5, window=np.hanning): + win = window(frameSize) + hopSize = int(frameSize - np.floor(overlapFac * frameSize)) + + # zeros at beginning (thus center of 1st window should be for sample nr. 0) + # sig = (sig*255).astype(np.uint8) + # import pdb;pdb.set_trace() + count = int(np.floor(frameSize/2.0)) + # import pdb;pdb.set_trace() + samples = np.append(np.zeros(count), sig) + # cols for windowing + cols = int(np.ceil( (len(samples) - frameSize) / float(hopSize)) + 1) + # zeros at end (thus samples can be fully covered by frames) + samples = np.append(samples, np.zeros(frameSize)) + + frames = stride_tricks.as_strided(samples, shape=(cols, frameSize), strides=(samples.strides[0]*hopSize, samples.strides[0])).copy() + frames *= win + + return np.fft.rfft(frames) + +""" scale frequency axis logarithmically """ +def logscale_spec(spec, sr=44100, factor=20.): + timebins, freqbins = np.shape(spec) + + scale = np.linspace(0, 1, freqbins) ** factor + scale *= (freqbins-1)/max(scale) + scale = np.unique(np.round(scale)).astype(np.uint32) + # import pdb;pdb.set_trace() + # create spectrogram with new freq bins + newspec = np.complex128(np.zeros([timebins, len(scale)])) + for i in range(0, len(scale)): + if i == len(scale)-1: + newspec[:,i] = np.sum(spec[:,scale[i]:], axis=1) + else: + newspec[:,i] = np.sum(spec[:,scale[i]:scale[i+1]], axis=1) + + # list center freq of bins + allfreqs = np.abs(np.fft.fftfreq(freqbins*2, 1./sr)[:freqbins+1]) + freqs = [] + for i in range(0, len(scale)): + if i == len(scale)-1: + freqs += [np.mean(allfreqs[scale[i]:])] + else: + freqs += [np.mean(allfreqs[scale[i]:scale[i+1]])] + + return newspec, freqs + +""" generate spectrogram for aiff audio with 150ms windows and 50ms overlap""" +def generate_aiff_spectrogram(audiopath): + samples,samplerate,_ = snd.read(audiopath) + # samplerate, samples = wav.read(audiopath) + # s = stft(samples, binsize) + s = stft(samples, samplerate*150/1000,1.0/3) + + sshow, freq = logscale_spec(s, factor=1.0, sr=samplerate) + ims = 20.*np.log10(np.abs(sshow)/10e-6) + return ims + +""" plot spectrogram""" +def plotstft(audiopath, binsize=2**10, plotpath=None, colormap="jet"): + samples,samplerate,_ = snd.read(audiopath) + # samplerate, samples = wav.read(audiopath) + # s = stft(samples, binsize) + s = stft(samples, samplerate*150/1000,1.0/3) + + sshow, freq = logscale_spec(s, factor=1.0, sr=samplerate) + ims = 20.*np.log10(np.abs(sshow)/10e-6) # amplitude to decibel + + timebins, freqbins = np.shape(ims) + # import pdb;pdb.set_trace() + plt.figure(figsize=(15, 7.5)) + plt.imshow(np.transpose(ims), origin="lower", aspect="auto", cmap=colormap, interpolation="none") + plt.colorbar() + + plt.xlabel("time (s)") + plt.ylabel("frequency (hz)") + plt.xlim([0, timebins-1]) + plt.ylim([0, freqbins]) + + xlocs = np.float32(np.linspace(0, timebins-1, 5)) + plt.xticks(xlocs, ["%.02f" % l for l in ((xlocs*len(samples)/timebins)+(0.5*binsize))/samplerate]) + ylocs = np.int16(np.round(np.linspace(0, freqbins-1, 10))) + plt.yticks(ylocs, ["%.02f" % freq[i] for i in ylocs]) + + if plotpath: + plt.savefig(plotpath, bbox_inches="tight") + else: + plt.show() + + plt.clf() + +if __name__ == '__main__': + plotstft('./outputs/sunflowers-Alex-150-normal-589.aiff') + plotstft('./outputs/sunflowers-Alex-180-normal-4763.aiff') + plotstft('./outputs/sunflowers-Victoria-180-normal-870.aiff') + plotstft('./outputs/sunflowers-Fred-180-phoneme-9733.aiff') + plotstft('./outputs/sunflowers-Fred-180-normal-6515.aiff') diff --git a/tts-wav-gen.py b/tts_samplegen.py similarity index 70% rename from tts-wav-gen.py rename to tts_samplegen.py index 90ac549..e9c9282 100644 --- a/tts-wav-gen.py +++ b/tts_samplegen.py @@ -9,38 +9,39 @@ import subprocess OUTPUT_NAME = 'audio' -def create_output_dir(): - direc = os.path.abspath('.')+'/outputs/'+OUTPUT_NAME+'/' +dest_dir = os.path.abspath('.')+'/outputs/'+OUTPUT_NAME+'/' +dest_file = './outputs/'+OUTPUT_NAME+'.csv' +def create_dir(direc): if not os.path.exists(direc): os.mkdir(direc) -create_output_dir() dest_filename = lambda n,v,r,t: '{}-{}-{}-{}-'.format(n,v,r,t)+str(random.randint(0,10000))+'.aiff' -dest_path = lambda n: os.path.abspath('.')+'/outputs/'+OUTPUT_NAME+'/'+n +dest_path = lambda v,r,n: dest_dir+v+'/'+r+'/'+n dest_url = lambda p: NSURL.fileURLWithPath_(p) -def cli_gen_audio(word,rate,voice,out_path): - subprocess.call(['say','-v',voice,'-r',str(rate),'-o',out_path,word]) +def cli_gen_audio(speech_cmd,rate,voice,out_path): + subprocess.call(['say','-v',voice,'-r',str(rate),'-o',out_path,speech_cmd]) class SynthFile(object): """docstring for SynthFile.""" - def __init__(self,word, filename,voice,rate,operation): + def __init__(self,word,phon, filename,voice,rate,operation): super(SynthFile, self).__init__() self.word = word + self.phoneme = phon self.filename = filename self.voice = voice self.rate = rate - self.operation = operation + self.variant = operation def get_json(self): return {'filename':self.filename,'voice':self.voice, 'rate':self.rate,'operation':self.operation} def get_csv(self): - return '{},{},{},{},{}\n'.format(self.word,self.voice,self.rate,self.operation,self.filename) + return '{},{},{},{},{}\n'.format(self.word,self.phoneme,self.voice,self.rate,self.variant,self.filename) class SynthVariant(object): """docstring for SynthVariant.""" - def __init__(self,identifier,rate,op): + def __init__(self,identifier,rate): super(SynthVariant, self).__init__() self.synth = NSSpeechSynthesizer.alloc().initWithVoice_(identifier) self.synth.setVolume_(100) @@ -52,28 +53,31 @@ class SynthVariant(object): self.identifier = identifier self.rate = rate self.name = identifier.split('.')[-1] - self.operation = op def __repr__(self): - return 'Synthesizer[{} - {}]({})'.format(self.name,self.rate,self.operation) + return 'Synthesizer[{} - {}]({})'.format(self.name,self.rate) - def generate_audio(self,word): - fname = dest_filename(word,self.name,self.rate,self.operation) - d_path = dest_path(fname) - d_url = dest_url(d_path) - if self.operation == 'normal': - self.synth.startSpeakingString_toURL_(word,d_url) - # cli_gen_audio(word,self.rate,self.name,d_path) - else: - orig_phon = self.synth.phonemesFromText_(word) - phon = '[[inpt PHON]] '+re.sub('[0-9]','',orig_phon) - # phon = re.sub('[0-9]','',orig_phon) - cli_gen_audio(phon,self.rate,self.name,d_path) + def generate_audio(self,word,variant): + orig_phon,phoneme,phon_cmd = self.synth.phonemesFromText_(word),'',word + if variant == 'low': + # self.synth.startSpeakingString_toURL_(word,d_url) + phoneme = orig_phon + elif variant == 'medium': + phoneme = re.sub('[0-9]','',orig_phon) + phon_cmd = '[[inpt PHON]] '+phoneme + elif variant == 'high': + phoneme = orig_phon + phon_cmd = word + # elif variant == 'long': # if phon != '': # self.phone_synth.startSpeakingString_toURL_(phon,d_url) # else: # self.synth.startSpeakingString_toURL_(word,d_url) - return SynthFile(word,fname,self.name,self.rate,self.operation) + fname = dest_filename(word,phoneme,self.name,self.rate) + d_path = dest_path(self.name,self.rate,fname) + d_url = dest_url(d_path) + cli_gen_audio(phon_cmd,self.rate,self.name,d_path) + return SynthFile(word,phoneme,fname,self.name,self.rate,variant) def synth_generator(): @@ -83,18 +87,19 @@ def synth_generator(): # us_voices_ids = ['com.apple.speech.synthesis.voice.Fred','com.apple.speech.synthesis.voice.Alex', # 'com.apple.speech.synthesis.voice.Victoria'] # voice_rates = list(range(150,221,(220-180)//4)) - voice_rates = [150,180,210] + voice_rates = [150,180,210,250] voice_synths = [] - variants = ['normal','phoneme'] + create_dir(dest_dir) for v in us_voices_ids: for r in voice_rates: - for o in variants: - voice_synths.append(SynthVariant(v,r,o)) + create_dir(dest_dir+v+'/'+r) + voice_synths.append(SynthVariant(v,r)) def synth_for_words(words): all_synths = [] for w in words: for s in voice_synths: - all_synths.append(s.generate_audio(w)) + for v in ['low','medium','high']: + all_synths.append(s.generate_audio(w,v)) return all_synths return synth_for_words @@ -126,5 +131,5 @@ def generate_audio_for_stories(): synths = synth_generator()([OUTPUT_NAME]) # synths = generate_audio_for_stories() -write_synths(synths,'./outputs/'+OUTPUT_NAME+'.csv',True) +write_synths(synths,dest_file,True) # write_synths(synths,'./outputs/synths.json')