From dccbec7cbae54e66c7c4d945db0609be5df0055f Mon Sep 17 00:00:00 2001 From: Malar Kannan Date: Fri, 13 Oct 2017 16:40:57 +0530 Subject: [PATCH] 1. implemented spectrogram generator for audio files 2. imported siamese network class (wip) 3. added similarity measure based phoneme neighbor generator 4. fixed samplegen variants code 5. create triplets (wip) 6. updates --- .gitignore | 1 + TODO.md | 4 ++ create_triplets.py | 2 +- generate_similar.py | 35 ++++++++- siamese_network.py | 90 +++++++++++++++++++++++ snippets.py | 12 ++++ spectro_gen.py | 110 +++++++++++++++++++++++++++++ tts-wav-gen.py => tts_samplegen.py | 67 ++++++++++-------- 8 files changed, 286 insertions(+), 35 deletions(-) create mode 100644 TODO.md create mode 100644 siamese_network.py create mode 100644 snippets.py create mode 100644 spectro_gen.py rename tts-wav-gen.py => tts_samplegen.py (70%) diff --git a/.gitignore b/.gitignore index 6bc0dad..ac08111 100644 --- a/.gitignore +++ b/.gitignore @@ -138,3 +138,4 @@ Temporary Items # End of https://www.gitignore.io/api/macos outputs/* +inputs/mnist diff --git a/TODO.md b/TODO.md new file mode 100644 index 0000000..7cd8d97 --- /dev/null +++ b/TODO.md @@ -0,0 +1,4 @@ +0. generate the samples of phoneme similarity variants. +1. create spectrograms of 150ms windows with 50ms overlap for each word. +2. train a rnn to output a vector using the spectrograms +3. train a nn to output True/False based on the acceptability of the rnn output. -> Siamese network(implementation detail) diff --git a/create_triplets.py b/create_triplets.py index 068f3a0..ff1c04f 100644 --- a/create_triplets.py +++ b/create_triplets.py @@ -6,5 +6,5 @@ word_goups = audio_file.groupby('word') lst = [1, 2, 3, 1, 2, 3] s = pd.Series([1, 2, 3, 10, 20, 30], lst) df3 = pd.DataFrame({'X' : ['A', 'B', 'A', 'B'], 'Y' : [1, 4, 3, 2]}) -df3 + s.groupby(level=0).sum() diff --git a/generate_similar.py b/generate_similar.py index 6c4145d..8a8aada 100644 --- a/generate_similar.py +++ b/generate_similar.py @@ -38,11 +38,40 @@ UW UW V v W w Y y +X x Z z ZH Z """.strip().split('\n')} -mapping sim_mat = pd.read_csv('./similarity.csv',header=0,index_col=0) -[mapping[re.sub('[0-9]','',i)] for i in sim_mat.index.tolist()] -# sim_mat.loc + +def convert_ph(ph): + stress_level = re.search("(\w+)([0-9])",ph) + if stress_level: + return stress_level.group(2)+mapping[stress_level.group(1)] + else: + return mapping[ph] + +def sim_mat_to_apple_table(smt): + colnames = [convert_ph(ph) for ph in smt.index.tolist()] + smt = pd.DataFrame(np.nan_to_num(smt.values)) + fsmt = (smt.T+smt) + np.fill_diagonal(fsmt.values,100.0) + asmt = pd.DataFrame.copy(fsmt) + asmt.columns = colnames + asmt.index = colnames + apple_sim_lookup = asmt.stack().reset_index() + apple_sim_lookup.columns = ['q','r','s'] + return apple_sim_lookup + +apple_sim_lookup = sim_mat_to_apple_table(sim_mat) + +def top_match(ph): + selected = apple_sim_lookup[(apple_sim_lookup.q == ph) & (apple_sim_lookup.s < 100) & (apple_sim_lookup.s >= 70)] + tm = ph + if len(selected) > 0: + tm = pd.DataFrame.sort_values(selected,'s',ascending=False).iloc[0].r + return tm + +def similar_phoneme(ph_str): + return ph_str diff --git a/siamese_network.py b/siamese_network.py new file mode 100644 index 0000000..27d6290 --- /dev/null +++ b/siamese_network.py @@ -0,0 +1,90 @@ +import tensorflow as tf +import numpy as np + +class SiameseLSTM(object): + """ + A LSTM based deep Siamese network for text similarity. + Uses an character embedding layer, followed by a biLSTM and Energy Loss layer. + """ + + def BiRNN(self, x, dropout, scope, embedding_size, sequence_length): + n_input=embedding_size + n_steps=sequence_length + n_hidden=n_steps + n_layers=3 + # Prepare data shape to match `bidirectional_rnn` function requirements + # Current data input shape: (batch_size, n_steps, n_input) (?, seq_len, embedding_size) + # Required shape: 'n_steps' tensors list of shape (batch_size, n_input) + # Permuting batch_size and n_steps + x = tf.transpose(x, [1, 0, 2]) + # Reshape to (n_steps*batch_size, n_input) + x = tf.reshape(x, [-1, n_input]) + print(x) + # Split to get a list of 'n_steps' tensors of shape (batch_size, n_input) + x = tf.split(x, n_steps, 0) + print(x) + # Define lstm cells with tensorflow + # Forward direction cell + with tf.name_scope("fw"+scope),tf.variable_scope("fw"+scope): + stacked_rnn_fw = [] + for _ in range(n_layers): + fw_cell = tf.nn.rnn_cell.BasicLSTMCell(n_hidden, forget_bias=1.0, state_is_tuple=True) + lstm_fw_cell = tf.contrib.rnn.DropoutWrapper(fw_cell,output_keep_prob=dropout) + stacked_rnn_fw.append(lstm_fw_cell) + lstm_fw_cell_m = tf.nn.rnn_cell.MultiRNNCell(cells=stacked_rnn_fw, state_is_tuple=True) + + with tf.name_scope("bw"+scope),tf.variable_scope("bw"+scope): + stacked_rnn_bw = [] + for _ in range(n_layers): + bw_cell = tf.nn.rnn_cell.BasicLSTMCell(n_hidden, forget_bias=1.0, state_is_tuple=True) + lstm_bw_cell = tf.contrib.rnn.DropoutWrapper(bw_cell,output_keep_prob=dropout) + stacked_rnn_bw.append(lstm_bw_cell) + lstm_bw_cell_m = tf.nn.rnn_cell.MultiRNNCell(cells=stacked_rnn_bw, state_is_tuple=True) + # Get lstm cell output + + with tf.name_scope("bw"+scope),tf.variable_scope("bw"+scope): + outputs, _, _ = tf.nn.static_bidirectional_rnn(lstm_fw_cell_m, lstm_bw_cell_m, x, dtype=tf.float32) + return outputs[-1] + + def contrastive_loss(self, y,d,batch_size): + tmp= y *tf.square(d) + #tmp= tf.mul(y,tf.square(d)) + tmp2 = (1-y) *tf.square(tf.maximum((1 - d),0)) + return tf.reduce_sum(tmp +tmp2)/batch_size/2 + + def __init__( + self, sequence_length, vocab_size, embedding_size, hidden_units, l2_reg_lambda, batch_size): + + # Placeholders for input, output and dropout + self.input_x1 = tf.placeholder(tf.int32, [None, sequence_length], name="input_x1") + self.input_x2 = tf.placeholder(tf.int32, [None, sequence_length], name="input_x2") + self.input_y = tf.placeholder(tf.float32, [None], name="input_y") + self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob") + + # Keeping track of l2 regularization loss (optional) + l2_loss = tf.constant(0.0, name="l2_loss") + + # Embedding layer + with tf.name_scope("embedding"): + self.W = tf.Variable( + tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0), + trainable=True,name="W") + self.embedded_chars1 = tf.nn.embedding_lookup(self.W, self.input_x1) + #self.embedded_chars_expanded1 = tf.expand_dims(self.embedded_chars1, -1) + self.embedded_chars2 = tf.nn.embedding_lookup(self.W, self.input_x2) + #self.embedded_chars_expanded2 = tf.expand_dims(self.embedded_chars2, -1) + + # Create a convolution + maxpool layer for each filter size + with tf.name_scope("output"): + self.out1=self.BiRNN(self.embedded_chars1, self.dropout_keep_prob, "side1", embedding_size, sequence_length) + self.out2=self.BiRNN(self.embedded_chars2, self.dropout_keep_prob, "side2", embedding_size, sequence_length) + self.distance = tf.sqrt(tf.reduce_sum(tf.square(tf.subtract(self.out1,self.out2)),1,keep_dims=True)) + self.distance = tf.div(self.distance, tf.add(tf.sqrt(tf.reduce_sum(tf.square(self.out1),1,keep_dims=True)),tf.sqrt(tf.reduce_sum(tf.square(self.out2),1,keep_dims=True)))) + self.distance = tf.reshape(self.distance, [-1], name="distance") + with tf.name_scope("loss"): + self.loss = self.contrastive_loss(self.input_y,self.distance, batch_size) + #### Accuracy computation is outside of this class. + with tf.name_scope("accuracy"): + self.temp_sim = tf.subtract(tf.ones_like(self.distance),tf.rint(self.distance), name="temp_sim") #auto threshold 0.5 + correct_predictions = tf.equal(self.temp_sim, self.input_y) + self.accuracy=tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy") diff --git a/snippets.py b/snippets.py new file mode 100644 index 0000000..5726d21 --- /dev/null +++ b/snippets.py @@ -0,0 +1,12 @@ +# import scipy.signal as sg +# import pysndfile.sndio as snd +# +# snd_data,samples,_ = snd.read('./outputs/sunflowers-Alex-150-normal-589.aiff') +# samples_per_seg = 3*int(samples*150/(3*1000)) +# # samples/(len(snd_data)*1000.0) +# len(snd_data) +# samples_per_seg/2 +# +# len(sg.spectrogram(snd_data,nperseg=samples_per_seg,noverlap=samples_per_seg/3)[2]) +# +# from spectro_gen import generate_aiff_spectrogram diff --git a/spectro_gen.py b/spectro_gen.py new file mode 100644 index 0000000..7df17e4 --- /dev/null +++ b/spectro_gen.py @@ -0,0 +1,110 @@ +#!/usr/bin/env python +#coding: utf-8 +""" This work is licensed under a Creative Commons Attribution 3.0 Unported License. + Frank Zalkow, 2012-2013 + http://www.frank-zalkow.de/en/code-snippets/create-audio-spectrograms-with-python.html?i=1 +""" +# %matplotlib inline +import numpy as np +from matplotlib import pyplot as plt +from pysndfile import sndio as snd +from numpy.lib import stride_tricks + +""" short time fourier transform of audio signal """ +def stft(sig, frameSize, overlapFac=0.5, window=np.hanning): + win = window(frameSize) + hopSize = int(frameSize - np.floor(overlapFac * frameSize)) + + # zeros at beginning (thus center of 1st window should be for sample nr. 0) + # sig = (sig*255).astype(np.uint8) + # import pdb;pdb.set_trace() + count = int(np.floor(frameSize/2.0)) + # import pdb;pdb.set_trace() + samples = np.append(np.zeros(count), sig) + # cols for windowing + cols = int(np.ceil( (len(samples) - frameSize) / float(hopSize)) + 1) + # zeros at end (thus samples can be fully covered by frames) + samples = np.append(samples, np.zeros(frameSize)) + + frames = stride_tricks.as_strided(samples, shape=(cols, frameSize), strides=(samples.strides[0]*hopSize, samples.strides[0])).copy() + frames *= win + + return np.fft.rfft(frames) + +""" scale frequency axis logarithmically """ +def logscale_spec(spec, sr=44100, factor=20.): + timebins, freqbins = np.shape(spec) + + scale = np.linspace(0, 1, freqbins) ** factor + scale *= (freqbins-1)/max(scale) + scale = np.unique(np.round(scale)).astype(np.uint32) + # import pdb;pdb.set_trace() + # create spectrogram with new freq bins + newspec = np.complex128(np.zeros([timebins, len(scale)])) + for i in range(0, len(scale)): + if i == len(scale)-1: + newspec[:,i] = np.sum(spec[:,scale[i]:], axis=1) + else: + newspec[:,i] = np.sum(spec[:,scale[i]:scale[i+1]], axis=1) + + # list center freq of bins + allfreqs = np.abs(np.fft.fftfreq(freqbins*2, 1./sr)[:freqbins+1]) + freqs = [] + for i in range(0, len(scale)): + if i == len(scale)-1: + freqs += [np.mean(allfreqs[scale[i]:])] + else: + freqs += [np.mean(allfreqs[scale[i]:scale[i+1]])] + + return newspec, freqs + +""" generate spectrogram for aiff audio with 150ms windows and 50ms overlap""" +def generate_aiff_spectrogram(audiopath): + samples,samplerate,_ = snd.read(audiopath) + # samplerate, samples = wav.read(audiopath) + # s = stft(samples, binsize) + s = stft(samples, samplerate*150/1000,1.0/3) + + sshow, freq = logscale_spec(s, factor=1.0, sr=samplerate) + ims = 20.*np.log10(np.abs(sshow)/10e-6) + return ims + +""" plot spectrogram""" +def plotstft(audiopath, binsize=2**10, plotpath=None, colormap="jet"): + samples,samplerate,_ = snd.read(audiopath) + # samplerate, samples = wav.read(audiopath) + # s = stft(samples, binsize) + s = stft(samples, samplerate*150/1000,1.0/3) + + sshow, freq = logscale_spec(s, factor=1.0, sr=samplerate) + ims = 20.*np.log10(np.abs(sshow)/10e-6) # amplitude to decibel + + timebins, freqbins = np.shape(ims) + # import pdb;pdb.set_trace() + plt.figure(figsize=(15, 7.5)) + plt.imshow(np.transpose(ims), origin="lower", aspect="auto", cmap=colormap, interpolation="none") + plt.colorbar() + + plt.xlabel("time (s)") + plt.ylabel("frequency (hz)") + plt.xlim([0, timebins-1]) + plt.ylim([0, freqbins]) + + xlocs = np.float32(np.linspace(0, timebins-1, 5)) + plt.xticks(xlocs, ["%.02f" % l for l in ((xlocs*len(samples)/timebins)+(0.5*binsize))/samplerate]) + ylocs = np.int16(np.round(np.linspace(0, freqbins-1, 10))) + plt.yticks(ylocs, ["%.02f" % freq[i] for i in ylocs]) + + if plotpath: + plt.savefig(plotpath, bbox_inches="tight") + else: + plt.show() + + plt.clf() + +if __name__ == '__main__': + plotstft('./outputs/sunflowers-Alex-150-normal-589.aiff') + plotstft('./outputs/sunflowers-Alex-180-normal-4763.aiff') + plotstft('./outputs/sunflowers-Victoria-180-normal-870.aiff') + plotstft('./outputs/sunflowers-Fred-180-phoneme-9733.aiff') + plotstft('./outputs/sunflowers-Fred-180-normal-6515.aiff') diff --git a/tts-wav-gen.py b/tts_samplegen.py similarity index 70% rename from tts-wav-gen.py rename to tts_samplegen.py index 90ac549..e9c9282 100644 --- a/tts-wav-gen.py +++ b/tts_samplegen.py @@ -9,38 +9,39 @@ import subprocess OUTPUT_NAME = 'audio' -def create_output_dir(): - direc = os.path.abspath('.')+'/outputs/'+OUTPUT_NAME+'/' +dest_dir = os.path.abspath('.')+'/outputs/'+OUTPUT_NAME+'/' +dest_file = './outputs/'+OUTPUT_NAME+'.csv' +def create_dir(direc): if not os.path.exists(direc): os.mkdir(direc) -create_output_dir() dest_filename = lambda n,v,r,t: '{}-{}-{}-{}-'.format(n,v,r,t)+str(random.randint(0,10000))+'.aiff' -dest_path = lambda n: os.path.abspath('.')+'/outputs/'+OUTPUT_NAME+'/'+n +dest_path = lambda v,r,n: dest_dir+v+'/'+r+'/'+n dest_url = lambda p: NSURL.fileURLWithPath_(p) -def cli_gen_audio(word,rate,voice,out_path): - subprocess.call(['say','-v',voice,'-r',str(rate),'-o',out_path,word]) +def cli_gen_audio(speech_cmd,rate,voice,out_path): + subprocess.call(['say','-v',voice,'-r',str(rate),'-o',out_path,speech_cmd]) class SynthFile(object): """docstring for SynthFile.""" - def __init__(self,word, filename,voice,rate,operation): + def __init__(self,word,phon, filename,voice,rate,operation): super(SynthFile, self).__init__() self.word = word + self.phoneme = phon self.filename = filename self.voice = voice self.rate = rate - self.operation = operation + self.variant = operation def get_json(self): return {'filename':self.filename,'voice':self.voice, 'rate':self.rate,'operation':self.operation} def get_csv(self): - return '{},{},{},{},{}\n'.format(self.word,self.voice,self.rate,self.operation,self.filename) + return '{},{},{},{},{}\n'.format(self.word,self.phoneme,self.voice,self.rate,self.variant,self.filename) class SynthVariant(object): """docstring for SynthVariant.""" - def __init__(self,identifier,rate,op): + def __init__(self,identifier,rate): super(SynthVariant, self).__init__() self.synth = NSSpeechSynthesizer.alloc().initWithVoice_(identifier) self.synth.setVolume_(100) @@ -52,28 +53,31 @@ class SynthVariant(object): self.identifier = identifier self.rate = rate self.name = identifier.split('.')[-1] - self.operation = op def __repr__(self): - return 'Synthesizer[{} - {}]({})'.format(self.name,self.rate,self.operation) + return 'Synthesizer[{} - {}]({})'.format(self.name,self.rate) - def generate_audio(self,word): - fname = dest_filename(word,self.name,self.rate,self.operation) - d_path = dest_path(fname) - d_url = dest_url(d_path) - if self.operation == 'normal': - self.synth.startSpeakingString_toURL_(word,d_url) - # cli_gen_audio(word,self.rate,self.name,d_path) - else: - orig_phon = self.synth.phonemesFromText_(word) - phon = '[[inpt PHON]] '+re.sub('[0-9]','',orig_phon) - # phon = re.sub('[0-9]','',orig_phon) - cli_gen_audio(phon,self.rate,self.name,d_path) + def generate_audio(self,word,variant): + orig_phon,phoneme,phon_cmd = self.synth.phonemesFromText_(word),'',word + if variant == 'low': + # self.synth.startSpeakingString_toURL_(word,d_url) + phoneme = orig_phon + elif variant == 'medium': + phoneme = re.sub('[0-9]','',orig_phon) + phon_cmd = '[[inpt PHON]] '+phoneme + elif variant == 'high': + phoneme = orig_phon + phon_cmd = word + # elif variant == 'long': # if phon != '': # self.phone_synth.startSpeakingString_toURL_(phon,d_url) # else: # self.synth.startSpeakingString_toURL_(word,d_url) - return SynthFile(word,fname,self.name,self.rate,self.operation) + fname = dest_filename(word,phoneme,self.name,self.rate) + d_path = dest_path(self.name,self.rate,fname) + d_url = dest_url(d_path) + cli_gen_audio(phon_cmd,self.rate,self.name,d_path) + return SynthFile(word,phoneme,fname,self.name,self.rate,variant) def synth_generator(): @@ -83,18 +87,19 @@ def synth_generator(): # us_voices_ids = ['com.apple.speech.synthesis.voice.Fred','com.apple.speech.synthesis.voice.Alex', # 'com.apple.speech.synthesis.voice.Victoria'] # voice_rates = list(range(150,221,(220-180)//4)) - voice_rates = [150,180,210] + voice_rates = [150,180,210,250] voice_synths = [] - variants = ['normal','phoneme'] + create_dir(dest_dir) for v in us_voices_ids: for r in voice_rates: - for o in variants: - voice_synths.append(SynthVariant(v,r,o)) + create_dir(dest_dir+v+'/'+r) + voice_synths.append(SynthVariant(v,r)) def synth_for_words(words): all_synths = [] for w in words: for s in voice_synths: - all_synths.append(s.generate_audio(w)) + for v in ['low','medium','high']: + all_synths.append(s.generate_audio(w,v)) return all_synths return synth_for_words @@ -126,5 +131,5 @@ def generate_audio_for_stories(): synths = synth_generator()([OUTPUT_NAME]) # synths = generate_audio_for_stories() -write_synths(synths,'./outputs/'+OUTPUT_NAME+'.csv',True) +write_synths(synths,dest_file,True) # write_synths(synths,'./outputs/synths.json')