diff --git a/generate_similar.py b/generate_similar.py index 8a8aada..967d9df 100644 --- a/generate_similar.py +++ b/generate_similar.py @@ -1,6 +1,12 @@ import pandas as pd +import pronouncing import re -mapping = {s.split()[0]: s.split()[1] for s in """ +import numpy as np +import random + +mapping = { + s.split()[0]: s.split()[1] + for s in """ AA AA AE AE AH UX @@ -38,40 +44,93 @@ UW UW V v W w Y y -X x Z z ZH Z -""".strip().split('\n')} +""".strip().split('\n') +} + +apple_phonemes = [ + '%', '@', 'AE', 'EY', 'AO', 'AX', 'IY', 'EH', 'IH', 'AY', 'IX', 'AA', 'UW', + 'UH', 'UX', 'OW', 'AW', 'OY', 'b', 'C', 'd', 'D', 'f', 'g', 'h', 'J', 'k', + 'l', 'm', 'n', 'N', 'p', 'r', 's', 'S', 't', 'T', 'v', 'w', 'y', 'z', 'Z' +] + +sim_mat = pd.read_csv('./similarity.csv', header=0, index_col=0) -sim_mat = pd.read_csv('./similarity.csv',header=0,index_col=0) def convert_ph(ph): - stress_level = re.search("(\w+)([0-9])",ph) + stress_level = re.search("(\w+)([0-9])", ph) if stress_level: - return stress_level.group(2)+mapping[stress_level.group(1)] + return stress_level.group(2) + mapping[stress_level.group(1)] else: return mapping[ph] + def sim_mat_to_apple_table(smt): colnames = [convert_ph(ph) for ph in smt.index.tolist()] smt = pd.DataFrame(np.nan_to_num(smt.values)) - fsmt = (smt.T+smt) - np.fill_diagonal(fsmt.values,100.0) + fsmt = (smt.T + smt) + np.fill_diagonal(fsmt.values, 100.0) asmt = pd.DataFrame.copy(fsmt) asmt.columns = colnames asmt.index = colnames - apple_sim_lookup = asmt.stack().reset_index() - apple_sim_lookup.columns = ['q','r','s'] - return apple_sim_lookup + apple_sim_table = asmt.stack().reset_index() + apple_sim_table.columns = ['q', 'r', 's'] + return apple_sim_table + + +apple_sim_table = sim_mat_to_apple_table(sim_mat) -apple_sim_lookup = sim_mat_to_apple_table(sim_mat) def top_match(ph): - selected = apple_sim_lookup[(apple_sim_lookup.q == ph) & (apple_sim_lookup.s < 100) & (apple_sim_lookup.s >= 70)] + selected = apple_sim_table[(apple_sim_table.q == ph) + & (apple_sim_table.s < 100) & + (apple_sim_table.s >= 70)] tm = ph if len(selected) > 0: - tm = pd.DataFrame.sort_values(selected,'s',ascending=False).iloc[0].r + tm = pd.DataFrame.sort_values(selected, 's', ascending=False).iloc[0].r return tm + +class ApplePhoneme(object): + """docstring for ApplePhoneme.""" + + def __init__(self, phone, stress, vowel=False): + super(ApplePhoneme, self).__init__() + self.phone = phone + self.stress = stress + self.vowel = vowel + + def __str__(self): + return (str(self.stress) if (self.vowel and self.stress>0) else '') + self.phone + + def __repr__(self): + return "'{}'".format(str(self)) + + def adjust_stress(self): + self.stress = random.choice([i for i in range(3) if i != self.stress]) + + +def parse_apple_phonemes(ph_str): + for i in range(len(ph_str)): + pref, rest = ph_str[:i + 1], ph_str[i + 1:] + if pref in apple_phonemes: + vowel = pref[0] in 'AEIOU' + return [ApplePhoneme(pref, 0, vowel)] + parse_apple_phonemes(rest) + elif pref[0].isdigit() and pref[1:] in apple_phonemes: + return [ApplePhoneme(pref[1:], int(pref[0]) , True)] + parse_apple_phonemes(rest) + elif not pref.isalnum(): + return [ApplePhoneme(pref, 0, False)] + parse_apple_phonemes(rest) + return [] + + def similar_phoneme(ph_str): - return ph_str + phons = parse_apple_phonemes(ph_str) + vowels = [i for i in phons if i.vowel] + random.choice(vowels).adjust_stress() + return ''.join([str(i) for i in phons]) + + +def similar_word(word_str): + similar = pronouncing.rhymes(word_str) + return random.choice(similar) if len(similar) > 0 else word_str diff --git a/record_mic_speech.py b/record_mic_speech.py index 8a73374..4ed11e2 100644 --- a/record_mic_speech.py +++ b/record_mic_speech.py @@ -1,7 +1,7 @@ import pyaudio import numpy as np # from matplotlib import pyplot as plt -from spectro_gen import plot_stft, generate_spectrogram +from spectro_gen import plot_stft, generate_spec_frec def record_spectrogram(n_sec, plot=False, playback=False): @@ -38,5 +38,5 @@ def record_spectrogram(n_sec, plot=False, playback=False): stream.write(mean_channel_data) stream.close() p_oup.terminate() - ims, _ = generate_spectrogram(one_channel, SAMPLE_RATE) + ims, _ = generate_spec_frec(one_channel, SAMPLE_RATE) return ims diff --git a/spectro_gen.py b/spectro_gen.py index 6ae6750..2e397e8 100644 --- a/spectro_gen.py +++ b/spectro_gen.py @@ -71,7 +71,7 @@ def logscale_spec(spec, sr=44100, factor=20.): """ generate spectrogram for aiff audio with 150ms windows and 50ms overlap""" -def generate_spectrogram(samples, samplerate): +def generate_spec_frec(samples, samplerate): # samplerate, samples = wav.read(audiopath) # s = stft(samples, binsize) s = stft(samples, samplerate * 150 // 1000, 1.0 / 3) @@ -84,12 +84,12 @@ def generate_spectrogram(samples, samplerate): def generate_aiff_spectrogram(audiopath): samples, samplerate, _ = snd.read(audiopath) - ims, _ = generate_spectrogram(samples, samplerate) + ims, _ = generate_spec_frec(samples, samplerate) return ims def plot_stft(samples, samplerate, binsize=2**10, plotpath=None, colormap="jet"): - (ims, freq) = generate_spectrogram(samples, samplerate) + (ims, freq) = generate_spec_frec(samples, samplerate) timebins, freqbins = np.shape(ims) plt.figure(figsize=(15, 7.5)) plt.imshow( @@ -126,8 +126,8 @@ def plot_aiff_stft(audiopath, binsize=2**10, plotpath=None, colormap="jet"): def play_sunflower(): - sample_r = snd.get_info('./outputs/sunflowers-Alex-150-normal-589.aiff')[0] - snd_data_f64 = snd.read('./outputs/sunflowers-Alex-150-normal-589.aiff')[0] + sample_r = snd.get_info('./outputs/audio/sunflowers-Alex-150-normal-589.aiff')[0] + snd_data_f64 = snd.read('./outputs/audio/sunflowers-Alex-150-normal-589.aiff')[0] snd_data_f32 = snd_data_f64.astype(np.float32) print(snd_data_f32.shape) snd_data = snd_data_f32.tobytes() diff --git a/tts_samplegen.py b/tts_samplegen.py index d83c86e..3be585c 100644 --- a/tts_samplegen.py +++ b/tts_samplegen.py @@ -9,7 +9,7 @@ import re import subprocess import progressbar -from generate_similar import similar_phonemes +from generate_similar import similar_phonemes,similar_word OUTPUT_NAME = 'story_sents' dest_dir = os.path.abspath('.') + '/outputs/' + OUTPUT_NAME + '/' @@ -74,10 +74,6 @@ class SynthFile(object): self.filename] return ','.join([str(c) for c in cols])+'\n' - # return '{},{},{},{},{},{},{}\n'.format(self.word, self.phoneme, self.voice, -# self.voice_lang, self.rate, self.variant, -# self.filename) - class SynthVariant(object): @@ -116,11 +112,11 @@ class SynthVariant(object): # self.synth.startSpeakingString_toURL_(word,d_url) phoneme = orig_phon elif variant == 'medium': - phoneme = re.sub('[0-9]', '', orig_phon) + phoneme = similar_phoneme(orig_phon) phon_cmd = '[[inpt PHON]] ' + phoneme elif variant == 'high': - phoneme = orig_phon - phon_cmd = '[[inpt PHON]] ' + phoneme + phoneme = similar_word(word) + phon_cmd = phoneme # elif variant == 'long': # if phon != '': # self.phone_synth.startSpeakingString_toURL_(phon,d_url) @@ -132,35 +128,38 @@ class SynthVariant(object): cli_gen_audio(phon_cmd, self.rate, self.name, d_path) return SynthFile(word, phoneme, r_path, self.name, self.lang, self.rate, variant) + @staticmethod + def voices_for_lang(lang): + voices_installed = NSSpeechSynthesizer.availableVoices() + voice_attrs = [ + NSSpeechSynthesizer.attributesForVoice_(v) for v in voices_installed + ] + # sk = [k for k in voice_attrs[0].keys() if k not in [ + # 'VoiceIndividuallySpokenCharacters', 'VoiceSupportedCharacters']] + # s_attrs = [[v[i] for i in sk] for v in voice_attrs if 'VoiceShowInFullListOnly' in v + # and 'VoiceRelativeDesirability' in v] + return [ + (v['VoiceIdentifier'], + v['VoiceName'], + v['VoiceLanguage']) for v in voice_attrs + if v['VoiceLanguage'] == lang + and v['VoiceGender'] != 'VoiceGenderNeuter' + ] + + @classmethod + def synth_with(cls,voice_params,rate=180): + identifier,voice,lang = voice_params + return cls(identifier,voice,lang,rate) + def synth_generator(): - voices_installed = NSSpeechSynthesizer.availableVoices() - voice_attrs = [ - NSSpeechSynthesizer.attributesForVoice_(v) for v in voices_installed - ] - # sk = [k for k in voice_attrs[0].keys() if k not in [ - # 'VoiceIndividuallySpokenCharacters', 'VoiceSupportedCharacters']] - # s_attrs = [[v[i] for i in sk] for v in voice_attrs if 'VoiceShowInFullListOnly' in v - # and 'VoiceRelativeDesirability' in v] - us_voices_ids = [ - (v['VoiceIdentifier'], - v['VoiceName'], - v['VoiceLanguage']) for v in voice_attrs - if v['VoiceLanguage'] == 'en-US' - and v['VoiceGender'] != 'VoiceGenderNeuter' - ] - # import pdb - # pdb.set_trace() - # us_voices_ids = ['com.apple.speech.synthesis.voice.Fred', - # 'com.apple.speech.synthesis.voice.Alex', - # 'com.apple.speech.synthesis.voice.Victoria'] - # voice_rates = list(range(150,221,(220-180)//4)) + us_voices_ids = SynthVariant.voices_for_lang('en-US') voice_rates = [150, 180, 210, 250] voice_synths = [] create_dir(dest_dir) - for (i, v, l) in us_voices_ids: + for vp in us_voices_ids: for r in voice_rates: - s = SynthVariant(i, v, l, r) + s = SynthVariant.synth_with(vp,r) if s.phoneme_capable: print('Adding ', s) voice_synths.append(s) @@ -212,11 +211,11 @@ def synth_logger(fname, csv=False): def generate_audio_for_stories(): - # story_file = './inputs/all_stories_hs.json' - story_file = './inputs/all_stories.json' + story_file = './inputs/all_stories_hs.json' + # story_file = './inputs/all_stories.json' stories_data = json.load(open(story_file)) - # word_list = [t[0] for i in stories_data.values() for t in i] - word_list = [i for g in stories_data.values() for i in g] + word_list = [t[0] for i in stories_data.values() for t in i] + # word_list = [i for g in stories_data.values() for i in g] (writer, closer) = synth_logger(dest_file, csv=True) synth_for_words = synth_generator() try: @@ -228,11 +227,6 @@ def generate_audio_for_stories(): pass closer() -# synths = synth_generator()([OUTPUT_NAME]) - -# write_synths(synths, dest_file, True) -# write_synths(synths,'./outputs/synths.json') - if __name__ == '__main__': generate_audio_for_stories()