implemented tts gen variants

2017-10-27 18:53:22 +05:30
parent eb3ce8b7e5
commit 938a9cf0a8
4 changed files with 115 additions and 62 deletions
--- a/generate_similar.py
+++ b/generate_similar.py
@@ -1,6 +1,12 @@
 import pandas as pd
+import pronouncing
 import re
-mapping = {s.split()[0]: s.split()[1] for s in """
+import numpy as np
+import random
+
+mapping = {
+    s.split()[0]: s.split()[1]
+    for s in """
 AA AA
 AE AE
 AH UX
@@ -38,40 +44,93 @@ UW UW
 V  v
 W  w
 Y  y
-X  x
 Z  z
 ZH Z
-""".strip().split('\n')}
+""".strip().split('\n')
+}
+
+apple_phonemes = [
+    '%', '@', 'AE', 'EY', 'AO', 'AX', 'IY', 'EH', 'IH', 'AY', 'IX', 'AA', 'UW',
+    'UH', 'UX', 'OW', 'AW', 'OY', 'b', 'C', 'd', 'D', 'f', 'g', 'h', 'J', 'k',
+    'l', 'm', 'n', 'N', 'p', 'r', 's', 'S', 't', 'T', 'v', 'w', 'y', 'z', 'Z'
+]
+
+sim_mat = pd.read_csv('./similarity.csv', header=0, index_col=0)

-sim_mat = pd.read_csv('./similarity.csv',header=0,index_col=0)

 def convert_ph(ph):
-    stress_level = re.search("(\w+)([0-9])",ph)
+    stress_level = re.search("(\w+)([0-9])", ph)
    if stress_level:
-        return stress_level.group(2)+mapping[stress_level.group(1)]
+        return stress_level.group(2) + mapping[stress_level.group(1)]
    else:
        return mapping[ph]

+
 def sim_mat_to_apple_table(smt):
    colnames = [convert_ph(ph) for ph in smt.index.tolist()]
    smt = pd.DataFrame(np.nan_to_num(smt.values))
-    fsmt = (smt.T+smt)
-    np.fill_diagonal(fsmt.values,100.0)
+    fsmt = (smt.T + smt)
+    np.fill_diagonal(fsmt.values, 100.0)
    asmt = pd.DataFrame.copy(fsmt)
    asmt.columns = colnames
    asmt.index = colnames
-    apple_sim_lookup = asmt.stack().reset_index()
-    apple_sim_lookup.columns = ['q','r','s']
-    return apple_sim_lookup
+    apple_sim_table = asmt.stack().reset_index()
+    apple_sim_table.columns = ['q', 'r', 's']
+    return apple_sim_table
+
+
+apple_sim_table = sim_mat_to_apple_table(sim_mat)

-apple_sim_lookup = sim_mat_to_apple_table(sim_mat)

 def top_match(ph):
-    selected = apple_sim_lookup[(apple_sim_lookup.q == ph) & (apple_sim_lookup.s < 100) & (apple_sim_lookup.s >= 70)]
+    selected = apple_sim_table[(apple_sim_table.q == ph)
+                               & (apple_sim_table.s < 100) &
+                               (apple_sim_table.s >= 70)]
    tm = ph
    if len(selected) > 0:
-        tm = pd.DataFrame.sort_values(selected,'s',ascending=False).iloc[0].r
+        tm = pd.DataFrame.sort_values(selected, 's', ascending=False).iloc[0].r
    return tm

+
+class ApplePhoneme(object):
+    """docstring for ApplePhoneme."""
+
+    def __init__(self, phone, stress, vowel=False):
+        super(ApplePhoneme, self).__init__()
+        self.phone = phone
+        self.stress = stress
+        self.vowel = vowel
+
+    def __str__(self):
+        return (str(self.stress) if (self.vowel and self.stress>0) else '') + self.phone
+
+    def __repr__(self):
+        return "'{}'".format(str(self))
+
+    def adjust_stress(self):
+        self.stress = random.choice([i for i in range(3) if i != self.stress])
+
+
+def parse_apple_phonemes(ph_str):
+    for i in range(len(ph_str)):
+        pref, rest = ph_str[:i + 1], ph_str[i + 1:]
+        if pref in apple_phonemes:
+            vowel = pref[0] in 'AEIOU'
+            return [ApplePhoneme(pref, 0, vowel)] + parse_apple_phonemes(rest)
+        elif pref[0].isdigit() and pref[1:] in apple_phonemes:
+            return [ApplePhoneme(pref[1:], int(pref[0]) , True)] + parse_apple_phonemes(rest)
+        elif not pref.isalnum():
+            return [ApplePhoneme(pref, 0, False)] + parse_apple_phonemes(rest)
+    return []
+
+
 def similar_phoneme(ph_str):
-    return ph_str
+    phons = parse_apple_phonemes(ph_str)
+    vowels = [i for i in phons if i.vowel]
+    random.choice(vowels).adjust_stress()
+    return ''.join([str(i) for i in phons])
+
+
+def similar_word(word_str):
+    similar = pronouncing.rhymes(word_str)
+    return random.choice(similar) if len(similar) > 0 else word_str
--- a/record_mic_speech.py
+++ b/record_mic_speech.py
@@ -1,7 +1,7 @@
 import pyaudio
 import numpy as np
 # from matplotlib import pyplot as plt
-from spectro_gen import plot_stft, generate_spectrogram
+from spectro_gen import plot_stft, generate_spec_frec


 def record_spectrogram(n_sec, plot=False, playback=False):
@@ -38,5 +38,5 @@ def record_spectrogram(n_sec, plot=False, playback=False):
        stream.write(mean_channel_data)
        stream.close()
        p_oup.terminate()
-    ims, _ = generate_spectrogram(one_channel, SAMPLE_RATE)
+    ims, _ = generate_spec_frec(one_channel, SAMPLE_RATE)
    return ims
--- a/spectro_gen.py
+++ b/spectro_gen.py
@@ -71,7 +71,7 @@ def logscale_spec(spec, sr=44100, factor=20.):
 """ generate spectrogram for aiff audio with 150ms windows and 50ms overlap"""


-def generate_spectrogram(samples, samplerate):
+def generate_spec_frec(samples, samplerate):
    # samplerate, samples = wav.read(audiopath)
    # s = stft(samples, binsize)
    s = stft(samples, samplerate * 150 // 1000, 1.0 / 3)
@@ -84,12 +84,12 @@ def generate_spectrogram(samples, samplerate):

 def generate_aiff_spectrogram(audiopath):
    samples, samplerate, _ = snd.read(audiopath)
-    ims, _ = generate_spectrogram(samples, samplerate)
+    ims, _ = generate_spec_frec(samples, samplerate)
    return ims


 def plot_stft(samples, samplerate, binsize=2**10, plotpath=None, colormap="jet"):
-    (ims, freq) = generate_spectrogram(samples, samplerate)
+    (ims, freq) = generate_spec_frec(samples, samplerate)
    timebins, freqbins = np.shape(ims)
    plt.figure(figsize=(15, 7.5))
    plt.imshow(
@@ -126,8 +126,8 @@ def plot_aiff_stft(audiopath, binsize=2**10, plotpath=None, colormap="jet"):


 def play_sunflower():
-    sample_r = snd.get_info('./outputs/sunflowers-Alex-150-normal-589.aiff')[0]
-    snd_data_f64 = snd.read('./outputs/sunflowers-Alex-150-normal-589.aiff')[0]
+    sample_r = snd.get_info('./outputs/audio/sunflowers-Alex-150-normal-589.aiff')[0]
+    snd_data_f64 = snd.read('./outputs/audio/sunflowers-Alex-150-normal-589.aiff')[0]
    snd_data_f32 = snd_data_f64.astype(np.float32)
    print(snd_data_f32.shape)
    snd_data = snd_data_f32.tobytes()
--- a/tts_samplegen.py
+++ b/tts_samplegen.py
@@ -9,7 +9,7 @@ import re
 import subprocess
 import progressbar

-from generate_similar import similar_phonemes
+from generate_similar import similar_phonemes,similar_word

 OUTPUT_NAME = 'story_sents'
 dest_dir = os.path.abspath('.') + '/outputs/' + OUTPUT_NAME + '/'
@@ -74,10 +74,6 @@ class SynthFile(object):
                self.filename]

        return ','.join([str(c) for c in cols])+'\n'
-        # return '{},{},{},{},{},{},{}\n'.format(self.word, self.phoneme, self.voice,
-#                                        self.voice_lang, self.rate, self.variant,
-#                                        self.filename)
-


 class SynthVariant(object):
@@ -116,11 +112,11 @@ class SynthVariant(object):
            # self.synth.startSpeakingString_toURL_(word,d_url)
            phoneme = orig_phon
        elif variant == 'medium':
-            phoneme = re.sub('[0-9]', '', orig_phon)
+            phoneme = similar_phoneme(orig_phon)
            phon_cmd = '[[inpt PHON]] ' + phoneme
        elif variant == 'high':
-            phoneme = orig_phon
-            phon_cmd = '[[inpt PHON]] ' + phoneme
+            phoneme = similar_word(word)
+            phon_cmd = phoneme
        # elif variant == 'long':
        # if phon != '':
        # self.phone_synth.startSpeakingString_toURL_(phon,d_url)
@@ -132,35 +128,38 @@ class SynthVariant(object):
        cli_gen_audio(phon_cmd, self.rate, self.name, d_path)
        return SynthFile(word, phoneme, r_path, self.name, self.lang, self.rate, variant)

+    @staticmethod
+    def voices_for_lang(lang):
+        voices_installed = NSSpeechSynthesizer.availableVoices()
+        voice_attrs = [
+            NSSpeechSynthesizer.attributesForVoice_(v) for v in voices_installed
+        ]
+        # sk = [k for k in voice_attrs[0].keys() if k not in [
+        #     'VoiceIndividuallySpokenCharacters', 'VoiceSupportedCharacters']]
+        # s_attrs = [[v[i] for i in sk] for v in voice_attrs if 'VoiceShowInFullListOnly' in v
+        #            and 'VoiceRelativeDesirability' in v]
+        return [
+            (v['VoiceIdentifier'],
+             v['VoiceName'],
+             v['VoiceLanguage']) for v in voice_attrs
+            if v['VoiceLanguage'] == lang
+            and v['VoiceGender'] != 'VoiceGenderNeuter'
+        ]
+
+    @classmethod
+    def synth_with(cls,voice_params,rate=180):
+        identifier,voice,lang = voice_params
+        return cls(identifier,voice,lang,rate)
+

 def synth_generator():
-    voices_installed = NSSpeechSynthesizer.availableVoices()
-    voice_attrs = [
-        NSSpeechSynthesizer.attributesForVoice_(v) for v in voices_installed
-    ]
-    # sk = [k for k in voice_attrs[0].keys() if k not in [
-    #     'VoiceIndividuallySpokenCharacters', 'VoiceSupportedCharacters']]
-    # s_attrs = [[v[i] for i in sk] for v in voice_attrs if 'VoiceShowInFullListOnly' in v
-    #            and 'VoiceRelativeDesirability' in v]
-    us_voices_ids = [
-        (v['VoiceIdentifier'],
-         v['VoiceName'],
-         v['VoiceLanguage']) for v in voice_attrs
-        if v['VoiceLanguage'] == 'en-US'
-        and v['VoiceGender'] != 'VoiceGenderNeuter'
-    ]
-    # import pdb
-    # pdb.set_trace()
-    # us_voices_ids = ['com.apple.speech.synthesis.voice.Fred',
-    #                  'com.apple.speech.synthesis.voice.Alex',
-    #                  'com.apple.speech.synthesis.voice.Victoria']
-    # voice_rates = list(range(150,221,(220-180)//4))
+    us_voices_ids = SynthVariant.voices_for_lang('en-US')
    voice_rates = [150, 180, 210, 250]
    voice_synths = []
    create_dir(dest_dir)
-    for (i, v, l) in us_voices_ids:
+    for vp in us_voices_ids:
        for r in voice_rates:
-            s = SynthVariant(i, v, l, r)
+            s = SynthVariant.synth_with(vp,r)
            if s.phoneme_capable:
                print('Adding ', s)
                voice_synths.append(s)
@@ -212,11 +211,11 @@ def synth_logger(fname, csv=False):


 def generate_audio_for_stories():
-    # story_file = './inputs/all_stories_hs.json'
-    story_file = './inputs/all_stories.json'
+    story_file = './inputs/all_stories_hs.json'
+    # story_file = './inputs/all_stories.json'
    stories_data = json.load(open(story_file))
-    # word_list = [t[0] for i in stories_data.values() for t in i]
-    word_list = [i for g in stories_data.values() for i in g]
+    word_list = [t[0] for i in stories_data.values() for t in i]
+    # word_list = [i for g in stories_data.values() for i in g]
    (writer, closer) = synth_logger(dest_file, csv=True)
    synth_for_words = synth_generator()
    try:
@@ -228,11 +227,6 @@ def generate_audio_for_stories():
        pass
    closer()

-# synths = synth_generator()([OUTPUT_NAME])
-
-# write_synths(synths, dest_file, True)
-# write_synths(synths,'./outputs/synths.json')
-

 if __name__ == '__main__':
    generate_audio_for_stories()