implemented tts gen variants
parent
eb3ce8b7e5
commit
938a9cf0a8
|
|
@ -1,6 +1,12 @@
|
|||
import pandas as pd
|
||||
import pronouncing
|
||||
import re
|
||||
mapping = {s.split()[0]: s.split()[1] for s in """
|
||||
import numpy as np
|
||||
import random
|
||||
|
||||
mapping = {
|
||||
s.split()[0]: s.split()[1]
|
||||
for s in """
|
||||
AA AA
|
||||
AE AE
|
||||
AH UX
|
||||
|
|
@ -38,40 +44,93 @@ UW UW
|
|||
V v
|
||||
W w
|
||||
Y y
|
||||
X x
|
||||
Z z
|
||||
ZH Z
|
||||
""".strip().split('\n')}
|
||||
""".strip().split('\n')
|
||||
}
|
||||
|
||||
apple_phonemes = [
|
||||
'%', '@', 'AE', 'EY', 'AO', 'AX', 'IY', 'EH', 'IH', 'AY', 'IX', 'AA', 'UW',
|
||||
'UH', 'UX', 'OW', 'AW', 'OY', 'b', 'C', 'd', 'D', 'f', 'g', 'h', 'J', 'k',
|
||||
'l', 'm', 'n', 'N', 'p', 'r', 's', 'S', 't', 'T', 'v', 'w', 'y', 'z', 'Z'
|
||||
]
|
||||
|
||||
sim_mat = pd.read_csv('./similarity.csv', header=0, index_col=0)
|
||||
|
||||
sim_mat = pd.read_csv('./similarity.csv',header=0,index_col=0)
|
||||
|
||||
def convert_ph(ph):
|
||||
stress_level = re.search("(\w+)([0-9])",ph)
|
||||
stress_level = re.search("(\w+)([0-9])", ph)
|
||||
if stress_level:
|
||||
return stress_level.group(2)+mapping[stress_level.group(1)]
|
||||
return stress_level.group(2) + mapping[stress_level.group(1)]
|
||||
else:
|
||||
return mapping[ph]
|
||||
|
||||
|
||||
def sim_mat_to_apple_table(smt):
|
||||
colnames = [convert_ph(ph) for ph in smt.index.tolist()]
|
||||
smt = pd.DataFrame(np.nan_to_num(smt.values))
|
||||
fsmt = (smt.T+smt)
|
||||
np.fill_diagonal(fsmt.values,100.0)
|
||||
fsmt = (smt.T + smt)
|
||||
np.fill_diagonal(fsmt.values, 100.0)
|
||||
asmt = pd.DataFrame.copy(fsmt)
|
||||
asmt.columns = colnames
|
||||
asmt.index = colnames
|
||||
apple_sim_lookup = asmt.stack().reset_index()
|
||||
apple_sim_lookup.columns = ['q','r','s']
|
||||
return apple_sim_lookup
|
||||
apple_sim_table = asmt.stack().reset_index()
|
||||
apple_sim_table.columns = ['q', 'r', 's']
|
||||
return apple_sim_table
|
||||
|
||||
|
||||
apple_sim_table = sim_mat_to_apple_table(sim_mat)
|
||||
|
||||
apple_sim_lookup = sim_mat_to_apple_table(sim_mat)
|
||||
|
||||
def top_match(ph):
|
||||
selected = apple_sim_lookup[(apple_sim_lookup.q == ph) & (apple_sim_lookup.s < 100) & (apple_sim_lookup.s >= 70)]
|
||||
selected = apple_sim_table[(apple_sim_table.q == ph)
|
||||
& (apple_sim_table.s < 100) &
|
||||
(apple_sim_table.s >= 70)]
|
||||
tm = ph
|
||||
if len(selected) > 0:
|
||||
tm = pd.DataFrame.sort_values(selected,'s',ascending=False).iloc[0].r
|
||||
tm = pd.DataFrame.sort_values(selected, 's', ascending=False).iloc[0].r
|
||||
return tm
|
||||
|
||||
|
||||
class ApplePhoneme(object):
|
||||
"""docstring for ApplePhoneme."""
|
||||
|
||||
def __init__(self, phone, stress, vowel=False):
|
||||
super(ApplePhoneme, self).__init__()
|
||||
self.phone = phone
|
||||
self.stress = stress
|
||||
self.vowel = vowel
|
||||
|
||||
def __str__(self):
|
||||
return (str(self.stress) if (self.vowel and self.stress>0) else '') + self.phone
|
||||
|
||||
def __repr__(self):
|
||||
return "'{}'".format(str(self))
|
||||
|
||||
def adjust_stress(self):
|
||||
self.stress = random.choice([i for i in range(3) if i != self.stress])
|
||||
|
||||
|
||||
def parse_apple_phonemes(ph_str):
|
||||
for i in range(len(ph_str)):
|
||||
pref, rest = ph_str[:i + 1], ph_str[i + 1:]
|
||||
if pref in apple_phonemes:
|
||||
vowel = pref[0] in 'AEIOU'
|
||||
return [ApplePhoneme(pref, 0, vowel)] + parse_apple_phonemes(rest)
|
||||
elif pref[0].isdigit() and pref[1:] in apple_phonemes:
|
||||
return [ApplePhoneme(pref[1:], int(pref[0]) , True)] + parse_apple_phonemes(rest)
|
||||
elif not pref.isalnum():
|
||||
return [ApplePhoneme(pref, 0, False)] + parse_apple_phonemes(rest)
|
||||
return []
|
||||
|
||||
|
||||
def similar_phoneme(ph_str):
|
||||
return ph_str
|
||||
phons = parse_apple_phonemes(ph_str)
|
||||
vowels = [i for i in phons if i.vowel]
|
||||
random.choice(vowels).adjust_stress()
|
||||
return ''.join([str(i) for i in phons])
|
||||
|
||||
|
||||
def similar_word(word_str):
|
||||
similar = pronouncing.rhymes(word_str)
|
||||
return random.choice(similar) if len(similar) > 0 else word_str
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
import pyaudio
|
||||
import numpy as np
|
||||
# from matplotlib import pyplot as plt
|
||||
from spectro_gen import plot_stft, generate_spectrogram
|
||||
from spectro_gen import plot_stft, generate_spec_frec
|
||||
|
||||
|
||||
def record_spectrogram(n_sec, plot=False, playback=False):
|
||||
|
|
@ -38,5 +38,5 @@ def record_spectrogram(n_sec, plot=False, playback=False):
|
|||
stream.write(mean_channel_data)
|
||||
stream.close()
|
||||
p_oup.terminate()
|
||||
ims, _ = generate_spectrogram(one_channel, SAMPLE_RATE)
|
||||
ims, _ = generate_spec_frec(one_channel, SAMPLE_RATE)
|
||||
return ims
|
||||
|
|
|
|||
|
|
@ -71,7 +71,7 @@ def logscale_spec(spec, sr=44100, factor=20.):
|
|||
""" generate spectrogram for aiff audio with 150ms windows and 50ms overlap"""
|
||||
|
||||
|
||||
def generate_spectrogram(samples, samplerate):
|
||||
def generate_spec_frec(samples, samplerate):
|
||||
# samplerate, samples = wav.read(audiopath)
|
||||
# s = stft(samples, binsize)
|
||||
s = stft(samples, samplerate * 150 // 1000, 1.0 / 3)
|
||||
|
|
@ -84,12 +84,12 @@ def generate_spectrogram(samples, samplerate):
|
|||
|
||||
def generate_aiff_spectrogram(audiopath):
|
||||
samples, samplerate, _ = snd.read(audiopath)
|
||||
ims, _ = generate_spectrogram(samples, samplerate)
|
||||
ims, _ = generate_spec_frec(samples, samplerate)
|
||||
return ims
|
||||
|
||||
|
||||
def plot_stft(samples, samplerate, binsize=2**10, plotpath=None, colormap="jet"):
|
||||
(ims, freq) = generate_spectrogram(samples, samplerate)
|
||||
(ims, freq) = generate_spec_frec(samples, samplerate)
|
||||
timebins, freqbins = np.shape(ims)
|
||||
plt.figure(figsize=(15, 7.5))
|
||||
plt.imshow(
|
||||
|
|
@ -126,8 +126,8 @@ def plot_aiff_stft(audiopath, binsize=2**10, plotpath=None, colormap="jet"):
|
|||
|
||||
|
||||
def play_sunflower():
|
||||
sample_r = snd.get_info('./outputs/sunflowers-Alex-150-normal-589.aiff')[0]
|
||||
snd_data_f64 = snd.read('./outputs/sunflowers-Alex-150-normal-589.aiff')[0]
|
||||
sample_r = snd.get_info('./outputs/audio/sunflowers-Alex-150-normal-589.aiff')[0]
|
||||
snd_data_f64 = snd.read('./outputs/audio/sunflowers-Alex-150-normal-589.aiff')[0]
|
||||
snd_data_f32 = snd_data_f64.astype(np.float32)
|
||||
print(snd_data_f32.shape)
|
||||
snd_data = snd_data_f32.tobytes()
|
||||
|
|
|
|||
|
|
@ -9,7 +9,7 @@ import re
|
|||
import subprocess
|
||||
import progressbar
|
||||
|
||||
from generate_similar import similar_phonemes
|
||||
from generate_similar import similar_phonemes,similar_word
|
||||
|
||||
OUTPUT_NAME = 'story_sents'
|
||||
dest_dir = os.path.abspath('.') + '/outputs/' + OUTPUT_NAME + '/'
|
||||
|
|
@ -74,10 +74,6 @@ class SynthFile(object):
|
|||
self.filename]
|
||||
|
||||
return ','.join([str(c) for c in cols])+'\n'
|
||||
# return '{},{},{},{},{},{},{}\n'.format(self.word, self.phoneme, self.voice,
|
||||
# self.voice_lang, self.rate, self.variant,
|
||||
# self.filename)
|
||||
|
||||
|
||||
|
||||
class SynthVariant(object):
|
||||
|
|
@ -116,11 +112,11 @@ class SynthVariant(object):
|
|||
# self.synth.startSpeakingString_toURL_(word,d_url)
|
||||
phoneme = orig_phon
|
||||
elif variant == 'medium':
|
||||
phoneme = re.sub('[0-9]', '', orig_phon)
|
||||
phoneme = similar_phoneme(orig_phon)
|
||||
phon_cmd = '[[inpt PHON]] ' + phoneme
|
||||
elif variant == 'high':
|
||||
phoneme = orig_phon
|
||||
phon_cmd = '[[inpt PHON]] ' + phoneme
|
||||
phoneme = similar_word(word)
|
||||
phon_cmd = phoneme
|
||||
# elif variant == 'long':
|
||||
# if phon != '':
|
||||
# self.phone_synth.startSpeakingString_toURL_(phon,d_url)
|
||||
|
|
@ -132,35 +128,38 @@ class SynthVariant(object):
|
|||
cli_gen_audio(phon_cmd, self.rate, self.name, d_path)
|
||||
return SynthFile(word, phoneme, r_path, self.name, self.lang, self.rate, variant)
|
||||
|
||||
@staticmethod
|
||||
def voices_for_lang(lang):
|
||||
voices_installed = NSSpeechSynthesizer.availableVoices()
|
||||
voice_attrs = [
|
||||
NSSpeechSynthesizer.attributesForVoice_(v) for v in voices_installed
|
||||
]
|
||||
# sk = [k for k in voice_attrs[0].keys() if k not in [
|
||||
# 'VoiceIndividuallySpokenCharacters', 'VoiceSupportedCharacters']]
|
||||
# s_attrs = [[v[i] for i in sk] for v in voice_attrs if 'VoiceShowInFullListOnly' in v
|
||||
# and 'VoiceRelativeDesirability' in v]
|
||||
return [
|
||||
(v['VoiceIdentifier'],
|
||||
v['VoiceName'],
|
||||
v['VoiceLanguage']) for v in voice_attrs
|
||||
if v['VoiceLanguage'] == lang
|
||||
and v['VoiceGender'] != 'VoiceGenderNeuter'
|
||||
]
|
||||
|
||||
@classmethod
|
||||
def synth_with(cls,voice_params,rate=180):
|
||||
identifier,voice,lang = voice_params
|
||||
return cls(identifier,voice,lang,rate)
|
||||
|
||||
|
||||
def synth_generator():
|
||||
voices_installed = NSSpeechSynthesizer.availableVoices()
|
||||
voice_attrs = [
|
||||
NSSpeechSynthesizer.attributesForVoice_(v) for v in voices_installed
|
||||
]
|
||||
# sk = [k for k in voice_attrs[0].keys() if k not in [
|
||||
# 'VoiceIndividuallySpokenCharacters', 'VoiceSupportedCharacters']]
|
||||
# s_attrs = [[v[i] for i in sk] for v in voice_attrs if 'VoiceShowInFullListOnly' in v
|
||||
# and 'VoiceRelativeDesirability' in v]
|
||||
us_voices_ids = [
|
||||
(v['VoiceIdentifier'],
|
||||
v['VoiceName'],
|
||||
v['VoiceLanguage']) for v in voice_attrs
|
||||
if v['VoiceLanguage'] == 'en-US'
|
||||
and v['VoiceGender'] != 'VoiceGenderNeuter'
|
||||
]
|
||||
# import pdb
|
||||
# pdb.set_trace()
|
||||
# us_voices_ids = ['com.apple.speech.synthesis.voice.Fred',
|
||||
# 'com.apple.speech.synthesis.voice.Alex',
|
||||
# 'com.apple.speech.synthesis.voice.Victoria']
|
||||
# voice_rates = list(range(150,221,(220-180)//4))
|
||||
us_voices_ids = SynthVariant.voices_for_lang('en-US')
|
||||
voice_rates = [150, 180, 210, 250]
|
||||
voice_synths = []
|
||||
create_dir(dest_dir)
|
||||
for (i, v, l) in us_voices_ids:
|
||||
for vp in us_voices_ids:
|
||||
for r in voice_rates:
|
||||
s = SynthVariant(i, v, l, r)
|
||||
s = SynthVariant.synth_with(vp,r)
|
||||
if s.phoneme_capable:
|
||||
print('Adding ', s)
|
||||
voice_synths.append(s)
|
||||
|
|
@ -212,11 +211,11 @@ def synth_logger(fname, csv=False):
|
|||
|
||||
|
||||
def generate_audio_for_stories():
|
||||
# story_file = './inputs/all_stories_hs.json'
|
||||
story_file = './inputs/all_stories.json'
|
||||
story_file = './inputs/all_stories_hs.json'
|
||||
# story_file = './inputs/all_stories.json'
|
||||
stories_data = json.load(open(story_file))
|
||||
# word_list = [t[0] for i in stories_data.values() for t in i]
|
||||
word_list = [i for g in stories_data.values() for i in g]
|
||||
word_list = [t[0] for i in stories_data.values() for t in i]
|
||||
# word_list = [i for g in stories_data.values() for i in g]
|
||||
(writer, closer) = synth_logger(dest_file, csv=True)
|
||||
synth_for_words = synth_generator()
|
||||
try:
|
||||
|
|
@ -228,11 +227,6 @@ def generate_audio_for_stories():
|
|||
pass
|
||||
closer()
|
||||
|
||||
# synths = synth_generator()([OUTPUT_NAME])
|
||||
|
||||
# write_synths(synths, dest_file, True)
|
||||
# write_synths(synths,'./outputs/synths.json')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
generate_audio_for_stories()
|
||||
|
|
|
|||
Loading…
Reference in New Issue