implemented tts gen variants
parent
eb3ce8b7e5
commit
938a9cf0a8
|
|
@ -1,6 +1,12 @@
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
import pronouncing
|
||||||
import re
|
import re
|
||||||
mapping = {s.split()[0]: s.split()[1] for s in """
|
import numpy as np
|
||||||
|
import random
|
||||||
|
|
||||||
|
mapping = {
|
||||||
|
s.split()[0]: s.split()[1]
|
||||||
|
for s in """
|
||||||
AA AA
|
AA AA
|
||||||
AE AE
|
AE AE
|
||||||
AH UX
|
AH UX
|
||||||
|
|
@ -38,40 +44,93 @@ UW UW
|
||||||
V v
|
V v
|
||||||
W w
|
W w
|
||||||
Y y
|
Y y
|
||||||
X x
|
|
||||||
Z z
|
Z z
|
||||||
ZH Z
|
ZH Z
|
||||||
""".strip().split('\n')}
|
""".strip().split('\n')
|
||||||
|
}
|
||||||
|
|
||||||
|
apple_phonemes = [
|
||||||
|
'%', '@', 'AE', 'EY', 'AO', 'AX', 'IY', 'EH', 'IH', 'AY', 'IX', 'AA', 'UW',
|
||||||
|
'UH', 'UX', 'OW', 'AW', 'OY', 'b', 'C', 'd', 'D', 'f', 'g', 'h', 'J', 'k',
|
||||||
|
'l', 'm', 'n', 'N', 'p', 'r', 's', 'S', 't', 'T', 'v', 'w', 'y', 'z', 'Z'
|
||||||
|
]
|
||||||
|
|
||||||
|
sim_mat = pd.read_csv('./similarity.csv', header=0, index_col=0)
|
||||||
|
|
||||||
sim_mat = pd.read_csv('./similarity.csv',header=0,index_col=0)
|
|
||||||
|
|
||||||
def convert_ph(ph):
|
def convert_ph(ph):
|
||||||
stress_level = re.search("(\w+)([0-9])",ph)
|
stress_level = re.search("(\w+)([0-9])", ph)
|
||||||
if stress_level:
|
if stress_level:
|
||||||
return stress_level.group(2)+mapping[stress_level.group(1)]
|
return stress_level.group(2) + mapping[stress_level.group(1)]
|
||||||
else:
|
else:
|
||||||
return mapping[ph]
|
return mapping[ph]
|
||||||
|
|
||||||
|
|
||||||
def sim_mat_to_apple_table(smt):
|
def sim_mat_to_apple_table(smt):
|
||||||
colnames = [convert_ph(ph) for ph in smt.index.tolist()]
|
colnames = [convert_ph(ph) for ph in smt.index.tolist()]
|
||||||
smt = pd.DataFrame(np.nan_to_num(smt.values))
|
smt = pd.DataFrame(np.nan_to_num(smt.values))
|
||||||
fsmt = (smt.T+smt)
|
fsmt = (smt.T + smt)
|
||||||
np.fill_diagonal(fsmt.values,100.0)
|
np.fill_diagonal(fsmt.values, 100.0)
|
||||||
asmt = pd.DataFrame.copy(fsmt)
|
asmt = pd.DataFrame.copy(fsmt)
|
||||||
asmt.columns = colnames
|
asmt.columns = colnames
|
||||||
asmt.index = colnames
|
asmt.index = colnames
|
||||||
apple_sim_lookup = asmt.stack().reset_index()
|
apple_sim_table = asmt.stack().reset_index()
|
||||||
apple_sim_lookup.columns = ['q','r','s']
|
apple_sim_table.columns = ['q', 'r', 's']
|
||||||
return apple_sim_lookup
|
return apple_sim_table
|
||||||
|
|
||||||
|
|
||||||
|
apple_sim_table = sim_mat_to_apple_table(sim_mat)
|
||||||
|
|
||||||
apple_sim_lookup = sim_mat_to_apple_table(sim_mat)
|
|
||||||
|
|
||||||
def top_match(ph):
|
def top_match(ph):
|
||||||
selected = apple_sim_lookup[(apple_sim_lookup.q == ph) & (apple_sim_lookup.s < 100) & (apple_sim_lookup.s >= 70)]
|
selected = apple_sim_table[(apple_sim_table.q == ph)
|
||||||
|
& (apple_sim_table.s < 100) &
|
||||||
|
(apple_sim_table.s >= 70)]
|
||||||
tm = ph
|
tm = ph
|
||||||
if len(selected) > 0:
|
if len(selected) > 0:
|
||||||
tm = pd.DataFrame.sort_values(selected,'s',ascending=False).iloc[0].r
|
tm = pd.DataFrame.sort_values(selected, 's', ascending=False).iloc[0].r
|
||||||
return tm
|
return tm
|
||||||
|
|
||||||
|
|
||||||
|
class ApplePhoneme(object):
|
||||||
|
"""docstring for ApplePhoneme."""
|
||||||
|
|
||||||
|
def __init__(self, phone, stress, vowel=False):
|
||||||
|
super(ApplePhoneme, self).__init__()
|
||||||
|
self.phone = phone
|
||||||
|
self.stress = stress
|
||||||
|
self.vowel = vowel
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return (str(self.stress) if (self.vowel and self.stress>0) else '') + self.phone
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return "'{}'".format(str(self))
|
||||||
|
|
||||||
|
def adjust_stress(self):
|
||||||
|
self.stress = random.choice([i for i in range(3) if i != self.stress])
|
||||||
|
|
||||||
|
|
||||||
|
def parse_apple_phonemes(ph_str):
|
||||||
|
for i in range(len(ph_str)):
|
||||||
|
pref, rest = ph_str[:i + 1], ph_str[i + 1:]
|
||||||
|
if pref in apple_phonemes:
|
||||||
|
vowel = pref[0] in 'AEIOU'
|
||||||
|
return [ApplePhoneme(pref, 0, vowel)] + parse_apple_phonemes(rest)
|
||||||
|
elif pref[0].isdigit() and pref[1:] in apple_phonemes:
|
||||||
|
return [ApplePhoneme(pref[1:], int(pref[0]) , True)] + parse_apple_phonemes(rest)
|
||||||
|
elif not pref.isalnum():
|
||||||
|
return [ApplePhoneme(pref, 0, False)] + parse_apple_phonemes(rest)
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
def similar_phoneme(ph_str):
|
def similar_phoneme(ph_str):
|
||||||
return ph_str
|
phons = parse_apple_phonemes(ph_str)
|
||||||
|
vowels = [i for i in phons if i.vowel]
|
||||||
|
random.choice(vowels).adjust_stress()
|
||||||
|
return ''.join([str(i) for i in phons])
|
||||||
|
|
||||||
|
|
||||||
|
def similar_word(word_str):
|
||||||
|
similar = pronouncing.rhymes(word_str)
|
||||||
|
return random.choice(similar) if len(similar) > 0 else word_str
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,7 @@
|
||||||
import pyaudio
|
import pyaudio
|
||||||
import numpy as np
|
import numpy as np
|
||||||
# from matplotlib import pyplot as plt
|
# from matplotlib import pyplot as plt
|
||||||
from spectro_gen import plot_stft, generate_spectrogram
|
from spectro_gen import plot_stft, generate_spec_frec
|
||||||
|
|
||||||
|
|
||||||
def record_spectrogram(n_sec, plot=False, playback=False):
|
def record_spectrogram(n_sec, plot=False, playback=False):
|
||||||
|
|
@ -38,5 +38,5 @@ def record_spectrogram(n_sec, plot=False, playback=False):
|
||||||
stream.write(mean_channel_data)
|
stream.write(mean_channel_data)
|
||||||
stream.close()
|
stream.close()
|
||||||
p_oup.terminate()
|
p_oup.terminate()
|
||||||
ims, _ = generate_spectrogram(one_channel, SAMPLE_RATE)
|
ims, _ = generate_spec_frec(one_channel, SAMPLE_RATE)
|
||||||
return ims
|
return ims
|
||||||
|
|
|
||||||
|
|
@ -71,7 +71,7 @@ def logscale_spec(spec, sr=44100, factor=20.):
|
||||||
""" generate spectrogram for aiff audio with 150ms windows and 50ms overlap"""
|
""" generate spectrogram for aiff audio with 150ms windows and 50ms overlap"""
|
||||||
|
|
||||||
|
|
||||||
def generate_spectrogram(samples, samplerate):
|
def generate_spec_frec(samples, samplerate):
|
||||||
# samplerate, samples = wav.read(audiopath)
|
# samplerate, samples = wav.read(audiopath)
|
||||||
# s = stft(samples, binsize)
|
# s = stft(samples, binsize)
|
||||||
s = stft(samples, samplerate * 150 // 1000, 1.0 / 3)
|
s = stft(samples, samplerate * 150 // 1000, 1.0 / 3)
|
||||||
|
|
@ -84,12 +84,12 @@ def generate_spectrogram(samples, samplerate):
|
||||||
|
|
||||||
def generate_aiff_spectrogram(audiopath):
|
def generate_aiff_spectrogram(audiopath):
|
||||||
samples, samplerate, _ = snd.read(audiopath)
|
samples, samplerate, _ = snd.read(audiopath)
|
||||||
ims, _ = generate_spectrogram(samples, samplerate)
|
ims, _ = generate_spec_frec(samples, samplerate)
|
||||||
return ims
|
return ims
|
||||||
|
|
||||||
|
|
||||||
def plot_stft(samples, samplerate, binsize=2**10, plotpath=None, colormap="jet"):
|
def plot_stft(samples, samplerate, binsize=2**10, plotpath=None, colormap="jet"):
|
||||||
(ims, freq) = generate_spectrogram(samples, samplerate)
|
(ims, freq) = generate_spec_frec(samples, samplerate)
|
||||||
timebins, freqbins = np.shape(ims)
|
timebins, freqbins = np.shape(ims)
|
||||||
plt.figure(figsize=(15, 7.5))
|
plt.figure(figsize=(15, 7.5))
|
||||||
plt.imshow(
|
plt.imshow(
|
||||||
|
|
@ -126,8 +126,8 @@ def plot_aiff_stft(audiopath, binsize=2**10, plotpath=None, colormap="jet"):
|
||||||
|
|
||||||
|
|
||||||
def play_sunflower():
|
def play_sunflower():
|
||||||
sample_r = snd.get_info('./outputs/sunflowers-Alex-150-normal-589.aiff')[0]
|
sample_r = snd.get_info('./outputs/audio/sunflowers-Alex-150-normal-589.aiff')[0]
|
||||||
snd_data_f64 = snd.read('./outputs/sunflowers-Alex-150-normal-589.aiff')[0]
|
snd_data_f64 = snd.read('./outputs/audio/sunflowers-Alex-150-normal-589.aiff')[0]
|
||||||
snd_data_f32 = snd_data_f64.astype(np.float32)
|
snd_data_f32 = snd_data_f64.astype(np.float32)
|
||||||
print(snd_data_f32.shape)
|
print(snd_data_f32.shape)
|
||||||
snd_data = snd_data_f32.tobytes()
|
snd_data = snd_data_f32.tobytes()
|
||||||
|
|
|
||||||
|
|
@ -9,7 +9,7 @@ import re
|
||||||
import subprocess
|
import subprocess
|
||||||
import progressbar
|
import progressbar
|
||||||
|
|
||||||
from generate_similar import similar_phonemes
|
from generate_similar import similar_phonemes,similar_word
|
||||||
|
|
||||||
OUTPUT_NAME = 'story_sents'
|
OUTPUT_NAME = 'story_sents'
|
||||||
dest_dir = os.path.abspath('.') + '/outputs/' + OUTPUT_NAME + '/'
|
dest_dir = os.path.abspath('.') + '/outputs/' + OUTPUT_NAME + '/'
|
||||||
|
|
@ -74,10 +74,6 @@ class SynthFile(object):
|
||||||
self.filename]
|
self.filename]
|
||||||
|
|
||||||
return ','.join([str(c) for c in cols])+'\n'
|
return ','.join([str(c) for c in cols])+'\n'
|
||||||
# return '{},{},{},{},{},{},{}\n'.format(self.word, self.phoneme, self.voice,
|
|
||||||
# self.voice_lang, self.rate, self.variant,
|
|
||||||
# self.filename)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class SynthVariant(object):
|
class SynthVariant(object):
|
||||||
|
|
@ -116,11 +112,11 @@ class SynthVariant(object):
|
||||||
# self.synth.startSpeakingString_toURL_(word,d_url)
|
# self.synth.startSpeakingString_toURL_(word,d_url)
|
||||||
phoneme = orig_phon
|
phoneme = orig_phon
|
||||||
elif variant == 'medium':
|
elif variant == 'medium':
|
||||||
phoneme = re.sub('[0-9]', '', orig_phon)
|
phoneme = similar_phoneme(orig_phon)
|
||||||
phon_cmd = '[[inpt PHON]] ' + phoneme
|
phon_cmd = '[[inpt PHON]] ' + phoneme
|
||||||
elif variant == 'high':
|
elif variant == 'high':
|
||||||
phoneme = orig_phon
|
phoneme = similar_word(word)
|
||||||
phon_cmd = '[[inpt PHON]] ' + phoneme
|
phon_cmd = phoneme
|
||||||
# elif variant == 'long':
|
# elif variant == 'long':
|
||||||
# if phon != '':
|
# if phon != '':
|
||||||
# self.phone_synth.startSpeakingString_toURL_(phon,d_url)
|
# self.phone_synth.startSpeakingString_toURL_(phon,d_url)
|
||||||
|
|
@ -132,8 +128,8 @@ class SynthVariant(object):
|
||||||
cli_gen_audio(phon_cmd, self.rate, self.name, d_path)
|
cli_gen_audio(phon_cmd, self.rate, self.name, d_path)
|
||||||
return SynthFile(word, phoneme, r_path, self.name, self.lang, self.rate, variant)
|
return SynthFile(word, phoneme, r_path, self.name, self.lang, self.rate, variant)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
def synth_generator():
|
def voices_for_lang(lang):
|
||||||
voices_installed = NSSpeechSynthesizer.availableVoices()
|
voices_installed = NSSpeechSynthesizer.availableVoices()
|
||||||
voice_attrs = [
|
voice_attrs = [
|
||||||
NSSpeechSynthesizer.attributesForVoice_(v) for v in voices_installed
|
NSSpeechSynthesizer.attributesForVoice_(v) for v in voices_installed
|
||||||
|
|
@ -142,25 +138,28 @@ def synth_generator():
|
||||||
# 'VoiceIndividuallySpokenCharacters', 'VoiceSupportedCharacters']]
|
# 'VoiceIndividuallySpokenCharacters', 'VoiceSupportedCharacters']]
|
||||||
# s_attrs = [[v[i] for i in sk] for v in voice_attrs if 'VoiceShowInFullListOnly' in v
|
# s_attrs = [[v[i] for i in sk] for v in voice_attrs if 'VoiceShowInFullListOnly' in v
|
||||||
# and 'VoiceRelativeDesirability' in v]
|
# and 'VoiceRelativeDesirability' in v]
|
||||||
us_voices_ids = [
|
return [
|
||||||
(v['VoiceIdentifier'],
|
(v['VoiceIdentifier'],
|
||||||
v['VoiceName'],
|
v['VoiceName'],
|
||||||
v['VoiceLanguage']) for v in voice_attrs
|
v['VoiceLanguage']) for v in voice_attrs
|
||||||
if v['VoiceLanguage'] == 'en-US'
|
if v['VoiceLanguage'] == lang
|
||||||
and v['VoiceGender'] != 'VoiceGenderNeuter'
|
and v['VoiceGender'] != 'VoiceGenderNeuter'
|
||||||
]
|
]
|
||||||
# import pdb
|
|
||||||
# pdb.set_trace()
|
@classmethod
|
||||||
# us_voices_ids = ['com.apple.speech.synthesis.voice.Fred',
|
def synth_with(cls,voice_params,rate=180):
|
||||||
# 'com.apple.speech.synthesis.voice.Alex',
|
identifier,voice,lang = voice_params
|
||||||
# 'com.apple.speech.synthesis.voice.Victoria']
|
return cls(identifier,voice,lang,rate)
|
||||||
# voice_rates = list(range(150,221,(220-180)//4))
|
|
||||||
|
|
||||||
|
def synth_generator():
|
||||||
|
us_voices_ids = SynthVariant.voices_for_lang('en-US')
|
||||||
voice_rates = [150, 180, 210, 250]
|
voice_rates = [150, 180, 210, 250]
|
||||||
voice_synths = []
|
voice_synths = []
|
||||||
create_dir(dest_dir)
|
create_dir(dest_dir)
|
||||||
for (i, v, l) in us_voices_ids:
|
for vp in us_voices_ids:
|
||||||
for r in voice_rates:
|
for r in voice_rates:
|
||||||
s = SynthVariant(i, v, l, r)
|
s = SynthVariant.synth_with(vp,r)
|
||||||
if s.phoneme_capable:
|
if s.phoneme_capable:
|
||||||
print('Adding ', s)
|
print('Adding ', s)
|
||||||
voice_synths.append(s)
|
voice_synths.append(s)
|
||||||
|
|
@ -212,11 +211,11 @@ def synth_logger(fname, csv=False):
|
||||||
|
|
||||||
|
|
||||||
def generate_audio_for_stories():
|
def generate_audio_for_stories():
|
||||||
# story_file = './inputs/all_stories_hs.json'
|
story_file = './inputs/all_stories_hs.json'
|
||||||
story_file = './inputs/all_stories.json'
|
# story_file = './inputs/all_stories.json'
|
||||||
stories_data = json.load(open(story_file))
|
stories_data = json.load(open(story_file))
|
||||||
# word_list = [t[0] for i in stories_data.values() for t in i]
|
word_list = [t[0] for i in stories_data.values() for t in i]
|
||||||
word_list = [i for g in stories_data.values() for i in g]
|
# word_list = [i for g in stories_data.values() for i in g]
|
||||||
(writer, closer) = synth_logger(dest_file, csv=True)
|
(writer, closer) = synth_logger(dest_file, csv=True)
|
||||||
synth_for_words = synth_generator()
|
synth_for_words = synth_generator()
|
||||||
try:
|
try:
|
||||||
|
|
@ -228,11 +227,6 @@ def generate_audio_for_stories():
|
||||||
pass
|
pass
|
||||||
closer()
|
closer()
|
||||||
|
|
||||||
# synths = synth_generator()([OUTPUT_NAME])
|
|
||||||
|
|
||||||
# write_synths(synths, dest_file, True)
|
|
||||||
# write_synths(synths,'./outputs/synths.json')
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
generate_audio_for_stories()
|
generate_audio_for_stories()
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue