implemented tts gen variants

master
Malar Kannan 2017-10-27 18:53:22 +05:30
parent eb3ce8b7e5
commit 938a9cf0a8
4 changed files with 115 additions and 62 deletions

View File

@ -1,6 +1,12 @@
import pandas as pd import pandas as pd
import pronouncing
import re import re
mapping = {s.split()[0]: s.split()[1] for s in """ import numpy as np
import random
mapping = {
s.split()[0]: s.split()[1]
for s in """
AA AA AA AA
AE AE AE AE
AH UX AH UX
@ -38,40 +44,93 @@ UW UW
V v V v
W w W w
Y y Y y
X x
Z z Z z
ZH Z ZH Z
""".strip().split('\n')} """.strip().split('\n')
}
apple_phonemes = [
'%', '@', 'AE', 'EY', 'AO', 'AX', 'IY', 'EH', 'IH', 'AY', 'IX', 'AA', 'UW',
'UH', 'UX', 'OW', 'AW', 'OY', 'b', 'C', 'd', 'D', 'f', 'g', 'h', 'J', 'k',
'l', 'm', 'n', 'N', 'p', 'r', 's', 'S', 't', 'T', 'v', 'w', 'y', 'z', 'Z'
]
sim_mat = pd.read_csv('./similarity.csv', header=0, index_col=0)
sim_mat = pd.read_csv('./similarity.csv',header=0,index_col=0)
def convert_ph(ph): def convert_ph(ph):
stress_level = re.search("(\w+)([0-9])",ph) stress_level = re.search("(\w+)([0-9])", ph)
if stress_level: if stress_level:
return stress_level.group(2)+mapping[stress_level.group(1)] return stress_level.group(2) + mapping[stress_level.group(1)]
else: else:
return mapping[ph] return mapping[ph]
def sim_mat_to_apple_table(smt): def sim_mat_to_apple_table(smt):
colnames = [convert_ph(ph) for ph in smt.index.tolist()] colnames = [convert_ph(ph) for ph in smt.index.tolist()]
smt = pd.DataFrame(np.nan_to_num(smt.values)) smt = pd.DataFrame(np.nan_to_num(smt.values))
fsmt = (smt.T+smt) fsmt = (smt.T + smt)
np.fill_diagonal(fsmt.values,100.0) np.fill_diagonal(fsmt.values, 100.0)
asmt = pd.DataFrame.copy(fsmt) asmt = pd.DataFrame.copy(fsmt)
asmt.columns = colnames asmt.columns = colnames
asmt.index = colnames asmt.index = colnames
apple_sim_lookup = asmt.stack().reset_index() apple_sim_table = asmt.stack().reset_index()
apple_sim_lookup.columns = ['q','r','s'] apple_sim_table.columns = ['q', 'r', 's']
return apple_sim_lookup return apple_sim_table
apple_sim_table = sim_mat_to_apple_table(sim_mat)
apple_sim_lookup = sim_mat_to_apple_table(sim_mat)
def top_match(ph): def top_match(ph):
selected = apple_sim_lookup[(apple_sim_lookup.q == ph) & (apple_sim_lookup.s < 100) & (apple_sim_lookup.s >= 70)] selected = apple_sim_table[(apple_sim_table.q == ph)
& (apple_sim_table.s < 100) &
(apple_sim_table.s >= 70)]
tm = ph tm = ph
if len(selected) > 0: if len(selected) > 0:
tm = pd.DataFrame.sort_values(selected,'s',ascending=False).iloc[0].r tm = pd.DataFrame.sort_values(selected, 's', ascending=False).iloc[0].r
return tm return tm
class ApplePhoneme(object):
"""docstring for ApplePhoneme."""
def __init__(self, phone, stress, vowel=False):
super(ApplePhoneme, self).__init__()
self.phone = phone
self.stress = stress
self.vowel = vowel
def __str__(self):
return (str(self.stress) if (self.vowel and self.stress>0) else '') + self.phone
def __repr__(self):
return "'{}'".format(str(self))
def adjust_stress(self):
self.stress = random.choice([i for i in range(3) if i != self.stress])
def parse_apple_phonemes(ph_str):
for i in range(len(ph_str)):
pref, rest = ph_str[:i + 1], ph_str[i + 1:]
if pref in apple_phonemes:
vowel = pref[0] in 'AEIOU'
return [ApplePhoneme(pref, 0, vowel)] + parse_apple_phonemes(rest)
elif pref[0].isdigit() and pref[1:] in apple_phonemes:
return [ApplePhoneme(pref[1:], int(pref[0]) , True)] + parse_apple_phonemes(rest)
elif not pref.isalnum():
return [ApplePhoneme(pref, 0, False)] + parse_apple_phonemes(rest)
return []
def similar_phoneme(ph_str): def similar_phoneme(ph_str):
return ph_str phons = parse_apple_phonemes(ph_str)
vowels = [i for i in phons if i.vowel]
random.choice(vowels).adjust_stress()
return ''.join([str(i) for i in phons])
def similar_word(word_str):
similar = pronouncing.rhymes(word_str)
return random.choice(similar) if len(similar) > 0 else word_str

View File

@ -1,7 +1,7 @@
import pyaudio import pyaudio
import numpy as np import numpy as np
# from matplotlib import pyplot as plt # from matplotlib import pyplot as plt
from spectro_gen import plot_stft, generate_spectrogram from spectro_gen import plot_stft, generate_spec_frec
def record_spectrogram(n_sec, plot=False, playback=False): def record_spectrogram(n_sec, plot=False, playback=False):
@ -38,5 +38,5 @@ def record_spectrogram(n_sec, plot=False, playback=False):
stream.write(mean_channel_data) stream.write(mean_channel_data)
stream.close() stream.close()
p_oup.terminate() p_oup.terminate()
ims, _ = generate_spectrogram(one_channel, SAMPLE_RATE) ims, _ = generate_spec_frec(one_channel, SAMPLE_RATE)
return ims return ims

View File

@ -71,7 +71,7 @@ def logscale_spec(spec, sr=44100, factor=20.):
""" generate spectrogram for aiff audio with 150ms windows and 50ms overlap""" """ generate spectrogram for aiff audio with 150ms windows and 50ms overlap"""
def generate_spectrogram(samples, samplerate): def generate_spec_frec(samples, samplerate):
# samplerate, samples = wav.read(audiopath) # samplerate, samples = wav.read(audiopath)
# s = stft(samples, binsize) # s = stft(samples, binsize)
s = stft(samples, samplerate * 150 // 1000, 1.0 / 3) s = stft(samples, samplerate * 150 // 1000, 1.0 / 3)
@ -84,12 +84,12 @@ def generate_spectrogram(samples, samplerate):
def generate_aiff_spectrogram(audiopath): def generate_aiff_spectrogram(audiopath):
samples, samplerate, _ = snd.read(audiopath) samples, samplerate, _ = snd.read(audiopath)
ims, _ = generate_spectrogram(samples, samplerate) ims, _ = generate_spec_frec(samples, samplerate)
return ims return ims
def plot_stft(samples, samplerate, binsize=2**10, plotpath=None, colormap="jet"): def plot_stft(samples, samplerate, binsize=2**10, plotpath=None, colormap="jet"):
(ims, freq) = generate_spectrogram(samples, samplerate) (ims, freq) = generate_spec_frec(samples, samplerate)
timebins, freqbins = np.shape(ims) timebins, freqbins = np.shape(ims)
plt.figure(figsize=(15, 7.5)) plt.figure(figsize=(15, 7.5))
plt.imshow( plt.imshow(
@ -126,8 +126,8 @@ def plot_aiff_stft(audiopath, binsize=2**10, plotpath=None, colormap="jet"):
def play_sunflower(): def play_sunflower():
sample_r = snd.get_info('./outputs/sunflowers-Alex-150-normal-589.aiff')[0] sample_r = snd.get_info('./outputs/audio/sunflowers-Alex-150-normal-589.aiff')[0]
snd_data_f64 = snd.read('./outputs/sunflowers-Alex-150-normal-589.aiff')[0] snd_data_f64 = snd.read('./outputs/audio/sunflowers-Alex-150-normal-589.aiff')[0]
snd_data_f32 = snd_data_f64.astype(np.float32) snd_data_f32 = snd_data_f64.astype(np.float32)
print(snd_data_f32.shape) print(snd_data_f32.shape)
snd_data = snd_data_f32.tobytes() snd_data = snd_data_f32.tobytes()

View File

@ -9,7 +9,7 @@ import re
import subprocess import subprocess
import progressbar import progressbar
from generate_similar import similar_phonemes from generate_similar import similar_phonemes,similar_word
OUTPUT_NAME = 'story_sents' OUTPUT_NAME = 'story_sents'
dest_dir = os.path.abspath('.') + '/outputs/' + OUTPUT_NAME + '/' dest_dir = os.path.abspath('.') + '/outputs/' + OUTPUT_NAME + '/'
@ -74,10 +74,6 @@ class SynthFile(object):
self.filename] self.filename]
return ','.join([str(c) for c in cols])+'\n' return ','.join([str(c) for c in cols])+'\n'
# return '{},{},{},{},{},{},{}\n'.format(self.word, self.phoneme, self.voice,
# self.voice_lang, self.rate, self.variant,
# self.filename)
class SynthVariant(object): class SynthVariant(object):
@ -116,11 +112,11 @@ class SynthVariant(object):
# self.synth.startSpeakingString_toURL_(word,d_url) # self.synth.startSpeakingString_toURL_(word,d_url)
phoneme = orig_phon phoneme = orig_phon
elif variant == 'medium': elif variant == 'medium':
phoneme = re.sub('[0-9]', '', orig_phon) phoneme = similar_phoneme(orig_phon)
phon_cmd = '[[inpt PHON]] ' + phoneme phon_cmd = '[[inpt PHON]] ' + phoneme
elif variant == 'high': elif variant == 'high':
phoneme = orig_phon phoneme = similar_word(word)
phon_cmd = '[[inpt PHON]] ' + phoneme phon_cmd = phoneme
# elif variant == 'long': # elif variant == 'long':
# if phon != '': # if phon != '':
# self.phone_synth.startSpeakingString_toURL_(phon,d_url) # self.phone_synth.startSpeakingString_toURL_(phon,d_url)
@ -132,35 +128,38 @@ class SynthVariant(object):
cli_gen_audio(phon_cmd, self.rate, self.name, d_path) cli_gen_audio(phon_cmd, self.rate, self.name, d_path)
return SynthFile(word, phoneme, r_path, self.name, self.lang, self.rate, variant) return SynthFile(word, phoneme, r_path, self.name, self.lang, self.rate, variant)
@staticmethod
def voices_for_lang(lang):
voices_installed = NSSpeechSynthesizer.availableVoices()
voice_attrs = [
NSSpeechSynthesizer.attributesForVoice_(v) for v in voices_installed
]
# sk = [k for k in voice_attrs[0].keys() if k not in [
# 'VoiceIndividuallySpokenCharacters', 'VoiceSupportedCharacters']]
# s_attrs = [[v[i] for i in sk] for v in voice_attrs if 'VoiceShowInFullListOnly' in v
# and 'VoiceRelativeDesirability' in v]
return [
(v['VoiceIdentifier'],
v['VoiceName'],
v['VoiceLanguage']) for v in voice_attrs
if v['VoiceLanguage'] == lang
and v['VoiceGender'] != 'VoiceGenderNeuter'
]
@classmethod
def synth_with(cls,voice_params,rate=180):
identifier,voice,lang = voice_params
return cls(identifier,voice,lang,rate)
def synth_generator(): def synth_generator():
voices_installed = NSSpeechSynthesizer.availableVoices() us_voices_ids = SynthVariant.voices_for_lang('en-US')
voice_attrs = [
NSSpeechSynthesizer.attributesForVoice_(v) for v in voices_installed
]
# sk = [k for k in voice_attrs[0].keys() if k not in [
# 'VoiceIndividuallySpokenCharacters', 'VoiceSupportedCharacters']]
# s_attrs = [[v[i] for i in sk] for v in voice_attrs if 'VoiceShowInFullListOnly' in v
# and 'VoiceRelativeDesirability' in v]
us_voices_ids = [
(v['VoiceIdentifier'],
v['VoiceName'],
v['VoiceLanguage']) for v in voice_attrs
if v['VoiceLanguage'] == 'en-US'
and v['VoiceGender'] != 'VoiceGenderNeuter'
]
# import pdb
# pdb.set_trace()
# us_voices_ids = ['com.apple.speech.synthesis.voice.Fred',
# 'com.apple.speech.synthesis.voice.Alex',
# 'com.apple.speech.synthesis.voice.Victoria']
# voice_rates = list(range(150,221,(220-180)//4))
voice_rates = [150, 180, 210, 250] voice_rates = [150, 180, 210, 250]
voice_synths = [] voice_synths = []
create_dir(dest_dir) create_dir(dest_dir)
for (i, v, l) in us_voices_ids: for vp in us_voices_ids:
for r in voice_rates: for r in voice_rates:
s = SynthVariant(i, v, l, r) s = SynthVariant.synth_with(vp,r)
if s.phoneme_capable: if s.phoneme_capable:
print('Adding ', s) print('Adding ', s)
voice_synths.append(s) voice_synths.append(s)
@ -212,11 +211,11 @@ def synth_logger(fname, csv=False):
def generate_audio_for_stories(): def generate_audio_for_stories():
# story_file = './inputs/all_stories_hs.json' story_file = './inputs/all_stories_hs.json'
story_file = './inputs/all_stories.json' # story_file = './inputs/all_stories.json'
stories_data = json.load(open(story_file)) stories_data = json.load(open(story_file))
# word_list = [t[0] for i in stories_data.values() for t in i] word_list = [t[0] for i in stories_data.values() for t in i]
word_list = [i for g in stories_data.values() for i in g] # word_list = [i for g in stories_data.values() for i in g]
(writer, closer) = synth_logger(dest_file, csv=True) (writer, closer) = synth_logger(dest_file, csv=True)
synth_for_words = synth_generator() synth_for_words = synth_generator()
try: try:
@ -228,11 +227,6 @@ def generate_audio_for_stories():
pass pass
closer() closer()
# synths = synth_generator()([OUTPUT_NAME])
# write_synths(synths, dest_file, True)
# write_synths(synths,'./outputs/synths.json')
if __name__ == '__main__': if __name__ == '__main__':
generate_audio_for_stories() generate_audio_for_stories()