Compare commits
5 Commits
49e6a46efd
...
eb3ce8b7e5
| Author | SHA1 | Date |
|---|---|---|
|
|
eb3ce8b7e5 | |
|
|
e57576d6fa | |
|
|
a953fa3355 | |
|
|
7a520b79f4 | |
|
|
05f36daf7e |
|
|
@ -138,7 +138,7 @@ Temporary Items
|
||||||
# End of https://www.gitignore.io/api/macos
|
# End of https://www.gitignore.io/api/macos
|
||||||
|
|
||||||
outputs/*
|
outputs/*
|
||||||
inputs/mnist
|
inputs/*
|
||||||
inputs/audio*
|
inputs/audio*
|
||||||
logs/*
|
logs/*
|
||||||
models/*
|
models/*
|
||||||
|
|
|
||||||
7
TODO.md
7
TODO.md
|
|
@ -2,3 +2,10 @@
|
||||||
1. create spectrograms of 150ms windows with 50ms overlap for each word.
|
1. create spectrograms of 150ms windows with 50ms overlap for each word.
|
||||||
2. train a rnn to output a vector using the spectrograms
|
2. train a rnn to output a vector using the spectrograms
|
||||||
3. train a nn to output True/False based on the acceptability of the rnn output. -> Siamese network(implementation detail)
|
3. train a nn to output True/False based on the acceptability of the rnn output. -> Siamese network(implementation detail)
|
||||||
|
4. validate with real world samples
|
||||||
|
|
||||||
|
same word spoken by multiple people etc. will be low distance. two words which are very different (you can use similarity measure given in the speech_recognition repo) will have high distance.
|
||||||
|
|
||||||
|
the one with wrong pronunciation will have medium distance from one with right pronunciation
|
||||||
|
|
||||||
|
i also had good experience with getting non-English voices to speak out the English words to get "wrong" pronunciation - so that will be subtly different too.
|
||||||
|
|
|
||||||
159
tts_samplegen.py
159
tts_samplegen.py
|
|
@ -7,24 +7,38 @@ import random
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import subprocess
|
import subprocess
|
||||||
|
import progressbar
|
||||||
|
|
||||||
OUTPUT_NAME = 'audio'
|
from generate_similar import similar_phonemes
|
||||||
|
|
||||||
|
OUTPUT_NAME = 'story_sents'
|
||||||
dest_dir = os.path.abspath('.') + '/outputs/' + OUTPUT_NAME + '/'
|
dest_dir = os.path.abspath('.') + '/outputs/' + OUTPUT_NAME + '/'
|
||||||
dest_file = './outputs/' + OUTPUT_NAME + '.csv'
|
dest_file = './outputs/' + OUTPUT_NAME + '.csv'
|
||||||
|
|
||||||
|
|
||||||
|
def prog_bar(title):
|
||||||
|
widgets = [title, progressbar.Counter(), 'th entry - ', progressbar.FormatLabel(
|
||||||
|
''), ' [', progressbar.Bar(), '] - ', progressbar.ETA()]
|
||||||
|
prog = progressbar.ProgressBar(widgets=widgets)
|
||||||
|
|
||||||
|
def update_prog(current):
|
||||||
|
widgets[3] = progressbar.FormatLabel(current)
|
||||||
|
prog.update()
|
||||||
|
return (update_prog, prog)
|
||||||
|
|
||||||
|
|
||||||
def create_dir(direc):
|
def create_dir(direc):
|
||||||
if not os.path.exists(direc):
|
if not os.path.exists(direc):
|
||||||
os.mkdir(direc)
|
os.makedirs(direc)
|
||||||
|
|
||||||
|
|
||||||
def dest_filename(n, v, r, t):
|
def dest_filename(w, v, r, t):
|
||||||
return '{}-{}-{}-{}-'.format(n, v, r,
|
return '{}-{}-{}-{}-{}.aiff'.format(w, v, r, t, str(random.randint(0, 10000)))
|
||||||
t) + str(random.randint(0, 10000)) + '.aiff'
|
|
||||||
|
|
||||||
|
|
||||||
def dest_path(v, r, n):
|
def dest_path(v, r, n):
|
||||||
return dest_dir + v + '/' + r + '/' + n
|
rel = v + '/' + str(r) + '/' + n
|
||||||
|
return (dest_dir + rel), rel
|
||||||
|
|
||||||
|
|
||||||
def cli_gen_audio(speech_cmd, rate, voice, out_path):
|
def cli_gen_audio(speech_cmd, rate, voice, out_path):
|
||||||
|
|
@ -36,12 +50,13 @@ def cli_gen_audio(speech_cmd, rate, voice, out_path):
|
||||||
class SynthFile(object):
|
class SynthFile(object):
|
||||||
"""docstring for SynthFile."""
|
"""docstring for SynthFile."""
|
||||||
|
|
||||||
def __init__(self, word, phon, filename, voice, rate, operation):
|
def __init__(self, word, phon, filename, voice, voice_lang, rate, operation):
|
||||||
super(SynthFile, self).__init__()
|
super(SynthFile, self).__init__()
|
||||||
self.word = word
|
self.word = word
|
||||||
self.phoneme = phon
|
self.phoneme = phon
|
||||||
self.filename = filename
|
self.filename = filename
|
||||||
self.voice = voice
|
self.voice = voice
|
||||||
|
self.voice_lang = voice_lang
|
||||||
self.rate = rate
|
self.rate = rate
|
||||||
self.variant = operation
|
self.variant = operation
|
||||||
|
|
||||||
|
|
@ -54,15 +69,21 @@ class SynthFile(object):
|
||||||
}
|
}
|
||||||
|
|
||||||
def get_csv(self):
|
def get_csv(self):
|
||||||
return '{},{},{},{},{}\n'.format(self.word, self.phoneme, self.voice,
|
cols = [self.word, self.phoneme, self.voice,
|
||||||
self.rate, self.variant,
|
self.voice_lang, self.rate, self.variant,
|
||||||
self.filename)
|
self.filename]
|
||||||
|
|
||||||
|
return ','.join([str(c) for c in cols])+'\n'
|
||||||
|
# return '{},{},{},{},{},{},{}\n'.format(self.word, self.phoneme, self.voice,
|
||||||
|
# self.voice_lang, self.rate, self.variant,
|
||||||
|
# self.filename)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class SynthVariant(object):
|
class SynthVariant(object):
|
||||||
"""docstring for SynthVariant."""
|
"""docstring for SynthVariant."""
|
||||||
|
|
||||||
def __init__(self, identifier, rate):
|
def __init__(self, identifier, voice, lang, rate):
|
||||||
super(SynthVariant, self).__init__()
|
super(SynthVariant, self).__init__()
|
||||||
self.synth = NSSpeechSynthesizer.alloc().initWithVoice_(identifier)
|
self.synth = NSSpeechSynthesizer.alloc().initWithVoice_(identifier)
|
||||||
self.synth.setVolume_(100)
|
self.synth.setVolume_(100)
|
||||||
|
|
@ -75,10 +96,18 @@ class SynthVariant(object):
|
||||||
NSSpeechModePhoneme, NSSpeechInputModeProperty, None)
|
NSSpeechModePhoneme, NSSpeechInputModeProperty, None)
|
||||||
self.identifier = identifier
|
self.identifier = identifier
|
||||||
self.rate = rate
|
self.rate = rate
|
||||||
self.name = identifier.split('.')[-1]
|
self.name = voice
|
||||||
|
self.lang = lang
|
||||||
|
self.phoneme_capable = self.is_phoneme_capable()
|
||||||
|
if self.phoneme_capable:
|
||||||
|
create_dir(dest_dir + self.name + '/' + str(self.rate))
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return 'Synthesizer[{} - {}]({})'.format(self.name, self.rate)
|
return 'Synthesizer[{} - {}]'.format(self.name, self.rate)
|
||||||
|
|
||||||
|
def is_phoneme_capable(self):
|
||||||
|
orig_phon = self.synth.phonemesFromText_('water')
|
||||||
|
return orig_phon != ''
|
||||||
|
|
||||||
def generate_audio(self, word, variant):
|
def generate_audio(self, word, variant):
|
||||||
orig_phon, phoneme, phon_cmd = self.synth.phonemesFromText_(
|
orig_phon, phoneme, phon_cmd = self.synth.phonemesFromText_(
|
||||||
|
|
@ -91,17 +120,17 @@ class SynthVariant(object):
|
||||||
phon_cmd = '[[inpt PHON]] ' + phoneme
|
phon_cmd = '[[inpt PHON]] ' + phoneme
|
||||||
elif variant == 'high':
|
elif variant == 'high':
|
||||||
phoneme = orig_phon
|
phoneme = orig_phon
|
||||||
phon_cmd = word
|
phon_cmd = '[[inpt PHON]] ' + phoneme
|
||||||
# elif variant == 'long':
|
# elif variant == 'long':
|
||||||
# if phon != '':
|
# if phon != '':
|
||||||
# self.phone_synth.startSpeakingString_toURL_(phon,d_url)
|
# self.phone_synth.startSpeakingString_toURL_(phon,d_url)
|
||||||
# else:
|
# else:
|
||||||
# self.synth.startSpeakingString_toURL_(word,d_url)
|
# self.synth.startSpeakingString_toURL_(word,d_url)
|
||||||
fname = dest_filename(word, phoneme, self.name, self.rate)
|
fname = dest_filename(word, self.name, self.rate, variant)
|
||||||
d_path = dest_path(self.name, self.rate, fname)
|
d_path, r_path = dest_path(self.name, self.rate, fname)
|
||||||
# d_url = NSURL.fileURLWithPath_(d_path)
|
# d_url = NSURL.fileURLWithPath_(d_path)
|
||||||
cli_gen_audio(phon_cmd, self.rate, self.name, d_path)
|
cli_gen_audio(phon_cmd, self.rate, self.name, d_path)
|
||||||
return SynthFile(word, phoneme, fname, self.name, self.rate, variant)
|
return SynthFile(word, phoneme, r_path, self.name, self.lang, self.rate, variant)
|
||||||
|
|
||||||
|
|
||||||
def synth_generator():
|
def synth_generator():
|
||||||
|
|
@ -109,11 +138,19 @@ def synth_generator():
|
||||||
voice_attrs = [
|
voice_attrs = [
|
||||||
NSSpeechSynthesizer.attributesForVoice_(v) for v in voices_installed
|
NSSpeechSynthesizer.attributesForVoice_(v) for v in voices_installed
|
||||||
]
|
]
|
||||||
|
# sk = [k for k in voice_attrs[0].keys() if k not in [
|
||||||
|
# 'VoiceIndividuallySpokenCharacters', 'VoiceSupportedCharacters']]
|
||||||
|
# s_attrs = [[v[i] for i in sk] for v in voice_attrs if 'VoiceShowInFullListOnly' in v
|
||||||
|
# and 'VoiceRelativeDesirability' in v]
|
||||||
us_voices_ids = [
|
us_voices_ids = [
|
||||||
v['VoiceIdentifier'] for v in voice_attrs
|
(v['VoiceIdentifier'],
|
||||||
|
v['VoiceName'],
|
||||||
|
v['VoiceLanguage']) for v in voice_attrs
|
||||||
if v['VoiceLanguage'] == 'en-US'
|
if v['VoiceLanguage'] == 'en-US'
|
||||||
and v['VoiceIdentifier'].split('.')[-1][0].isupper()
|
and v['VoiceGender'] != 'VoiceGenderNeuter'
|
||||||
]
|
]
|
||||||
|
# import pdb
|
||||||
|
# pdb.set_trace()
|
||||||
# us_voices_ids = ['com.apple.speech.synthesis.voice.Fred',
|
# us_voices_ids = ['com.apple.speech.synthesis.voice.Fred',
|
||||||
# 'com.apple.speech.synthesis.voice.Alex',
|
# 'com.apple.speech.synthesis.voice.Alex',
|
||||||
# 'com.apple.speech.synthesis.voice.Victoria']
|
# 'com.apple.speech.synthesis.voice.Victoria']
|
||||||
|
|
@ -121,19 +158,24 @@ def synth_generator():
|
||||||
voice_rates = [150, 180, 210, 250]
|
voice_rates = [150, 180, 210, 250]
|
||||||
voice_synths = []
|
voice_synths = []
|
||||||
create_dir(dest_dir)
|
create_dir(dest_dir)
|
||||||
for v in us_voices_ids:
|
for (i, v, l) in us_voices_ids:
|
||||||
for r in voice_rates:
|
for r in voice_rates:
|
||||||
create_dir(dest_dir + v + '/' + r)
|
s = SynthVariant(i, v, l, r)
|
||||||
voice_synths.append(SynthVariant(v, r))
|
if s.phoneme_capable:
|
||||||
|
print('Adding ', s)
|
||||||
|
voice_synths.append(s)
|
||||||
|
else:
|
||||||
|
print('Discarding phoneme incapable ', s)
|
||||||
|
|
||||||
def synth_for_words(words):
|
def synth_for_words(words, writer):
|
||||||
all_synths = []
|
prog_title = "Synthesizing {} words : ".format(len(words))
|
||||||
for w in words:
|
(update, prog) = prog_bar(prog_title)
|
||||||
|
for w in prog(words):
|
||||||
for s in voice_synths:
|
for s in voice_synths:
|
||||||
for v in ['low', 'medium', 'high']:
|
for v in ['low', 'medium', 'high']:
|
||||||
all_synths.append(s.generate_audio(w, v))
|
update('"{}" with {} variant ({})'.format(w, s, v))
|
||||||
return all_synths
|
synthed = s.generate_audio(w, v)
|
||||||
|
writer(synthed)
|
||||||
return synth_for_words
|
return synth_for_words
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -147,25 +189,50 @@ def write_synths(synth_list, fname, csv=False):
|
||||||
f.close()
|
f.close()
|
||||||
|
|
||||||
|
|
||||||
|
def synth_logger(fname, csv=False):
|
||||||
|
f = open(fname, 'w')
|
||||||
|
|
||||||
|
def csv_writer(s):
|
||||||
|
f.write(s.get_csv())
|
||||||
|
synth_list = []
|
||||||
|
|
||||||
|
def json_writer(s):
|
||||||
|
synth_list.append(s)
|
||||||
|
|
||||||
|
def close_file():
|
||||||
|
if csv:
|
||||||
|
f.close()
|
||||||
|
else:
|
||||||
|
json.dump([s.get_json() for s in synth_list], f)
|
||||||
|
f.close()
|
||||||
|
if csv:
|
||||||
|
return csv_writer, close_file
|
||||||
|
else:
|
||||||
|
return json_writer, close_file
|
||||||
|
|
||||||
|
|
||||||
def generate_audio_for_stories():
|
def generate_audio_for_stories():
|
||||||
stories_data = json.load(open('./inputs/all_stories_hs.json'))
|
# story_file = './inputs/all_stories_hs.json'
|
||||||
word_list = [t[0] for i in stories_data.values() for t in i]
|
story_file = './inputs/all_stories.json'
|
||||||
words_audio_synth = synth_generator()
|
stories_data = json.load(open(story_file))
|
||||||
return words_audio_synth(word_list)
|
# word_list = [t[0] for i in stories_data.values() for t in i]
|
||||||
|
word_list = [i for g in stories_data.values() for i in g]
|
||||||
|
(writer, closer) = synth_logger(dest_file, csv=True)
|
||||||
|
synth_for_words = synth_generator()
|
||||||
|
try:
|
||||||
|
synth_for_words(word_list, writer)
|
||||||
|
except:
|
||||||
|
import traceback
|
||||||
|
import sys
|
||||||
|
traceback.print_exc(file=sys.stdout)
|
||||||
|
pass
|
||||||
|
closer()
|
||||||
|
|
||||||
|
# synths = synth_generator()([OUTPUT_NAME])
|
||||||
|
|
||||||
# words_audio_synth = synth_generator()
|
# write_synths(synths, dest_file, True)
|
||||||
# synth = NSSpeechSynthesizer.alloc().init()
|
|
||||||
# voices_installed = NSSpeechSynthesizer.availableVoices()
|
|
||||||
# voice_attrs = [NSSpeechSynthesizer.attributesForVoice_(v) for v in voices_installed]
|
|
||||||
# us_voices_ids = [v['VoiceIdentifier'] for v in voice_attrs if v['VoiceLanguage'] == 'en-US' and v['VoiceIdentifier'].split('.')[-1][0].isupper()]
|
|
||||||
# synth.setVoice_(us_voices_ids[2])
|
|
||||||
# synth.startSpeakingString_('your')
|
|
||||||
# fname = dest_filename(word,self.name,self.rate,self.operation)
|
|
||||||
# d_path = dest_path(fname)
|
|
||||||
# d_url = dest_url(d_path)
|
|
||||||
|
|
||||||
synths = synth_generator()([OUTPUT_NAME])
|
|
||||||
# synths = generate_audio_for_stories()
|
|
||||||
write_synths(synths, dest_file, True)
|
|
||||||
# write_synths(synths,'./outputs/synths.json')
|
# write_synths(synths,'./outputs/synths.json')
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
generate_audio_for_stories()
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue