wip high variant phoneme
parent
e57576d6fa
commit
eb3ce8b7e5
6
TODO.md
6
TODO.md
|
|
@ -3,3 +3,9 @@
|
|||
2. train a rnn to output a vector using the spectrograms
|
||||
3. train a nn to output True/False based on the acceptability of the rnn output. -> Siamese network(implementation detail)
|
||||
4. validate with real world samples
|
||||
|
||||
same word spoken by multiple people etc. will be low distance. two words which are very different (you can use similarity measure given in the speech_recognition repo) will have high distance.
|
||||
|
||||
the one with wrong pronunciation will have medium distance from one with right pronunciation
|
||||
|
||||
i also had good experience with getting non-English voices to speak out the English words to get "wrong" pronunciation - so that will be subtly different too.
|
||||
|
|
|
|||
|
|
@ -9,13 +9,15 @@ import re
|
|||
import subprocess
|
||||
import progressbar
|
||||
|
||||
from generate_similar import similar_phonemes
|
||||
|
||||
OUTPUT_NAME = 'story_sents'
|
||||
dest_dir = os.path.abspath('.') + '/outputs/' + OUTPUT_NAME + '/'
|
||||
dest_file = './outputs/' + OUTPUT_NAME + '.csv'
|
||||
|
||||
|
||||
def prog_bar(title):
|
||||
widgets = [title,progressbar.Counter(),'th entry - ', progressbar.FormatLabel(
|
||||
widgets = [title, progressbar.Counter(), 'th entry - ', progressbar.FormatLabel(
|
||||
''), ' [', progressbar.Bar(), '] - ', progressbar.ETA()]
|
||||
prog = progressbar.ProgressBar(widgets=widgets)
|
||||
|
||||
|
|
@ -48,12 +50,13 @@ def cli_gen_audio(speech_cmd, rate, voice, out_path):
|
|||
class SynthFile(object):
|
||||
"""docstring for SynthFile."""
|
||||
|
||||
def __init__(self, word, phon, filename, voice, rate, operation):
|
||||
def __init__(self, word, phon, filename, voice, voice_lang, rate, operation):
|
||||
super(SynthFile, self).__init__()
|
||||
self.word = word
|
||||
self.phoneme = phon
|
||||
self.filename = filename
|
||||
self.voice = voice
|
||||
self.voice_lang = voice_lang
|
||||
self.rate = rate
|
||||
self.variant = operation
|
||||
|
||||
|
|
@ -66,9 +69,15 @@ class SynthFile(object):
|
|||
}
|
||||
|
||||
def get_csv(self):
|
||||
return '{},{},{},{},{},{}\n'.format(self.word, self.phoneme, self.voice,
|
||||
self.rate, self.variant,
|
||||
self.filename)
|
||||
cols = [self.word, self.phoneme, self.voice,
|
||||
self.voice_lang, self.rate, self.variant,
|
||||
self.filename]
|
||||
|
||||
return ','.join([str(c) for c in cols])+'\n'
|
||||
# return '{},{},{},{},{},{},{}\n'.format(self.word, self.phoneme, self.voice,
|
||||
# self.voice_lang, self.rate, self.variant,
|
||||
# self.filename)
|
||||
|
||||
|
||||
|
||||
class SynthVariant(object):
|
||||
|
|
@ -90,7 +99,8 @@ class SynthVariant(object):
|
|||
self.name = voice
|
||||
self.lang = lang
|
||||
self.phoneme_capable = self.is_phoneme_capable()
|
||||
create_dir(dest_dir + self.name + '/' + str(self.rate))
|
||||
if self.phoneme_capable:
|
||||
create_dir(dest_dir + self.name + '/' + str(self.rate))
|
||||
|
||||
def __repr__(self):
|
||||
return 'Synthesizer[{} - {}]'.format(self.name, self.rate)
|
||||
|
|
@ -120,7 +130,7 @@ class SynthVariant(object):
|
|||
d_path, r_path = dest_path(self.name, self.rate, fname)
|
||||
# d_url = NSURL.fileURLWithPath_(d_path)
|
||||
cli_gen_audio(phon_cmd, self.rate, self.name, d_path)
|
||||
return SynthFile(word, phoneme, r_path, self.name, self.rate, variant)
|
||||
return SynthFile(word, phoneme, r_path, self.name, self.lang, self.rate, variant)
|
||||
|
||||
|
||||
def synth_generator():
|
||||
|
|
@ -136,14 +146,8 @@ def synth_generator():
|
|||
(v['VoiceIdentifier'],
|
||||
v['VoiceName'],
|
||||
v['VoiceLanguage']) for v in voice_attrs
|
||||
# v['VoiceDemoText'],
|
||||
# v['VoiceShowInFullListOnly'],
|
||||
# v['VoiceRelativeDesirability'])
|
||||
if v['VoiceLanguage'] == 'en-US'
|
||||
and v['VoiceGender'] != 'VoiceGenderNeuter'
|
||||
# and v['VoiceIdentifier'].split('.')[-1][0].isupper()
|
||||
# and 'VoiceShowInFullListOnly' in v
|
||||
# and 'VoiceRelativeDesirability' in v
|
||||
]
|
||||
# import pdb
|
||||
# pdb.set_trace()
|
||||
|
|
@ -164,7 +168,6 @@ def synth_generator():
|
|||
print('Discarding phoneme incapable ', s)
|
||||
|
||||
def synth_for_words(words, writer):
|
||||
# all_synths = []
|
||||
prog_title = "Synthesizing {} words : ".format(len(words))
|
||||
(update, prog) = prog_bar(prog_title)
|
||||
for w in prog(words):
|
||||
|
|
@ -173,9 +176,6 @@ def synth_generator():
|
|||
update('"{}" with {} variant ({})'.format(w, s, v))
|
||||
synthed = s.generate_audio(w, v)
|
||||
writer(synthed)
|
||||
# all_synths.append(synthed)
|
||||
# return all_synths
|
||||
|
||||
return synth_for_words
|
||||
|
||||
|
||||
|
|
@ -228,22 +228,11 @@ def generate_audio_for_stories():
|
|||
pass
|
||||
closer()
|
||||
|
||||
# words_audio_synth = synth_generator()
|
||||
# synth = NSSpeechSynthesizer.alloc().init()
|
||||
# voices_installed = NSSpeechSynthesizer.availableVoices()
|
||||
# voice_attrs = [NSSpeechSynthesizer.attributesForVoice_(v) for v in voices_installed]
|
||||
# us_voices_ids = [v['VoiceIdentifier'] for v in voice_attrs if v['VoiceLanguage'] == 'en-US' and v['VoiceIdentifier'].split('.')[-1][0].isupper()]
|
||||
# synth.setVoice_(us_voices_ids[2])
|
||||
# synth.startSpeakingString_('your')
|
||||
# fname = dest_filename(word,self.name,self.rate,self.operation)
|
||||
# d_path = dest_path(fname)
|
||||
# d_url = dest_url(d_path)
|
||||
|
||||
|
||||
# synths = synth_generator()([OUTPUT_NAME])
|
||||
|
||||
# write_synths(synths, dest_file, True)
|
||||
# write_synths(synths,'./outputs/synths.json')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
generate_audio_for_stories()
|
||||
|
|
|
|||
Loading…
Reference in New Issue