Compare commits
No commits in common. "8f793168938da38c567e4121e1f05ceffef7006a" and "ec08cc7d624f50d205c512e67ff90d3370887a09" have entirely different histories.
8f79316893
...
ec08cc7d62
|
|
@ -3,214 +3,61 @@ from AppKit import *
|
||||||
from Foundation import NSURL
|
from Foundation import NSURL
|
||||||
from PyObjCTools import AppHelper
|
from PyObjCTools import AppHelper
|
||||||
from time import time
|
from time import time
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
import random
|
|
||||||
import json
|
|
||||||
import csv
|
|
||||||
import subprocess
|
|
||||||
from tqdm import tqdm
|
|
||||||
|
|
||||||
from speech_samplegen import SynthVariant, format_filename
|
|
||||||
from speech_tools import create_dir
|
|
||||||
|
|
||||||
apple_phonemes = [
|
apple_phonemes = [
|
||||||
'%', '@', 'AE', 'EY', 'AO', 'AX', 'IY', 'EH', 'IH', 'AY', 'IX', 'AA', 'UW',
|
'%', '@', 'AE', 'EY', 'AO', 'AX', 'IY', 'EH', 'IH', 'AY', 'IX', 'AA', 'UW',
|
||||||
'UH', 'UX', 'OW', 'AW', 'OY', 'b', 'C', 'd', 'D', 'f', 'g', 'h', 'J', 'k',
|
'UH', 'UX', 'OW', 'AW', 'OY', 'b', 'C', 'd', 'D', 'f', 'g', 'h', 'J', 'k',
|
||||||
'l', 'm', 'n', 'N', 'p', 'r', 's', 'S', 't', 'T', 'v', 'w', 'y', 'z', 'Z'
|
'l', 'm', 'n', 'N', 'p', 'r', 's', 'S', 't', 'T', 'v', 'w', 'y', 'z', 'Z'
|
||||||
]
|
]
|
||||||
|
len(apple_phonemes)
|
||||||
|
|
||||||
OUTPUT_NAME = 'test'
|
speech_phoneme_data = []
|
||||||
|
|
||||||
dest_dir = os.path.abspath('.') + '/outputs/' + OUTPUT_NAME + '/'
|
|
||||||
csv_dest_file = os.path.abspath('.') + '/outputs/' + OUTPUT_NAME + '.csv'
|
|
||||||
create_dir(dest_dir)
|
|
||||||
|
|
||||||
|
|
||||||
def cli_gen_audio(speech_cmd, out_path):
|
|
||||||
subprocess.call(
|
|
||||||
['say', '-o', out_path, "'" + speech_cmd + "'"])
|
|
||||||
|
|
||||||
|
|
||||||
class SpeechDelegate (NSObject):
|
class SpeechDelegate (NSObject):
|
||||||
def speechSynthesizer_willSpeakWord_ofString_(self, sender, word, text):
|
def speechSynthesizer_willSpeakWord_ofString_(self, sender, word, text):
|
||||||
'''Called automatically when the application has launched'''
|
'''Called automatically when the application has launched'''
|
||||||
# print("Speaking word {} in sentence {}".format(word,text))
|
print("Speaking word {} in sentence {}".format(word,text))
|
||||||
self.wordWillSpeak()
|
|
||||||
|
|
||||||
def speechSynthesizer_willSpeakPhoneme_(self,sender,phoneme):
|
def speechSynthesizer_willSpeakPhoneme_(self,sender,phoneme):
|
||||||
phon_ch = apple_phonemes[phoneme]
|
phon_ch = apple_phonemes[phoneme]
|
||||||
self.phonemeWillSpeak(phon_ch)
|
# print('first',speech_phoneme_data)
|
||||||
|
# prev_time = speech_phoneme_data[-1][1]
|
||||||
|
# print('prev_time',prev_time)
|
||||||
|
speech_phoneme_data.append((phon_ch,time()))
|
||||||
|
print("phoneme boundary for {} time {}".format(phon_ch,time()))
|
||||||
|
# NSApp().terminate_(self)
|
||||||
|
|
||||||
def speechSynthesizer_didFinishSpeaking_(self,synth,didFinishSpeaking):
|
def speechSynthesizer_didFinishSpeaking_(self,synth,didFinishSpeaking):
|
||||||
if didFinishSpeaking:
|
speech_phoneme_data.append(('%',time()))
|
||||||
self.completeCB()
|
print("finished speaking time {}".format(time()))
|
||||||
|
diff_time = []
|
||||||
def setC_W_Ph_(self, completed, word, phoneme):
|
for i in range(len(speech_phoneme_data)-1):
|
||||||
self.completeCB = completed
|
dur = speech_phoneme_data[i+1][1] - speech_phoneme_data[i][1]
|
||||||
self.wordWillSpeak = word
|
diff_time.append((speech_phoneme_data[i][0],dur))
|
||||||
self.phonemeWillSpeak = phoneme
|
print(diff_time)
|
||||||
|
|
||||||
# del SpeechDelegate
|
# del SpeechDelegate
|
||||||
|
|
||||||
|
|
||||||
class Delegate (NSObject):
|
class Delegate (NSObject):
|
||||||
def applicationDidFinishLaunching_(self, aNotification):
|
def applicationDidFinishLaunching_(self, aNotification):
|
||||||
'''Called automatically when the application has launched'''
|
'''Called automatically when the application has launched'''
|
||||||
print("App Launched!")
|
print("Window, World!")
|
||||||
generate_audio()
|
|
||||||
|
|
||||||
|
def windowWillClose_(self, aNotification):
|
||||||
class PhonemeTiming(object):
|
'''Called automatically when the window is closed'''
|
||||||
"""docstring for PhonemeTiming."""
|
print("Window has been closed")
|
||||||
|
# Terminate the application
|
||||||
def __init__(self, phon, start):
|
NSApp().terminate_(self)
|
||||||
super(PhonemeTiming, self).__init__()
|
|
||||||
self.phoneme = phon
|
|
||||||
self.start = start
|
|
||||||
self.fraction = 0
|
|
||||||
self.duration = None
|
|
||||||
self.end = None
|
|
||||||
|
|
||||||
def is_audible(self):
|
|
||||||
return self.phoneme not in ['%', '~']
|
|
||||||
|
|
||||||
def tune(self):
|
|
||||||
if self.is_audible():
|
|
||||||
dur_ms = int(self.duration * 1000)
|
|
||||||
return '{} {{D {}}}'.format(self.phoneme, dur_ms)
|
|
||||||
else:
|
|
||||||
return '~'
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return '[{}]({:0.4f})'.format(self.phoneme, self.fraction)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def to_tune(phone_ts):
|
|
||||||
tune_list = ['[[inpt TUNE]]']
|
|
||||||
for ph in phone_ts:
|
|
||||||
tune_list.append(ph.tune())
|
|
||||||
tune_list.append('[[inpt TEXT]]')
|
|
||||||
return '\n'.join(tune_list)
|
|
||||||
|
|
||||||
|
|
||||||
class SegData(object):
|
|
||||||
"""docstring for SegData."""
|
|
||||||
|
|
||||||
def __init__(self, text, filename):
|
|
||||||
super(SegData, self).__init__()
|
|
||||||
self.text = text
|
|
||||||
self.tune = ''
|
|
||||||
self.filename = filename
|
|
||||||
self.segments = []
|
|
||||||
|
|
||||||
def csv_rows(self):
|
|
||||||
result = []
|
|
||||||
s_tim = self.segments[0].start
|
|
||||||
for i in range(len(self.segments) - 1):
|
|
||||||
cs = self.segments[i]
|
|
||||||
# if cs.is_audible():
|
|
||||||
ns = self.segments[i + 1]
|
|
||||||
row = [self.text, self.filename, cs.phoneme, ns.phoneme,
|
|
||||||
(cs.start - s_tim) * 1000, (cs.end - s_tim) * 1000]
|
|
||||||
result.append(row)
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
class SynthesizerQueue(object):
|
|
||||||
"""docstring for SynthesizerQueue."""
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
super(SynthesizerQueue, self).__init__()
|
|
||||||
self.synth = NSSpeechSynthesizer.alloc().init()
|
|
||||||
self.didComplete = None
|
|
||||||
q_delg = SpeechDelegate.alloc().init()
|
|
||||||
self.synth.setDelegate_(q_delg)
|
|
||||||
|
|
||||||
def synth_complete():
|
|
||||||
end_time = time()
|
|
||||||
for i in range(len(self.phoneme_timing)):
|
|
||||||
if i == len(self.phoneme_timing) - 1:
|
|
||||||
self.phoneme_timing[i].duration = end_time - \
|
|
||||||
self.phoneme_timing[i].start
|
|
||||||
self.phoneme_timing[i].end = end_time
|
|
||||||
else:
|
|
||||||
self.phoneme_timing[i].duration = self.phoneme_timing[i +
|
|
||||||
1].start - self.phoneme_timing[i].start
|
|
||||||
self.phoneme_timing[i].end = self.phoneme_timing[i + 1].start
|
|
||||||
|
|
||||||
total_time = sum(
|
|
||||||
[i.duration for i in self.phoneme_timing if i.is_audible()])
|
|
||||||
for ph in self.phoneme_timing:
|
|
||||||
if ph.is_audible():
|
|
||||||
ph.fraction = ph.duration / total_time
|
|
||||||
if self.didComplete:
|
|
||||||
self.data.segments = self.phoneme_timing
|
|
||||||
self.data.tune = PhonemeTiming.to_tune(self.phoneme_timing)
|
|
||||||
self.didComplete(self.data)
|
|
||||||
|
|
||||||
def will_speak_phoneme(phon):
|
|
||||||
phtm = PhonemeTiming(phon, time())
|
|
||||||
self.phoneme_timing.append(phtm)
|
|
||||||
|
|
||||||
def will_speak_word():
|
|
||||||
pass
|
|
||||||
# coz it comes after the first phoneme of the word is started
|
|
||||||
# phtm = PhonemeTiming('~', time())
|
|
||||||
# self.phoneme_timing.append(phtm)
|
|
||||||
|
|
||||||
q_delg.setC_W_Ph_(synth_complete, will_speak_word, will_speak_phoneme)
|
|
||||||
|
|
||||||
def queueTask(self, text):
|
|
||||||
rand_no = str(random.randint(0, 10000))
|
|
||||||
fname = '{}-{}.aiff'.format(text, rand_no)
|
|
||||||
sanitized = format_filename(fname)
|
|
||||||
dest_file = dest_dir + sanitized
|
|
||||||
cli_gen_audio(text, dest_file)
|
|
||||||
self.phoneme_timing = []
|
|
||||||
self.data = SegData(text, sanitized)
|
|
||||||
self.synth.startSpeakingString_(text)
|
|
||||||
|
|
||||||
|
|
||||||
def story_texts():
|
|
||||||
# story_file = './inputs/all_stories_hs.json'
|
|
||||||
story_file = './inputs/all_stories.json'
|
|
||||||
stories_data = json.load(open(story_file))
|
|
||||||
# text_list_dup = [t[0] for i in stories_data.values() for t in i]
|
|
||||||
text_list_dup = [t for i in stories_data.values() for t in i]
|
|
||||||
text_list = sorted(list(set(text_list_dup)))
|
|
||||||
return text_list
|
|
||||||
|
|
||||||
|
|
||||||
def generate_audio():
|
|
||||||
synthQ = SynthesizerQueue()
|
|
||||||
phrases = random.sample(story_texts(), 5) # story_texts()
|
|
||||||
f = open(csv_dest_file, 'w')
|
|
||||||
s_csv_w = csv.writer(f, quoting=csv.QUOTE_MINIMAL)
|
|
||||||
i = 0
|
|
||||||
p = tqdm(total=len(phrases))
|
|
||||||
|
|
||||||
def nextTask(seg_data=None):
|
|
||||||
nonlocal i
|
|
||||||
if i < len(phrases):
|
|
||||||
p.set_postfix(phrase=phrases[i])
|
|
||||||
p.update()
|
|
||||||
synthQ.queueTask(phrases[i])
|
|
||||||
i += 1
|
|
||||||
else:
|
|
||||||
p.close()
|
|
||||||
f.close()
|
|
||||||
dg = NSApplication.sharedApplication().delegate
|
|
||||||
print('App terminated.')
|
|
||||||
NSApp().terminate_(dg)
|
|
||||||
if seg_data:
|
|
||||||
s_csv_w.writerows(seg_data.csv_rows())
|
|
||||||
synthQ.didComplete = nextTask
|
|
||||||
nextTask()
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
speech_delg = SpeechDelegate.alloc().init()
|
||||||
|
speech_delg.speechSynthesizer_didFinishSpeaking_('t',True)
|
||||||
|
voices = NSSpeechSynthesizer.availableVoices()
|
||||||
|
identifier = voices[2]
|
||||||
|
time()
|
||||||
|
alex_voice = NSSpeechSynthesizer.alloc().initWithVoice_(identifier)
|
||||||
|
alex_voice.setDelegate_(speech_delg)
|
||||||
|
alex_voice.startSpeakingString_("This is a test for speech synthesis generation")
|
||||||
# Create a new application instance ...
|
# Create a new application instance ...
|
||||||
a=NSApplication.sharedApplication()
|
a=NSApplication.sharedApplication()
|
||||||
# ... and create its delgate. Note the use of the
|
# ... and create its delgate. Note the use of the
|
||||||
|
|
@ -222,6 +69,5 @@ def main():
|
||||||
|
|
||||||
AppHelper.runEventLoop()
|
AppHelper.runEventLoop()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
main()
|
main()
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue