implemented tts sementation generation code

2017-11-28 12:16:57 +05:30
parent 3d7542271d
commit 0345cc46ae
1 changed files with 187 additions and 33 deletions
--- a/speech_tts_queue.py
+++ b/speech_tts_queue.py
@@ -3,61 +3,214 @@ from AppKit import *
 from Foundation import NSURL
 from PyObjCTools import AppHelper
 from time import time
 import os
 import sys
 import random
 import json
 import csv
 import subprocess
 from tqdm import tqdm
 from speech_samplegen import SynthVariant, format_filename
 from speech_tools import create_dir
 apple_phonemes = [
    '%', '@', 'AE', 'EY', 'AO', 'AX', 'IY', 'EH', 'IH', 'AY', 'IX', 'AA', 'UW',
    'UH', 'UX', 'OW', 'AW', 'OY', 'b', 'C', 'd', 'D', 'f', 'g', 'h', 'J', 'k',
    'l', 'm', 'n', 'N', 'p', 'r', 's', 'S', 't', 'T', 'v', 'w', 'y', 'z', 'Z'
 ]
 len(apple_phonemes)
-speech_phoneme_data = []
+OUTPUT_NAME = 'test'
 dest_dir = os.path.abspath('.') + '/outputs/' + OUTPUT_NAME + '/'
 csv_dest_file = os.path.abspath('.') + '/outputs/' + OUTPUT_NAME + '.csv'
 create_dir(dest_dir)
 def cli_gen_audio(speech_cmd, out_path):
    subprocess.call(
        ['say', '-o', out_path, "'" + speech_cmd + "'"])
 class SpeechDelegate (NSObject):
    def speechSynthesizer_willSpeakWord_ofString_(self, sender, word, text):
        '''Called automatically when the application has launched'''
-        print("Speaking word {} in sentence {}".format(word,text))
+        # print("Speaking word {} in sentence {}".format(word,text))
        self.wordWillSpeak()
    def speechSynthesizer_willSpeakPhoneme_(self, sender, phoneme):
        phon_ch = apple_phonemes[phoneme]
-        # print('first',speech_phoneme_data)
+        self.phonemeWillSpeak(phon_ch)
        # prev_time = speech_phoneme_data[-1][1]
        # print('prev_time',prev_time)
        speech_phoneme_data.append((phon_ch,time()))
        print("phoneme boundary for {} time {}".format(phon_ch,time()))
        # NSApp().terminate_(self)
    def speechSynthesizer_didFinishSpeaking_(self, synth, didFinishSpeaking):
-        speech_phoneme_data.append(('%',time()))
+        if didFinishSpeaking:
-        print("finished speaking time {}".format(time()))
+            self.completeCB()
-        diff_time = []
+
-        for i in range(len(speech_phoneme_data)-1):
+    def setC_W_Ph_(self, completed, word, phoneme):
-            dur = speech_phoneme_data[i+1][1] - speech_phoneme_data[i][1]
+        self.completeCB = completed
-            diff_time.append((speech_phoneme_data[i][0],dur))
+        self.wordWillSpeak = word
-        print(diff_time)
+        self.phonemeWillSpeak = phoneme
 # del SpeechDelegate
 class Delegate (NSObject):
    def applicationDidFinishLaunching_(self, aNotification):
        '''Called automatically when the application has launched'''
-        print("Window, World!")
+        print("App Launched!")
        generate_audio()
-    def windowWillClose_(self, aNotification):
+
-        '''Called automatically when the window is closed'''
+class PhonemeTiming(object):
-        print("Window has been closed")
+    """docstring for PhonemeTiming."""
-        # Terminate the application
+
-        NSApp().terminate_(self)
+    def __init__(self, phon, start):
        super(PhonemeTiming, self).__init__()
        self.phoneme = phon
        self.start = start
        self.fraction = 0
        self.duration = None
        self.end = None
    def is_audible(self):
        return self.phoneme not in ['%', '~']
    def tune(self):
        if self.is_audible():
            dur_ms = int(self.duration * 1000)
            return '{} {{D {}}}'.format(self.phoneme, dur_ms)
        else:
            return '~'
    def __repr__(self):
        return '[{}]({:0.4f})'.format(self.phoneme, self.fraction)
    @staticmethod
    def to_tune(phone_ts):
        tune_list = ['[[inpt TUNE]]']
        for ph in phone_ts:
            tune_list.append(ph.tune())
        tune_list.append('[[inpt TEXT]]')
        return '\n'.join(tune_list)
 class SegData(object):
    """docstring for SegData."""
    def __init__(self, text, filename):
        super(SegData, self).__init__()
        self.text = text
        self.tune = ''
        self.filename = filename
        self.segments = []
    def csv_rows(self):
        result = []
        s_tim = self.segments[0].start
        for i in range(len(self.segments) - 1):
            cs = self.segments[i]
            # if cs.is_audible():
            ns = self.segments[i + 1]
            row = [self.text, self.filename, cs.phoneme, ns.phoneme,
                   (cs.start - s_tim) * 1000, (cs.end - s_tim) * 1000]
            result.append(row)
        return result
 class SynthesizerQueue(object):
    """docstring for SynthesizerQueue."""
    def __init__(self):
        super(SynthesizerQueue, self).__init__()
        self.synth = NSSpeechSynthesizer.alloc().init()
        self.didComplete = None
        q_delg = SpeechDelegate.alloc().init()
        self.synth.setDelegate_(q_delg)
        def synth_complete():
            end_time = time()
            for i in range(len(self.phoneme_timing)):
                if i == len(self.phoneme_timing) - 1:
                    self.phoneme_timing[i].duration = end_time - \
                        self.phoneme_timing[i].start
                    self.phoneme_timing[i].end = end_time
                else:
                    self.phoneme_timing[i].duration = self.phoneme_timing[i +
                                                                          1].start - self.phoneme_timing[i].start
                    self.phoneme_timing[i].end = self.phoneme_timing[i + 1].start
            total_time = sum(
                [i.duration for i in self.phoneme_timing if i.is_audible()])
            for ph in self.phoneme_timing:
                if ph.is_audible():
                    ph.fraction = ph.duration / total_time
            if self.didComplete:
                self.data.segments = self.phoneme_timing
                self.data.tune = PhonemeTiming.to_tune(self.phoneme_timing)
                self.didComplete(self.data)
        def will_speak_phoneme(phon):
            phtm = PhonemeTiming(phon, time())
            self.phoneme_timing.append(phtm)
        def will_speak_word():
            pass
            # coz it comes after the first phoneme of the word is started
            # phtm = PhonemeTiming('~', time())
            # self.phoneme_timing.append(phtm)
        q_delg.setC_W_Ph_(synth_complete, will_speak_word, will_speak_phoneme)
    def queueTask(self, text):
        rand_no = str(random.randint(0, 10000))
        fname = '{}-{}.aiff'.format(text, rand_no)
        sanitized = format_filename(fname)
        dest_file = dest_dir + sanitized
        cli_gen_audio(text, dest_file)
        self.phoneme_timing = []
        self.data = SegData(text, sanitized)
        self.synth.startSpeakingString_(text)
 def story_texts():
    # story_file = './inputs/all_stories_hs.json'
    story_file = './inputs/all_stories.json'
    stories_data = json.load(open(story_file))
    # text_list_dup = [t[0] for i in stories_data.values() for t in i]
    text_list_dup = [t for i in stories_data.values() for t in i]
    text_list = sorted(list(set(text_list_dup)))
    return text_list
 def generate_audio():
    synthQ = SynthesizerQueue()
    phrases = random.sample(story_texts(), 5)  # story_texts()
    f = open(csv_dest_file, 'w')
    s_csv_w = csv.writer(f, quoting=csv.QUOTE_MINIMAL)
    i = 0
    p = tqdm(total=len(phrases))
    def nextTask(seg_data=None):
        nonlocal i
        if i < len(phrases):
            p.set_postfix(phrase=phrases[i])
            p.update()
            synthQ.queueTask(phrases[i])
            i += 1
        else:
            p.close()
            f.close()
            dg = NSApplication.sharedApplication().delegate
            print('App terminated.')
            NSApp().terminate_(dg)
        if seg_data:
            s_csv_w.writerows(seg_data.csv_rows())
    synthQ.didComplete = nextTask
    nextTask()
 def main():
-    speech_delg = SpeechDelegate.alloc().init()
+
    speech_delg.speechSynthesizer_didFinishSpeaking_('t',True)
    voices = NSSpeechSynthesizer.availableVoices()
    identifier = voices[2]
    time()
    alex_voice = NSSpeechSynthesizer.alloc().initWithVoice_(identifier)
    alex_voice.setDelegate_(speech_delg)
    alex_voice.startSpeakingString_("This is a test for speech synthesis generation")
    # Create a new application instance ...
    a = NSApplication.sharedApplication()
    # ... and create its delgate.  Note the use of the
@@ -69,5 +222,6 @@ def main():
    AppHelper.runEventLoop()
 if __name__ == '__main__':
    main()