From 3d7542271d94b86e58994f5e4b6f51f0ae180506 Mon Sep 17 00:00:00 2001
From: Malar Kannan <malar@avaz.in>
Date: Thu, 23 Nov 2017 17:50:11 +0530
Subject: [PATCH] implemented tts segmentation data generation

---
 speech_similar.py   | 141 ++++++++++++++++++++++++++++++++++++++++++++
 speech_tts_queue.py |  73 +++++++++++++++++++++++
 2 files changed, 214 insertions(+)
 create mode 100644 speech_similar.py
 create mode 100644 speech_tts_queue.py

diff --git a/speech_similar.py b/speech_similar.py
new file mode 100644
index 0000000..e2d2772
--- /dev/null
+++ b/speech_similar.py
@@ -0,0 +1,141 @@
+import pandas as pd
+import pronouncing
+import re
+import numpy as np
+import random
+
+# mapping = {
+#     s.split()[0]: s.split()[1]
+#     for s in """
+# AA AA
+# AE AE
+# AH UX
+# AO AO
+# AW AW
+# AY AY
+# B  b
+# CH C
+# D  d
+# DH D
+# EH EH
+# ER UXr
+# EY EY
+# F  f
+# G  g
+# HH h
+# IH IH
+# IY IY
+# JH J
+# K  k
+# L  l
+# M  m
+# N  n
+# NG N
+# OW OW
+# OY OY
+# P  p
+# R  r
+# S  s
+# SH S
+# T  t
+# TH T
+# UH UH
+# UW UW
+# V  v
+# W  w
+# Y  y
+# Z  z
+# ZH Z
+# """.strip().split('\n')
+# }
+
+# sim_mat = pd.read_csv('./similarity.csv', header=0, index_col=0)
+#
+#
+# def convert_ph(ph):
+#     stress_level = re.search("(\w+)([0-9])", ph)
+#     if stress_level:
+#         return stress_level.group(2) + mapping[stress_level.group(1)]
+#     else:
+#         return mapping[ph]
+#
+#
+# def sim_mat_to_apple_table(smt):
+#     colnames = [convert_ph(ph) for ph in smt.index.tolist()]
+#     smt = pd.DataFrame(np.nan_to_num(smt.values))
+#     fsmt = (smt.T + smt)
+#     np.fill_diagonal(fsmt.values, 100.0)
+#     asmt = pd.DataFrame.copy(fsmt)
+#     asmt.columns = colnames
+#     asmt.index = colnames
+#     apple_sim_table = asmt.stack().reset_index()
+#     apple_sim_table.columns = ['q', 'r', 's']
+#     return apple_sim_table
+#
+#
+# apple_sim_table = sim_mat_to_apple_table(sim_mat)
+#
+#
+# def top_match(ph):
+#     selected = apple_sim_table[(apple_sim_table.q == ph)
+#                                & (apple_sim_table.s < 100) &
+#                                (apple_sim_table.s >= 70)]
+#     tm = ph
+#     if len(selected) > 0:
+#         tm = pd.DataFrame.sort_values(selected, 's', ascending=False).iloc[0].r
+#     return tm
+
+
+apple_phonemes = [
+    '%', '@', 'AE', 'EY', 'AO', 'AX', 'IY', 'EH', 'IH', 'AY', 'IX', 'AA', 'UW',
+    'UH', 'UX', 'OW', 'AW', 'OY', 'b', 'C', 'd', 'D', 'f', 'g', 'h', 'J', 'k',
+    'l', 'm', 'n', 'N', 'p', 'r', 's', 'S', 't', 'T', 'v', 'w', 'y', 'z', 'Z'
+]
+
+class ApplePhoneme(object):
+    """docstring for ApplePhoneme."""
+
+    def __init__(self, phone, stress, vowel=False):
+        super(ApplePhoneme, self).__init__()
+        self.phone = phone
+        self.stress = stress
+        self.vowel = vowel
+
+    def __str__(self):
+        return (str(self.stress) if (self.vowel and self.stress>0) else '') + self.phone
+
+    def __repr__(self):
+        return "'{}'".format(str(self))
+
+    def adjust_stress(self):
+        self.stress = random.choice([i for i in range(3) if i != self.stress])
+
+
+def parse_apple_phonemes(ph_str):
+    for i in range(len(ph_str)):
+        pref, rest = ph_str[:i + 1], ph_str[i + 1:]
+        if pref in apple_phonemes:
+            vowel = pref[0] in 'AEIOU'
+            return [ApplePhoneme(pref, 0, vowel)] + parse_apple_phonemes(rest)
+        elif pref[0].isdigit() and pref[1:] in apple_phonemes:
+            return [ApplePhoneme(pref[1:], int(pref[0]) , True)] + parse_apple_phonemes(rest)
+        elif not pref.isalnum():
+            return [ApplePhoneme(pref, 0, False)] + parse_apple_phonemes(rest)
+    return []
+
+
+def similar_phoneme_word(ph_str):
+    phons = parse_apple_phonemes(ph_str)
+    vowels = [i for i in phons if i.vowel]
+    random.choice(vowels).adjust_stress()
+    return ''.join([str(i) for i in phons])
+
+def similar_phoneme_phrase(ph_str):
+    return ' '.join([similar_phoneme_word(w) for w in ph_str.split()])
+
+def similar_word(word_str):
+    similar = pronouncing.rhymes(word_str)
+    return random.choice(similar) if len(similar) > 0 else word_str
+
+def similar_phrase(ph_str):
+    return ' '.join([similar_word(w) for w in ph_str.split()])
diff --git a/speech_tts_queue.py b/speech_tts_queue.py
new file mode 100644
index 0000000..ecb8a8e
--- /dev/null
+++ b/speech_tts_queue.py
@@ -0,0 +1,73 @@
+import objc
+from AppKit import *
+from Foundation import NSURL
+from PyObjCTools import AppHelper
+from time import time
+
+apple_phonemes = [
+    '%', '@', 'AE', 'EY', 'AO', 'AX', 'IY', 'EH', 'IH', 'AY', 'IX', 'AA', 'UW',
+    'UH', 'UX', 'OW', 'AW', 'OY', 'b', 'C', 'd', 'D', 'f', 'g', 'h', 'J', 'k',
+    'l', 'm', 'n', 'N', 'p', 'r', 's', 'S', 't', 'T', 'v', 'w', 'y', 'z', 'Z'
+]
+len(apple_phonemes)
+
+speech_phoneme_data = []
+
+class SpeechDelegate (NSObject):
+    def speechSynthesizer_willSpeakWord_ofString_(self, sender, word, text):
+        '''Called automatically when the application has launched'''
+        print("Speaking word {} in sentence {}".format(word,text))
+
+    def speechSynthesizer_willSpeakPhoneme_(self,sender,phoneme):
+        phon_ch = apple_phonemes[phoneme]
+        # print('first',speech_phoneme_data)
+        # prev_time = speech_phoneme_data[-1][1]
+        # print('prev_time',prev_time)
+        speech_phoneme_data.append((phon_ch,time()))
+        print("phoneme boundary for {} time {}".format(phon_ch,time()))
+        # NSApp().terminate_(self)
+
+    def speechSynthesizer_didFinishSpeaking_(self,synth,didFinishSpeaking):
+        speech_phoneme_data.append(('%',time()))
+        print("finished speaking time {}".format(time()))
+        diff_time = []
+        for i in range(len(speech_phoneme_data)-1):
+            dur = speech_phoneme_data[i+1][1] - speech_phoneme_data[i][1]
+            diff_time.append((speech_phoneme_data[i][0],dur))
+        print(diff_time)
+
+# del SpeechDelegate
+class Delegate (NSObject):
+    def applicationDidFinishLaunching_(self, aNotification):
+        '''Called automatically when the application has launched'''
+        print("Window, World!")
+
+    def windowWillClose_(self, aNotification):
+        '''Called automatically when the window is closed'''
+        print("Window has been closed")
+        # Terminate the application
+        NSApp().terminate_(self)
+
+
+def main():
+    speech_delg = SpeechDelegate.alloc().init()
+    speech_delg.speechSynthesizer_didFinishSpeaking_('t',True)
+    voices = NSSpeechSynthesizer.availableVoices()
+    identifier = voices[2]
+    time()
+    alex_voice = NSSpeechSynthesizer.alloc().initWithVoice_(identifier)
+    alex_voice.setDelegate_(speech_delg)
+    alex_voice.startSpeakingString_("This is a test for speech synthesis generation")
+    # Create a new application instance ...
+    a=NSApplication.sharedApplication()
+    # ... and create its delgate.  Note the use of the
+    # Objective C constructors below, because Delegate
+    # is a subcalss of an Objective C class, NSObject
+    delegate = Delegate.alloc().init()
+    # Tell the application which delegate object to use.
+    a.setDelegate_(delegate)
+
+    AppHelper.runEventLoop()
+
+if __name__ == '__main__':
+    main()