From 3d7542271d94b86e58994f5e4b6f51f0ae180506 Mon Sep 17 00:00:00 2001 From: Malar Kannan Date: Thu, 23 Nov 2017 17:50:11 +0530 Subject: [PATCH] implemented tts segmentation data generation --- speech_similar.py | 141 ++++++++++++++++++++++++++++++++++++++++++++ speech_tts_queue.py | 73 +++++++++++++++++++++++ 2 files changed, 214 insertions(+) create mode 100644 speech_similar.py create mode 100644 speech_tts_queue.py diff --git a/speech_similar.py b/speech_similar.py new file mode 100644 index 0000000..e2d2772 --- /dev/null +++ b/speech_similar.py @@ -0,0 +1,141 @@ +import pandas as pd +import pronouncing +import re +import numpy as np +import random + +# mapping = { +# s.split()[0]: s.split()[1] +# for s in """ +# AA AA +# AE AE +# AH UX +# AO AO +# AW AW +# AY AY +# B b +# CH C +# D d +# DH D +# EH EH +# ER UXr +# EY EY +# F f +# G g +# HH h +# IH IH +# IY IY +# JH J +# K k +# L l +# M m +# N n +# NG N +# OW OW +# OY OY +# P p +# R r +# S s +# SH S +# T t +# TH T +# UH UH +# UW UW +# V v +# W w +# Y y +# Z z +# ZH Z +# """.strip().split('\n') +# } + +# sim_mat = pd.read_csv('./similarity.csv', header=0, index_col=0) +# +# +# def convert_ph(ph): +# stress_level = re.search("(\w+)([0-9])", ph) +# if stress_level: +# return stress_level.group(2) + mapping[stress_level.group(1)] +# else: +# return mapping[ph] +# +# +# def sim_mat_to_apple_table(smt): +# colnames = [convert_ph(ph) for ph in smt.index.tolist()] +# smt = pd.DataFrame(np.nan_to_num(smt.values)) +# fsmt = (smt.T + smt) +# np.fill_diagonal(fsmt.values, 100.0) +# asmt = pd.DataFrame.copy(fsmt) +# asmt.columns = colnames +# asmt.index = colnames +# apple_sim_table = asmt.stack().reset_index() +# apple_sim_table.columns = ['q', 'r', 's'] +# return apple_sim_table +# +# +# apple_sim_table = sim_mat_to_apple_table(sim_mat) +# +# +# def top_match(ph): +# selected = apple_sim_table[(apple_sim_table.q == ph) +# & (apple_sim_table.s < 100) & +# (apple_sim_table.s >= 70)] +# tm = ph +# if len(selected) > 0: +# tm = pd.DataFrame.sort_values(selected, 's', ascending=False).iloc[0].r +# return tm + + +apple_phonemes = [ + '%', '@', 'AE', 'EY', 'AO', 'AX', 'IY', 'EH', 'IH', 'AY', 'IX', 'AA', 'UW', + 'UH', 'UX', 'OW', 'AW', 'OY', 'b', 'C', 'd', 'D', 'f', 'g', 'h', 'J', 'k', + 'l', 'm', 'n', 'N', 'p', 'r', 's', 'S', 't', 'T', 'v', 'w', 'y', 'z', 'Z' +] + +class ApplePhoneme(object): + """docstring for ApplePhoneme.""" + + def __init__(self, phone, stress, vowel=False): + super(ApplePhoneme, self).__init__() + self.phone = phone + self.stress = stress + self.vowel = vowel + + def __str__(self): + return (str(self.stress) if (self.vowel and self.stress>0) else '') + self.phone + + def __repr__(self): + return "'{}'".format(str(self)) + + def adjust_stress(self): + self.stress = random.choice([i for i in range(3) if i != self.stress]) + + +def parse_apple_phonemes(ph_str): + for i in range(len(ph_str)): + pref, rest = ph_str[:i + 1], ph_str[i + 1:] + if pref in apple_phonemes: + vowel = pref[0] in 'AEIOU' + return [ApplePhoneme(pref, 0, vowel)] + parse_apple_phonemes(rest) + elif pref[0].isdigit() and pref[1:] in apple_phonemes: + return [ApplePhoneme(pref[1:], int(pref[0]) , True)] + parse_apple_phonemes(rest) + elif not pref.isalnum(): + return [ApplePhoneme(pref, 0, False)] + parse_apple_phonemes(rest) + return [] + + +def similar_phoneme_word(ph_str): + phons = parse_apple_phonemes(ph_str) + vowels = [i for i in phons if i.vowel] + random.choice(vowels).adjust_stress() + return ''.join([str(i) for i in phons]) + +def similar_phoneme_phrase(ph_str): + return ' '.join([similar_phoneme_word(w) for w in ph_str.split()]) + +def similar_word(word_str): + similar = pronouncing.rhymes(word_str) + return random.choice(similar) if len(similar) > 0 else word_str + +def similar_phrase(ph_str): + return ' '.join([similar_word(w) for w in ph_str.split()]) diff --git a/speech_tts_queue.py b/speech_tts_queue.py new file mode 100644 index 0000000..ecb8a8e --- /dev/null +++ b/speech_tts_queue.py @@ -0,0 +1,73 @@ +import objc +from AppKit import * +from Foundation import NSURL +from PyObjCTools import AppHelper +from time import time + +apple_phonemes = [ + '%', '@', 'AE', 'EY', 'AO', 'AX', 'IY', 'EH', 'IH', 'AY', 'IX', 'AA', 'UW', + 'UH', 'UX', 'OW', 'AW', 'OY', 'b', 'C', 'd', 'D', 'f', 'g', 'h', 'J', 'k', + 'l', 'm', 'n', 'N', 'p', 'r', 's', 'S', 't', 'T', 'v', 'w', 'y', 'z', 'Z' +] +len(apple_phonemes) + +speech_phoneme_data = [] + +class SpeechDelegate (NSObject): + def speechSynthesizer_willSpeakWord_ofString_(self, sender, word, text): + '''Called automatically when the application has launched''' + print("Speaking word {} in sentence {}".format(word,text)) + + def speechSynthesizer_willSpeakPhoneme_(self,sender,phoneme): + phon_ch = apple_phonemes[phoneme] + # print('first',speech_phoneme_data) + # prev_time = speech_phoneme_data[-1][1] + # print('prev_time',prev_time) + speech_phoneme_data.append((phon_ch,time())) + print("phoneme boundary for {} time {}".format(phon_ch,time())) + # NSApp().terminate_(self) + + def speechSynthesizer_didFinishSpeaking_(self,synth,didFinishSpeaking): + speech_phoneme_data.append(('%',time())) + print("finished speaking time {}".format(time())) + diff_time = [] + for i in range(len(speech_phoneme_data)-1): + dur = speech_phoneme_data[i+1][1] - speech_phoneme_data[i][1] + diff_time.append((speech_phoneme_data[i][0],dur)) + print(diff_time) + +# del SpeechDelegate +class Delegate (NSObject): + def applicationDidFinishLaunching_(self, aNotification): + '''Called automatically when the application has launched''' + print("Window, World!") + + def windowWillClose_(self, aNotification): + '''Called automatically when the window is closed''' + print("Window has been closed") + # Terminate the application + NSApp().terminate_(self) + + +def main(): + speech_delg = SpeechDelegate.alloc().init() + speech_delg.speechSynthesizer_didFinishSpeaking_('t',True) + voices = NSSpeechSynthesizer.availableVoices() + identifier = voices[2] + time() + alex_voice = NSSpeechSynthesizer.alloc().initWithVoice_(identifier) + alex_voice.setDelegate_(speech_delg) + alex_voice.startSpeakingString_("This is a test for speech synthesis generation") + # Create a new application instance ... + a=NSApplication.sharedApplication() + # ... and create its delgate. Note the use of the + # Objective C constructors below, because Delegate + # is a subcalss of an Objective C class, NSObject + delegate = Delegate.alloc().init() + # Tell the application which delegate object to use. + a.setDelegate_(delegate) + + AppHelper.runEventLoop() + +if __name__ == '__main__': + main()