Compare commits
No commits in common. "ae46578aec400befb2b06437c148646bd5678740" and "1f60183ab8e653a3fda109e2e778c0ed8bc3f2d1" have entirely different histories.
ae46578aec
...
1f60183ab8
|
|
@ -41,7 +41,6 @@ partd==0.3.8
|
||||||
pexpect==4.2.1
|
pexpect==4.2.1
|
||||||
pickleshare==0.7.4
|
pickleshare==0.7.4
|
||||||
pkg-resources==0.0.0
|
pkg-resources==0.0.0
|
||||||
praat-parselmouth==0.2.0
|
|
||||||
progressbar2==3.34.3
|
progressbar2==3.34.3
|
||||||
prompt-toolkit==1.0.15
|
prompt-toolkit==1.0.15
|
||||||
protobuf==3.4.0
|
protobuf==3.4.0
|
||||||
|
|
|
||||||
|
|
@ -254,7 +254,7 @@ if __name__ == '__main__':
|
||||||
# create_spectrogram_tfrecords('story_all',sample_count=25)
|
# create_spectrogram_tfrecords('story_all',sample_count=25)
|
||||||
# fix_csv('story_words_test')
|
# fix_csv('story_words_test')
|
||||||
#fix_csv('story_phrases')
|
#fix_csv('story_phrases')
|
||||||
create_spectrogram_tfrecords('story_phrases',sample_count=500,train_test_ratio=0.1)
|
create_spectrogram_tfrecords('story_phrases',sample_count=100,train_test_ratio=0.1)
|
||||||
# create_spectrogram_tfrecords('audio',sample_count=50)
|
# create_spectrogram_tfrecords('audio',sample_count=50)
|
||||||
# read_siamese_tfrecords_generator('audio')
|
# read_siamese_tfrecords_generator('audio')
|
||||||
# padd_zeros_siamese_tfrecords('audio')
|
# padd_zeros_siamese_tfrecords('audio')
|
||||||
|
|
|
||||||
|
|
@ -18,9 +18,9 @@ def create_base_rnn_network(input_dim):
|
||||||
inp = Input(shape=input_dim)
|
inp = Input(shape=input_dim)
|
||||||
# ls0 = LSTM(512, return_sequences=True)(inp)
|
# ls0 = LSTM(512, return_sequences=True)(inp)
|
||||||
ls1 = Bidirectional(LSTM(128, return_sequences=True))(inp)
|
ls1 = Bidirectional(LSTM(128, return_sequences=True))(inp)
|
||||||
#ls2 = LSTM(128, return_sequences=True)(ls1)
|
ls2 = LSTM(128, return_sequences=True)(ls1)
|
||||||
# ls3 = LSTM(32, return_sequences=True)(ls2)
|
# ls3 = LSTM(32, return_sequences=True)(ls2)
|
||||||
ls4 = LSTM(64)(ls1)
|
ls4 = LSTM(64)(ls2)
|
||||||
# d1 = Dense(128, activation='relu')(ls4)
|
# d1 = Dense(128, activation='relu')(ls4)
|
||||||
#d2 = Dense(64, activation='relu')(ls2)
|
#d2 = Dense(64, activation='relu')(ls2)
|
||||||
return Model(inp, ls4)
|
return Model(inp, ls4)
|
||||||
|
|
@ -75,7 +75,7 @@ def train_siamese(audio_group = 'audio'):
|
||||||
log_dir = './logs/'+audio_group
|
log_dir = './logs/'+audio_group
|
||||||
create_dir(log_dir)
|
create_dir(log_dir)
|
||||||
tr_gen_fn,te_pairs,te_y,copy_read_consts = read_siamese_tfrecords_generator(audio_group,batch_size=batch_size,test_size=batch_size)
|
tr_gen_fn,te_pairs,te_y,copy_read_consts = read_siamese_tfrecords_generator(audio_group,batch_size=batch_size,test_size=batch_size)
|
||||||
n_step,n_features,n_records = copy_read_consts(model_dir)
|
n_step,n_features,n_records = copy_read_consts()
|
||||||
tr_gen = tr_gen_fn()
|
tr_gen = tr_gen_fn()
|
||||||
input_dim = (n_step, n_features)
|
input_dim = (n_step, n_features)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,141 +0,0 @@
|
||||||
import pandas as pd
|
|
||||||
import pronouncing
|
|
||||||
import re
|
|
||||||
import numpy as np
|
|
||||||
import random
|
|
||||||
|
|
||||||
# mapping = {
|
|
||||||
# s.split()[0]: s.split()[1]
|
|
||||||
# for s in """
|
|
||||||
# AA AA
|
|
||||||
# AE AE
|
|
||||||
# AH UX
|
|
||||||
# AO AO
|
|
||||||
# AW AW
|
|
||||||
# AY AY
|
|
||||||
# B b
|
|
||||||
# CH C
|
|
||||||
# D d
|
|
||||||
# DH D
|
|
||||||
# EH EH
|
|
||||||
# ER UXr
|
|
||||||
# EY EY
|
|
||||||
# F f
|
|
||||||
# G g
|
|
||||||
# HH h
|
|
||||||
# IH IH
|
|
||||||
# IY IY
|
|
||||||
# JH J
|
|
||||||
# K k
|
|
||||||
# L l
|
|
||||||
# M m
|
|
||||||
# N n
|
|
||||||
# NG N
|
|
||||||
# OW OW
|
|
||||||
# OY OY
|
|
||||||
# P p
|
|
||||||
# R r
|
|
||||||
# S s
|
|
||||||
# SH S
|
|
||||||
# T t
|
|
||||||
# TH T
|
|
||||||
# UH UH
|
|
||||||
# UW UW
|
|
||||||
# V v
|
|
||||||
# W w
|
|
||||||
# Y y
|
|
||||||
# Z z
|
|
||||||
# ZH Z
|
|
||||||
# """.strip().split('\n')
|
|
||||||
# }
|
|
||||||
|
|
||||||
# sim_mat = pd.read_csv('./similarity.csv', header=0, index_col=0)
|
|
||||||
#
|
|
||||||
#
|
|
||||||
# def convert_ph(ph):
|
|
||||||
# stress_level = re.search("(\w+)([0-9])", ph)
|
|
||||||
# if stress_level:
|
|
||||||
# return stress_level.group(2) + mapping[stress_level.group(1)]
|
|
||||||
# else:
|
|
||||||
# return mapping[ph]
|
|
||||||
#
|
|
||||||
#
|
|
||||||
# def sim_mat_to_apple_table(smt):
|
|
||||||
# colnames = [convert_ph(ph) for ph in smt.index.tolist()]
|
|
||||||
# smt = pd.DataFrame(np.nan_to_num(smt.values))
|
|
||||||
# fsmt = (smt.T + smt)
|
|
||||||
# np.fill_diagonal(fsmt.values, 100.0)
|
|
||||||
# asmt = pd.DataFrame.copy(fsmt)
|
|
||||||
# asmt.columns = colnames
|
|
||||||
# asmt.index = colnames
|
|
||||||
# apple_sim_table = asmt.stack().reset_index()
|
|
||||||
# apple_sim_table.columns = ['q', 'r', 's']
|
|
||||||
# return apple_sim_table
|
|
||||||
#
|
|
||||||
#
|
|
||||||
# apple_sim_table = sim_mat_to_apple_table(sim_mat)
|
|
||||||
#
|
|
||||||
#
|
|
||||||
# def top_match(ph):
|
|
||||||
# selected = apple_sim_table[(apple_sim_table.q == ph)
|
|
||||||
# & (apple_sim_table.s < 100) &
|
|
||||||
# (apple_sim_table.s >= 70)]
|
|
||||||
# tm = ph
|
|
||||||
# if len(selected) > 0:
|
|
||||||
# tm = pd.DataFrame.sort_values(selected, 's', ascending=False).iloc[0].r
|
|
||||||
# return tm
|
|
||||||
|
|
||||||
|
|
||||||
apple_phonemes = [
|
|
||||||
'%', '@', 'AE', 'EY', 'AO', 'AX', 'IY', 'EH', 'IH', 'AY', 'IX', 'AA', 'UW',
|
|
||||||
'UH', 'UX', 'OW', 'AW', 'OY', 'b', 'C', 'd', 'D', 'f', 'g', 'h', 'J', 'k',
|
|
||||||
'l', 'm', 'n', 'N', 'p', 'r', 's', 'S', 't', 'T', 'v', 'w', 'y', 'z', 'Z'
|
|
||||||
]
|
|
||||||
|
|
||||||
class ApplePhoneme(object):
|
|
||||||
"""docstring for ApplePhoneme."""
|
|
||||||
|
|
||||||
def __init__(self, phone, stress, vowel=False):
|
|
||||||
super(ApplePhoneme, self).__init__()
|
|
||||||
self.phone = phone
|
|
||||||
self.stress = stress
|
|
||||||
self.vowel = vowel
|
|
||||||
|
|
||||||
def __str__(self):
|
|
||||||
return (str(self.stress) if (self.vowel and self.stress>0) else '') + self.phone
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return "'{}'".format(str(self))
|
|
||||||
|
|
||||||
def adjust_stress(self):
|
|
||||||
self.stress = random.choice([i for i in range(3) if i != self.stress])
|
|
||||||
|
|
||||||
|
|
||||||
def parse_apple_phonemes(ph_str):
|
|
||||||
for i in range(len(ph_str)):
|
|
||||||
pref, rest = ph_str[:i + 1], ph_str[i + 1:]
|
|
||||||
if pref in apple_phonemes:
|
|
||||||
vowel = pref[0] in 'AEIOU'
|
|
||||||
return [ApplePhoneme(pref, 0, vowel)] + parse_apple_phonemes(rest)
|
|
||||||
elif pref[0].isdigit() and pref[1:] in apple_phonemes:
|
|
||||||
return [ApplePhoneme(pref[1:], int(pref[0]) , True)] + parse_apple_phonemes(rest)
|
|
||||||
elif not pref.isalnum():
|
|
||||||
return [ApplePhoneme(pref, 0, False)] + parse_apple_phonemes(rest)
|
|
||||||
return []
|
|
||||||
|
|
||||||
|
|
||||||
def similar_phoneme_word(ph_str):
|
|
||||||
phons = parse_apple_phonemes(ph_str)
|
|
||||||
vowels = [i for i in phons if i.vowel]
|
|
||||||
random.choice(vowels).adjust_stress()
|
|
||||||
return ''.join([str(i) for i in phons])
|
|
||||||
|
|
||||||
def similar_phoneme_phrase(ph_str):
|
|
||||||
return ' '.join([similar_phoneme_word(w) for w in ph_str.split()])
|
|
||||||
|
|
||||||
def similar_word(word_str):
|
|
||||||
similar = pronouncing.rhymes(word_str)
|
|
||||||
return random.choice(similar) if len(similar) > 0 else word_str
|
|
||||||
|
|
||||||
def similar_phrase(ph_str):
|
|
||||||
return ' '.join([similar_word(w) for w in ph_str.split()])
|
|
||||||
|
|
@ -1,73 +0,0 @@
|
||||||
import objc
|
|
||||||
from AppKit import *
|
|
||||||
from Foundation import NSURL
|
|
||||||
from PyObjCTools import AppHelper
|
|
||||||
from time import time
|
|
||||||
|
|
||||||
apple_phonemes = [
|
|
||||||
'%', '@', 'AE', 'EY', 'AO', 'AX', 'IY', 'EH', 'IH', 'AY', 'IX', 'AA', 'UW',
|
|
||||||
'UH', 'UX', 'OW', 'AW', 'OY', 'b', 'C', 'd', 'D', 'f', 'g', 'h', 'J', 'k',
|
|
||||||
'l', 'm', 'n', 'N', 'p', 'r', 's', 'S', 't', 'T', 'v', 'w', 'y', 'z', 'Z'
|
|
||||||
]
|
|
||||||
len(apple_phonemes)
|
|
||||||
|
|
||||||
speech_phoneme_data = []
|
|
||||||
|
|
||||||
class SpeechDelegate (NSObject):
|
|
||||||
def speechSynthesizer_willSpeakWord_ofString_(self, sender, word, text):
|
|
||||||
'''Called automatically when the application has launched'''
|
|
||||||
print("Speaking word {} in sentence {}".format(word,text))
|
|
||||||
|
|
||||||
def speechSynthesizer_willSpeakPhoneme_(self,sender,phoneme):
|
|
||||||
phon_ch = apple_phonemes[phoneme]
|
|
||||||
# print('first',speech_phoneme_data)
|
|
||||||
# prev_time = speech_phoneme_data[-1][1]
|
|
||||||
# print('prev_time',prev_time)
|
|
||||||
speech_phoneme_data.append((phon_ch,time()))
|
|
||||||
print("phoneme boundary for {} time {}".format(phon_ch,time()))
|
|
||||||
# NSApp().terminate_(self)
|
|
||||||
|
|
||||||
def speechSynthesizer_didFinishSpeaking_(self,synth,didFinishSpeaking):
|
|
||||||
speech_phoneme_data.append(('%',time()))
|
|
||||||
print("finished speaking time {}".format(time()))
|
|
||||||
diff_time = []
|
|
||||||
for i in range(len(speech_phoneme_data)-1):
|
|
||||||
dur = speech_phoneme_data[i+1][1] - speech_phoneme_data[i][1]
|
|
||||||
diff_time.append((speech_phoneme_data[i][0],dur))
|
|
||||||
print(diff_time)
|
|
||||||
|
|
||||||
# del SpeechDelegate
|
|
||||||
class Delegate (NSObject):
|
|
||||||
def applicationDidFinishLaunching_(self, aNotification):
|
|
||||||
'''Called automatically when the application has launched'''
|
|
||||||
print("Window, World!")
|
|
||||||
|
|
||||||
def windowWillClose_(self, aNotification):
|
|
||||||
'''Called automatically when the window is closed'''
|
|
||||||
print("Window has been closed")
|
|
||||||
# Terminate the application
|
|
||||||
NSApp().terminate_(self)
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
speech_delg = SpeechDelegate.alloc().init()
|
|
||||||
speech_delg.speechSynthesizer_didFinishSpeaking_('t',True)
|
|
||||||
voices = NSSpeechSynthesizer.availableVoices()
|
|
||||||
identifier = voices[2]
|
|
||||||
time()
|
|
||||||
alex_voice = NSSpeechSynthesizer.alloc().initWithVoice_(identifier)
|
|
||||||
alex_voice.setDelegate_(speech_delg)
|
|
||||||
alex_voice.startSpeakingString_("This is a test for speech synthesis generation")
|
|
||||||
# Create a new application instance ...
|
|
||||||
a=NSApplication.sharedApplication()
|
|
||||||
# ... and create its delgate. Note the use of the
|
|
||||||
# Objective C constructors below, because Delegate
|
|
||||||
# is a subcalss of an Objective C class, NSObject
|
|
||||||
delegate = Delegate.alloc().init()
|
|
||||||
# Tell the application which delegate object to use.
|
|
||||||
a.setDelegate_(delegate)
|
|
||||||
|
|
||||||
AppHelper.runEventLoop()
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
main()
|
|
||||||
Loading…
Reference in New Issue