Compare commits

...

124 Commits

Author SHA1 Message Date
Malar Kannan
225a720f18 updated README to include testing 2017-12-29 16:21:38 +05:30
Malar Kannan
b267b89a44 Merge branch 'master' of /home/ilml/Public/Repos/speech_scoring 2017-12-29 13:15:51 +05:30
Malar Kannan
eb10b577ae Added README.md describing the workflow 2017-12-29 13:14:37 +05:30
Malar Kannan
ee2eb63f66 Merge branch 'master' of ssh://invnuc/~/Public/Repos/speech_scoring 2017-12-28 20:02:44 +05:30
Malar Kannan
2ae269d939 generating test for phone seg model 2017-12-28 20:01:44 +05:30
Malar Kannan
40d7933870 saving model on better 'acc' 2017-12-28 20:00:19 +05:30
Malar Kannan
4dd4bb5963 implemented phoneme segmented training on samples 2017-12-28 18:53:54 +05:30
Malar Kannan
0600482fe5 generating segmentation for words 2017-12-28 13:37:27 +05:30
Malar Kannan
507da49cfa added voicerss tts support for test data generation 2017-12-26 14:32:56 +05:30
Malar Kannan
f44665e9b2 1. fixed softmax output and overfit the model for small sample
2. updated to run on complete data
2017-12-12 12:18:27 +05:30
Malar Kannan
cc4fbe45b9 trying to overfit 2 samples with model -> doesn't seem to converge 2017-12-11 15:03:14 +05:30
Malar Kannan
8d550c58cc fixed batch normalization layer before activation 2017-12-11 14:33:56 +05:30
Malar Kannan
240ecb3f27 removed bn output layer 2017-12-11 14:12:23 +05:30
Malar Kannan
05242d5991 added batch normalization 2017-12-11 14:09:04 +05:30
Malar Kannan
fea9184aec using the full data and fixed typo in model layer name 2017-12-11 13:47:30 +05:30
Malar Kannan
a6543491f8 fixed empty phoneme boundary case 2017-12-11 13:05:46 +05:30
Malar Kannan
d387922f7d added dense-relu/softmax layers to segment output 2017-12-11 12:30:08 +05:30
Malar Kannan
52bbb69c65 resuming segment training 2017-12-10 21:58:55 +05:30
Malar Kannan
03edd935ea fixed input_dim 2017-12-07 17:16:05 +05:30
Malar Kannan
a7f1451a7f fixed exception in data generation 2017-12-07 16:49:34 +05:30
Malar Kannan
91fde710f3 completed the segmentation model 2017-12-07 15:17:59 +05:30
Malar Kannan
c8a07b3d7b Merge branch 'master' of ssh://invnuc/~/Public/Repos/speech_scoring 2017-12-07 12:00:59 +05:30
Malar Kannan
8785522196 Merge branch 'master' of /home/ilml/Public/Repos/speech_scoring 2017-12-07 12:00:44 +05:30
Malar Kannan
435c4a4aa6 added a resume parameter for training 2017-12-07 12:00:42 +05:30
Malar Kannan
c1801b5aa3 implented segment tfrecords batch data-generator 2017-12-07 11:48:19 +05:30
Malar Kannan
c0369d7a66 Merge branch 'master' of ssh://gpuaws/~/repos/speech_scoring 2017-12-06 17:33:27 +05:30
Malar Kannan
8e14db2437 Merge branch 'master' of ssh://invmac/~/Public/repos/speech-scoring 2017-12-06 17:32:46 +05:30
Malar Kannan
bcf1041bde created segment sample tfrecord writer 2017-12-06 17:32:26 +05:30
Malar Kannan
b50edb980d implemented segment-generation for random words for testing 2017-12-06 14:41:25 +05:30
Malar Kannan
3f76207f0d using pitch contour instead of spectrogram 2017-12-04 19:15:17 +05:30
Malar Kannan
6ef4e86f41 implemented segmentation visualization 2017-11-30 14:49:55 +05:30
Malar Kannan
0b1152b5c3 implemented the model, todo implement ctc and training queueing logic 2017-11-28 19:10:19 +05:30
Malar Kannan
1928fce4e8 Merge branch 'master' of ssh://invnuc/~/Public/Repos/speech_scoring 2017-11-28 17:05:35 +05:30
Malar Kannan
ec7303223c merged 2017-11-28 17:05:20 +05:30
Malar Kannan
f12da988d3 segmentation model wip 2017-11-28 15:46:39 +05:30
Malar Kannan
705cf3d172 finding exact duration of sound sample 2017-11-28 12:52:20 +05:30
Malar Kannan
8f79316893 Merge branch 'master' of /Users/malarkannan/Public/repos/speech-scoring 2017-11-28 12:32:50 +05:30
Malar Kannan
0345cc46ae implemented tts sementation generation code 2017-11-28 12:32:45 +05:30
Malar Kannan
20b2d7a958 updated model data 2017-11-27 14:08:01 +05:30
Malar Kannan
43d5b75db9 removing spec_n counter 2017-11-24 11:06:42 +00:00
Malar Kannan
ec08cc7d62 Merge branch 'master' of ssh://gpuaws/~/repos/speech_scoring 2017-11-24 14:32:43 +05:30
Malar Kannan
2268ad8bb0 implemented pitch plotting 2017-11-24 14:32:13 +05:30
Malar Kannan
ec317b6628 Merge branch 'master' of /home/ilml/Public/Repos/speech_scoring 2017-11-24 14:26:40 +05:30
Malar Kannan
235300691e find spec_n from tfrecords 2017-11-24 14:26:36 +05:30
Malar Kannan
ae46578aec Merge branch 'master' of ssh://invmac/~/Public/repos/speech-scoring 2017-11-23 17:50:47 +05:30
Malar Kannan
3d7542271d implemented tts segmentation data generation 2017-11-23 17:50:11 +05:30
Malar Kannan
54f38ca775 removed a layer using lstm 2017-11-22 15:46:42 +05:30
Malar Kannan
6355db4af7 adding missing model-dir for training constants copying 2017-11-22 15:04:02 +05:30
Malar Kannan
1f60183ab8 Merge branch 'master' of ssh://invnuc/~/Public/Repos/speech_scoring 2017-11-22 14:45:35 +05:30
Malar Kannan
e7fc607578 trying mfcc instead of spectrogram 2017-11-22 14:45:08 +05:30
Malar Kannan
d2a075422c copying constantws to models 2017-11-20 15:15:27 +05:30
Malar Kannan
a5d4ede35d finding number of record by streaming-onepass 2017-11-20 12:07:13 +05:30
Malar Kannan
3ae8dc50a2 implemented pair data inspection 2017-11-17 17:29:48 +05:30
Ubuntu
c81a7b4468 decreasing first layer node count to avoid gpu memory overflow 2017-11-17 10:31:36 +00:00
Malar Kannan
c682962c8f using a Bi-LSTM layer as the first layer 2017-11-17 14:17:12 +05:30
Malar Kannan
6ff052be9b fixed randomize pair picking 2017-11-17 11:57:38 +05:30
Malar Kannan
7fc89c0853 1. fixed pairing and data duplicates
2. clean-up
2017-11-16 23:41:38 +05:30
Malar Kannan
3d297f176f perfect score on new test words - TODO evaluate on real voice 2017-11-16 14:19:25 +05:30
Malar Kannan
7d94ddc2ae all phrases 2017-11-15 18:30:49 +05:30
Malar Kannan
77c7adbdb5 Merge branch 'master' of ssh://invmac/~/Public/repos/speech-scoring 2017-11-15 18:29:23 +05:30
Malar Kannan
a67ce148d6 fixed dupliate words 2017-11-15 18:28:47 +05:30
Malar Kannan
c75ff4d109 failure visualization wip 2017-11-15 15:17:37 +05:30
Malar Kannan
a9b244a50c the pair generation order is randomized 2017-11-15 14:43:39 +05:30
Malar Kannan
1b0ba26a6e Merge branch 'master' of ssh://invnuc/~/Public/Repos/speech_scoring 2017-11-15 14:17:15 +05:30
Malar Kannan
e9f54c7f6f 1. tuned batchsize
2. fixed last batch carry-over
2017-11-15 14:14:17 +05:30
Malar Kannan
7684ab3a74 ported to tqdm 2017-11-14 22:59:51 +05:30
Malar Kannan
036667d1c7 Merge branch 'master' of https://code.whiteblossom.net/malar/speech-scoring 2017-11-14 21:59:15 +05:30
Malar Kannan
10b024866e implemented evaluation of test data with model by overfitting on smaller dataset 2017-11-14 17:54:44 +05:30
Malar Kannan
e4b8b4e0a7 visualizing and playing sound files where prediction fails 2017-11-13 19:22:30 +05:30
Malar Kannan
988f66c2c2 avoiding same voice similar variants 2017-11-13 17:33:37 +05:30
Malar Kannan
d978272bdb saving model and tensorboard
checkpointing model
2017-11-10 18:09:14 +05:30
Malar Kannan
bb72c4045e trying to overfit the model to identify false-negative types 2017-11-10 17:52:21 +05:30
Malar Kannan
1190312def removed tfrecord tensor code and remnants 2017-11-10 14:15:12 +05:30
Malar Kannan
e9b18921ee implemented train/test split at word-level and generator returns one-shot validation data 2017-11-10 14:07:31 +05:30
Malar Kannan
ab452494b3 implemented streaming tfreccords 2017-11-09 20:31:29 +05:30
Malar Kannan
0a4d4fadeb implemented random sampling of data for oneshot loading 2017-11-09 15:00:17 +05:30
Malar Kannan
b3a6aa2f6a clean-up 2017-11-08 11:08:19 +05:30
Malar Kannan
7cbfebbf1a 1. fixed missing wrong pairs
2.using different progress bakend
2017-11-07 17:27:09 +05:30
Malar Kannan
b8a9f87031 implemented padding and pipeline is complete 2017-11-07 15:18:04 +05:30
Malar Kannan
41b3f1a9fe dropping invalid csv entries 2017-11-07 12:43:17 +05:30
Malar Kannan
55e2de2f04 using csv writer instead as comma in phrases are mis-aligning columns 2017-11-07 11:56:09 +05:30
Malar Kannan
33c6bcc3c1 implemeted test data sample generation 2017-11-07 10:23:31 +05:30
Malar Kannan
15f29895d4 implemented tfrecord reader and model refactor wip 2017-11-07 00:10:23 +05:30
Malar Kannan
5b682c78b8 Merge branch 'master' of ssh://invmac/~/Public/repos/speech-scoring 2017-11-06 15:50:06 +05:30
Malar Kannan
046343680e implemented siamese pair tfrecord writer 2017-11-06 15:48:38 +05:30
Malar Kannan
c187fbe1ca implemented tfrecord writer for spectrograms 2017-11-06 14:12:09 +05:30
Malar Kannan
fabd882664 tfrecords wip 2017-11-06 12:36:20 +05:30
Malar Kannan
5ff437b095 computing spectrogram for existing files 2017-11-06 12:15:12 +05:30
Malar Kannan
4194e05b4c removing - from phrases before synthesizing audio 2017-11-03 15:30:13 +05:30
Malar Kannan
22d353f101 skipping missing files 2017-11-03 15:20:31 +05:30
Malar Kannan
1f19463b65 computing phoneme/word variant for each word in a phrase 2017-11-03 14:48:55 +05:30
Malar Kannan
b4ceeb4eed generating spectrogram parallelly 2017-11-03 14:19:19 +05:30
Malar Kannan
6ab84b4dc2 Merge branch 'master' of ssh://invmac/~/Public/repos/speech-scoring 2017-11-02 13:16:04 +05:30
Malar Kannan
d4454b6434 looping record test code 2017-11-02 13:14:59 +05:30
Malar Kannan
45977a819d generating randome samples 2017-11-02 13:14:08 +05:30
Malar Kannan
4188585488 updated test code 2017-10-31 17:41:02 +05:30
Malar Kannan
6fbf06814c updated model to use dense classifier 2017-10-31 13:31:31 +05:30
Malar Kannan
2d9b12af95 fixed out of range exception 2017-10-31 10:29:24 +05:30
Malar Kannan
80c0ce403e generating all words for a every voice first 2017-10-27 19:04:09 +05:30
Malar Kannan
cbf15ff662 type in fn name 2017-10-27 18:57:26 +05:30
Malar Kannan
307b4ce1c2 removed legacy similarity 2017-10-27 18:56:37 +05:30
Malar Kannan
938a9cf0a8 implemented tts gen variants 2017-10-27 18:53:22 +05:30
Malar Kannan
eb3ce8b7e5 wip high variant phoneme 2017-10-26 18:06:14 +05:30
Malar Kannan
e57576d6fa discarding phoneme incapable synthesizers 2017-10-26 16:51:32 +05:30
Malar Kannan
a953fa3355 fixed progress 2017-10-26 16:18:17 +05:30
Malar Kannan
7a520b79f4 writing to csv proactively 2017-10-26 15:58:25 +05:30
Malar Kannan
05f36daf7e refactored sample generation code 2017-10-26 15:27:22 +05:30
Malar Kannan
49e6a46efd code cleanup 2017-10-26 12:48:31 +05:30
Malar Kannan
5824158af2 1. fixed neg values in spectrogram
2. refactored get word spectrogram code
2017-10-25 16:52:45 +05:30
Malar Kannan
f1e82a2539 added code to record and generate spectrogram, wip test model 2017-10-25 15:38:03 +05:30
Malar Kannan
a8f17ef764 refactored spectrogram and implemented record and generate spectrogram 2017-10-25 13:37:17 +05:30
Malar Kannan
82d0398d2c formatted 2017-10-25 13:36:41 +05:30
Malar Kannan
e6f0c8b21b 1. clean-up code
2. implemented checkpoint model saving
2017-10-24 16:41:35 +05:30
Malar Kannan
77821093cb Merge remote-tracking branch 'locnuc/master' 2017-10-24 11:55:57 +05:30
Malar Kannan
47991cb7ab added audio recording snippet 2017-10-24 11:55:27 +05:30
Malar Kannan
71c49320c1 added changes 2017-10-24 11:55:04 +05:30
Malar Kannan
8be8fa2595 saving models and logs 2017-10-24 11:49:47 +05:30
Malar Kannan
03d49d83e7 updated learning rate 2017-10-23 20:21:44 +05:30
Malar Kannan
6f3bca61cf 1. fixed dimension issue in data
2. experimenting with different base network
2017-10-23 19:00:27 +05:30
Malar Kannan
e865f17a0d pickling intermediate data to save memory usage 2017-10-20 12:52:11 +05:30
Malar Kannan
b3755ad80e updated tested pickling 2017-10-17 19:17:44 +05:30
Malar Kannan
88edcdd239 seprated spectrogram generation code 2017-10-17 19:11:04 +05:30
Malar Kannan
51a6d6e804 added who data method 2017-10-17 19:04:07 +05:30
Malar Kannan
8ae5104201 added spectrogram to model data code and implemented simple rnn model 2017-10-17 18:56:42 +05:30
27 changed files with 2700 additions and 622 deletions

8
.gitignore vendored
View File

@@ -138,4 +138,10 @@ Temporary Items
# End of https://www.gitignore.io/api/macos # End of https://www.gitignore.io/api/macos
outputs/* outputs/*
inputs/mnist inputs/*
inputs/audio*
logs/*
models/*
*.pkl
temp/*
trained/*

2
CLI.md Normal file
View File

@@ -0,0 +1,2 @@
### Convert audio files
$ `for f in *.mp3; do ffmpeg -i "$f" "${f%.mp3}.aiff"; done`

23
README.md Normal file
View File

@@ -0,0 +1,23 @@
### Setup
`. env/bin/activate` to activate the virtualenv.
### Data Generation
* update `OUTPUT_NAME` in *speech_samplegen.py* to create the dataset folder with the name
* `python speech_samplegen.py` generates variants of audio samples
### Data Preprocessing
* `python speech_data.py` creates the training-testing data from the generated samples.
* run `fix_csv(OUTPUT_NAME)` once to create the fixed index of the dataset generated
* run `generate_sppas_trans(OUTPUT_NAME)` once to create the SPPAS transcription(wav+txt) data
* run `$ (SPPAS_DIR)/bin/annotation.py -l eng -e csv --ipus --tok --phon --align --align -w ./outputs/OUTPUT_NAME/` once to create the phoneme alignment csv files for all variants.
* `create_seg_phonpair_tfrecords(OUTPUT_NAME)` creates the tfrecords files
with the phoneme level pairs of right/wrong stresses
### Training
* `python speech_model.py` trains the model with the training data generated.
* `train_siamese(OUTPUT_NAME)` trains the siamese model with the generated dataset.
### Testing
* `python speech_test.py` tests the trained model with the test dataset
* `evaluate_siamese(TEST_RECORD_FILE,audio_group=OUTPUT_NAME,weights = WEIGHTS_FILE_NAME)`
the TEST_RECORD_FILE will be under outputs directory and WEIGHTS_FILE_NAME will be under the models directory, pick the most recent weights file.

View File

@@ -2,3 +2,10 @@
1. create spectrograms of 150ms windows with 50ms overlap for each word. 1. create spectrograms of 150ms windows with 50ms overlap for each word.
2. train a rnn to output a vector using the spectrograms 2. train a rnn to output a vector using the spectrograms
3. train a nn to output True/False based on the acceptability of the rnn output. -> Siamese network(implementation detail) 3. train a nn to output True/False based on the acceptability of the rnn output. -> Siamese network(implementation detail)
4. validate with real world samples
same word spoken by multiple people etc. will be low distance. two words which are very different (you can use similarity measure given in the speech_recognition repo) will have high distance.
the one with wrong pronunciation will have medium distance from one with right pronunciation
i also had good experience with getting non-English voices to speak out the English words to get "wrong" pronunciation - so that will be subtly different too.

View File

@@ -1,55 +0,0 @@
#!/usr/bin/env python3
"""
Convert ARPABET <http://www.speech.cs.cmu.edu/cgi-bin/cmudict>
to Apple's codes <https://developer.apple.com/library/content/documentation/UserExperience/Conceptual/SpeechSynthesisProgrammingGuide/Phonemes/Phonemes.html>
"""
import sys
mapping = {s.split()[0]: s.split()[1] for s in """
AA AA
AE AE
AH UX
AO AO
AW AW
AY AY
B b
CH C
D d
DH D
EH EH
ER UXr
EY EY
F f
G g
HH h
IH IH
IY IY
JH J
K k
L l
M m
N n
NG N
OW OW
OY OY
P p
R r
S s
SH S
T t
TH T
UH UH
UW UW
V v
W w
Y y
Z z
ZH Z
""".strip().split('\n')}
arpabet_phonemes = sys.stdin.read().split()
apple_phonemes = [mapping[p.upper()] for p in arpabet_phonemes]
print('[[inpt PHON]] ' + ''.join(apple_phonemes))

View File

@@ -1,10 +0,0 @@
import pandas as pd
audio_file = pd.read_csv('./outputs/audio.csv',names=['word','voice','rate','type','filename'])
word_goups = audio_file.groupby('word')
# audio
lst = [1, 2, 3, 1, 2, 3]
s = pd.Series([1, 2, 3, 10, 20, 30], lst)
df3 = pd.DataFrame({'X' : ['A', 'B', 'A', 'B'], 'Y' : [1, 4, 3, 2]})
s.groupby(level=0).sum()

View File

@@ -1,77 +1,141 @@
import pandas as pd import pandas as pd
import pronouncing
import re import re
mapping = {s.split()[0]: s.split()[1] for s in """ import numpy as np
AA AA import random
AE AE
AH UX
AO AO
AW AW
AY AY
B b
CH C
D d
DH D
EH EH
ER UXr
EY EY
F f
G g
HH h
IH IH
IY IY
JH J
K k
L l
M m
N n
NG N
OW OW
OY OY
P p
R r
S s
SH S
T t
TH T
UH UH
UW UW
V v
W w
Y y
X x
Z z
ZH Z
""".strip().split('\n')}
sim_mat = pd.read_csv('./similarity.csv',header=0,index_col=0) # mapping = {
# s.split()[0]: s.split()[1]
# for s in """
# AA AA
# AE AE
# AH UX
# AO AO
# AW AW
# AY AY
# B b
# CH C
# D d
# DH D
# EH EH
# ER UXr
# EY EY
# F f
# G g
# HH h
# IH IH
# IY IY
# JH J
# K k
# L l
# M m
# N n
# NG N
# OW OW
# OY OY
# P p
# R r
# S s
# SH S
# T t
# TH T
# UH UH
# UW UW
# V v
# W w
# Y y
# Z z
# ZH Z
# """.strip().split('\n')
# }
def convert_ph(ph): # sim_mat = pd.read_csv('./similarity.csv', header=0, index_col=0)
stress_level = re.search("(\w+)([0-9])",ph) #
if stress_level: #
return stress_level.group(2)+mapping[stress_level.group(1)] # def convert_ph(ph):
else: # stress_level = re.search("(\w+)([0-9])", ph)
return mapping[ph] # if stress_level:
# return stress_level.group(2) + mapping[stress_level.group(1)]
# else:
# return mapping[ph]
#
#
# def sim_mat_to_apple_table(smt):
# colnames = [convert_ph(ph) for ph in smt.index.tolist()]
# smt = pd.DataFrame(np.nan_to_num(smt.values))
# fsmt = (smt.T + smt)
# np.fill_diagonal(fsmt.values, 100.0)
# asmt = pd.DataFrame.copy(fsmt)
# asmt.columns = colnames
# asmt.index = colnames
# apple_sim_table = asmt.stack().reset_index()
# apple_sim_table.columns = ['q', 'r', 's']
# return apple_sim_table
#
#
# apple_sim_table = sim_mat_to_apple_table(sim_mat)
#
#
# def top_match(ph):
# selected = apple_sim_table[(apple_sim_table.q == ph)
# & (apple_sim_table.s < 100) &
# (apple_sim_table.s >= 70)]
# tm = ph
# if len(selected) > 0:
# tm = pd.DataFrame.sort_values(selected, 's', ascending=False).iloc[0].r
# return tm
def sim_mat_to_apple_table(smt):
colnames = [convert_ph(ph) for ph in smt.index.tolist()]
smt = pd.DataFrame(np.nan_to_num(smt.values))
fsmt = (smt.T+smt)
np.fill_diagonal(fsmt.values,100.0)
asmt = pd.DataFrame.copy(fsmt)
asmt.columns = colnames
asmt.index = colnames
apple_sim_lookup = asmt.stack().reset_index()
apple_sim_lookup.columns = ['q','r','s']
return apple_sim_lookup
apple_sim_lookup = sim_mat_to_apple_table(sim_mat) apple_phonemes = [
'%', '@', 'AE', 'EY', 'AO', 'AX', 'IY', 'EH', 'IH', 'AY', 'IX', 'AA', 'UW',
'UH', 'UX', 'OW', 'AW', 'OY', 'b', 'C', 'd', 'D', 'f', 'g', 'h', 'J', 'k',
'l', 'm', 'n', 'N', 'p', 'r', 's', 'S', 't', 'T', 'v', 'w', 'y', 'z', 'Z'
]
def top_match(ph): class ApplePhoneme(object):
selected = apple_sim_lookup[(apple_sim_lookup.q == ph) & (apple_sim_lookup.s < 100) & (apple_sim_lookup.s >= 70)] """docstring for ApplePhoneme."""
tm = ph
if len(selected) > 0:
tm = pd.DataFrame.sort_values(selected,'s',ascending=False).iloc[0].r
return tm
def similar_phoneme(ph_str): def __init__(self, phone, stress, vowel=False):
return ph_str super(ApplePhoneme, self).__init__()
self.phone = phone
self.stress = stress
self.vowel = vowel
def __str__(self):
return (str(self.stress) if (self.vowel and self.stress>0) else '') + self.phone
def __repr__(self):
return "'{}'".format(str(self))
def adjust_stress(self):
self.stress = random.choice([i for i in range(3) if i != self.stress])
def parse_apple_phonemes(ph_str):
for i in range(len(ph_str)):
pref, rest = ph_str[:i + 1], ph_str[i + 1:]
if pref in apple_phonemes:
vowel = pref[0] in 'AEIOU'
return [ApplePhoneme(pref, 0, vowel)] + parse_apple_phonemes(rest)
elif pref[0].isdigit() and pref[1:] in apple_phonemes:
return [ApplePhoneme(pref[1:], int(pref[0]) , True)] + parse_apple_phonemes(rest)
elif not pref.isalnum():
return [ApplePhoneme(pref, 0, False)] + parse_apple_phonemes(rest)
return []
def similar_phoneme_word(ph_str):
phons = parse_apple_phonemes(ph_str)
vowels = [i for i in phons if i.vowel]
random.choice(vowels).adjust_stress()
return ''.join([str(i) for i in phons])
def similar_phoneme_phrase(ph_str):
return ' '.join([similar_phoneme_word(w) for w in ph_str.split()])
def similar_word(word_str):
similar = pronouncing.rhymes(word_str)
return random.choice(similar) if len(similar) > 0 else word_str
def similar_phrase(ph_str):
return ' '.join([similar_word(w) for w in ph_str.split()])

View File

@@ -1,140 +0,0 @@
'''Train a Siamese MLP on pairs of digits from the MNIST dataset.
It follows Hadsell-et-al.'06 [1] by computing the Euclidean distance on the
output of the shared network and by optimizing the contrastive loss (see paper
for mode details).
[1] "Dimensionality Reduction by Learning an Invariant Mapping"
http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
Gets to 97.2% test accuracy after 20 epochs.
2 seconds per epoch on a Titan X Maxwell GPU
'''
from __future__ import absolute_import
from __future__ import print_function
import numpy as np
import random
from keras.datasets import mnist
from keras.models import Model
from keras.layers import Dense, Dropout, Input, Lambda, Recurrent
from keras.optimizers import RMSprop
from keras import backend as K
num_classes = 10
def euclidean_distance(vects):
x, y = vects
return K.sqrt(K.maximum(K.sum(K.square(x - y), axis=1, keepdims=True), K.epsilon()))
def eucl_dist_output_shape(shapes):
shape1, shape2 = shapes
return (shape1[0], 1)
def contrastive_loss(y_true, y_pred):
'''Contrastive loss from Hadsell-et-al.'06
http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
'''
margin = 1
return K.mean(y_true * K.square(y_pred) +
(1 - y_true) * K.square(K.maximum(margin - y_pred, 0)))
def create_pairs(x, digit_indices):
'''Positive and negative pair creation.
Alternates between positive and negative pairs.
'''
pairs = []
labels = []
n = min([len(digit_indices[d]) for d in range(num_classes)]) - 1
for d in range(num_classes):
for i in range(n):
z1, z2 = digit_indices[d][i], digit_indices[d][i + 1]
pairs += [[x[z1], x[z2]]]
inc = random.randrange(1, num_classes)
dn = (d + inc) % num_classes
z1, z2 = digit_indices[d][i], digit_indices[dn][i]
pairs += [[x[z1], x[z2]]]
labels += [1, 0]
return np.array(pairs), np.array(labels)
def create_base_network(input_dim):
'''Base network to be shared (eq. to feature extraction).
'''
input = Input(shape=(input_dim,))
x = Dense(128, activation='relu')(input)
x = Dropout(0.1)(x)
x = Dense(128, activation='relu')(x)
x = Dropout(0.1)(x)
x = Dense(128, activation='relu')(x)
return Model(input, x)
def compute_accuracy(y_true, y_pred):
'''Compute classification accuracy with a fixed threshold on distances.
'''
pred = y_pred.ravel() < 0.5
return np.mean(pred == y_true)
def accuracy(y_true, y_pred):
'''Compute classification accuracy with a fixed threshold on distances.
'''
return K.mean(K.equal(y_true, K.cast(y_pred < 0.5, y_true.dtype)))
# the data, shuffled and split between train and test sets
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train = x_train.reshape(60000, 784)
x_test = x_test.reshape(10000, 784)
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_train /= 255
x_test /= 255
input_dim = 784
epochs = 20
# create training+test positive and negative pairs
digit_indices = [np.where(y_train == i)[0] for i in range(num_classes)]
tr_pairs, tr_y = create_pairs(x_train, digit_indices)
digit_indices = [np.where(y_test == i)[0] for i in range(num_classes)]
te_pairs, te_y = create_pairs(x_test, digit_indices)
# network definition
base_network = create_base_network(input_dim)
input_a = Input(shape=(input_dim,))
input_b = Input(shape=(input_dim,))
# because we re-use the same instance `base_network`,
# the weights of the network
# will be shared across the two branches
processed_a = base_network(input_a)
processed_b = base_network(input_b)
distance = Lambda(euclidean_distance,
output_shape=eucl_dist_output_shape)([processed_a, processed_b])
model = Model([input_a, input_b], distance)
# train
rms = RMSprop()
model.compile(loss=contrastive_loss, optimizer=rms, metrics=[accuracy])
model.fit([tr_pairs[:, 0], tr_pairs[:, 1]], tr_y,
batch_size=128,
epochs=epochs,
validation_data=([te_pairs[:, 0], te_pairs[:, 1]], te_y))
# compute final accuracy on training and test sets
y_pred = model.predict([tr_pairs[:, 0], tr_pairs[:, 1]])
tr_acc = compute_accuracy(tr_y, y_pred)
y_pred = model.predict([te_pairs[:, 0], te_pairs[:, 1]])
te_acc = compute_accuracy(te_y, y_pred)
print('* Accuracy on training set: %0.2f%%' % (100 * tr_acc))
print('* Accuracy on test set: %0.2f%%' % (100 * te_acc))

80
requirements-linux.txt Normal file
View File

@@ -0,0 +1,80 @@
bleach==1.5.0
click==6.7
cloudpickle==0.4.1
cycler==0.10.0
dask==0.15.4
decorator==4.1.2
distributed==1.19.3
entrypoints==0.2.3
enum34==1.1.6
futures==3.1.1
graphviz==0.8.1
h5py==2.7.1
HeapDict==1.0.0
html5lib==0.9999999
ipykernel==4.6.1
ipython==6.2.1
ipython-genutils==0.2.0
ipywidgets==7.0.3
jedi==0.11.0
Jinja2==2.9.6
jsonschema==2.6.0
jupyter==1.0.0
jupyter-client==5.1.0
jupyter-console==5.2.0
jupyter-core==4.3.0
Keras==2.0.8
locket==0.2.0
Markdown==2.6.9
MarkupSafe==1.0
matplotlib==2.1.0
mistune==0.7.4
msgpack-python==0.4.8
nbconvert==5.3.1
nbformat==4.4.0
notebook==5.2.0
numexpr==2.6.4
numpy==1.13.3
pandas==0.20.3
pandocfilters==1.4.2
parso==0.1.0
partd==0.3.8
pexpect==4.2.1
pickleshare==0.7.4
praat-parselmouth==0.2.0
progressbar2==3.34.3
prompt-toolkit==1.0.15
protobuf==3.5.0
psutil==5.4.0
ptyprocess==0.5.2
PyAudio==0.2.11
pydot==1.2.3
Pygments==2.2.0
pyparsing==2.2.0
pysndfile==1.0.0
python-dateutil==2.6.1
python-utils==2.2.0
pytz==2017.2
PyYAML==3.12
pyzmq==16.0.2
qtconsole==4.3.1
scikit-learn==0.19.0
scipy==0.19.1
seaborn==0.8.1
simplegeneric==0.8.1
six==1.11.0
sortedcontainers==1.5.7
tables==3.4.2
tblib==1.3.2
tensorflow==1.3.0
tensorflow-tensorboard==0.4.0rc3
terminado==0.6
testpath==0.3.1
toolz==0.8.2
tornado==4.5.2
tqdm==4.19.4
traitlets==4.3.2
wcwidth==0.1.7
Werkzeug==0.12.2
widgetsnbextension==3.0.6
zict==0.1.3

265
segment_data.py Normal file
View File

@@ -0,0 +1,265 @@
import random
import math
import pickle
from functools import reduce
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import tensorflow as tf
import shutil
from speech_pitch import *
from speech_tools import reservoir_sample,padd_zeros
# import importlib
# import speech_tools
# importlib.reload(speech_tools)
# %matplotlib inline
SPEC_MAX_FREQUENCY = 8000
SPEC_WINDOW_SIZE = 0.03
def fix_csv(collection_name = 'test'):
seg_data = pd.read_csv('./outputs/segments/'+collection_name+'/index.csv',names=['phrase','filename'
,'start_phoneme','end_phoneme','start_time','end_time'])
seg_data.to_csv('./outputs/segments/'+collection_name+'/index.fixed.csv')
def pick_random_phrases(collection_name='test'):
collection_name = 'test'
seg_data = pd.read_csv('./outputs/'+collection_name+'.fixed.csv',index_col=0)
phrase_groups = random.sample([i for i in seg_data.groupby(['phrase'])],10)
result = []
for ph,g in phrase_groups:
result.append(ph)
pd.DataFrame(result,columns=['phrase']).to_csv('./outputs/'+collection_name+'.random.csv')
# pick_random_phrases()
def plot_random_phrases(collection_name = 'test'):
# collection_name = 'test'
rand_words = pd.read_csv('./outputs/'+collection_name+'.random.csv',index_col=0)
rand_w_list = rand_words['phrase'].tolist()
seg_data = pd.read_csv('./outputs/'+collection_name+'.fixed.csv',index_col=0)
result = (seg_data['phrase'] == rand_w_list[0])
for i in rand_w_list[1:]:
result |= (seg_data['phrase'] == i)
phrase_groups = [i for i in seg_data[result].groupby(['phrase'])]
self_files = ['a_wrong_turn-low1.aiff','great_pin-low1.aiff'
,'he_set_off_at_once_to_find_the_beast-low1.aiff'
,'hound-low1.aiff','noises-low1.aiff','po_burped-low1.aiff'
,'she_loves_the_roses-low1.aiff','the_busy_spider-low1.aiff'
,'the_rain_helped-low1.aiff','to_go_to_the_doctor-low1.aiff']
co_files = map(lambda x: './inputs/self/'+x,self_files)
for ((ph,g),s_f) in zip(phrase_groups,co_files):
# ph,g = phrase_groups[0]
file_path = './outputs/test/'+g.iloc[0]['filename']
phrase_sample = pm_snd(file_path)
self_sample = pm_snd(s_f)
player,closer = play_sound()
# rows = [i for i in g.iterrows()]
# random.shuffle(rows)
print(ph)
phon_stops = []
for (i,phon) in g.iterrows():
end_t = phon['end_time']/1000
phon_ch = phon['start_phoneme']
phon_stops.append((end_t,phon_ch))
plot_sample_pitch(phrase_sample,phons = phon_stops)
plot_sample_pitch(self_sample)
# player(phrase_sample)
# input()
# for (i,phon) in g.iterrows():
# # phon = g.iloc[1]
# start_t = phon['start_time']/1000
# end_t = phon['end_time']/1000
# phon_ch = phon['start_phoneme']
# phon_sample = phrase_sample.extract_part(from_time=start_t,to_time=end_t)
# if phon_sample.n_samples*phon_sample.sampling_period < 6.4/100:
# continue
# # if phon_ch[0] not in 'AEIOU':
# # continue
# # phon_sample
# # player(phon_sample)
# # plot_sample_intensity(phon_sample)
# print(phon_ch)
# plot_sample_pitch(phon_sample)
# closer()
def plot_segments(collection_name = 'story_test_segments'):
collection_name = 'story_test_segments'
seg_data = pd.read_csv('./outputs/'+collection_name+'.fixed.csv',index_col=0)
phrase_groups = [i for i in seg_data.groupby(['phrase'])]
for (ph,g) in phrase_groups:
# ph,g = phrase_groups[0]
file_path = './outputs/'+collection_name+'/'+g.iloc[0]['filename']
phrase_sample = pm_snd(file_path)
# player,closer = play_sound()
print(ph)
phon_stops = []
for (i,phon) in g.iterrows():
end_t = phon['end_time']/1000
phon_ch = phon['start_phoneme']
phon_stops.append((end_t,phon_ch))
phrase_spec = phrase_sample.to_spectrogram(window_length=0.03, maximum_frequency=8000)
sg_db = 10 * np.log10(phrase_spec.values)
result = np.zeros(sg_db.shape[0],dtype=np.int64)
ph_bounds = [t[0] for t in phon_stops[1:]]
b_frames = np.asarray([spec_frame(phrase_spec,b) for b in ph_bounds])
result[b_frames] = 1
# print(audio)
def generate_spec(aiff_file):
phrase_sample = pm_snd(aiff_file)
phrase_spec = phrase_sample.to_spectrogram(window_length=SPEC_WINDOW_SIZE, maximum_frequency=SPEC_MAX_FREQUENCY)
sshow_abs = np.abs(phrase_spec.values + np.finfo(phrase_spec.values.dtype).eps)
sg_db = 10 * np.log10(sshow_abs)
sg_db[sg_db < 0] = 0
return sg_db,phrase_spec
def spec_frame(spec,b):
return int(round(spec.frame_number_to_time(b)))
def _float_feature(value):
return tf.train.Feature(float_list=tf.train.FloatList(value=value))
def _int64_feature(value):
return tf.train.Feature(int64_list=tf.train.Int64List(value=value))
def _bytes_feature(value):
return tf.train.Feature(bytes_list=tf.train.BytesList(value=value))
def create_segments_tfrecords(collection_name='story_test_segments',sample_count=0,train_test_ratio=0.1):
audio_samples = pd.read_csv( './outputs/segments/' + collection_name + '/index.fixed.csv',index_col=0)
audio_samples['file_path'] = audio_samples.loc[:, 'filename'].apply(lambda x: 'outputs/segments/' + collection_name + '/samples/' + x)
n_records,n_spec,n_features = 0,0,0
def write_samples(wg,sample_name):
phrase_groups = tqdm(wg,desc='Computing segmentation')
record_file = './outputs/segments/{}/{}.tfrecords'.format(collection_name,sample_name)
writer = tf.python_io.TFRecordWriter(record_file)
for (ph,g) in phrase_groups:
fname = g.iloc[0]['filename']
sg_db,phrase_spec = generate_spec(g.iloc[0]['file_path'])
phon_stops = []
phrase_groups.set_postfix(phrase=ph)
spec_n,spec_w = sg_db.shape
spec = sg_db.reshape(-1)
for (i,phon) in g.iterrows():
end_t = phon['end_time']/1000
phon_ch = phon['start_phoneme']
phon_stops.append((end_t,phon_ch))
result = np.zeros(spec_n,dtype=np.int64)
ph_bounds = [t[0] for t in phon_stops]
f_bounds = [spec_frame(phrase_spec,b) for b in ph_bounds]
valid_bounds = [i for i in f_bounds if 0 < i < spec_n]
b_frames = np.asarray(valid_bounds)
if len(b_frames) > 0:
result[b_frames] = 1
nonlocal n_records,n_spec,n_features
n_spec = max([n_spec,spec_n])
n_features = spec_w
n_records+=1
example = tf.train.Example(features=tf.train.Features(
feature={
'phrase': _bytes_feature([ph.encode('utf-8')]),
'file': _bytes_feature([fname.encode('utf-8')]),
'spec':_float_feature(spec),
'spec_n':_int64_feature([spec_n]),
'spec_w':_int64_feature([spec_w]),
'output':_int64_feature(result)
}
))
writer.write(example.SerializeToString())
phrase_groups.close()
writer.close()
word_groups = [i for i in audio_samples.groupby('phrase')]
wg_sampled = reservoir_sample(word_groups,sample_count) if sample_count > 0 else word_groups
# write_samples(word_groups,'all')
tr_audio_samples,te_audio_samples = train_test_split(wg_sampled,test_size=train_test_ratio)
write_samples(tr_audio_samples,'train')
write_samples(te_audio_samples,'test')
const_file = './outputs/segments/'+collection_name+'/constants.pkl'
pickle.dump((n_spec,n_features,n_records),open(const_file,'wb'))
def record_generator_count(records_file):
record_iterator = tf.python_io.tf_record_iterator(path=records_file)
count,spec_n = 0,0
for i in record_iterator:
count+=1
record_iterator = tf.python_io.tf_record_iterator(path=records_file)
return record_iterator,count
def read_segments_tfrecords_generator(collection_name='audio',batch_size=32,test_size=0):
# collection_name = 'story_test'
records_file = './outputs/segments/'+collection_name+'/train.tfrecords'
const_file = './outputs/segments/'+collection_name+'/constants.pkl'
(n_spec,n_features,n_records) = pickle.load(open(const_file,'rb'))
def copy_read_consts(dest_dir):
shutil.copy2(const_file,dest_dir+'/constants.pkl')
return (n_spec,n_features,n_records)
# @threadsafe_iter
def record_generator():
print('reading tfrecords({}-train)...'.format(collection_name))
input_data = []
output_data = []
while True:
record_iterator,records_count = record_generator_count(records_file)
for (i,string_record) in enumerate(record_iterator):
# (i,string_record) = next(enumerate(record_iterator))
example = tf.train.Example()
example.ParseFromString(string_record)
spec_n = example.features.feature['spec_n'].int64_list.value[0]
spec_w = example.features.feature['spec_w'].int64_list.value[0]
spec = np.array(example.features.feature['spec'].float_list.value).reshape(spec_n,spec_w)
p_spec = padd_zeros(spec,n_spec)
input_data.append(p_spec)
output = np.asarray(example.features.feature['output'].int64_list.value)
p_output = np.pad(output,(0,n_spec-output.shape[0]),'constant')
output_data.append(p_output)
if len(input_data) == batch_size or i == n_records-1:
input_arr = np.asarray(input_data)
output_arr = np.asarray(output_data)
input_arr.shape,output_arr.shape
yield (input_arr,output_arr)
input_data = []
output_data = []
# Read test in one-shot
print('reading tfrecords({}-test)...'.format(collection_name))
te_records_file = './outputs/segments/'+collection_name+'/test.tfrecords'
te_re_iterator,te_n_records = record_generator_count(te_records_file)
# test_size = 10
test_size = min([test_size,te_n_records]) if test_size > 0 else te_n_records
input_data = np.zeros((test_size,n_spec,n_features))
output_data = np.zeros((test_size,n_spec))
random_samples = enumerate(reservoir_sample(te_re_iterator,test_size))
for (i,string_record) in tqdm(random_samples,total=test_size):
# (i,string_record) = next(random_samples)
example = tf.train.Example()
example.ParseFromString(string_record)
spec_n = example.features.feature['spec_n'].int64_list.value[0]
spec_w = example.features.feature['spec_w'].int64_list.value[0]
spec = np.array(example.features.feature['spec'].float_list.value).reshape(spec_n,spec_w)
p_spec = padd_zeros(spec,n_spec)
input_data[i] = p_spec
output = np.asarray(example.features.feature['output'].int64_list.value)
p_output = np.pad(output,(0,n_spec-output.shape[0]),'constant')
output_data[i] = p_output
return record_generator,input_data,output_data,copy_read_consts
if __name__ == '__main__':
# plot_random_phrases()
# fix_csv('story_test_segments')
# plot_segments('story_test_segments')
# fix_csv('story_words')
# pass
create_segments_tfrecords('story_words.30', sample_count=36,train_test_ratio=0.1)
# record_generator,input_data,output_data,copy_read_consts = read_segments_tfrecords_generator('story_test')
# tr_gen = record_generator()
# for i in tr_gen:
# print(i[0].shape,i[1].shape)

144
segment_model.py Normal file
View File

@@ -0,0 +1,144 @@
from __future__ import absolute_import
from __future__ import print_function
import numpy as np
from keras.models import Model,load_model,model_from_yaml
from keras.layers import Input,Concatenate,Lambda, Reshape, Dropout
from keras.layers import Dense,Conv2D, LSTM, Bidirectional, GRU
from keras.layers import BatchNormalization,Activation
from keras.losses import categorical_crossentropy
from keras.utils import to_categorical
from keras.optimizers import RMSprop,Adadelta,Adagrad,Adam,Nadam
from keras.callbacks import TensorBoard, ModelCheckpoint
from keras import backend as K
from keras.utils import plot_model
from speech_tools import create_dir,step_count
from segment_data import read_segments_tfrecords_generator
# import importlib
# import segment_data
# import speech_tools
# importlib.reload(segment_data)
# importlib.reload(speech_tools)
# TODO implement ctc losses
# https://github.com/fchollet/keras/blob/master/examples/image_ocr.py
def accuracy(y_true, y_pred):
'''Compute classification accuracy with a fixed threshold on distances.
'''
return K.mean(K.equal(y_true, K.cast(y_pred > 0.5, y_true.dtype)))
def ctc_lambda_func(args):
y_pred, labels, input_length, label_length = args
# the 2 is critical here since the first couple outputs of the RNN
# tend to be garbage:
y_pred = y_pred[:, 2:, :]
return K.ctc_batch_cost(labels, y_pred, input_length, label_length)
def segment_model(input_dim):
inp = Input(shape=input_dim)
cnv1 = Conv2D(filters=32, kernel_size=(5,9))(inp)
cnv2 = Conv2D(filters=1, kernel_size=(5,9))(cnv1)
dr_cnv2 = Dropout(rate=0.95)(cnv2)
cn_rnn_dim = (dr_cnv2.shape[1].value,dr_cnv2.shape[2].value)
r_dr_cnv2 = Reshape(target_shape=cn_rnn_dim)(dr_cnv2)
b_gr1 = Bidirectional(GRU(512, return_sequences=True),merge_mode='sum')(r_dr_cnv2)
b_gr2 = Bidirectional(GRU(512, return_sequences=True),merge_mode='sum')(b_gr1)
b_gr3 = Bidirectional(GRU(512, return_sequences=True),merge_mode='sum')(b_gr2)
oup = Dense(2, activation='softmax')(b_gr3)
return Model(inp, oup)
def simple_segment_model(input_dim):
inp = Input(shape=input_dim)
b_gr1 = Bidirectional(LSTM(32, return_sequences=True))(inp)
b_gr1 = Bidirectional(LSTM(16, return_sequences=True),merge_mode='sum')(b_gr1)
b_gr1 = LSTM(1, return_sequences=True,activation='softmax')(b_gr1)
# b_gr1 = LSTM(4, return_sequences=True)(b_gr1)
# b_gr1 = LSTM(2, return_sequences=True)(b_gr1)
# bn_b_gr1 = BatchNormalization(momentum=0.98)(b_gr1)
# b_gr2 = GRU(64, return_sequences=True)(b_gr1)
# bn_b_gr2 = BatchNormalization(momentum=0.98)(b_gr2)
# d1 = Dense(32)(b_gr2)
# bn_d1 = BatchNormalization(momentum=0.98)(d1)
# bn_da1 = Activation('relu')(bn_d1)
# d2 = Dense(8)(bn_da1)
# bn_d2 = BatchNormalization(momentum=0.98)(d2)
# bn_da2 = Activation('relu')(bn_d2)
# d3 = Dense(1)(b_gr1)
# # bn_d3 = BatchNormalization(momentum=0.98)(d3)
# bn_da3 = Activation('softmax')(d3)
oup = Reshape(target_shape=(input_dim[0],))(b_gr1)
return Model(inp, oup)
def write_model_arch(mod,mod_file):
model_f = open(mod_file,'w')
model_f.write(mod.to_yaml())
model_f.close()
def load_model_arch(mod_file):
model_f = open(mod_file,'r')
mod = model_from_yaml(model_f.read())
model_f.close()
return mod
def train_segment(collection_name = 'test',resume_weights='',initial_epoch=0):
# collection_name = 'story_test'
batch_size = 128
# batch_size = 4
model_dir = './models/segment/'+collection_name
create_dir(model_dir)
log_dir = './logs/segment/'+collection_name
create_dir(log_dir)
tr_gen_fn,te_x,te_y,copy_read_consts = read_segments_tfrecords_generator(collection_name,batch_size,2*batch_size)
tr_gen = tr_gen_fn()
n_step,n_features,n_records = copy_read_consts(model_dir)
input_dim = (n_step, n_features)
model = simple_segment_model(input_dim)
# model.output_shape,model.input_shape
plot_model(model,show_shapes=True, to_file=model_dir+'/model.png')
# loss_out = Lambda(ctc_lambda_func, output_shape=(1,), name='ctc')([y_pred, labels, input_length, label_length])
tb_cb = TensorBoard(
log_dir=log_dir,
histogram_freq=1,
batch_size=32,
write_graph=True,
write_grads=True,
write_images=True,
embeddings_freq=0,
embeddings_layer_names=None,
embeddings_metadata=None)
cp_file_fmt = model_dir+'/speech_segment_model-{epoch:02d}-epoch-{val_loss:0.2f}\
-acc.h5'
cp_cb = ModelCheckpoint(
cp_file_fmt,
monitor='val_loss',
verbose=0,
save_best_only=False,
save_weights_only=True,
mode='auto',
period=1)
# train
opt = RMSprop()
model.compile(loss=categorical_crossentropy, optimizer=opt, metrics=[accuracy])
write_model_arch(model,model_dir+'/speech_segment_model_arch.yaml')
epoch_n_steps = step_count(n_records,batch_size)
if resume_weights != '':
model.load_weights(resume_weights)
model.fit_generator(tr_gen
, epochs=10000
, steps_per_epoch=epoch_n_steps
, validation_data=(te_x, te_y)
, max_queue_size=32
, callbacks=[tb_cb, cp_cb],initial_epoch=initial_epoch)
model.save(model_dir+'/speech_segment_model-final.h5')
# y_pred = model.predict([te_pairs[:, 0], te_pairs[:, 1]])
# te_acc = compute_accuracy(te_y, y_pred)
# print('* Accuracy on test set: %0.2f%%' % (100 * te_acc))
if __name__ == '__main__':
# pass
train_segment('story_words')#,'./models/segment/story_phrases.1000/speech_segment_model-final.h5',1001)

View File

@@ -1,90 +0,0 @@
import tensorflow as tf
import numpy as np
class SiameseLSTM(object):
"""
A LSTM based deep Siamese network for text similarity.
Uses an character embedding layer, followed by a biLSTM and Energy Loss layer.
"""
def BiRNN(self, x, dropout, scope, embedding_size, sequence_length):
n_input=embedding_size
n_steps=sequence_length
n_hidden=n_steps
n_layers=3
# Prepare data shape to match `bidirectional_rnn` function requirements
# Current data input shape: (batch_size, n_steps, n_input) (?, seq_len, embedding_size)
# Required shape: 'n_steps' tensors list of shape (batch_size, n_input)
# Permuting batch_size and n_steps
x = tf.transpose(x, [1, 0, 2])
# Reshape to (n_steps*batch_size, n_input)
x = tf.reshape(x, [-1, n_input])
print(x)
# Split to get a list of 'n_steps' tensors of shape (batch_size, n_input)
x = tf.split(x, n_steps, 0)
print(x)
# Define lstm cells with tensorflow
# Forward direction cell
with tf.name_scope("fw"+scope),tf.variable_scope("fw"+scope):
stacked_rnn_fw = []
for _ in range(n_layers):
fw_cell = tf.nn.rnn_cell.BasicLSTMCell(n_hidden, forget_bias=1.0, state_is_tuple=True)
lstm_fw_cell = tf.contrib.rnn.DropoutWrapper(fw_cell,output_keep_prob=dropout)
stacked_rnn_fw.append(lstm_fw_cell)
lstm_fw_cell_m = tf.nn.rnn_cell.MultiRNNCell(cells=stacked_rnn_fw, state_is_tuple=True)
with tf.name_scope("bw"+scope),tf.variable_scope("bw"+scope):
stacked_rnn_bw = []
for _ in range(n_layers):
bw_cell = tf.nn.rnn_cell.BasicLSTMCell(n_hidden, forget_bias=1.0, state_is_tuple=True)
lstm_bw_cell = tf.contrib.rnn.DropoutWrapper(bw_cell,output_keep_prob=dropout)
stacked_rnn_bw.append(lstm_bw_cell)
lstm_bw_cell_m = tf.nn.rnn_cell.MultiRNNCell(cells=stacked_rnn_bw, state_is_tuple=True)
# Get lstm cell output
with tf.name_scope("bw"+scope),tf.variable_scope("bw"+scope):
outputs, _, _ = tf.nn.static_bidirectional_rnn(lstm_fw_cell_m, lstm_bw_cell_m, x, dtype=tf.float32)
return outputs[-1]
def contrastive_loss(self, y,d,batch_size):
tmp= y *tf.square(d)
#tmp= tf.mul(y,tf.square(d))
tmp2 = (1-y) *tf.square(tf.maximum((1 - d),0))
return tf.reduce_sum(tmp +tmp2)/batch_size/2
def __init__(
self, sequence_length, vocab_size, embedding_size, hidden_units, l2_reg_lambda, batch_size):
# Placeholders for input, output and dropout
self.input_x1 = tf.placeholder(tf.int32, [None, sequence_length], name="input_x1")
self.input_x2 = tf.placeholder(tf.int32, [None, sequence_length], name="input_x2")
self.input_y = tf.placeholder(tf.float32, [None], name="input_y")
self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")
# Keeping track of l2 regularization loss (optional)
l2_loss = tf.constant(0.0, name="l2_loss")
# Embedding layer
with tf.name_scope("embedding"):
self.W = tf.Variable(
tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0),
trainable=True,name="W")
self.embedded_chars1 = tf.nn.embedding_lookup(self.W, self.input_x1)
#self.embedded_chars_expanded1 = tf.expand_dims(self.embedded_chars1, -1)
self.embedded_chars2 = tf.nn.embedding_lookup(self.W, self.input_x2)
#self.embedded_chars_expanded2 = tf.expand_dims(self.embedded_chars2, -1)
# Create a convolution + maxpool layer for each filter size
with tf.name_scope("output"):
self.out1=self.BiRNN(self.embedded_chars1, self.dropout_keep_prob, "side1", embedding_size, sequence_length)
self.out2=self.BiRNN(self.embedded_chars2, self.dropout_keep_prob, "side2", embedding_size, sequence_length)
self.distance = tf.sqrt(tf.reduce_sum(tf.square(tf.subtract(self.out1,self.out2)),1,keep_dims=True))
self.distance = tf.div(self.distance, tf.add(tf.sqrt(tf.reduce_sum(tf.square(self.out1),1,keep_dims=True)),tf.sqrt(tf.reduce_sum(tf.square(self.out2),1,keep_dims=True))))
self.distance = tf.reshape(self.distance, [-1], name="distance")
with tf.name_scope("loss"):
self.loss = self.contrastive_loss(self.input_y,self.distance, batch_size)
#### Accuracy computation is outside of this class.
with tf.name_scope("accuracy"):
self.temp_sim = tf.subtract(tf.ones_like(self.distance),tf.rint(self.distance), name="temp_sim") #auto threshold 0.5
correct_predictions = tf.equal(self.temp_sim, self.input_y)
self.accuracy=tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")

View File

@@ -1,12 +0,0 @@
# import scipy.signal as sg
# import pysndfile.sndio as snd
#
# snd_data,samples,_ = snd.read('./outputs/sunflowers-Alex-150-normal-589.aiff')
# samples_per_seg = 3*int(samples*150/(3*1000))
# # samples/(len(snd_data)*1000.0)
# len(snd_data)
# samples_per_seg/2
#
# len(sg.spectrogram(snd_data,nperseg=samples_per_seg,noverlap=samples_per_seg/3)[2])
#
# from spectro_gen import generate_aiff_spectrogram

View File

@@ -1,110 +0,0 @@
#!/usr/bin/env python
#coding: utf-8
""" This work is licensed under a Creative Commons Attribution 3.0 Unported License.
Frank Zalkow, 2012-2013
http://www.frank-zalkow.de/en/code-snippets/create-audio-spectrograms-with-python.html?i=1
"""
# %matplotlib inline
import numpy as np
from matplotlib import pyplot as plt
from pysndfile import sndio as snd
from numpy.lib import stride_tricks
""" short time fourier transform of audio signal """
def stft(sig, frameSize, overlapFac=0.5, window=np.hanning):
win = window(frameSize)
hopSize = int(frameSize - np.floor(overlapFac * frameSize))
# zeros at beginning (thus center of 1st window should be for sample nr. 0)
# sig = (sig*255).astype(np.uint8)
# import pdb;pdb.set_trace()
count = int(np.floor(frameSize/2.0))
# import pdb;pdb.set_trace()
samples = np.append(np.zeros(count), sig)
# cols for windowing
cols = int(np.ceil( (len(samples) - frameSize) / float(hopSize)) + 1)
# zeros at end (thus samples can be fully covered by frames)
samples = np.append(samples, np.zeros(frameSize))
frames = stride_tricks.as_strided(samples, shape=(cols, frameSize), strides=(samples.strides[0]*hopSize, samples.strides[0])).copy()
frames *= win
return np.fft.rfft(frames)
""" scale frequency axis logarithmically """
def logscale_spec(spec, sr=44100, factor=20.):
timebins, freqbins = np.shape(spec)
scale = np.linspace(0, 1, freqbins) ** factor
scale *= (freqbins-1)/max(scale)
scale = np.unique(np.round(scale)).astype(np.uint32)
# import pdb;pdb.set_trace()
# create spectrogram with new freq bins
newspec = np.complex128(np.zeros([timebins, len(scale)]))
for i in range(0, len(scale)):
if i == len(scale)-1:
newspec[:,i] = np.sum(spec[:,scale[i]:], axis=1)
else:
newspec[:,i] = np.sum(spec[:,scale[i]:scale[i+1]], axis=1)
# list center freq of bins
allfreqs = np.abs(np.fft.fftfreq(freqbins*2, 1./sr)[:freqbins+1])
freqs = []
for i in range(0, len(scale)):
if i == len(scale)-1:
freqs += [np.mean(allfreqs[scale[i]:])]
else:
freqs += [np.mean(allfreqs[scale[i]:scale[i+1]])]
return newspec, freqs
""" generate spectrogram for aiff audio with 150ms windows and 50ms overlap"""
def generate_aiff_spectrogram(audiopath):
samples,samplerate,_ = snd.read(audiopath)
# samplerate, samples = wav.read(audiopath)
# s = stft(samples, binsize)
s = stft(samples, samplerate*150/1000,1.0/3)
sshow, freq = logscale_spec(s, factor=1.0, sr=samplerate)
ims = 20.*np.log10(np.abs(sshow)/10e-6)
return ims
""" plot spectrogram"""
def plotstft(audiopath, binsize=2**10, plotpath=None, colormap="jet"):
samples,samplerate,_ = snd.read(audiopath)
# samplerate, samples = wav.read(audiopath)
# s = stft(samples, binsize)
s = stft(samples, samplerate*150/1000,1.0/3)
sshow, freq = logscale_spec(s, factor=1.0, sr=samplerate)
ims = 20.*np.log10(np.abs(sshow)/10e-6) # amplitude to decibel
timebins, freqbins = np.shape(ims)
# import pdb;pdb.set_trace()
plt.figure(figsize=(15, 7.5))
plt.imshow(np.transpose(ims), origin="lower", aspect="auto", cmap=colormap, interpolation="none")
plt.colorbar()
plt.xlabel("time (s)")
plt.ylabel("frequency (hz)")
plt.xlim([0, timebins-1])
plt.ylim([0, freqbins])
xlocs = np.float32(np.linspace(0, timebins-1, 5))
plt.xticks(xlocs, ["%.02f" % l for l in ((xlocs*len(samples)/timebins)+(0.5*binsize))/samplerate])
ylocs = np.int16(np.round(np.linspace(0, freqbins-1, 10)))
plt.yticks(ylocs, ["%.02f" % freq[i] for i in ylocs])
if plotpath:
plt.savefig(plotpath, bbox_inches="tight")
else:
plt.show()
plt.clf()
if __name__ == '__main__':
plotstft('./outputs/sunflowers-Alex-150-normal-589.aiff')
plotstft('./outputs/sunflowers-Alex-180-normal-4763.aiff')
plotstft('./outputs/sunflowers-Victoria-180-normal-870.aiff')
plotstft('./outputs/sunflowers-Fred-180-phoneme-9733.aiff')
plotstft('./outputs/sunflowers-Fred-180-normal-6515.aiff')

394
speech_data.py Normal file
View File

@@ -0,0 +1,394 @@
import pandas as pd
from speech_tools import *
from speech_pitch import *
# import dask as dd
# import dask.dataframe as ddf
import tensorflow as tf
from tensorflow.python.ops import data_flow_ops
import numpy as np
from speech_spectrum import generate_aiff_spectrogram,generate_sample_spectrogram
from speech_similar import segmentable_phoneme
from sklearn.model_selection import train_test_split
import os,shutil
import random
import csv
import gc
import pickle
import itertools
from tqdm import tqdm
def siamese_pairs(rightGroup, wrongGroup):
group1 = [r for (i, r) in rightGroup.iterrows()]
group2 = [r for (i, r) in wrongGroup.iterrows()]
rightWrongPairs = [(g1, g2) for g2 in group2 for g1 in group1]#+[(g2, g1) for g2 in group2 for g1 in group1]
rightRightPairs = [i for i in itertools.combinations(group1, 2)]#+[i for i in itertools.combinations(group2, 2)]
def filter_criteria(s1,s2):
same = s1['variant'] == s2['variant']
phon_same = s1['phonemes'] == s2['phonemes']
voice_diff = s1['voice'] != s2['voice']
if not same and phon_same:
return False
# if same and not voice_diff:
# return False
return True
validRWPairs = [i for i in rightWrongPairs if filter_criteria(*i)]
validRRPairs = [i for i in rightRightPairs if filter_criteria(*i)]
random.shuffle(validRWPairs)
random.shuffle(validRRPairs)
# return rightRightPairs[:10],rightWrongPairs[:10]
return validRRPairs[:32],validRWPairs[:32]
def seg_siamese_pairs(rightGroup, wrongGroup):
group1 = [r for (i, r) in rightGroup.iterrows()]
group2 = [r for (i, r) in wrongGroup.iterrows()]
rightWrongPairs = [(g1, g2) for g2 in group2 for g1 in group1]#+[(g2, g1) for g2 in group2 for g1 in group1]
rightRightPairs = [i for i in itertools.combinations(group1, 2)]#+[i for i in itertools.combinations(group2, 2)]
def filter_criteria(s1,s2):
same = s1['variant'] == s2['variant']
phon_same = s1['phonemes'] == s2['phonemes']
voice_diff = s1['voice'] != s2['voice']
if not same and phon_same:
return False
# if same and not voice_diff:
# return False
return True
validRWPairs = [i for i in rightWrongPairs if filter_criteria(*i)]
validRRPairs = [i for i in rightRightPairs if filter_criteria(*i)]
random.shuffle(validRWPairs)
random.shuffle(validRRPairs)
rrPhonePairs = []
rwPhonePairs = []
def compute_seg_spec(s1,s2):
phon_count = len(s1['parsed_phoneme'])
seg1_count = len(s1['segments'].index)
seg2_count = len(s2['segments'].index)
if phon_count == seg1_count and seg2_count == phon_count:
s1nd,s2nd = pm_snd(s1['file_path']),pm_snd(s2['file_path'])
segs1 = [tuple(x) for x in s1['segments'][['start','end']].values]
segs2 = [tuple(x) for x in s2['segments'][['start','end']].values]
s1_cp = pd.Series(s1)
s2_cp = pd.Series(s2)
pp12 = zip(s1['parsed_phoneme'],s2['parsed_phoneme'],segs1,segs2)
for (p1,p2,(s1s,s1e),(s2s,s2e)) in pp12:
spc1 = generate_sample_spectrogram(s1nd.extract_part(s1s,s1e).values)
spc2 = generate_sample_spectrogram(s2nd.extract_part(s2s,s2e).values)
s1_cp['spectrogram'] = spc1
s2_cp['spectrogram'] = spc2
# import pdb; pdb.set_trace()
if repr(p1) == repr(p2):
rrPhonePairs.append((s1_cp,s2_cp))
else:
rwPhonePairs.append((s1_cp,s2_cp))
for (s1,s2) in validRRPairs:
compute_seg_spec(s1,s2)
for (s1,s2) in validRWPairs:
compute_seg_spec(s1,s2)
return rrPhonePairs[:32],rwPhonePairs[:32]
# return rightRightPairs[:10],rightWrongPairs[:10]
# return
# validRRPairs[:8],validRWPairs[:8]
def _float_feature(value):
return tf.train.Feature(float_list=tf.train.FloatList(value=value))
def _int64_feature(value):
return tf.train.Feature(int64_list=tf.train.Int64List(value=value))
def _bytes_feature(value):
return tf.train.Feature(bytes_list=tf.train.BytesList(value=value))
def create_spectrogram_tfrecords(audio_group='audio',sample_count=0,train_test_ratio=0.1):
'''
http://warmspringwinds.github.io/tensorflow/tf-slim/2016/12/21/tfrecords-guide/
http://www.machinelearninguru.com/deep_learning/tensorflow/basics/tfrecord/tfrecord.html
'''
audio_samples = pd.read_csv( './outputs/' + audio_group + '.fixed.csv',index_col=0)
audio_samples['file_path'] = audio_samples.loc[:, 'file'].apply(lambda x: 'outputs/' + audio_group + '/' + x)
n_records,n_spec,n_features = 0,0,0
def write_samples(wg,sample_name):
word_group_prog = tqdm(wg,desc='Computing spectrogram')
record_file = './outputs/{}.{}.tfrecords'.format(audio_group,sample_name)
writer = tf.python_io.TFRecordWriter(record_file)
for (w, word_group) in word_group_prog:
word_group_prog.set_postfix(word=w,sample_name=sample_name)
g = word_group.reset_index()
# g['spectrogram'] = apply_by_multiprocessing(g['file_path'],pitch_array)
g['spectrogram'] = apply_by_multiprocessing(g['file_path'],generate_aiff_spectrogram)
# g['spectrogram'] = apply_by_multiprocessing(g['file_path'],compute_mfcc)
sample_right = g.loc[g['variant'] == 'low']
sample_wrong = g.loc[g['variant'] == 'medium']
same, diff = siamese_pairs(sample_right, sample_wrong)
groups = [([0,1],same),([1,0],diff)]
for (output,group) in groups:
group_prog = tqdm(group,desc='Writing Spectrogram')
for sample1,sample2 in group_prog:
group_prog.set_postfix(output=output
,var1=sample1['variant']
,var2=sample2['variant'])
spectro1,spectro2 = sample1['spectrogram'],sample2['spectrogram']
spec_n1,spec_n2 = spectro1.shape[0],spectro2.shape[0]
spec_w1,spec_w2 = spectro1.shape[1],spectro2.shape[1]
spec1,spec2 = spectro1.reshape(-1),spectro2.reshape(-1)
nonlocal n_spec,n_records,n_features
n_spec = max([n_spec,spec_n1,spec_n2])
n_features = spec_w1
n_records+=1
example = tf.train.Example(features=tf.train.Features(
feature={
'word': _bytes_feature([w.encode('utf-8')]),
'phoneme1': _bytes_feature([sample1['phonemes'].encode('utf-8')]),
'phoneme2': _bytes_feature([sample2['phonemes'].encode('utf-8')]),
'voice1': _bytes_feature([sample1['voice'].encode('utf-8')]),
'voice2': _bytes_feature([sample2['voice'].encode('utf-8')]),
'language': _bytes_feature([sample1['language'].encode('utf-8')]),
'rate1':_int64_feature([sample1['rate']]),
'rate2':_int64_feature([sample2['rate']]),
'variant1': _bytes_feature([sample1['variant'].encode('utf-8')]),
'variant2': _bytes_feature([sample2['variant'].encode('utf-8')]),
'file1': _bytes_feature([sample1['file'].encode('utf-8')]),
'file2': _bytes_feature([sample2['file'].encode('utf-8')]),
'spec1':_float_feature(spec1),
'spec2':_float_feature(spec2),
'spec_n1':_int64_feature([spec_n1]),
'spec_w1':_int64_feature([spec_w1]),
'spec_n2':_int64_feature([spec_n2]),
'spec_w2':_int64_feature([spec_w2]),
'output':_int64_feature(output)
}
))
writer.write(example.SerializeToString())
group_prog.close()
word_group_prog.close()
writer.close()
word_groups = [i for i in audio_samples.groupby('word')]
wg_sampled = reservoir_sample(word_groups,sample_count) if sample_count > 0 else word_groups
tr_audio_samples,te_audio_samples = train_test_split(wg_sampled,test_size=train_test_ratio)
write_samples(tr_audio_samples,'train')
write_samples(te_audio_samples,'test')
const_file = os.path.join('./outputs',audio_group+'.constants')
pickle.dump((n_spec,n_features,n_records),open(const_file,'wb'))
def read_siamese_tfrecords_generator(audio_group='audio',batch_size=32,test_size=0):
records_file = os.path.join('./outputs',audio_group+'.train.tfrecords')
input_pairs = []
output_class = []
const_file = os.path.join('./outputs',audio_group+'.constants')
(n_spec,n_features,n_records) = pickle.load(open(const_file,'rb'))
def copy_read_consts(dest_dir):
shutil.copy2(const_file,dest_dir+'/constants.pkl')
return (n_spec,n_features,n_records)
# @threadsafe_iter
def record_generator():
print('reading tfrecords({}-train)...'.format(audio_group))
input_data = []
output_data = []
while True:
record_iterator,records_count = record_generator_count(records_file)
#tqdm(enumerate(record_iterator),total=records_count)
#enumerate(record_iterator)
for (i,string_record) in enumerate(record_iterator):
example = tf.train.Example()
example.ParseFromString(string_record)
spec_n1 = example.features.feature['spec_n1'].int64_list.value[0]
spec_n2 = example.features.feature['spec_n2'].int64_list.value[0]
spec_w1 = example.features.feature['spec_w1'].int64_list.value[0]
spec_w2 = example.features.feature['spec_w2'].int64_list.value[0]
spec1 = np.array(example.features.feature['spec1'].float_list.value).reshape(spec_n1,spec_w1)
spec2 = np.array(example.features.feature['spec2'].float_list.value).reshape(spec_n2,spec_w2)
p_spec1,p_spec2 = padd_zeros(spec1,n_spec),padd_zeros(spec2,n_spec)
input_data.append(np.asarray([p_spec1,p_spec2]))
output = example.features.feature['output'].int64_list.value
output_data.append(np.asarray(output))
if len(input_data) == batch_size or i == n_records-1:
input_arr = np.asarray(input_data)
output_arr = np.asarray(output_data)
yield ([input_arr[:, 0], input_arr[:, 1]],output_arr)
input_data = []
output_data = []
# Read test in one-shot
print('reading tfrecords({}-test)...'.format(audio_group))
te_records_file = os.path.join('./outputs',audio_group+'.test.tfrecords')
te_re_iterator,te_n_records = record_generator_count(te_records_file)
test_size = min([test_size,te_n_records]) if test_size > 0 else te_n_records
input_data = np.zeros((test_size,2,n_spec,n_features))
output_data = np.zeros((test_size,2))
random_samples = enumerate(reservoir_sample(te_re_iterator,test_size))
for (i,string_record) in tqdm(random_samples,total=test_size):
example = tf.train.Example()
example.ParseFromString(string_record)
spec_n1 = example.features.feature['spec_n1'].int64_list.value[0]
spec_n2 = example.features.feature['spec_n2'].int64_list.value[0]
spec_w1 = example.features.feature['spec_w1'].int64_list.value[0]
spec_w2 = example.features.feature['spec_w2'].int64_list.value[0]
spec1 = np.array(example.features.feature['spec1'].float_list.value).reshape(spec_n1,spec_w1)
spec2 = np.array(example.features.feature['spec2'].float_list.value).reshape(spec_n2,spec_w2)
p_spec1,p_spec2 = padd_zeros(spec1,n_spec),padd_zeros(spec2,n_spec)
input_data[i] = np.asarray([p_spec1,p_spec2])
output = example.features.feature['output'].int64_list.value
output_data[i] = np.asarray(output)
return record_generator,input_data,output_data,copy_read_consts
def audio_samples_word_count(audio_group='audio'):
audio_samples = pd.read_csv( './outputs/' + audio_group + '.csv')
return len(audio_samples.groupby(audio_samples['word']))
def record_generator_count(records_file):
record_iterator = tf.python_io.tf_record_iterator(path=records_file)
count,spec_n = 0,0
for i in record_iterator:
# example = tf.train.Example()
# example.ParseFromString(i)
# spec_n1 = example.features.feature['spec_n1'].int64_list.value[0]
# spec_n2 = example.features.feature['spec_n2'].int64_list.value[0]
# spec_n = max([spec_n,spec_n1,spec_n2])
count+=1
record_iterator = tf.python_io.tf_record_iterator(path=records_file)
return record_iterator,count #,spec_n
def fix_csv(audio_group='audio'):
audio_csv_lines = open('./outputs/' + audio_group + '.csv','r').readlines()
audio_csv_data = [i.strip().split(',') for i in audio_csv_lines]
proper_rows = [i for i in audio_csv_data if len(i) == 7]
with open('./outputs/' + audio_group + '.fixed.csv','w') as fixed_csv:
fixed_csv_w = csv.writer(fixed_csv, quoting=csv.QUOTE_MINIMAL)
fixed_csv_w.writerows(proper_rows)
audio_samples = pd.read_csv( './outputs/' + audio_group + '.fixed.csv'
, names=['word','phonemes', 'voice', 'language', 'rate', 'variant', 'file'])
audio_samples['file_path'] = audio_samples.loc[:, 'file'].apply(lambda x: 'outputs/' + audio_group + '/' + x)
audio_samples['file_exists'] = apply_by_multiprocessing(audio_samples['file_path'], os.path.exists)
audio_samples = audio_samples[audio_samples['file_exists'] == True]
audio_samples = audio_samples.drop(['file_path','file_exists'],axis=1).reset_index(drop=True)
audio_samples.to_csv('./outputs/' + audio_group + '.fixed.csv')
def convert_old_audio():
audio_samples = pd.read_csv( './outputs/audio.csv.old'
, names=['word', 'voice', 'rate', 'variant', 'file'])
audio_samples['phonemes'] = 'unknown'
audio_samples['language'] = 'en-US'
audio_samples.loc[audio_samples['variant'] == 'normal','variant'] = 'low'
audio_samples.loc[audio_samples['variant'] == 'phoneme','variant'] = 'medium'
audio_samples = audio_samples[['word','phonemes', 'voice', 'language', 'rate', 'variant', 'file']]
audio_samples.to_csv('./outputs/audio_new.csv',index=False,header=False)
def generate_sppas_trans(audio_group='story_words.all'):
# audio_group='story_words.all'
audio_samples = pd.read_csv( './outputs/' + audio_group + '.fixed.csv',index_col=0)
audio_samples['file_path'] = audio_samples.loc[:, 'file'].apply(lambda x: 'outputs/' + audio_group + '/' + x)
# audio_samples = audio_samples.head(5)
rows = tqdm(audio_samples.iterrows(),total = len(audio_samples.index)
, desc='Transcribing Words ')
for (i,row) in rows:
# len(audio_samples.iterrows())
# (i,row) = next(audio_samples.iterrows())
rows.set_postfix(word=row['word'])
transribe_audio_text(row['file_path'],row['word'])
rows.close()
def create_seg_phonpair_tfrecords(audio_group='story_words.all',sample_count=0,train_test_ratio=0.1):
audio_samples = pd.read_csv( './outputs/' + audio_group + '.fixed.csv',index_col=0)
audio_samples['file_path'] = audio_samples.loc[:, 'file'].apply(lambda x: 'outputs/' + audio_group + '/' + x)
audio_samples = audio_samples[(audio_samples['variant'] == 'low') | (audio_samples['variant'] == 'medium')]
audio_samples['parsed_phoneme'] = apply_by_multiprocessing(audio_samples['phonemes'],segmentable_phoneme)
# audio_samples['sound'] = apply_by_multiprocessing(audio_samples['file_path'],pm_snd)
# read_seg_file(audio_samples.iloc[0]['file_path'])
audio_samples['segments'] = apply_by_multiprocessing(audio_samples['file_path'],read_seg_file)
n_records,n_spec,n_features = 0,0,0
def write_samples(wg,sample_name):
word_group_prog = tqdm(wg,desc='Computing PhonPair spectrogram')
record_file = './outputs/{}.{}.tfrecords'.format(audio_group,sample_name)
writer = tf.python_io.TFRecordWriter(record_file)
for (w, word_group) in word_group_prog:
word_group_prog.set_postfix(word=w,sample_name=sample_name)
g = word_group.reset_index()
# g['spectrogram'] = apply_by_multiprocessing(g['file_path'],pitch_array)
# g['spectrogram'] = apply_by_multiprocessing(g['file_path'],generate_aiff_spectrogram)
# g['spectrogram'] = apply_by_multiprocessing(g['file_path'],compute_mfcc)
sample_right = g.loc[g['variant'] == 'low']
sample_wrong = g.loc[g['variant'] == 'medium']
same, diff = seg_siamese_pairs(sample_right, sample_wrong)
groups = [([0,1],same),([1,0],diff)]
for (output,group) in groups:
group_prog = tqdm(group,desc='Writing Spectrogram')
for sample1,sample2 in group_prog:
group_prog.set_postfix(output=output
,var1=sample1['variant']
,var2=sample2['variant'])
spectro1,spectro2 = sample1['spectrogram'],sample2['spectrogram']
spec_n1,spec_n2 = spectro1.shape[0],spectro2.shape[0]
spec_w1,spec_w2 = spectro1.shape[1],spectro2.shape[1]
spec1,spec2 = spectro1.reshape(-1),spectro2.reshape(-1)
nonlocal n_spec,n_records,n_features
n_spec = max([n_spec,spec_n1,spec_n2])
n_features = spec_w1
n_records+=1
example = tf.train.Example(features=tf.train.Features(
feature={
'word': _bytes_feature([w.encode('utf-8')]),
'phoneme1': _bytes_feature([sample1['phonemes'].encode('utf-8')]),
'phoneme2': _bytes_feature([sample2['phonemes'].encode('utf-8')]),
'voice1': _bytes_feature([sample1['voice'].encode('utf-8')]),
'voice2': _bytes_feature([sample2['voice'].encode('utf-8')]),
'language': _bytes_feature([sample1['language'].encode('utf-8')]),
'rate1':_int64_feature([sample1['rate']]),
'rate2':_int64_feature([sample2['rate']]),
'variant1': _bytes_feature([sample1['variant'].encode('utf-8')]),
'variant2': _bytes_feature([sample2['variant'].encode('utf-8')]),
'file1': _bytes_feature([sample1['file'].encode('utf-8')]),
'file2': _bytes_feature([sample2['file'].encode('utf-8')]),
'spec1':_float_feature(spec1),
'spec2':_float_feature(spec2),
'spec_n1':_int64_feature([spec_n1]),
'spec_w1':_int64_feature([spec_w1]),
'spec_n2':_int64_feature([spec_n2]),
'spec_w2':_int64_feature([spec_w2]),
'output':_int64_feature(output)
}
))
writer.write(example.SerializeToString())
group_prog.close()
word_group_prog.close()
writer.close()
word_groups = [i for i in audio_samples.groupby('word')]
wg_sampled = reservoir_sample(word_groups,sample_count) if sample_count > 0 else word_groups
tr_audio_samples,te_audio_samples = train_test_split(wg_sampled,test_size=train_test_ratio)
write_samples(tr_audio_samples,'train')
write_samples(te_audio_samples,'test')
const_file = os.path.join('./outputs',audio_group+'.constants')
pickle.dump((n_spec,n_features,n_records),open(const_file,'wb'))
if __name__ == '__main__':
# sunflower_pairs_data()
# create_spectrogram_data()
# create_spectrogram_data('story_words')
# create_spectrogram_tfrecords('story_words')
# create_spectrogram_tfrecords('story_words_test')
# read_siamese_tfrecords('story_all')
# read_siamese_tfrecords('story_words_test')
# padd_zeros_siamese_tfrecords('story_words')
# fix_csv('story_words')
# pickle_constants('story_words')
# create_spectrogram_tfrecords('audio',sample_count=100)
# create_spectrogram_tfrecords('story_all',sample_count=25)
# fix_csv('story_words_test')
# fix_csv('test_5_words')
# generate_sppas_trans('test_5_words')
create_seg_phonpair_tfrecords('test_5_words')
# create_spectrogram_tfrecords('story_words.all',sample_count=0,train_test_ratio=0.1)
#record_generator_count()
# create_spectrogram_tfrecords('audio',sample_count=50)
# read_siamese_tfrecords_generator('audio')
# padd_zeros_siamese_tfrecords('audio')
# create_padded_spectrogram()
# create_speech_pairs_data()
# print(speech_model_data())

134
speech_model.py Normal file
View File

@@ -0,0 +1,134 @@
from __future__ import absolute_import
from __future__ import print_function
import numpy as np
from speech_data import read_siamese_tfrecords_generator
from keras.models import Model,load_model,model_from_yaml
from keras.layers import Input,Concatenate,Lambda, BatchNormalization, Dropout
from keras.layers import Dense, LSTM, Bidirectional, GRU
from keras.losses import categorical_crossentropy
from keras.utils import to_categorical
from keras.optimizers import RMSprop
from keras.callbacks import TensorBoard, ModelCheckpoint
from keras import backend as K
from keras.utils import plot_model
from speech_tools import create_dir,step_count
def create_base_rnn_network(input_dim):
'''Base network to be shared (eq. to feature extraction).
'''
inp = Input(shape=input_dim)
# ls0 = LSTM(512, return_sequences=True)(inp)
ls1 = LSTM(128, return_sequences=True)(inp)
bn_ls1 = BatchNormalization(momentum=0.98)(ls1)
ls2 = LSTM(64, return_sequences=True)(bn_ls1)
bn_ls2 = BatchNormalization(momentum=0.98)(ls2)
# ls3 = LSTM(32, return_sequences=True)(ls2)
ls4 = LSTM(32)(bn_ls2)
# d1 = Dense(128, activation='relu')(ls4)
#d2 = Dense(64, activation='relu')(ls2)
return Model(inp, ls4)
def compute_accuracy(y_true, y_pred):
'''Compute classification accuracy with a fixed threshold on distances.
'''
pred = y_pred.ravel() > 0.5
return np.mean(pred == y_true)
def accuracy(y_true, y_pred):
'''Compute classification accuracy with a fixed threshold on distances.
'''
return K.mean(K.equal(y_true, K.cast(y_pred > 0.5, y_true.dtype)))
def dense_classifier(processed):
conc_proc = Concatenate()(processed)
d1 = Dense(64, activation='relu')(conc_proc)
# dr1 = Dropout(0.1)(d1)
bn_d1 = BatchNormalization(momentum=0.98)(d1)
# d2 = Dense(128, activation='relu')(d1)
d3 = Dense(8, activation='relu')(bn_d1)
bn_d3 = BatchNormalization(momentum=0.98)(d3)
# dr2 = Dropout(0.1)(d2)
return Dense(2, activation='softmax')(bn_d3)
def siamese_model(input_dim):
base_network = create_base_rnn_network(input_dim)
input_a = Input(shape=input_dim)
input_b = Input(shape=input_dim)
processed_a = base_network(input_a)
processed_b = base_network(input_b)
final_output = dense_classifier([processed_a,processed_b])
model = Model([input_a, input_b], final_output)
return model,base_network
def write_model_arch(mod,mod_file):
model_f = open(mod_file,'w')
model_f.write(mod.to_yaml())
model_f.close()
def load_model_arch(mod_file):
model_f = open(mod_file,'r')
mod = model_from_yaml(model_f.read())
model_f.close()
return mod
def train_siamese(audio_group = 'audio',resume_weights='',initial_epoch=0):
batch_size = 128
model_dir = './models/'+audio_group
create_dir(model_dir)
log_dir = './logs/'+audio_group
create_dir(log_dir)
tr_gen_fn,te_pairs,te_y,copy_read_consts = read_siamese_tfrecords_generator(audio_group,batch_size=batch_size,test_size=batch_size)
n_step,n_features,n_records = copy_read_consts(model_dir)
tr_gen = tr_gen_fn()
input_dim = (n_step, n_features)
model,base_model = siamese_model(input_dim)
plot_model(model,show_shapes=True, to_file=model_dir+'/model.png')
plot_model(base_model,show_shapes=True, to_file=model_dir+'/base_model.png')
tb_cb = TensorBoard(
log_dir=log_dir,
histogram_freq=1,
batch_size=32,
write_graph=True,
write_grads=True,
write_images=True,
embeddings_freq=0,
embeddings_layer_names=None,
embeddings_metadata=None)
cp_file_fmt = model_dir+'/siamese_speech_model-{epoch:02d}-epoch-{val_loss:0.2f}\
-acc.h5'
cp_cb = ModelCheckpoint(
cp_file_fmt,
monitor='acc',
verbose=0,
save_best_only=True,
save_weights_only=True,
mode='auto',
period=1)
# train
rms = RMSprop()
model.compile(loss=categorical_crossentropy, optimizer=rms, metrics=[accuracy])
write_model_arch(model,model_dir+'/siamese_speech_model_arch.yaml')
epoch_n_steps = step_count(n_records,batch_size)
if resume_weights != '':
model.load_weights(resume_weights)
model.fit_generator(tr_gen
, epochs=10000
, steps_per_epoch=epoch_n_steps
, validation_data=([te_pairs[:, 0], te_pairs[:, 1]], te_y)
, max_queue_size=8
, callbacks=[tb_cb, cp_cb],initial_epoch=initial_epoch)
model.save(model_dir+'/siamese_speech_model-final.h5')
# y_pred = model.predict([te_pairs[:, 0], te_pairs[:, 1]])
# te_acc = compute_accuracy(te_y, y_pred)
# print('* Accuracy on test set: %0.2f%%' % (100 * te_acc))
if __name__ == '__main__':
train_siamese('test_5_words')

158
speech_pitch.py Normal file
View File

@@ -0,0 +1,158 @@
import parselmouth as pm
from pysndfile import sndio as snd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pyaudio as pa
sns.set() # Use seaborn's default style to make graphs more pretty
def pm_snd(sample_file):
# sample_file = 'inputs/self-apple/apple-low1.aiff'
samples, samplerate, _ = snd.read(sample_file)
return pm.Sound(values=samples,sampling_frequency=samplerate)
def pitch_array(sample_file='outputs/audio/sunflowers-Victoria-180-normal-870.aiff'):
sample_sound = pm_snd(sample_file)
sample_pitch = sample_sound.to_pitch()
return sample_pitch.to_matrix().as_array()
def intensity_array(sample_file='outputs/audio/sunflowers-Victoria-180-normal-870.aiff'):
sample_sound = pm_snd(sample_file)
sample_intensity = sample_sound.to_mfcc()
sample_intensity.as_array().shape
return sample_pitch.to_matrix().as_array()
def compute_mfcc(sample_file='outputs/audio/sunflowers-Victoria-180-normal-870.aiff'):
sample_sound = pm_snd(sample_file)
sample_mfcc = sample_sound.to_mfcc()
# sample_mfcc.to_array().shape
return sample_mfcc.to_array()
def compute_formants(sample_file='outputs/audio/sunflowers-Victoria-180-normal-870.aiff'):
# sample_file='outputs/audio/sunflowers-Victoria-180-normal-870.aiff'
sample_sound = pm_snd(sample_file)
sample_formant = sample_sound.to_formant_burg()
# sample_formant.x_bins()
return sample_formant.x_bins()
def draw_spectrogram(spectrogram, dynamic_range=70):
X, Y = spectrogram.x_grid(), spectrogram.y_grid()
sg_db = 10 * np.log10(spectrogram.values.T)
plt.pcolormesh(X, Y, sg_db, vmin=sg_db.max() - dynamic_range, cmap='afmhot')
plt.ylim([spectrogram.ymin, spectrogram.ymax])
plt.xlabel("time [s]")
plt.ylabel("frequency [Hz]")
def draw_intensity(intensity):
plt.plot(intensity.xs(), intensity.values, linewidth=3, color='w')
plt.plot(intensity.xs(), intensity.values, linewidth=1)
plt.grid(False)
plt.ylim(0)
plt.ylabel("intensity [dB]")
def draw_pitch(pitch):
# Extract selected pitch contour, and
# replace unvoiced samples by NaN to not plot
pitch_values = pitch.to_matrix().values
pitch_values[pitch_values==0] = np.nan
plt.plot(pitch.xs(), pitch_values, linewidth=3, color='w')
plt.plot(pitch.xs(), pitch_values, linewidth=1)
plt.grid(False)
plt.ylim(0, pitch.ceiling)
plt.ylabel("pitch [Hz]")
def draw_formants(formant):
# Extract selected pitch contour, and
# replace unvoiced samples by NaN to not plot
formant_values = formant.to_matrix().values
pitch_values[pitch_values==0] = np.nan
plt.plot(pitch.xs(), pitch_values, linewidth=3, color='w')
plt.plot(pitch.xs(), pitch_values, linewidth=1)
plt.grid(False)
plt.ylim(0, pitch.ceiling)
plt.ylabel("Formants [val]")
def plot_sample_raw(sample_file='outputs/audio/sunflowers-Victoria-180-normal-870.aiff'):
# %matplotlib inline
# sample_file='outputs/audio/sunflowers-Victoria-180-normal-870.aiff
snd_d = pm_snd(sample_file)
plt.figure()
plt.plot(snd_d.xs(), snd_d.values)
plt.xlim([snd_d.xmin, snd_d.xmax])
plt.xlabel("time [s]")
plt.ylabel("amplitude")
plt.show()
def plot_file_intensity(sample_file='outputs/audio/sunflowers-Victoria-180-normal-870.aiff'):
snd_d = pm_snd(sample_file)
plot_sample_intensity(snd_d)
def plot_sample_intensity(snd_d):
intensity = snd_d.to_intensity()
spectrogram = snd_d.to_spectrogram()
plt.figure()
draw_spectrogram(spectrogram)
plt.twinx()
draw_intensity(intensity)
plt.xlim([snd_d.xmin, snd_d.xmax])
plt.show()
def plot_file_pitch(sample_file='outputs/audio/sunflowers-Victoria-180-normal-870.aiff'):
snd_d = pm_snd(sample_file)
plot_sample_pitch(snd_d)
def plot_sample_pitch(snd_d,phons = []):
pitch = snd_d.to_pitch()
spectrogram = snd_d.to_spectrogram(window_length=0.03, maximum_frequency=8000)
plt.figure()
draw_spectrogram(spectrogram)
plt.twinx()
draw_pitch(pitch)
for (p,c) in phons:
plt.axvline(x=p)
plt.text(p,-1,c)
plt.xlim([snd_d.xmin, snd_d.xmax])
plt.show()
def play_sound(samplerate=22050):
#snd_sample = pm_snd('outputs/test/a_warm_smile_and_a_good_heart-1917.aiff')
p_oup = pa.PyAudio()
stream = p_oup.open(
format=pa.paFloat32,
channels=2,
rate=samplerate,
output=True)
def sample_player(snd_sample=None):
samples = snd_sample.as_array()[:,0]
one_channel = np.asarray([samples, samples]).T.reshape(-1)
audio_data = one_channel.astype(np.float32).tobytes()
stream.write(audio_data)
def close_player():
stream.close()
p_oup.terminate()
return sample_player,close_player
# snd_part = snd_d.extract_part(from_time=0.9, preserve_times=True)
# plt.figure()
# plt.plot(snd_part.xs(), snd_part.values, linewidth=0.5)
# plt.xlim([snd_part.xmin, snd_part.xmax])
# plt.xlabel("time [s]")
# plt.ylabel("amplitude")
# plt.show()
if __name__ == '__main__':
plot_file_pitch('outputs/audio/sunflowers-Victoria-180-normal-870.aiff')
plot_file_pitch('outputs/test/a_warm_smile_and_a_good_heart-1917.aiff')
play_sound(pm_snd('outputs/test/a_warm_smile_and_a_good_heart-1917.aiff'))
plot_file_pitch('outputs/test/a_wrong_turn-3763.aiff')
play_sound(pm_snd('outputs/test/a_wrong_turn-3763.aiff'))
plot_file_pitch('inputs/self/a_wrong_turn-low1.aiff')
play_sound(pm_snd('inputs/self/a_wrong_turn-low1.aiff'))
plot_file_pitch('inputs/self/a_wrong_turn-low2.aiff')
play_sound(pm_snd('inputs/self/a_wrong_turn-low2.aiff'))
plot_file_pitch('inputs/self/apple-low1.aiff')
plot_file_pitch('inputs/self/apple-low2.aiff')
plot_file_pitch('inputs/self/apple-medium1.aiff')

252
speech_samplegen.py Normal file
View File

@@ -0,0 +1,252 @@
import objc
from AppKit import NSSpeechSynthesizer, NSSpeechInputModeProperty
from AppKit import NSSpeechModePhoneme
from Foundation import NSURL
import json
import csv
import random
import os
import re
import subprocess
import time
from tqdm import tqdm
from generate_similar import similar_phoneme_phrase,similar_phrase
from speech_tools import hms_string,create_dir,format_filename,reservoir_sample
OUTPUT_NAME = 'test_5_words'
dest_dir = os.path.abspath('.') + '/outputs/' + OUTPUT_NAME + '/'
dest_file = './outputs/' + OUTPUT_NAME + '.csv'
def dest_filename(w, v, r, t):
rand_no = str(random.randint(0, 10000))
fname = '{}-{}-{}-{}-{}.aiff'.format(w, v, r, t, rand_no)
sanitized = format_filename(fname)
return sanitized
def dest_path(v, r, n):
rel = v + '/' + str(r) + '/' + n
return (dest_dir + rel), rel
def cli_gen_audio(speech_cmd, rate, voice, out_path):
subprocess.call(
['say', '-v', voice, '-r',
str(rate), '-o', out_path, "'"+speech_cmd+"'"])
class SynthFile(object):
"""docstring for SynthFile."""
def __init__(self, word, phon, filename, voice, voice_lang, rate, operation):
super(SynthFile, self).__init__()
self.word = word
self.phoneme = phon
self.filename = filename
self.voice = voice
self.voice_lang = voice_lang
self.rate = rate
self.variant = operation
def get_json(self):
return {
'filename': self.filename,
'voice': self.voice,
'rate': self.rate,
'operation': self.operation
}
def get_csv(self):
cols = [self.word, self.phoneme, self.voice,
self.voice_lang, self.rate, self.variant,
self.filename]
return ','.join([str(c) for c in cols])+'\n'
def get_values(self):
cols = [self.word, self.phoneme, self.voice,
self.voice_lang, self.rate, self.variant,
self.filename]
return [str(c) for c in cols]
class SynthVariant(object):
"""docstring for SynthVariant."""
def __init__(self, identifier, voice, lang, rate):
super(SynthVariant, self).__init__()
self.synth = NSSpeechSynthesizer.alloc().initWithVoice_(identifier)
self.synth.setVolume_(100)
self.synth.setRate_(rate)
self.phone_synth = NSSpeechSynthesizer.alloc().initWithVoice_(
identifier)
self.phone_synth.setVolume_(100)
self.phone_synth.setRate_(rate)
self.phone_synth.setObject_forProperty_error_(
NSSpeechModePhoneme, NSSpeechInputModeProperty, None)
self.identifier = identifier
self.rate = rate
self.name = voice
self.lang = lang
self.phoneme_capable = self.is_phoneme_capable()
def __repr__(self):
return 'Synthesizer[{} - {}]'.format(self.name, self.rate)
def is_phoneme_capable(self):
orig_phon = self.synth.phonemesFromText_('water')
return orig_phon != ''
def generate_audio(self, text, variant):
orig_phon, phoneme, phon_cmd = self.synth.phonemesFromText_(
text), '', text
if variant == 'low':
# self.synth.startSpeakingString_toURL_(word,d_url)
phoneme = orig_phon
elif variant == 'medium':
phoneme = similar_phoneme_phrase(orig_phon)
phon_cmd = '[[inpt PHON]] ' + phoneme
elif variant == 'high':
phoneme = similar_phrase(text)
phon_cmd = phoneme
# elif variant == 'long':
# if phon != '':
# self.phone_synth.startSpeakingString_toURL_(phon,d_url)
# else:
# self.synth.startSpeakingString_toURL_(word,d_url)
fname = dest_filename(text, self.name, self.rate, variant)
d_path, r_path = dest_path(self.name, self.rate, fname)
# d_url = NSURL.fileURLWithPath_(d_path)
cli_gen_audio(phon_cmd, self.rate, self.name, d_path)
return SynthFile(text, phoneme, r_path, self.name, self.lang, self.rate, variant)
def create_synth_dirs(self):
if self.phoneme_capable:
create_dir(dest_dir + self.name + '/' + str(self.rate))
@staticmethod
def voices_for_lang(lang):
voices_installed = NSSpeechSynthesizer.availableVoices()
voice_attrs = [
NSSpeechSynthesizer.attributesForVoice_(v) for v in voices_installed
]
# sk = [k for k in voice_attrs[0].keys() if k not in [
# 'VoiceIndividuallySpokenCharacters', 'VoiceSupportedCharacters']]
# s_attrs = [[v[i] for i in sk] for v in voice_attrs if 'VoiceShowInFullListOnly' in v
# and 'VoiceRelativeDesirability' in v]
return [
(v['VoiceIdentifier'],
v['VoiceName'],
v['VoiceLanguage']) for v in voice_attrs
if v['VoiceLanguage'] == lang
and v['VoiceGender'] != 'VoiceGenderNeuter'
]
@classmethod
def synth_with(cls,voice_params,rate=180):
identifier,voice,lang = voice_params
return cls(identifier,voice,lang,rate)
def synth_generator():
us_voices_ids = SynthVariant.voices_for_lang('en-US')
voice_rates = [150, 180, 210]#, 250]
voice_synths = []
create_dir(dest_dir)
for vp in us_voices_ids:
for r in voice_rates:
s = SynthVariant.synth_with(vp,r)
if s.phoneme_capable:
print('Adding ', s)
voice_synths.append(s)
else:
print('Discarding phoneme incapable ', s)
def synth_for_words(words, writer):
start_time = time.time()
prog_title = "Synthesizing {} words : ".format(len(words))
for s in voice_synths:
s.create_synth_dirs()
for v in ['low', 'medium', 'high']:
prog = tqdm(words)
prog.set_postfix(variant=v,voice=s.name,rate=s.rate)
for w in tqdm(words):
prog.set_description('Synthesizing text:"{}"'.format(w))
synthed = s.generate_audio(w, v)
writer(synthed)
prog.close()
end_time = time.time()
time_str = hms_string(end_time - start_time)
print("It took {} to synthsize all variants.".format(time_str))
return synth_for_words
def synth_logger(fname, csv_mode=False):
f = open(fname, 'w')
s_csv_w = csv.writer(f, quoting=csv.QUOTE_MINIMAL)
def csv_writer(s):
s_csv_w.writerow(s.get_values())
synth_list = []
def json_writer(s):
synth_list.append(s)
def close_file():
if csv_mode:
f.close()
else:
json.dump([s.get_json() for s in synth_list], f)
f.close()
if csv_mode:
return csv_writer, close_file
else:
return json_writer, close_file
def generate_audio_for_text_list(text_list):
(writer, closer) = synth_logger(dest_file, csv_mode=True)
synth_for_texts = synth_generator()
try:
synth_for_texts(text_list, writer)
except:
import traceback
import sys
traceback.print_exc(file=sys.stdout)
pass
closer()
def generate_audio_for_stories():
'''
Generates the audio sample variants for the list of words in the stories
'''
# story_file = './inputs/all_stories_hs.json'
story_file = './inputs/all_stories.json'
stories_data = json.load(open(story_file))
# text_list_dup = [t[0] for i in stories_data.values() for t in i]
text_list_dup = [t for i in stories_data.values() for t in i]
text_list = sorted(list(set(text_list_dup)))
generate_audio_for_text_list(text_list)
def generate_test_audio_for_stories(sample_count=0):
'''
Picks a list of words from the wordlist that are not in story words
and generates the variants
'''
story_file = './inputs/all_stories_hs.json'
# story_file = './inputs/all_stories.json'
stories_data = json.load(open(story_file))
text_list_dup = [t[0] for i in stories_data.values() for t in i]
text_list = sorted(list(set(text_list_dup)))
# text_list = [i.replace('-','') for g in stories_data.values() for i in g]
word_list = [i.strip('\n_') for i in open('./inputs/wordlist.txt','r').readlines()]
text_set = set(text_list)
new_word_list = [i for i in word_list if i not in text_set and len(i) > 4]
# test_words = new_word_list[:int(len(text_list)/5+1)]
test_words = reservoir_sample(new_word_list,sample_count) if sample_count > 0 else new_word_list
generate_audio_for_text_list(test_words)
if __name__ == '__main__':
generate_test_audio_for_stories(5)
# generate_audio_for_text_list(['I want to go home','education'])
# generate_audio_for_stories()

237
speech_segmentgen.py Normal file
View File

@@ -0,0 +1,237 @@
import objc
from AppKit import *
from Foundation import NSURL
from PyObjCTools import AppHelper
from time import time
import os
import sys
import random
import json
import csv
import subprocess
from tqdm import tqdm
from speech_tools import create_dir,format_filename
apple_phonemes = [
'%', '@', 'AE', 'EY', 'AO', 'AX', 'IY', 'EH', 'IH', 'AY', 'IX', 'AA', 'UW',
'UH', 'UX', 'OW', 'AW', 'OY', 'b', 'C', 'd', 'D', 'f', 'g', 'h', 'J', 'k',
'l', 'm', 'n', 'N', 'p', 'r', 's', 'S', 't', 'T', 'v', 'w', 'y', 'z', 'Z'
]
OUTPUT_NAME = 'story_test_segments'
dest_dir = os.path.abspath('.') + '/outputs/' + OUTPUT_NAME + '/'
csv_dest_file = os.path.abspath('.') + '/outputs/' + OUTPUT_NAME + '.csv'
create_dir(dest_dir)
def cli_gen_audio(speech_cmd, out_path):
subprocess.call(
['say', '-o', out_path, "'" + speech_cmd + "'"])
class SpeechDelegate (NSObject):
def speechSynthesizer_willSpeakWord_ofString_(self, sender, word, text):
'''Called automatically when the application has launched'''
# print("Speaking word {} in sentence {}".format(word,text))
self.wordWillSpeak()
def speechSynthesizer_willSpeakPhoneme_(self, sender, phoneme):
phon_ch = apple_phonemes[phoneme]
self.phonemeWillSpeak(phon_ch)
def speechSynthesizer_didFinishSpeaking_(self, synth, didFinishSpeaking):
if didFinishSpeaking:
self.completeCB()
def setC_W_Ph_(self, completed, word, phoneme):
self.completeCB = completed
self.wordWillSpeak = word
self.phonemeWillSpeak = phoneme
# del SpeechDelegate
class Delegate (NSObject):
def applicationDidFinishLaunching_(self, aNotification):
'''Called automatically when the application has launched'''
print("App Launched!")
# phrases = story_texts()#random.sample(story_texts(), 100) #
# phrases = test_texts(30)
phrases = story_words()
# print(phrases)
generate_audio(phrases)
class PhonemeTiming(object):
"""docstring for PhonemeTiming."""
def __init__(self, phon, start):
super(PhonemeTiming, self).__init__()
self.phoneme = phon
self.start = start
self.fraction = 0
self.duration = None
self.end = None
def is_audible(self):
return self.phoneme not in ['%', '~']
def tune(self):
if self.is_audible():
dur_ms = int(self.duration * 1000)
return '{} {{D {}}}'.format(self.phoneme, dur_ms)
else:
return '~'
def __repr__(self):
return '[{}]({:0.4f})'.format(self.phoneme, self.fraction)
@staticmethod
def to_tune(phone_ts):
tune_list = ['[[inpt TUNE]]']
for ph in phone_ts:
tune_list.append(ph.tune())
tune_list.append('[[inpt TEXT]]')
return '\n'.join(tune_list)
class SegData(object):
"""docstring for SegData."""
def __init__(self, text, filename):
super(SegData, self).__init__()
self.text = text
self.tune = ''
self.filename = filename
self.segments = []
def csv_rows(self):
result = []
s_tim = self.segments[0].start
for i in range(len(self.segments) - 1):
cs = self.segments[i]
# if cs.is_audible():
ns = self.segments[i + 1]
row = [self.text, self.filename, cs.phoneme, ns.phoneme,
(cs.start - s_tim) * 1000, (cs.end - s_tim) * 1000]
result.append(row)
return result
class SynthesizerQueue(object):
"""docstring for SynthesizerQueue."""
def __init__(self):
super(SynthesizerQueue, self).__init__()
self.synth = NSSpeechSynthesizer.alloc().init()
self.didComplete = None
q_delg = SpeechDelegate.alloc().init()
self.synth.setDelegate_(q_delg)
def synth_complete():
end_time = time()
for i in range(len(self.phoneme_timing)):
if i == len(self.phoneme_timing) - 1:
self.phoneme_timing[i].duration = end_time - \
self.phoneme_timing[i].start
self.phoneme_timing[i].end = end_time
else:
self.phoneme_timing[i].duration = self.phoneme_timing[i +
1].start - self.phoneme_timing[i].start
self.phoneme_timing[i].end = self.phoneme_timing[i + 1].start
total_time = sum(
[i.duration for i in self.phoneme_timing if i.is_audible()])
for ph in self.phoneme_timing:
if ph.is_audible():
ph.fraction = ph.duration / total_time
if self.didComplete:
self.data.segments = self.phoneme_timing
self.data.tune = PhonemeTiming.to_tune(self.phoneme_timing)
self.didComplete(self.data)
def will_speak_phoneme(phon):
phtm = PhonemeTiming(phon, time())
self.phoneme_timing.append(phtm)
def will_speak_word():
pass
# coz it comes after the first phoneme of the word is started
# phtm = PhonemeTiming('~', time())
# self.phoneme_timing.append(phtm)
q_delg.setC_W_Ph_(synth_complete, will_speak_word, will_speak_phoneme)
def queueTask(self, text):
rand_no = str(random.randint(0, 10000))
fname = '{}-{}.aiff'.format(text, rand_no)
sanitized = format_filename(fname)
dest_file = dest_dir + sanitized
cli_gen_audio(text, dest_file)
self.phoneme_timing = []
self.data = SegData(text, sanitized)
self.synth.startSpeakingString_(text)
def story_texts():
story_file = './inputs/all_stories.json'
stories_data = json.load(open(story_file))
text_list_dup = [t for i in stories_data.values() for t in i]
text_list = sorted(list(set(text_list_dup)))
return text_list
def story_words():
story_file = './inputs/all_stories_hs.json'
stories_data = json.load(open(story_file))
text_list_dup = [t[0] for i in stories_data.values() for t in i]
text_list = sorted(list(set(text_list_dup)))
return text_list
def test_texts(count=10):
word_list = [i.strip('\n_') for i in open('./inputs/wordlist.txt','r').readlines()]
text_list = sorted(random.sample(list(set(word_list)),count))
return text_list
def generate_audio(phrases):
synthQ = SynthesizerQueue()
f = open(csv_dest_file, 'w')
s_csv_w = csv.writer(f, quoting=csv.QUOTE_MINIMAL)
i = 0
p = tqdm(total=len(phrases))
def nextTask(seg_data=None):
nonlocal i
if i < len(phrases):
p.set_postfix(phrase=phrases[i])
p.update()
synthQ.queueTask(phrases[i])
i += 1
else:
p.close()
f.close()
dg = NSApplication.sharedApplication().delegate
print('App terminated.')
NSApp().terminate_(dg)
if seg_data:
s_csv_w.writerows(seg_data.csv_rows())
synthQ.didComplete = nextTask
nextTask()
def main():
# Create a new application instance ...
a = NSApplication.sharedApplication()
# ... and create its delgate. Note the use of the
# Objective C constructors below, because Delegate
# is a subcalss of an Objective C class, NSObject
delegate = Delegate.alloc().init()
# Tell the application which delegate object to use.
a.setDelegate_(delegate)
AppHelper.runEventLoop()
if __name__ == '__main__':
main()

143
speech_similar.py Normal file
View File

@@ -0,0 +1,143 @@
import pandas as pd
import pronouncing
import re
import numpy as np
import random
# mapping = {
# s.split()[0]: s.split()[1]
# for s in """
# AA AA
# AE AE
# AH UX
# AO AO
# AW AW
# AY AY
# B b
# CH C
# D d
# DH D
# EH EH
# ER UXr
# EY EY
# F f
# G g
# HH h
# IH IH
# IY IY
# JH J
# K k
# L l
# M m
# N n
# NG N
# OW OW
# OY OY
# P p
# R r
# S s
# SH S
# T t
# TH T
# UH UH
# UW UW
# V v
# W w
# Y y
# Z z
# ZH Z
# """.strip().split('\n')
# }
# sim_mat = pd.read_csv('./similarity.csv', header=0, index_col=0)
#
#
# def convert_ph(ph):
# stress_level = re.search("(\w+)([0-9])", ph)
# if stress_level:
# return stress_level.group(2) + mapping[stress_level.group(1)]
# else:
# return mapping[ph]
#
#
# def sim_mat_to_apple_table(smt):
# colnames = [convert_ph(ph) for ph in smt.index.tolist()]
# smt = pd.DataFrame(np.nan_to_num(smt.values))
# fsmt = (smt.T + smt)
# np.fill_diagonal(fsmt.values, 100.0)
# asmt = pd.DataFrame.copy(fsmt)
# asmt.columns = colnames
# asmt.index = colnames
# apple_sim_table = asmt.stack().reset_index()
# apple_sim_table.columns = ['q', 'r', 's']
# return apple_sim_table
#
#
# apple_sim_table = sim_mat_to_apple_table(sim_mat)
#
#
# def top_match(ph):
# selected = apple_sim_table[(apple_sim_table.q == ph)
# & (apple_sim_table.s < 100) &
# (apple_sim_table.s >= 70)]
# tm = ph
# if len(selected) > 0:
# tm = pd.DataFrame.sort_values(selected, 's', ascending=False).iloc[0].r
# return tm
apple_phonemes = [
'%', '@', 'AE', 'EY', 'AO', 'AX', 'IY', 'EH', 'IH', 'AY', 'IX', 'AA', 'UW',
'UH', 'UX', 'OW', 'AW', 'OY', 'b', 'C', 'd', 'D', 'f', 'g', 'h', 'J', 'k',
'l', 'm', 'n', 'N', 'p', 'r', 's', 'S', 't', 'T', 'v', 'w', 'y', 'z', 'Z'
]
class ApplePhoneme(object):
"""docstring for ApplePhoneme."""
def __init__(self, phone, stress, vowel=False):
super(ApplePhoneme, self).__init__()
self.phone = phone
self.stress = stress
self.vowel = vowel
def __str__(self):
return (str(self.stress) if (self.vowel and self.stress>0) else '') + self.phone
def __repr__(self):
return "'{}'".format(str(self))
def adjust_stress(self):
self.stress = random.choice([i for i in range(3) if i != self.stress])
def parse_apple_phonemes(ph_str):
for i in range(len(ph_str)):
pref, rest = ph_str[:i + 1], ph_str[i + 1:]
if pref in apple_phonemes:
vowel = pref[0] in 'AEIOU'
return [ApplePhoneme(pref, 0, vowel)] + parse_apple_phonemes(rest)
elif pref[0].isdigit() and pref[1:] in apple_phonemes:
return [ApplePhoneme(pref[1:], int(pref[0]) , True)] + parse_apple_phonemes(rest)
elif not pref.isalnum():
return [ApplePhoneme(pref, -1, False)] + parse_apple_phonemes(rest)
return []
def segmentable_phoneme(ph_str):
return [p for p in parse_apple_phonemes(ph_str) if p.stress >=0]
def similar_phoneme_word(ph_str):
phons = parse_apple_phonemes(ph_str)
vowels = [i for i in phons if i.vowel]
random.choice(vowels).adjust_stress()
return ''.join([str(i) for i in phons])
def similar_phoneme_phrase(ph_str):
return ' '.join([similar_phoneme_word(w) for w in ph_str.split()])
def similar_word(word_str):
similar = pronouncing.rhymes(word_str)
return random.choice(similar) if len(similar) > 0 else word_str
def similar_phrase(ph_str):
return ' '.join([similar_word(w) for w in ph_str.split()])

161
speech_spectrum.py Normal file
View File

@@ -0,0 +1,161 @@
#!/usr/bin/env python
""" This work is licensed under a Creative Commons Attribution 3.0 Unported
License.
Frank Zalkow, 2012-2013
http://www.frank-zalkow.de/en/code-snippets/create-audio-spectrograms-with-python.html?i=1
"""
# %matplotlib inline
import numpy as np
import pyaudio
from matplotlib import pyplot as plt
from pysndfile import sndio as snd
from numpy.lib import stride_tricks
""" short time fourier transform of audio signal """
STFT_WINDOWS_MSEC = 20
STFT_WINDOW_OVERLAP = 1.0 / 3
def stft(sig, frameSize, overlapFac=0.5, window=np.hanning):
win = window(frameSize)
hopSize = int(frameSize - np.floor(overlapFac * frameSize))
# zeros at beginning (thus center of 1st window should be for sample nr. 0)
# sig = (sig*255).astype(np.uint8)
count = int(np.floor(frameSize / 2.0))
samples = np.append(np.zeros(count), sig)
# cols for windowing
cols = int(np.ceil((len(samples) - frameSize) / float(hopSize)) + 1)
# zeros at end (thus samples can be fully covered by frames)
samples = np.append(samples, np.zeros(frameSize))
frames = stride_tricks.as_strided(
samples,
shape=(cols, frameSize),
strides=(samples.strides[0] * hopSize, samples.strides[0])).copy()
frames *= win
return np.fft.rfft(frames)
""" scale frequency axis logarithmically """
def logscale_spec(spec, sr=44100, factor=20.):
timebins, freqbins = np.shape(spec)
scale = np.linspace(0, 1, freqbins)**factor
scale *= (freqbins - 1) / max(scale)
scale = np.unique(np.round(scale)).astype(np.uint32)
# create spectrogram with new freq bins
newspec = np.complex128(np.zeros([timebins, len(scale)]))
for i in range(0, len(scale)):
if i == len(scale) - 1:
newspec[:, i] = np.sum(spec[:, scale[i]:], axis=1)
else:
newspec[:, i] = np.sum(spec[:, scale[i]:scale[i + 1]], axis=1)
# list center freq of bins
allfreqs = np.abs(np.fft.fftfreq(freqbins * 2, 1. / sr)[:freqbins + 1])
freqs = []
for i in range(0, len(scale)):
if i == len(scale) - 1:
freqs += [np.mean(allfreqs[scale[i]:])]
else:
freqs += [np.mean(allfreqs[scale[i]:scale[i + 1]])]
return newspec, freqs
""" generate spectrogram for aiff audio with 150ms windows and 50ms overlap"""
def generate_spec_frec(samples, samplerate):
# samplerate, samples = wav.read(audiopath)
# s = stft(samples, binsize)
s = stft(samples, samplerate * STFT_WINDOWS_MSEC // 1000,
STFT_WINDOW_OVERLAP)
sshow, freq = logscale_spec(s, factor=1.0, sr=samplerate)
# add epison so that log10 doesn't break
sshow_abs = np.abs(sshow + np.finfo(sshow.dtype).eps)
ims = 20. * np.log10(sshow_abs / 10e-6)
ims[ims < 0] = 0 #np.finfo(sshow.dtype).eps
return ims, freq
def generate_sample_spectrogram(samples):
ims, _ = generate_spec_frec(samples, 22050)
return ims
def generate_aiff_spectrogram(audiopath):
samples, samplerate, _ = snd.read(audiopath)
ims, _ = generate_spec_frec(samples, samplerate)
return ims
def plot_stft(samples,
samplerate,
binsize=2**10,
plotpath=None,
colormap="jet"):
(ims, freq) = generate_spec_frec(samples, samplerate)
timebins, freqbins = np.shape(ims)
plt.figure(figsize=(15, 7.5))
plt.imshow(
np.transpose(ims),
origin="lower",
aspect="auto",
cmap=colormap,
interpolation="none")
plt.colorbar()
plt.xlabel("time (s)")
plt.ylabel("frequency (hz)")
plt.xlim([0, timebins - 1])
plt.ylim([0, freqbins])
xlocs = np.float32(np.linspace(0, timebins - 1, 5))
plt.xticks(xlocs, [
"%.02f" % l
for l in (
(xlocs * len(samples) / timebins) + (0.5 * binsize)) / samplerate
])
ylocs = np.int16(np.round(np.linspace(0, freqbins - 1, 10)))
plt.yticks(ylocs, ["%.02f" % freq[i] for i in ylocs])
if plotpath:
plt.savefig(plotpath, bbox_inches="tight")
else:
plt.show()
plt.clf()
def plot_aiff_stft(audiopath, binsize=2**10, plotpath=None, colormap="jet"):
samples, samplerate, _ = snd.read(audiopath)
plot_stft(samples, samplerate)
def play_sunflower():
sample_r = snd.get_info(
'./outputs/audio/sunflowers-Alex-150-normal-589.aiff')[0]
snd_data_f64 = snd.read(
'./outputs/audio/sunflowers-Alex-150-normal-589.aiff')[0]
snd_data_f32 = snd_data_f64.astype(np.float32)
print(snd_data_f32.shape)
snd_data = snd_data_f32.tobytes()
p_oup = pyaudio.PyAudio()
stream = p_oup.open(
format=pyaudio.paFloat32, channels=1, rate=sample_r, output=True)
stream.write(snd_data)
stream.close()
p_oup.terminate()
plot_stft(snd_data_f32, sample_r)
if __name__ == '__main__':
# play_sunflower()
plot_aiff_stft(
'./outputs/story_words/Agnes/150/chicken-Agnes-150-low-1077.aiff')
plot_aiff_stft(
'./outputs/story_words/Agnes/150/chicken-Agnes-150-medium-1762.aiff')
# spec = generate_aiff_spectrogram('./outputs/story_words/Agnes/150/chicken-Agnes-150-low-1077.aiff')
# print(spec.shape)
# plot_aiff_stft('./outputs/sunflowers-Alex-180-normal-4763.aiff')
# plot_aiff_stft('./outputs/sunflowers-Victoria-180-normal-870.aiff')
# plot_aiff_stft('./outputs/sunflowers-Fred-180-phoneme-9733.aiff')
# plot_aiff_stft('./outputs/sunflowers-Fred-180-normal-6515.aiff')

202
speech_test.py Normal file
View File

@@ -0,0 +1,202 @@
from speech_model import load_model_arch
from speech_tools import record_spectrogram, file_player, padd_zeros, pair_for_word
from speech_data import record_generator_count
# from importlib import reload
# import speech_data
# reload(speech_data)
import numpy as np
import pandas as pd
import os
import pickle
import tensorflow as tf
import csv
from tqdm import tqdm
from speech_data import padd_zeros
import seaborn as sns
def predict_recording_with(m,sample_size=15):
spec1 = record_spectrogram(n_sec=1.4)
spec2 = record_spectrogram(n_sec=1.4)
inp = create_test_pair(spec1,spec2,sample_size)
return m.predict([inp[:, 0], inp[:, 1]])
def predict_tts_sample(sample_word = 'able',audio_group='story_words',weights = 'siamese_speech_model-153-epoch-0.55-acc.h5'):
# sample_word = 'able';audio_group='story_words';weights = 'siamese_speech_model-153-epoch-0.55-acc.h5'
const_file = './models/'+audio_group+'/constants.pkl'
arch_file='./models/'+audio_group+'/siamese_speech_model_arch.yaml'
weight_file='./models/'+audio_group+'/'+weights
(sample_size,n_features,n_records) = pickle.load(open(const_file,'rb'))
model = load_model_arch(arch_file)
model.load_weights(weight_file)
spec1,spec2 = pair_for_word(sample_word)
p_spec1 = padd_zeros(spec1,sample_size)
p_spec2 = padd_zeros(spec2,sample_size)
inp = np.array([[p_spec1,p_spec2]])
result = model.predict([inp[:, 0], inp[:, 1]])[0]
res_str = 'same' if result[0] < result[1] else 'diff'
return res_str
def test_with(audio_group):
X,Y = speech_data(audio_group)
print(np.argmax(model.predict([X[:, 0], X[:, 1]]),axis=1))
print(Y.astype(np.int8))
def evaluate_siamese(records_file,audio_group='audio',weights = 'siamese_speech_model-final.h5'):
# audio_group='audio';model_file = 'siamese_speech_model-305-epoch-0.20-acc.h5'
# records_file = os.path.join('./outputs',eval_group+'.train.tfrecords')
const_file = os.path.join('./models/'+audio_group+'/','constants.pkl')
arch_file='./models/'+audio_group+'/siamese_speech_model_arch.yaml'
weight_file='./models/'+audio_group+'/'+weights
(n_spec,n_features,n_records) = pickle.load(open(const_file,'rb'))
print('evaluating {}...'.format(records_file))
model = load_model_arch(arch_file)
# model = siamese_model((n_spec, n_features))
model.load_weights(weight_file)
record_iterator,records_count = record_generator_count(records_file)
total,same_success,diff_success,skipped,same_failed,diff_failed = 0,0,0,0,0,0
all_results = []
for (i,string_record) in tqdm(enumerate(record_iterator),total=records_count):
total+=1
example = tf.train.Example()
example.ParseFromString(string_record)
spec_n1 = example.features.feature['spec_n1'].int64_list.value[0]
spec_n2 = example.features.feature['spec_n2'].int64_list.value[0]
if n_spec < spec_n1 or n_spec < spec_n2:
skipped+=1
continue
spec_w1 = example.features.feature['spec_w1'].int64_list.value[0]
spec_w2 = example.features.feature['spec_w2'].int64_list.value[0]
spec1 = np.array(example.features.feature['spec1'].float_list.value).reshape(spec_n1,spec_w1)
spec2 = np.array(example.features.feature['spec2'].float_list.value).reshape(spec_n2,spec_w2)
word = example.features.feature['word'].bytes_list.value[0].decode()
phoneme1 = example.features.feature['phoneme1'].bytes_list.value[0].decode()
phoneme2 = example.features.feature['phoneme2'].bytes_list.value[0].decode()
voice1 = example.features.feature['voice1'].bytes_list.value[0].decode()
voice2 = example.features.feature['voice2'].bytes_list.value[0].decode()
language = example.features.feature['language'].bytes_list.value[0].decode()
rate1 = example.features.feature['rate1'].int64_list.value[0]
rate2 = example.features.feature['rate2'].int64_list.value[0]
variant1 = example.features.feature['variant1'].bytes_list.value[0].decode()
variant2 = example.features.feature['variant2'].bytes_list.value[0].decode()
file1 = example.features.feature['file1'].bytes_list.value[0].decode()
file2 = example.features.feature['file2'].bytes_list.value[0].decode()
p_spec1,p_spec2 = padd_zeros(spec1,n_spec),padd_zeros(spec2,n_spec)
input_arr = np.asarray([[p_spec1,p_spec2]])
output_arr = np.asarray([example.features.feature['output'].int64_list.value])
y_pred = model.predict([input_arr[:, 0], input_arr[:, 1]])
predicted = np.asarray(y_pred[0]>0.5).astype(output_arr.dtype)
expected = output_arr[0]
status = np.all(predicted == expected)
result = {"phoneme1":phoneme1,"phoneme2":phoneme2,"voice1":voice1
,"voice2":voice2,"rate1":rate1,"rate2":rate2
,"variant1":variant1,"variant2":variant2,"file1":file1
,"file2":file2,"expected":expected[0],"predicted":y_pred[0][0]
,"success":status}
all_results.append(result)
if status:
if variant1 == variant2:
same_success+=1
else:
diff_success+=1
continue
else:
if variant1 == variant2:
same_failed+=1
else:
diff_failed+=1
print('total-{},same_success-{},diff_success-{},skipped-{},same_failed-{},diff_failed-{}'.format(total,same_success,diff_success,skipped,same_failed,diff_failed))
success = same_success+diff_success
failure = same_failed+diff_failed
print('accuracy-{:.3f}'.format(success*100/(success+failure)))
print('same_accuracy-{:.3f}'.format(same_success*100/(same_success+same_failed)))
print('diff_accuracy-{:.3f}'.format(diff_success*100/(diff_success+diff_failed)))
result_data = pd.DataFrame(all_results,columns=["phoneme1","phoneme2"
,"voice1","voice2","rate1","rate2","variant1","variant2","file1","file2",
"expected","predicted","success"])
result_data.to_csv('./outputs/' + audio_group + '.results.csv')
def inspect_tfrecord(records_file,audio_group='audio'):
record_iterator,records_count = record_generator_count(records_file)
all_results = []
for (i,string_record) in tqdm(enumerate(record_iterator),total=records_count):
# string_record = next(record_iterator)
example = tf.train.Example()
example.ParseFromString(string_record)
spec_n1 = example.features.feature['spec_n1'].int64_list.value[0]
spec_n2 = example.features.feature['spec_n2'].int64_list.value[0]
word = example.features.feature['word'].bytes_list.value[0].decode()
phoneme1 = example.features.feature['phoneme1'].bytes_list.value[0].decode()
phoneme2 = example.features.feature['phoneme2'].bytes_list.value[0].decode()
voice1 = example.features.feature['voice1'].bytes_list.value[0].decode()
voice2 = example.features.feature['voice2'].bytes_list.value[0].decode()
language = example.features.feature['language'].bytes_list.value[0].decode()
rate1 = example.features.feature['rate1'].int64_list.value[0]
rate2 = example.features.feature['rate2'].int64_list.value[0]
variant1 = example.features.feature['variant1'].bytes_list.value[0].decode()
variant2 = example.features.feature['variant2'].bytes_list.value[0].decode()
file1 = example.features.feature['file1'].bytes_list.value[0].decode()
file2 = example.features.feature['file2'].bytes_list.value[0].decode()
output_arr = np.asarray([example.features.feature['output'].int64_list.value])
expected = output_arr[0]
result = {"phoneme1":phoneme1,"phoneme2":phoneme2,"voice1":voice1
,"voice2":voice2,"rate1":rate1,"rate2":rate2,"spec_n1":spec_n1
,"spec_n2":spec_n2,"variant1":variant1,"variant2":variant2
,"file1":file1,"file2":file2,"expected":expected[0]}
all_results.append(result)
result_data = pd.DataFrame(all_results,columns=["phoneme1","phoneme2"
,"voice1","voice2","rate1","rate2","spec_n1","spec_n2","variant1","variant2","file1","file2",
"expected"])
result_data.to_csv('./outputs/' + audio_group + '.pairs.csv')
def play_results(audio_group='audio'):
result_data = pd.read_csv('./outputs/' + audio_group + '.results.csv')
play_file,close_player = file_player()
quit = False
for (i,r) in result_data.iterrows():
if quit:
break
keys = ["phoneme1","phoneme2","voice1","voice2","rate1","rate2","variant1","variant2"]
row_vals = [str(r[k]) for k in keys]
h_str = '\t'.join(keys)
row_str = '\t'.join(row_vals)
while True:
print(h_str)
print(row_str)
play_file('./outputs/'+audio_group+'/'+r['file1'],True)
play_file('./outputs/'+audio_group+'/'+r['file2'],True)
a = input("press 'r/q/[Enter]' to replay/quit/continue:\t")
if a == 'r':
continue
if a == 'q':
quit = True
break
else:
break
close_player()
def visualize_results(audio_group='audio'):
# %matplotlib inline
audio_group = 'story_phrases'
source = pd.read_csv('./outputs/'+audio_group+'.pairs.csv',index_col=0)
source.groupby(['voice1','voice2']).size()
result = pd.read_csv('./outputs/' + audio_group + '.results.csv',index_col=0)
# result.groupby('success').size().plot(kind='bar')
result.describe(include=['object'])
failed = result[result['success'] == False]
same_failed = failed[failed['variant1'] == failed['variant2']]
diff_failed = failed[failed['variant1'] != failed['variant2']]
result.groupby(['voice1','voice2']).size()
if __name__ == '__main__':
# evaluate_siamese('./outputs/story_words_test.train.tfrecords',audio_group='story_words.gpu',weights ='siamese_speech_model-58-epoch-0.00-acc.h5')
# evaluate_siamese('./outputs/story_words.test.tfrecords',audio_group='story_words',weights ='siamese_speech_model-675-epoch-0.00-acc.h5')
evaluate_siamese('./outputs/story_words.test.tfrecords',audio_group='story_words',weights ='siamese_speech_model-153-epoch-0.55-acc.h5')
# play_results('story_words')
#inspect_tfrecord('./outputs/story_phrases.test.tfrecords',audio_group='story_phrases')
# visualize_results('story_words.gpu')
# test_with('rand_edu')
# sunflower_data,sunflower_result = get_word_pairs_data('sweater',15)
# print(np.argmax(model.predict([sunflower_data[:, 0], sunflower_data[:, 1]]),axis=1))
# print(sunflower_result)

50
speech_testgen.py Normal file
View File

@@ -0,0 +1,50 @@
import voicerss_tts
import json
from speech_tools import format_filename
def generate_voice(phrase):
voice = voicerss_tts.speech({
'key': '0ae89d82aa78460691c99a4ac8c0f9ec',
'hl': 'en-us',
'src': phrase,
'r': '0',
'c': 'mp3',
'f': '22khz_16bit_mono',
'ssml': 'false',
'b64': 'false'
})
if not voice['error']:
return voice[b'response']
return None
def generate_test_audio_for_stories():
story_file = './inputs/all_stories_hs.json'
# story_file = './inputs/all_stories.json'
stories_data = json.load(open(story_file))
text_list_dup = [t[0] for i in stories_data.values() for t in i]
text_list = sorted(list(set(text_list_dup)))[:10]
for t in text_list:
v = generate_voice(t)
if v:
f_name = format_filename(t)
tf = open('inputs/voicerss/'+f_name+'.mp3','wb')
tf.write(v)
tf.close()
# def generate_test_audio_for(records_file,audio_group='audio'):
# # audio_group='audio';model_file = 'siamese_speech_model-305-epoch-0.20-acc.h5'
# # records_file = os.path.join('./outputs',eval_group+'.train.tfrecords')
# const_file = os.path.join('./models/'+audio_group+'/','constants.pkl')
# (n_spec,n_features,n_records) = pickle.load(open(const_file,'rb'))
# print('evaluating {}...'.format(records_file))
# record_iterator,records_count = record_generator_count(records_file)
# all_results = []
# for (i,string_record) in tqdm(enumerate(record_iterator),total=records_count):
# total+=1
# example = tf.train.Example()
# example.ParseFromString(string_record)
# word = example.features.feature['word'].bytes_list.value[0].decode()
# audio = generate_voice('hello world')
# audio

204
speech_tools.py Normal file
View File

@@ -0,0 +1,204 @@
import os
import math
import string
import threading
import itertools
import random
import multiprocessing
import subprocess
import pandas as pd
import numpy as np
import pyaudio
from pysndfile import sndio as snd
# from matplotlib import pyplot as plt
from speech_spectrum import plot_stft, generate_spec_frec,generate_aiff_spectrogram
SAMPLE_RATE = 22050
N_CHANNELS = 2
devnull = open(os.devnull, 'w')
def step_count(n_records,batch_size):
return int(math.ceil(n_records*1.0/batch_size))
def file_player():
p_oup = pyaudio.PyAudio()
def play_file(audiopath,plot=False):
print('playing',audiopath)
samples, samplerate, form = snd.read(audiopath)
stream = p_oup.open(
format=pyaudio.paFloat32,
channels=2,
rate=samplerate,
output=True)
one_channel = np.asarray([samples, samples]).T.reshape(-1)
audio_data = one_channel.astype(np.float32).tobytes()
stream.write(audio_data)
stream.close()
if plot:
plot_stft(samples, SAMPLE_RATE)
def close_player():
p_oup.terminate()
return play_file,close_player
def reservoir_sample(iterable, k):
it = iter(iterable)
if not (k > 0):
raise ValueError("sample size must be positive")
sample = list(itertools.islice(it, k)) # fill the reservoir
random.shuffle(sample) # if number of items less then *k* then
# return all items in random order.
for i, item in enumerate(it, start=k+1):
j = random.randrange(i) # random [0..i)
if j < k:
sample[j] = item # replace item with gradually decreasing probability
return sample
def padd_zeros(spgr, max_samples):
return np.lib.pad(spgr, [(0, max_samples - spgr.shape[0]), (0, 0)],
'constant')
def read_seg_file(aiff_name):
base_name = aiff_name.rsplit('.aiff',1)[0]
seg_file = base_name+'-palign.csv'
seg_data = pd.read_csv(seg_file,names=['action','start','end','phoneme'])
seg_data = seg_data[(seg_data['action'] == 'PhonAlign') & (seg_data['phoneme'] != '#')]
return seg_data
def record_spectrogram(n_sec, plot=False, playback=False):
# show_record_prompt()
N_SEC = n_sec
CHUNKSIZE = int(SAMPLE_RATE * N_SEC / N_CHANNELS) # fixed chunk size
input('Press [Enter] to start recording sample... ')
p_inp = pyaudio.PyAudio()
stream = p_inp.open(
format=pyaudio.paFloat32,
channels=N_CHANNELS,
rate=SAMPLE_RATE,
input=True,
frames_per_buffer=CHUNKSIZE)
data = stream.read(CHUNKSIZE)
numpydata = np.frombuffer(data, dtype=np.float32)
multi_channel = np.abs(np.reshape(numpydata, (-1, 2))).mean(axis=1)
one_channel = np.asarray([multi_channel, -1 * multi_channel]).T.reshape(-1)
mean_channel_data = one_channel.tobytes()
stream.stop_stream()
stream.close()
p_inp.terminate()
if plot:
plot_stft(one_channel, SAMPLE_RATE)
if playback:
p_oup = pyaudio.PyAudio()
stream = p_oup.open(
format=pyaudio.paFloat32,
channels=2,
rate=SAMPLE_RATE,
output=True)
stream.write(mean_channel_data)
stream.close()
p_oup.terminate()
ims, _ = generate_spec_frec(one_channel, SAMPLE_RATE)
return ims
def pair_for_word(phrase='able'):
spec1 = generate_aiff_spectrogram('./inputs/pairs/good/'+phrase+'.aiff')
spec2 = generate_aiff_spectrogram('./inputs/pairs/test/'+phrase+'.aiff')
return spec1,spec2
def transribe_audio_text(aiff_name,phrase):
base_name = aiff_name.rsplit('.aiff',1)[0]
wav_name = base_name+'.wav'
txt_name = base_name+'.txt'
params = ['ffmpeg', '-y', '-i',aiff_name,wav_name]
subprocess.call(params,stdout=devnull,stderr=devnull)
trcr_f = open(txt_name,'w')
trcr_f.write(phrase)
trcr_f.close()
def _apply_df(args):
df, func, num, kwargs = args
return num, df.apply(func, **kwargs)
def apply_by_multiprocessing(df,func,**kwargs):
cores = multiprocessing.cpu_count()
workers=kwargs.pop('workers') if 'workers' in kwargs else cores
pool = multiprocessing.Pool(processes=workers)
result = pool.map(_apply_df, [(d, func, i, kwargs) for i,d in enumerate(np.array_split(df, workers))])
pool.close()
result=sorted(result,key=lambda x:x[0])
return pd.concat([i[1] for i in result])
def square(x):
return x**x
# if __name__ == '__main__':
# df = pd.DataFrame({'a':range(10), 'b':range(10)})
# apply_by_multiprocessing(df, square, axis=1, workers=4)
def hms_string(sec_elapsed):
h = int(sec_elapsed / (60 * 60))
m = int((sec_elapsed % (60 * 60)) / 60)
s = sec_elapsed % 60.
return "{}:{:>02}:{:>05.2f}".format(h, m, s)
def rm_rf(d):
for path in (os.path.join(d,f) for f in os.listdir(d)):
if os.path.isdir(path):
rm_rf(path)
else:
os.unlink(path)
os.rmdir(d)
def create_dir(direc):
if not os.path.exists(direc):
os.makedirs(direc)
else:
rm_rf(direc)
create_dir(direc)
def format_filename(s):
"""
Take a string and return a valid filename constructed from the string.
Uses a whitelist approach: any characters not present in valid_chars are
removed. Also spaces are replaced with underscores.
Note: this method may produce invalid filenames such as ``, `.` or `..`
When I use this method I prepend a date string like '2009_01_15_19_46_32_'
and append a file extension like '.txt', so I avoid the potential of using
an invalid filename.
"""
valid_chars = "-_.() %s%s" % (string.ascii_letters, string.digits)
filename = ''.join(c for c in s if c in valid_chars)
filename = filename.replace(' ','_') # I don't like spaces in filenames.
return filename
#################### Now make the data generator threadsafe ####################
class threadsafe_iter:
"""Takes an iterator/generator and makes it thread-safe by
serializing call to the `next` method of given iterator/generator.
"""
def __init__(self, it):
self.it = it
self.lock = threading.Lock()
def __iter__(self):
return self
def __next__(self): # Py3
with self.lock:
return next(self.it)
def next(self): # Py2
with self.lock:
return self.it.next()
def threadsafe_generator(f):
"""A decorator that takes a generator function and makes it thread-safe.
"""
def g(*a, **kw):
return threadsafe_iter(f(*a, **kw))
return g

View File

@@ -1,135 +0,0 @@
import objc
from AppKit import NSSpeechSynthesizer,NSSpeechInputModeProperty,NSSpeechModePhoneme
from Foundation import NSURL,NSError,NSObject
import json
import random
import os
import re
import subprocess
OUTPUT_NAME = 'audio'
dest_dir = os.path.abspath('.')+'/outputs/'+OUTPUT_NAME+'/'
dest_file = './outputs/'+OUTPUT_NAME+'.csv'
def create_dir(direc):
if not os.path.exists(direc):
os.mkdir(direc)
dest_filename = lambda n,v,r,t: '{}-{}-{}-{}-'.format(n,v,r,t)+str(random.randint(0,10000))+'.aiff'
dest_path = lambda v,r,n: dest_dir+v+'/'+r+'/'+n
dest_url = lambda p: NSURL.fileURLWithPath_(p)
def cli_gen_audio(speech_cmd,rate,voice,out_path):
subprocess.call(['say','-v',voice,'-r',str(rate),'-o',out_path,speech_cmd])
class SynthFile(object):
"""docstring for SynthFile."""
def __init__(self,word,phon, filename,voice,rate,operation):
super(SynthFile, self).__init__()
self.word = word
self.phoneme = phon
self.filename = filename
self.voice = voice
self.rate = rate
self.variant = operation
def get_json(self):
return {'filename':self.filename,'voice':self.voice,
'rate':self.rate,'operation':self.operation}
def get_csv(self):
return '{},{},{},{},{}\n'.format(self.word,self.phoneme,self.voice,self.rate,self.variant,self.filename)
class SynthVariant(object):
"""docstring for SynthVariant."""
def __init__(self,identifier,rate):
super(SynthVariant, self).__init__()
self.synth = NSSpeechSynthesizer.alloc().initWithVoice_(identifier)
self.synth.setVolume_(100)
self.synth.setRate_(rate)
self.phone_synth = NSSpeechSynthesizer.alloc().initWithVoice_(identifier)
self.phone_synth.setVolume_(100)
self.phone_synth.setRate_(rate)
self.phone_synth.setObject_forProperty_error_(NSSpeechModePhoneme,NSSpeechInputModeProperty,None)
self.identifier = identifier
self.rate = rate
self.name = identifier.split('.')[-1]
def __repr__(self):
return 'Synthesizer[{} - {}]({})'.format(self.name,self.rate)
def generate_audio(self,word,variant):
orig_phon,phoneme,phon_cmd = self.synth.phonemesFromText_(word),'',word
if variant == 'low':
# self.synth.startSpeakingString_toURL_(word,d_url)
phoneme = orig_phon
elif variant == 'medium':
phoneme = re.sub('[0-9]','',orig_phon)
phon_cmd = '[[inpt PHON]] '+phoneme
elif variant == 'high':
phoneme = orig_phon
phon_cmd = word
# elif variant == 'long':
# if phon != '':
# self.phone_synth.startSpeakingString_toURL_(phon,d_url)
# else:
# self.synth.startSpeakingString_toURL_(word,d_url)
fname = dest_filename(word,phoneme,self.name,self.rate)
d_path = dest_path(self.name,self.rate,fname)
d_url = dest_url(d_path)
cli_gen_audio(phon_cmd,self.rate,self.name,d_path)
return SynthFile(word,phoneme,fname,self.name,self.rate,variant)
def synth_generator():
voices_installed = NSSpeechSynthesizer.availableVoices()
voice_attrs = [NSSpeechSynthesizer.attributesForVoice_(v) for v in voices_installed]
us_voices_ids = [v['VoiceIdentifier'] for v in voice_attrs if v['VoiceLanguage'] == 'en-US' and v['VoiceIdentifier'].split('.')[-1][0].isupper()]
# us_voices_ids = ['com.apple.speech.synthesis.voice.Fred','com.apple.speech.synthesis.voice.Alex',
# 'com.apple.speech.synthesis.voice.Victoria']
# voice_rates = list(range(150,221,(220-180)//4))
voice_rates = [150,180,210,250]
voice_synths = []
create_dir(dest_dir)
for v in us_voices_ids:
for r in voice_rates:
create_dir(dest_dir+v+'/'+r)
voice_synths.append(SynthVariant(v,r))
def synth_for_words(words):
all_synths = []
for w in words:
for s in voice_synths:
for v in ['low','medium','high']:
all_synths.append(s.generate_audio(w,v))
return all_synths
return synth_for_words
def write_synths(synth_list,fname,csv=False):
f = open(fname,'w')
if csv:
for s in synth_list:
f.write(s.get_csv())
else:
json.dump([s.get_json() for s in synth_list],f)
f.close()
def generate_audio_for_stories():
stories_data = json.load(open('./inputs/all_stories_hs.json'))
word_list = [t[0] for i in stories_data.values() for t in i]
words_audio_synth = synth_generator()
return words_audio_synth(word_list)
# words_audio_synth = synth_generator()
# synth = NSSpeechSynthesizer.alloc().init()
# voices_installed = NSSpeechSynthesizer.availableVoices()
# voice_attrs = [NSSpeechSynthesizer.attributesForVoice_(v) for v in voices_installed]
# us_voices_ids = [v['VoiceIdentifier'] for v in voice_attrs if v['VoiceLanguage'] == 'en-US' and v['VoiceIdentifier'].split('.')[-1][0].isupper()]
# synth.setVoice_(us_voices_ids[2])
# synth.startSpeakingString_('your')
# fname = dest_filename(word,self.name,self.rate,self.operation)
# d_path = dest_path(fname)
# d_url = dest_url(d_path)
synths = synth_generator()([OUTPUT_NAME])
# synths = generate_audio_for_stories()
write_synths(synths,dest_file,True)
# write_synths(synths,'./outputs/synths.json')

52
voicerss_tts.py Normal file
View File

@@ -0,0 +1,52 @@
import http.client, urllib.request, urllib.parse, urllib.error
def speech(settings):
__validate(settings)
return __request(settings)
def __validate(settings):
if not settings: raise RuntimeError('The settings are undefined')
if 'key' not in settings or not settings['key']: raise RuntimeError('The API key is undefined')
if 'src' not in settings or not settings['src']: raise RuntimeError('The text is undefined')
if 'hl' not in settings or not settings['hl']: raise RuntimeError('The language is undefined')
def __request(settings):
result = {'error': None, 'response': None}
headers = {'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'}
params = urllib.parse.urlencode(__buildRequest(settings))
if 'ssl' in settings and settings['ssl']:
conn = http.client.HTTPSConnection('api.voicerss.org:443')
else:
conn = http.client.HTTPConnection('api.voicerss.org:80')
conn.request('POST', '/', params, headers)
response = conn.getresponse()
content = response.read()
if response.status != 200:
result[b'error'] = response.reason
elif content.find(b'ERROR') == 0:
result[b'error'] = content
else:
result[b'response'] = content
conn.close()
return result
def __buildRequest(settings):
params = {'key': '', 'src': '', 'hl': '', 'r': '', 'c': '', 'f': '', 'ssml': '', 'b64': ''}
if 'key' in settings: params['key'] = settings['key']
if 'src' in settings: params['src'] = settings['src']
if 'hl' in settings: params['hl'] = settings['hl']
if 'r' in settings: params['r'] = settings['r']
if 'c' in settings: params['c'] = settings['c']
if 'f' in settings: params['f'] = settings['f']
if 'ssml' in settings: params['ssml'] = settings['ssml']
if 'b64' in settings: params['b64'] = settings['b64']
return params

52
voicerss_tts.py.bak Normal file
View File

@@ -0,0 +1,52 @@
import httplib, urllib
def speech(settings):
__validate(settings)
return __request(settings)
def __validate(settings):
if not settings: raise RuntimeError('The settings are undefined')
if 'key' not in settings or not settings['key']: raise RuntimeError('The API key is undefined')
if 'src' not in settings or not settings['src']: raise RuntimeError('The text is undefined')
if 'hl' not in settings or not settings['hl']: raise RuntimeError('The language is undefined')
def __request(settings):
result = {'error': None, 'response': None}
headers = {'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'}
params = urllib.urlencode(__buildRequest(settings))
if 'ssl' in settings and settings['ssl']:
conn = httplib.HTTPSConnection('api.voicerss.org:443')
else:
conn = httplib.HTTPConnection('api.voicerss.org:80')
conn.request('POST', '/', params, headers)
response = conn.getresponse()
content = response.read()
if response.status != 200:
result['error'] = response.reason
elif content.find('ERROR') == 0:
result['error'] = content
else:
result['response'] = content
conn.close()
return result
def __buildRequest(settings):
params = {'key': '', 'src': '', 'hl': '', 'r': '', 'c': '', 'f': '', 'ssml': '', 'b64': ''}
if 'key' in settings: params['key'] = settings['key']
if 'src' in settings: params['src'] = settings['src']
if 'hl' in settings: params['hl'] = settings['hl']
if 'r' in settings: params['r'] = settings['r']
if 'c' in settings: params['c'] = settings['c']
if 'f' in settings: params['f'] = settings['f']
if 'ssml' in settings: params['ssml'] = settings['ssml']
if 'b64' in settings: params['b64'] = settings['b64']
return params