formatted

master
Malar Kannan 2017-10-25 13:36:41 +05:30
parent e6f0c8b21b
commit 82d0398d2c
8 changed files with 275 additions and 252 deletions

View File

@ -1,55 +0,0 @@
#!/usr/bin/env python3
"""
Convert ARPABET <http://www.speech.cs.cmu.edu/cgi-bin/cmudict>
to Apple's codes <https://developer.apple.com/library/content/documentation/UserExperience/Conceptual/SpeechSynthesisProgrammingGuide/Phonemes/Phonemes.html>
"""
import sys
mapping = {s.split()[0]: s.split()[1] for s in """
AA AA
AE AE
AH UX
AO AO
AW AW
AY AY
B b
CH C
D d
DH D
EH EH
ER UXr
EY EY
F f
G g
HH h
IH IH
IY IY
JH J
K k
L l
M m
N n
NG N
OW OW
OY OY
P p
R r
S s
SH S
T t
TH T
UH UH
UW UW
V v
W w
Y y
Z z
ZH Z
""".strip().split('\n')}
arpabet_phonemes = sys.stdin.read().split()
apple_phonemes = [mapping[p.upper()] for p in arpabet_phonemes]
print('[[inpt PHON]] ' + ''.join(apple_phonemes))

View File

@ -1,10 +0,0 @@
import pandas as pd
audio_file = pd.read_csv('./outputs/audio.csv',names=['word','voice','rate','type','filename'])
word_goups = audio_file.groupby('word')
# audio
lst = [1, 2, 3, 1, 2, 3]
s = pd.Series([1, 2, 3, 10, 20, 30], lst)
df3 = pd.DataFrame({'X' : ['A', 'B', 'A', 'B'], 'Y' : [1, 4, 3, 2]})
s.groupby(level=0).sum()

View File

@ -2,14 +2,24 @@ import pyaudio
import numpy as np import numpy as np
from matplotlib import pyplot as plt from matplotlib import pyplot as plt
CHUNKSIZE = 1024 # fixed chunk size CHUNKSIZE = 44100 * 10 # fixed chunk size
# initialize portaudio # initialize portaudio
p = pyaudio.PyAudio() p_inp = pyaudio.PyAudio()
stream = p.open(format=pyaudio.paInt16, channels=1, rate=44100, input=True, frames_per_buffer=CHUNKSIZE) # dev_n = p.get_device_count()
# dev_infos = [p.get_device_info_by_index(index) for index in range(dev_n)]
# [i for i in dev_infos] # if i['name'] == 'record']
stream = p_inp.open(
format=pyaudio.paInt24,
channels=2,
rate=44100,
input=True,
frames_per_buffer=CHUNKSIZE)
# do this as long as you want fresh samples # do this as long as you want fresh samples
data = stream.read(CHUNKSIZE) data = stream.read(CHUNKSIZE)
len(data)
CHUNKSIZE*10
numpydata = np.fromstring(data, dtype=np.int16) numpydata = np.fromstring(data, dtype=np.int16)
# plot data # plot data
@ -19,4 +29,27 @@ plt.show()
# close stream # close stream
stream.stop_stream() stream.stop_stream()
stream.close() stream.close()
p.terminate() p_inp.terminate()
# open the file for reading.
# wf = wave.open(sys.argv[1], 'rb')
# create an audio object
# p = pyaudio.PyAudio()
# open stream based on the wave object which has been input.
p_oup = pyaudio.PyAudio()
stream = p_oup.open(
format=pyaudio.paInt24, channels=2, rate=44100, output=True)
# read data (based on the chunk size)
# data = wf.readframes(CHUNKSIZE)
# play stream (looping from beginning of file to the end)
# while data != '':
# writing to the stream is what *actually* plays the sound.
stream.write(data)
# data = wf.readframes(chunk)
# cleanup stuff.
stream.close()
p_oup.terminate()

View File

@ -1,12 +0,0 @@
# import scipy.signal as sg
# import pysndfile.sndio as snd
#
# snd_data,samples,_ = snd.read('./outputs/sunflowers-Alex-150-normal-589.aiff')
# samples_per_seg = 3*int(samples*150/(3*1000))
# # samples/(len(snd_data)*1000.0)
# len(snd_data)
# samples_per_seg/2
#
# len(sg.spectrogram(snd_data,nperseg=samples_per_seg,noverlap=samples_per_seg/3)[2])
#
# from spectro_gen import generate_aiff_spectrogram

View File

@ -1,6 +1,7 @@
#!/usr/bin/env python #!/usr/bin/env python
#coding: utf-8
""" This work is licensed under a Creative Commons Attribution 3.0 Unported License. """ This work is licensed under a Creative Commons Attribution 3.0 Unported
License.
Frank Zalkow, 2012-2013 Frank Zalkow, 2012-2013
http://www.frank-zalkow.de/en/code-snippets/create-audio-spectrograms-with-python.html?i=1 http://www.frank-zalkow.de/en/code-snippets/create-audio-spectrograms-with-python.html?i=1
""" """
@ -9,8 +10,9 @@ import numpy as np
from matplotlib import pyplot as plt from matplotlib import pyplot as plt
from pysndfile import sndio as snd from pysndfile import sndio as snd
from numpy.lib import stride_tricks from numpy.lib import stride_tricks
""" short time fourier transform of audio signal """ """ short time fourier transform of audio signal """
def stft(sig, frameSize, overlapFac=0.5, window=np.hanning): def stft(sig, frameSize, overlapFac=0.5, window=np.hanning):
win = window(frameSize) win = window(frameSize)
hopSize = int(frameSize - np.floor(overlapFac * frameSize)) hopSize = int(frameSize - np.floor(overlapFac * frameSize))
@ -26,12 +28,18 @@ def stft(sig, frameSize, overlapFac=0.5, window=np.hanning):
# zeros at end (thus samples can be fully covered by frames) # zeros at end (thus samples can be fully covered by frames)
samples = np.append(samples, np.zeros(frameSize)) samples = np.append(samples, np.zeros(frameSize))
frames = stride_tricks.as_strided(samples, shape=(cols, frameSize), strides=(samples.strides[0]*hopSize, samples.strides[0])).copy() frames = stride_tricks.as_strided(
samples,
shape=(cols, frameSize),
strides=(samples.strides[0] * hopSize, samples.strides[0])).copy()
frames *= win frames *= win
return np.fft.rfft(frames) return np.fft.rfft(frames)
""" scale frequency axis logarithmically """ """ scale frequency axis logarithmically """
def logscale_spec(spec, sr=44100, factor=20.): def logscale_spec(spec, sr=44100, factor=20.):
timebins, freqbins = np.shape(spec) timebins, freqbins = np.shape(spec)
@ -58,7 +66,10 @@ def logscale_spec(spec, sr=44100, factor=20.):
return newspec, freqs return newspec, freqs
""" generate spectrogram for aiff audio with 150ms windows and 50ms overlap""" """ generate spectrogram for aiff audio with 150ms windows and 50ms overlap"""
def generate_aiff_spectrogram(audiopath): def generate_aiff_spectrogram(audiopath):
samples, samplerate, _ = snd.read(audiopath) samples, samplerate, _ = snd.read(audiopath)
# samplerate, samples = wav.read(audiopath) # samplerate, samples = wav.read(audiopath)
@ -69,7 +80,10 @@ def generate_aiff_spectrogram(audiopath):
ims = 20. * np.log10(np.abs(sshow) / 10e-6) ims = 20. * np.log10(np.abs(sshow) / 10e-6)
return ims return ims
""" plot spectrogram""" """ plot spectrogram"""
def plotstft(audiopath, binsize=2**10, plotpath=None, colormap="jet"): def plotstft(audiopath, binsize=2**10, plotpath=None, colormap="jet"):
samples, samplerate, _ = snd.read(audiopath) samples, samplerate, _ = snd.read(audiopath)
# samplerate, samples = wav.read(audiopath) # samplerate, samples = wav.read(audiopath)
@ -83,7 +97,12 @@ def plotstft(audiopath, binsize=2**10, plotpath=None, colormap="jet"):
timebins, freqbins = np.shape(ims) timebins, freqbins = np.shape(ims)
# import pdb;pdb.set_trace() # import pdb;pdb.set_trace()
plt.figure(figsize=(15, 7.5)) plt.figure(figsize=(15, 7.5))
plt.imshow(np.transpose(ims), origin="lower", aspect="auto", cmap=colormap, interpolation="none") plt.imshow(
np.transpose(ims),
origin="lower",
aspect="auto",
cmap=colormap,
interpolation="none")
plt.colorbar() plt.colorbar()
plt.xlabel("time (s)") plt.xlabel("time (s)")
@ -92,7 +111,11 @@ def plotstft(audiopath, binsize=2**10, plotpath=None, colormap="jet"):
plt.ylim([0, freqbins]) plt.ylim([0, freqbins])
xlocs = np.float32(np.linspace(0, timebins - 1, 5)) xlocs = np.float32(np.linspace(0, timebins - 1, 5))
plt.xticks(xlocs, ["%.02f" % l for l in ((xlocs*len(samples)/timebins)+(0.5*binsize))/samplerate]) plt.xticks(xlocs, [
"%.02f" % l
for l in (
(xlocs * len(samples) / timebins) + (0.5 * binsize)) / samplerate
])
ylocs = np.int16(np.round(np.linspace(0, freqbins - 1, 10))) ylocs = np.int16(np.round(np.linspace(0, freqbins - 1, 10)))
plt.yticks(ylocs, ["%.02f" % freq[i] for i in ylocs]) plt.yticks(ylocs, ["%.02f" % freq[i] for i in ylocs])
@ -103,6 +126,13 @@ def plotstft(audiopath, binsize=2**10, plotpath=None, colormap="jet"):
plt.clf() plt.clf()
snd.get_info('./outputs/sunflowers-Alex-150-normal-589.aiff')
snd_data_arr = snd.read('./outputs/sunflowers-Alex-150-normal-589.aiff')[0]
snd_data = snd_data_arr.tobytes()
snd_data_arr.dtype
len(snd_data)
if __name__ == '__main__': if __name__ == '__main__':
plotstft('./outputs/sunflowers-Alex-150-normal-589.aiff') plotstft('./outputs/sunflowers-Alex-150-normal-589.aiff')
plotstft('./outputs/sunflowers-Alex-180-normal-4763.aiff') plotstft('./outputs/sunflowers-Alex-180-normal-4763.aiff')

View File

@ -3,37 +3,28 @@ import numpy as np
from spectro_gen import generate_aiff_spectrogram from spectro_gen import generate_aiff_spectrogram
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
import itertools import itertools
import pickle,gc import gc
def sunflower_data():
audio_samples = pd.read_csv('./outputs/audio.csv',names=['word','voice','rate','variant','file'])
sunflowers = audio_samples.loc[audio_samples['word'] == 'sunflowers'].reset_index(drop=True)
sunflowers.loc[:,'file'] = sunflowers.loc[:,'file'].apply(lambda x:'outputs/'+x).apply(generate_aiff_spectrogram)
y_data = sunflowers['variant'].apply(lambda x:x=='normal').values
max_samples = sunflowers['file'].apply(lambda x:x.shape[0]).max()
sample_size = sunflowers['file'][0].shape[1]
sample_count = sunflowers['file'].shape[0]
sunflowers['file'][0].shape[0]
def append_zeros(spgr):
orig = spgr.shape[0]
return np.lib.pad(spgr,[(0, max_samples-orig), (0,0)],'median')
pad_sun = sunflowers['file'].apply(append_zeros).values
x_data = np.vstack(pad_sun).reshape((sample_count,max_samples,sample_size,))
return train_test_split(x_data,y_data,test_size=0.33)
def get_siamese_pairs(groupF1, groupF2): def get_siamese_pairs(groupF1, groupF2):
group1 = [r for (i, r) in groupF1.iterrows()] group1 = [r for (i, r) in groupF1.iterrows()]
group2 = [r for (i, r) in groupF2.iterrows()] group2 = [r for (i, r) in groupF2.iterrows()]
f = [(g1, g2) for g2 in group2 for g1 in group1] f = [(g1, g2) for g2 in group2 for g1 in group1]
t = [i for i in itertools.combinations(group1,2)]+[i for i in itertools.combinations(group2,2)] t = [i for i in itertools.combinations(group1, 2)
] + [i for i in itertools.combinations(group2, 2)]
return (t, f) return (t, f)
def sunflower_pairs_data(): def sunflower_pairs_data():
audio_samples = pd.read_csv('./outputs/audio.csv',names=['word','voice','rate','variant','file']) audio_samples = pd.read_csv(
audio_samples = audio_samples.loc[audio_samples['word'] == 'sunflowers'].reset_index(drop=True) './outputs/audio.csv',
audio_samples.loc[:,'spectrogram'] = audio_samples.loc[:,'file'].apply(lambda x:'outputs/audio/'+x).apply(generate_aiff_spectrogram) names=['word', 'voice', 'rate', 'variant', 'file'])
max_samples = audio_samples['spectrogram'].apply(lambda x:x.shape[0]).max() audio_samples = audio_samples.loc[audio_samples['word'] ==
sample_size = audio_samples['spectrogram'][0].shape[1] 'sunflowers'].reset_index(drop=True)
audio_samples.loc[:, 'spectrogram'] = audio_samples.loc[:, 'file'].apply(
lambda x: 'outputs/audio/' + x).apply(generate_aiff_spectrogram)
max_samples = audio_samples['spectrogram'].apply(
lambda x: x.shape[0]).max()
same_data, diff_data = [], [] same_data, diff_data = [], []
for (w, g) in audio_samples.groupby(audio_samples['word']): for (w, g) in audio_samples.groupby(audio_samples['word']):
sample_norm = g.loc[audio_samples['variant'] == 'normal'] sample_norm = g.loc[audio_samples['variant'] == 'normal']
@ -43,34 +34,49 @@ def sunflower_pairs_data():
diff_data.extend(diff) diff_data.extend(diff)
Y = np.hstack([np.ones(len(same_data)), np.zeros(len(diff_data))]) Y = np.hstack([np.ones(len(same_data)), np.zeros(len(diff_data))])
X_sample_pairs = same_data + diff_data X_sample_pairs = same_data + diff_data
def append_zeros(spgr): def append_zeros(spgr):
sample = np.lib.pad(spgr,[(0, max_samples-spgr.shape[0]), (0,0)],'median') sample = np.lib.pad(spgr, [(0, max_samples - spgr.shape[0]), (0, 0)],
'median')
return np.expand_dims(sample, axis=0) return np.expand_dims(sample, axis=0)
def create_X(sp): def create_X(sp):
# sample_count = sp[0]['file'].shape[0] # sample_count = sp[0]['file'].shape[0]
l_sample = append_zeros(sp[0]['spectrogram']) l_sample = append_zeros(sp[0]['spectrogram'])
r_sample = append_zeros(sp[1]['spectrogram'])#.apply(append_zeros).values r_sample = append_zeros(
# x_data = np.vstack(pad_sun).reshape((sample_count,max_samples,sample_size)) sp[1]['spectrogram'])
return np.expand_dims(np.vstack([l_sample, r_sample]), axis=0) return np.expand_dims(np.vstack([l_sample, r_sample]), axis=0)
X_list = (create_X(sp) for sp in X_sample_pairs) X_list = (create_X(sp) for sp in X_sample_pairs)
X = np.vstack(X_list) X = np.vstack(X_list)
tr_pairs, te_pairs, tr_y, te_y = train_test_split(X, Y, test_size=0.1) tr_pairs, te_pairs, tr_y, te_y = train_test_split(X, Y, test_size=0.1)
return train_test_split(X, Y, test_size=0.1) return train_test_split(X, Y, test_size=0.1)
def create_spectrogram_data(audio_group='audio'): def create_spectrogram_data(audio_group='audio'):
audio_samples = pd.read_csv('./outputs/'+audio_group+'.csv',names=['word','voice','rate','variant','file']) audio_samples = pd.read_csv(
# audio_samples = audio_samples.loc[audio_samples['word'] == 'sunflowers'].reset_index(drop=True) './outputs/' + audio_group + '.csv',
audio_samples.loc[:,'spectrogram'] = audio_samples.loc[:,'file'].apply(lambda x:'outputs/'+audio_group+'/'+x).apply(generate_aiff_spectrogram) names=['word', 'voice', 'rate', 'variant', 'file'])
# audio_samples = audio_samples.loc[audio_samples['word'] ==
# 'sunflowers'].reset_index(drop=True)
audio_samples.loc[:, 'spectrogram'] = audio_samples.loc[:, 'file'].apply(
lambda x: 'outputs/' + audio_group + '/' + x).apply(
generate_aiff_spectrogram)
audio_samples.to_pickle('outputs/spectrogram.pkl') audio_samples.to_pickle('outputs/spectrogram.pkl')
def create_speech_pairs_data(audio_group='audio'): def create_speech_pairs_data(audio_group='audio'):
audio_samples = pd.read_pickle('outputs/spectrogram.pkl') audio_samples = pd.read_pickle('outputs/spectrogram.pkl')
max_samples = audio_samples['spectrogram'].apply(lambda x:x.shape[0]).max() max_samples = audio_samples['spectrogram'].apply(
sample_size = audio_samples['spectrogram'][0].shape[1] lambda x: x.shape[0]).max()
# sample_size = audio_samples['spectrogram'][0].shape[1]
def append_zeros(spgr): def append_zeros(spgr):
sample = np.lib.pad(spgr,[(0, max_samples-spgr.shape[0]), (0,0)],'median') sample = np.lib.pad(spgr, [(0, max_samples - spgr.shape[0]), (0, 0)],
'median')
return sample return sample
def create_X(sp): def create_X(sp):
l_sample = append_zeros(sp[0]['spectrogram']) l_sample = append_zeros(sp[0]['spectrogram'])
r_sample = append_zeros(sp[1]['spectrogram']) r_sample = append_zeros(sp[1]['spectrogram'])
@ -79,8 +85,8 @@ def create_speech_pairs_data(audio_group='audio'):
print('generating siamese speech pairs') print('generating siamese speech pairs')
same_data, diff_data = [], [] same_data, diff_data = [], []
for (w, g) in audio_samples.groupby(audio_samples['word']): for (w, g) in audio_samples.groupby(audio_samples['word']):
sample_norm = g.loc[audio_samples['variant'] == 'normal']#.reset_index(drop=True) sample_norm = g.loc[audio_samples['variant'] == 'normal']
sample_phon = g.loc[audio_samples['variant'] == 'phoneme']#.reset_index(drop=True) sample_phon = g.loc[audio_samples['variant'] == 'phoneme']
same, diff = get_siamese_pairs(sample_norm, sample_phon) same, diff = get_siamese_pairs(sample_norm, sample_phon)
same_data.extend([create_X(s) for s in same[:10]]) same_data.extend([create_X(s) for s in same[:10]])
diff_data.extend([create_X(d) for d in diff[:10]]) diff_data.extend([create_X(d) for d in diff[:10]])
@ -91,7 +97,8 @@ def create_speech_pairs_data(audio_group='audio'):
print('pickling X/Y') print('pickling X/Y')
np.save('outputs/X.npy', X) np.save('outputs/X.npy', X)
np.save('outputs/Y.npy', Y) np.save('outputs/Y.npy', Y)
del X del same_data
del diff_data
gc.collect() gc.collect()
print('train/test splitting speech pairs') print('train/test splitting speech pairs')
tr_pairs, te_pairs, tr_y, te_y = train_test_split(X, Y, test_size=0.1) tr_pairs, te_pairs, tr_y, te_y = train_test_split(X, Y, test_size=0.1)
@ -101,25 +108,6 @@ def create_speech_pairs_data(audio_group='audio'):
np.save('outputs/tr_y.npy', tr_y) np.save('outputs/tr_y.npy', tr_y)
np.save('outputs/te_y.npy', te_y) np.save('outputs/te_y.npy', te_y)
# def create_speech_model_data():
# (max_samples,sample_size) = pickle.load(open('./spectrogram_vars.pkl','rb'))
# x_data_pos = np.load('outputs/x_data_pos.npy')
# x_data_neg = np.load('outputs/x_data_neg.npy')
# x_pos_train, x_pos_test, x_neg_train, x_neg_test =train_test_split(x_data_pos,x_data_neg,test_size=0.1)
# del x_data_pos
# del x_data_neg
# gc.collect()
# print('split train and test')
# tr_y = np.array(x_pos_train.shape[0]*[1])
# te_y = np.array(x_pos_test.shape[0]*[[1,0]])
# tr_pairs = np.array([x_pos_train,x_neg_train]).reshape(x_pos_train.shape[0],2,max_samples,sample_size)
# te_pairs = np.array([x_pos_test,x_neg_test]).reshape(x_pos_test.shape[0],2,max_samples,sample_size)
# print('reshaped to input dim')
# np.save('outputs/tr_pairs.npy',tr_pairs)
# np.save('outputs/te_pairs.npy',te_pairs)
# np.save('outputs/tr_y.npy',tr_y)
# np.save('outputs/te_y.npy',te_y)
# print('pickled speech model data')
def speech_model_data(): def speech_model_data():
tr_pairs = np.load('outputs/tr_pairs.npy') / 255.0 tr_pairs = np.load('outputs/tr_pairs.npy') / 255.0
@ -130,6 +118,7 @@ def speech_model_data():
te_y = np.load('outputs/te_y.npy') te_y = np.load('outputs/te_y.npy')
return tr_pairs, te_pairs, tr_y, te_y return tr_pairs, te_pairs, tr_y, te_y
if __name__ == '__main__': if __name__ == '__main__':
# sunflower_pairs_data() # sunflower_pairs_data()
# create_spectrogram_data() # create_spectrogram_data()

View File

@ -1,4 +1,3 @@
from __future__ import absolute_import from __future__ import absolute_import
from __future__ import print_function from __future__ import print_function
import numpy as np import numpy as np
@ -12,8 +11,8 @@ from keras import backend as K
def euclidean_distance(vects): def euclidean_distance(vects):
x, y = vects x, y = vects
return K.sqrt(K.maximum(K.sum(K.square(x - y), axis=1, keepdims=True), return K.sqrt(
K.epsilon())) K.maximum(K.sum(K.square(x - y), axis=1, keepdims=True), K.epsilon()))
def eucl_dist_output_shape(shapes): def eucl_dist_output_shape(shapes):
@ -79,27 +78,40 @@ input_b = Input(shape=input_dim)
processed_a = base_network(input_a) processed_a = base_network(input_a)
processed_b = base_network(input_b) processed_b = base_network(input_b)
distance = Lambda(euclidean_distance, distance = Lambda(
output_shape=eucl_dist_output_shape)( euclidean_distance,
[processed_a, processed_b] output_shape=eucl_dist_output_shape)([processed_a, processed_b])
)
model = Model([input_a, input_b], distance) model = Model([input_a, input_b], distance)
tb_cb = TensorBoard(log_dir='./logs/siamese_logs', histogram_freq=1, tb_cb = TensorBoard(
batch_size=32, write_graph=True, write_grads=True, log_dir='./logs/siamese_logs',
write_images=True, embeddings_freq=0, histogram_freq=1,
embeddings_layer_names=None, embeddings_metadata=None) batch_size=32,
write_graph=True,
write_grads=True,
write_images=True,
embeddings_freq=0,
embeddings_layer_names=None,
embeddings_metadata=None)
cp_file_fmt = './models/siamese_speech_model-{epoch:02d}-epoch-{val_acc:0.2f}\ cp_file_fmt = './models/siamese_speech_model-{epoch:02d}-epoch-{val_acc:0.2f}\
-acc.h5' -acc.h5'
cp_cb = ModelCheckpoint(cp_file_fmt, monitor='val_acc', verbose=0,
save_best_only=False, save_weights_only=False, cp_cb = ModelCheckpoint(
mode='auto', period=1) cp_file_fmt,
monitor='val_acc',
verbose=0,
save_best_only=False,
save_weights_only=False,
mode='auto',
period=1)
# train # train
rms = RMSprop(lr=0.001) rms = RMSprop(lr=0.001)
sgd = SGD(lr=0.001) sgd = SGD(lr=0.001)
model.compile(loss=contrastive_loss, optimizer=rms, metrics=[accuracy]) model.compile(loss=contrastive_loss, optimizer=rms, metrics=[accuracy])
model.fit([tr_pairs[:, 0], tr_pairs[:, 1]], tr_y, model.fit(
[tr_pairs[:, 0], tr_pairs[:, 1]],
tr_y,
batch_size=128, batch_size=128,
epochs=50, epochs=50,
validation_data=([te_pairs[:, 0], te_pairs[:, 1]], te_y), validation_data=([te_pairs[:, 0], te_pairs[:, 1]], te_y),

View File

@ -1,28 +1,41 @@
import objc import objc
from AppKit import NSSpeechSynthesizer,NSSpeechInputModeProperty,NSSpeechModePhoneme from AppKit import NSSpeechSynthesizer, NSSpeechInputModeProperty
from Foundation import NSURL,NSError,NSObject from AppKit import NSSpeechModePhoneme
from Foundation import NSURL
import json import json
import random import random
import os import os
import re import re
import subprocess import subprocess
OUTPUT_NAME = 'audio' OUTPUT_NAME = 'audio'
dest_dir = os.path.abspath('.') + '/outputs/' + OUTPUT_NAME + '/' dest_dir = os.path.abspath('.') + '/outputs/' + OUTPUT_NAME + '/'
dest_file = './outputs/' + OUTPUT_NAME + '.csv' dest_file = './outputs/' + OUTPUT_NAME + '.csv'
def create_dir(direc): def create_dir(direc):
if not os.path.exists(direc): if not os.path.exists(direc):
os.mkdir(direc) os.mkdir(direc)
dest_filename = lambda n,v,r,t: '{}-{}-{}-{}-'.format(n,v,r,t)+str(random.randint(0,10000))+'.aiff'
dest_path = lambda v,r,n: dest_dir+v+'/'+r+'/'+n
dest_url = lambda p: NSURL.fileURLWithPath_(p) def dest_filename(n, v, r, t):
return '{}-{}-{}-{}-'.format(n, v, r,
t) + str(random.randint(0, 10000)) + '.aiff'
def dest_path(v, r, n):
return dest_dir + v + '/' + r + '/' + n
def cli_gen_audio(speech_cmd, rate, voice, out_path): def cli_gen_audio(speech_cmd, rate, voice, out_path):
subprocess.call(['say','-v',voice,'-r',str(rate),'-o',out_path,speech_cmd]) subprocess.call(
['say', '-v', voice, '-r',
str(rate), '-o', out_path, speech_cmd])
class SynthFile(object): class SynthFile(object):
"""docstring for SynthFile.""" """docstring for SynthFile."""
def __init__(self, word, phon, filename, voice, rate, operation): def __init__(self, word, phon, filename, voice, rate, operation):
super(SynthFile, self).__init__() super(SynthFile, self).__init__()
self.word = word self.word = word
@ -33,23 +46,33 @@ class SynthFile(object):
self.variant = operation self.variant = operation
def get_json(self): def get_json(self):
return {'filename':self.filename,'voice':self.voice, return {
'rate':self.rate,'operation':self.operation} 'filename': self.filename,
'voice': self.voice,
'rate': self.rate,
'operation': self.operation
}
def get_csv(self): def get_csv(self):
return '{},{},{},{},{}\n'.format(self.word,self.phoneme,self.voice,self.rate,self.variant,self.filename) return '{},{},{},{},{}\n'.format(self.word, self.phoneme, self.voice,
self.rate, self.variant,
self.filename)
class SynthVariant(object): class SynthVariant(object):
"""docstring for SynthVariant.""" """docstring for SynthVariant."""
def __init__(self, identifier, rate): def __init__(self, identifier, rate):
super(SynthVariant, self).__init__() super(SynthVariant, self).__init__()
self.synth = NSSpeechSynthesizer.alloc().initWithVoice_(identifier) self.synth = NSSpeechSynthesizer.alloc().initWithVoice_(identifier)
self.synth.setVolume_(100) self.synth.setVolume_(100)
self.synth.setRate_(rate) self.synth.setRate_(rate)
self.phone_synth = NSSpeechSynthesizer.alloc().initWithVoice_(identifier) self.phone_synth = NSSpeechSynthesizer.alloc().initWithVoice_(
identifier)
self.phone_synth.setVolume_(100) self.phone_synth.setVolume_(100)
self.phone_synth.setRate_(rate) self.phone_synth.setRate_(rate)
self.phone_synth.setObject_forProperty_error_(NSSpeechModePhoneme,NSSpeechInputModeProperty,None) self.phone_synth.setObject_forProperty_error_(
NSSpeechModePhoneme, NSSpeechInputModeProperty, None)
self.identifier = identifier self.identifier = identifier
self.rate = rate self.rate = rate
self.name = identifier.split('.')[-1] self.name = identifier.split('.')[-1]
@ -58,7 +81,8 @@ class SynthVariant(object):
return 'Synthesizer[{} - {}]({})'.format(self.name, self.rate) return 'Synthesizer[{} - {}]({})'.format(self.name, self.rate)
def generate_audio(self, word, variant): def generate_audio(self, word, variant):
orig_phon,phoneme,phon_cmd = self.synth.phonemesFromText_(word),'',word orig_phon, phoneme, phon_cmd = self.synth.phonemesFromText_(
word), '', word
if variant == 'low': if variant == 'low':
# self.synth.startSpeakingString_toURL_(word,d_url) # self.synth.startSpeakingString_toURL_(word,d_url)
phoneme = orig_phon phoneme = orig_phon
@ -75,16 +99,23 @@ class SynthVariant(object):
# self.synth.startSpeakingString_toURL_(word,d_url) # self.synth.startSpeakingString_toURL_(word,d_url)
fname = dest_filename(word, phoneme, self.name, self.rate) fname = dest_filename(word, phoneme, self.name, self.rate)
d_path = dest_path(self.name, self.rate, fname) d_path = dest_path(self.name, self.rate, fname)
d_url = dest_url(d_path) # d_url = NSURL.fileURLWithPath_(d_path)
cli_gen_audio(phon_cmd, self.rate, self.name, d_path) cli_gen_audio(phon_cmd, self.rate, self.name, d_path)
return SynthFile(word, phoneme, fname, self.name, self.rate, variant) return SynthFile(word, phoneme, fname, self.name, self.rate, variant)
def synth_generator(): def synth_generator():
voices_installed = NSSpeechSynthesizer.availableVoices() voices_installed = NSSpeechSynthesizer.availableVoices()
voice_attrs = [NSSpeechSynthesizer.attributesForVoice_(v) for v in voices_installed] voice_attrs = [
us_voices_ids = [v['VoiceIdentifier'] for v in voice_attrs if v['VoiceLanguage'] == 'en-US' and v['VoiceIdentifier'].split('.')[-1][0].isupper()] NSSpeechSynthesizer.attributesForVoice_(v) for v in voices_installed
# us_voices_ids = ['com.apple.speech.synthesis.voice.Fred','com.apple.speech.synthesis.voice.Alex', ]
us_voices_ids = [
v['VoiceIdentifier'] for v in voice_attrs
if v['VoiceLanguage'] == 'en-US'
and v['VoiceIdentifier'].split('.')[-1][0].isupper()
]
# us_voices_ids = ['com.apple.speech.synthesis.voice.Fred',
# 'com.apple.speech.synthesis.voice.Alex',
# 'com.apple.speech.synthesis.voice.Victoria'] # 'com.apple.speech.synthesis.voice.Victoria']
# voice_rates = list(range(150,221,(220-180)//4)) # voice_rates = list(range(150,221,(220-180)//4))
voice_rates = [150, 180, 210, 250] voice_rates = [150, 180, 210, 250]
@ -94,6 +125,7 @@ def synth_generator():
for r in voice_rates: for r in voice_rates:
create_dir(dest_dir + v + '/' + r) create_dir(dest_dir + v + '/' + r)
voice_synths.append(SynthVariant(v, r)) voice_synths.append(SynthVariant(v, r))
def synth_for_words(words): def synth_for_words(words):
all_synths = [] all_synths = []
for w in words: for w in words:
@ -101,8 +133,10 @@ def synth_generator():
for v in ['low', 'medium', 'high']: for v in ['low', 'medium', 'high']:
all_synths.append(s.generate_audio(w, v)) all_synths.append(s.generate_audio(w, v))
return all_synths return all_synths
return synth_for_words return synth_for_words
def write_synths(synth_list, fname, csv=False): def write_synths(synth_list, fname, csv=False):
f = open(fname, 'w') f = open(fname, 'w')
if csv: if csv:
@ -112,12 +146,14 @@ def write_synths(synth_list,fname,csv=False):
json.dump([s.get_json() for s in synth_list], f) json.dump([s.get_json() for s in synth_list], f)
f.close() f.close()
def generate_audio_for_stories(): def generate_audio_for_stories():
stories_data = json.load(open('./inputs/all_stories_hs.json')) stories_data = json.load(open('./inputs/all_stories_hs.json'))
word_list = [t[0] for i in stories_data.values() for t in i] word_list = [t[0] for i in stories_data.values() for t in i]
words_audio_synth = synth_generator() words_audio_synth = synth_generator()
return words_audio_synth(word_list) return words_audio_synth(word_list)
# words_audio_synth = synth_generator() # words_audio_synth = synth_generator()
# synth = NSSpeechSynthesizer.alloc().init() # synth = NSSpeechSynthesizer.alloc().init()
# voices_installed = NSSpeechSynthesizer.availableVoices() # voices_installed = NSSpeechSynthesizer.availableVoices()