formatted
parent
e6f0c8b21b
commit
82d0398d2c
|
|
@ -1,55 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
"""
|
||||
Convert ARPABET <http://www.speech.cs.cmu.edu/cgi-bin/cmudict>
|
||||
to Apple's codes <https://developer.apple.com/library/content/documentation/UserExperience/Conceptual/SpeechSynthesisProgrammingGuide/Phonemes/Phonemes.html>
|
||||
"""
|
||||
|
||||
import sys
|
||||
|
||||
|
||||
mapping = {s.split()[0]: s.split()[1] for s in """
|
||||
AA AA
|
||||
AE AE
|
||||
AH UX
|
||||
AO AO
|
||||
AW AW
|
||||
AY AY
|
||||
B b
|
||||
CH C
|
||||
D d
|
||||
DH D
|
||||
EH EH
|
||||
ER UXr
|
||||
EY EY
|
||||
F f
|
||||
G g
|
||||
HH h
|
||||
IH IH
|
||||
IY IY
|
||||
JH J
|
||||
K k
|
||||
L l
|
||||
M m
|
||||
N n
|
||||
NG N
|
||||
OW OW
|
||||
OY OY
|
||||
P p
|
||||
R r
|
||||
S s
|
||||
SH S
|
||||
T t
|
||||
TH T
|
||||
UH UH
|
||||
UW UW
|
||||
V v
|
||||
W w
|
||||
Y y
|
||||
Z z
|
||||
ZH Z
|
||||
""".strip().split('\n')}
|
||||
|
||||
arpabet_phonemes = sys.stdin.read().split()
|
||||
apple_phonemes = [mapping[p.upper()] for p in arpabet_phonemes]
|
||||
print('[[inpt PHON]] ' + ''.join(apple_phonemes))
|
||||
|
|
@ -1,10 +0,0 @@
|
|||
import pandas as pd
|
||||
|
||||
audio_file = pd.read_csv('./outputs/audio.csv',names=['word','voice','rate','type','filename'])
|
||||
word_goups = audio_file.groupby('word')
|
||||
# audio
|
||||
lst = [1, 2, 3, 1, 2, 3]
|
||||
s = pd.Series([1, 2, 3, 10, 20, 30], lst)
|
||||
df3 = pd.DataFrame({'X' : ['A', 'B', 'A', 'B'], 'Y' : [1, 4, 3, 2]})
|
||||
|
||||
s.groupby(level=0).sum()
|
||||
|
|
@ -2,14 +2,24 @@ import pyaudio
|
|||
import numpy as np
|
||||
from matplotlib import pyplot as plt
|
||||
|
||||
CHUNKSIZE = 1024 # fixed chunk size
|
||||
CHUNKSIZE = 44100 * 10 # fixed chunk size
|
||||
|
||||
# initialize portaudio
|
||||
p = pyaudio.PyAudio()
|
||||
stream = p.open(format=pyaudio.paInt16, channels=1, rate=44100, input=True, frames_per_buffer=CHUNKSIZE)
|
||||
p_inp = pyaudio.PyAudio()
|
||||
# dev_n = p.get_device_count()
|
||||
# dev_infos = [p.get_device_info_by_index(index) for index in range(dev_n)]
|
||||
# [i for i in dev_infos] # if i['name'] == 'record']
|
||||
stream = p_inp.open(
|
||||
format=pyaudio.paInt24,
|
||||
channels=2,
|
||||
rate=44100,
|
||||
input=True,
|
||||
frames_per_buffer=CHUNKSIZE)
|
||||
|
||||
# do this as long as you want fresh samples
|
||||
data = stream.read(CHUNKSIZE)
|
||||
len(data)
|
||||
CHUNKSIZE*10
|
||||
numpydata = np.fromstring(data, dtype=np.int16)
|
||||
|
||||
# plot data
|
||||
|
|
@ -19,4 +29,27 @@ plt.show()
|
|||
# close stream
|
||||
stream.stop_stream()
|
||||
stream.close()
|
||||
p.terminate()
|
||||
p_inp.terminate()
|
||||
# open the file for reading.
|
||||
# wf = wave.open(sys.argv[1], 'rb')
|
||||
|
||||
# create an audio object
|
||||
# p = pyaudio.PyAudio()
|
||||
|
||||
# open stream based on the wave object which has been input.
|
||||
p_oup = pyaudio.PyAudio()
|
||||
stream = p_oup.open(
|
||||
format=pyaudio.paInt24, channels=2, rate=44100, output=True)
|
||||
|
||||
# read data (based on the chunk size)
|
||||
# data = wf.readframes(CHUNKSIZE)
|
||||
|
||||
# play stream (looping from beginning of file to the end)
|
||||
# while data != '':
|
||||
# writing to the stream is what *actually* plays the sound.
|
||||
stream.write(data)
|
||||
# data = wf.readframes(chunk)
|
||||
|
||||
# cleanup stuff.
|
||||
stream.close()
|
||||
p_oup.terminate()
|
||||
|
|
|
|||
12
snippets.py
12
snippets.py
|
|
@ -1,12 +0,0 @@
|
|||
# import scipy.signal as sg
|
||||
# import pysndfile.sndio as snd
|
||||
#
|
||||
# snd_data,samples,_ = snd.read('./outputs/sunflowers-Alex-150-normal-589.aiff')
|
||||
# samples_per_seg = 3*int(samples*150/(3*1000))
|
||||
# # samples/(len(snd_data)*1000.0)
|
||||
# len(snd_data)
|
||||
# samples_per_seg/2
|
||||
#
|
||||
# len(sg.spectrogram(snd_data,nperseg=samples_per_seg,noverlap=samples_per_seg/3)[2])
|
||||
#
|
||||
# from spectro_gen import generate_aiff_spectrogram
|
||||
|
|
@ -1,6 +1,7 @@
|
|||
#!/usr/bin/env python
|
||||
#coding: utf-8
|
||||
""" This work is licensed under a Creative Commons Attribution 3.0 Unported License.
|
||||
|
||||
""" This work is licensed under a Creative Commons Attribution 3.0 Unported
|
||||
License.
|
||||
Frank Zalkow, 2012-2013
|
||||
http://www.frank-zalkow.de/en/code-snippets/create-audio-spectrograms-with-python.html?i=1
|
||||
"""
|
||||
|
|
@ -9,8 +10,9 @@ import numpy as np
|
|||
from matplotlib import pyplot as plt
|
||||
from pysndfile import sndio as snd
|
||||
from numpy.lib import stride_tricks
|
||||
|
||||
""" short time fourier transform of audio signal """
|
||||
|
||||
|
||||
def stft(sig, frameSize, overlapFac=0.5, window=np.hanning):
|
||||
win = window(frameSize)
|
||||
hopSize = int(frameSize - np.floor(overlapFac * frameSize))
|
||||
|
|
@ -18,82 +20,103 @@ def stft(sig, frameSize, overlapFac=0.5, window=np.hanning):
|
|||
# zeros at beginning (thus center of 1st window should be for sample nr. 0)
|
||||
# sig = (sig*255).astype(np.uint8)
|
||||
# import pdb;pdb.set_trace()
|
||||
count = int(np.floor(frameSize/2.0))
|
||||
count = int(np.floor(frameSize / 2.0))
|
||||
# import pdb;pdb.set_trace()
|
||||
samples = np.append(np.zeros(count), sig)
|
||||
# cols for windowing
|
||||
cols = int(np.ceil( (len(samples) - frameSize) / float(hopSize)) + 1)
|
||||
cols = int(np.ceil((len(samples) - frameSize) / float(hopSize)) + 1)
|
||||
# zeros at end (thus samples can be fully covered by frames)
|
||||
samples = np.append(samples, np.zeros(frameSize))
|
||||
|
||||
frames = stride_tricks.as_strided(samples, shape=(cols, frameSize), strides=(samples.strides[0]*hopSize, samples.strides[0])).copy()
|
||||
frames = stride_tricks.as_strided(
|
||||
samples,
|
||||
shape=(cols, frameSize),
|
||||
strides=(samples.strides[0] * hopSize, samples.strides[0])).copy()
|
||||
frames *= win
|
||||
|
||||
return np.fft.rfft(frames)
|
||||
|
||||
|
||||
""" scale frequency axis logarithmically """
|
||||
|
||||
|
||||
def logscale_spec(spec, sr=44100, factor=20.):
|
||||
timebins, freqbins = np.shape(spec)
|
||||
|
||||
scale = np.linspace(0, 1, freqbins) ** factor
|
||||
scale *= (freqbins-1)/max(scale)
|
||||
scale = np.linspace(0, 1, freqbins)**factor
|
||||
scale *= (freqbins - 1) / max(scale)
|
||||
scale = np.unique(np.round(scale)).astype(np.uint32)
|
||||
# import pdb;pdb.set_trace()
|
||||
# create spectrogram with new freq bins
|
||||
newspec = np.complex128(np.zeros([timebins, len(scale)]))
|
||||
for i in range(0, len(scale)):
|
||||
if i == len(scale)-1:
|
||||
newspec[:,i] = np.sum(spec[:,scale[i]:], axis=1)
|
||||
if i == len(scale) - 1:
|
||||
newspec[:, i] = np.sum(spec[:, scale[i]:], axis=1)
|
||||
else:
|
||||
newspec[:,i] = np.sum(spec[:,scale[i]:scale[i+1]], axis=1)
|
||||
newspec[:, i] = np.sum(spec[:, scale[i]:scale[i + 1]], axis=1)
|
||||
|
||||
# list center freq of bins
|
||||
allfreqs = np.abs(np.fft.fftfreq(freqbins*2, 1./sr)[:freqbins+1])
|
||||
allfreqs = np.abs(np.fft.fftfreq(freqbins * 2, 1. / sr)[:freqbins + 1])
|
||||
freqs = []
|
||||
for i in range(0, len(scale)):
|
||||
if i == len(scale)-1:
|
||||
if i == len(scale) - 1:
|
||||
freqs += [np.mean(allfreqs[scale[i]:])]
|
||||
else:
|
||||
freqs += [np.mean(allfreqs[scale[i]:scale[i+1]])]
|
||||
freqs += [np.mean(allfreqs[scale[i]:scale[i + 1]])]
|
||||
|
||||
return newspec, freqs
|
||||
|
||||
|
||||
""" generate spectrogram for aiff audio with 150ms windows and 50ms overlap"""
|
||||
|
||||
|
||||
def generate_aiff_spectrogram(audiopath):
|
||||
samples,samplerate,_ = snd.read(audiopath)
|
||||
samples, samplerate, _ = snd.read(audiopath)
|
||||
# samplerate, samples = wav.read(audiopath)
|
||||
# s = stft(samples, binsize)
|
||||
s = stft(samples, samplerate*150//1000,1.0/3)
|
||||
s = stft(samples, samplerate * 150 // 1000, 1.0 / 3)
|
||||
|
||||
sshow, freq = logscale_spec(s, factor=1.0, sr=samplerate)
|
||||
ims = 20.*np.log10(np.abs(sshow)/10e-6)
|
||||
ims = 20. * np.log10(np.abs(sshow) / 10e-6)
|
||||
return ims
|
||||
|
||||
|
||||
""" plot spectrogram"""
|
||||
|
||||
|
||||
def plotstft(audiopath, binsize=2**10, plotpath=None, colormap="jet"):
|
||||
samples,samplerate,_ = snd.read(audiopath)
|
||||
samples, samplerate, _ = snd.read(audiopath)
|
||||
# samplerate, samples = wav.read(audiopath)
|
||||
# s = stft(samples, binsize)
|
||||
# print(samplerate*150//1000)
|
||||
s = stft(samples, samplerate*150//1000,1.0/3)
|
||||
s = stft(samples, samplerate * 150 // 1000, 1.0 / 3)
|
||||
|
||||
sshow, freq = logscale_spec(s, factor=1.0, sr=samplerate)
|
||||
ims = 20.*np.log10(np.abs(sshow)/10e-6) # amplitude to decibel
|
||||
ims = 20. * np.log10(np.abs(sshow) / 10e-6) # amplitude to decibel
|
||||
|
||||
timebins, freqbins = np.shape(ims)
|
||||
# import pdb;pdb.set_trace()
|
||||
plt.figure(figsize=(15, 7.5))
|
||||
plt.imshow(np.transpose(ims), origin="lower", aspect="auto", cmap=colormap, interpolation="none")
|
||||
plt.imshow(
|
||||
np.transpose(ims),
|
||||
origin="lower",
|
||||
aspect="auto",
|
||||
cmap=colormap,
|
||||
interpolation="none")
|
||||
plt.colorbar()
|
||||
|
||||
plt.xlabel("time (s)")
|
||||
plt.ylabel("frequency (hz)")
|
||||
plt.xlim([0, timebins-1])
|
||||
plt.xlim([0, timebins - 1])
|
||||
plt.ylim([0, freqbins])
|
||||
|
||||
xlocs = np.float32(np.linspace(0, timebins-1, 5))
|
||||
plt.xticks(xlocs, ["%.02f" % l for l in ((xlocs*len(samples)/timebins)+(0.5*binsize))/samplerate])
|
||||
ylocs = np.int16(np.round(np.linspace(0, freqbins-1, 10)))
|
||||
xlocs = np.float32(np.linspace(0, timebins - 1, 5))
|
||||
plt.xticks(xlocs, [
|
||||
"%.02f" % l
|
||||
for l in (
|
||||
(xlocs * len(samples) / timebins) + (0.5 * binsize)) / samplerate
|
||||
])
|
||||
ylocs = np.int16(np.round(np.linspace(0, freqbins - 1, 10)))
|
||||
plt.yticks(ylocs, ["%.02f" % freq[i] for i in ylocs])
|
||||
|
||||
if plotpath:
|
||||
|
|
@ -103,6 +126,13 @@ def plotstft(audiopath, binsize=2**10, plotpath=None, colormap="jet"):
|
|||
|
||||
plt.clf()
|
||||
|
||||
|
||||
snd.get_info('./outputs/sunflowers-Alex-150-normal-589.aiff')
|
||||
snd_data_arr = snd.read('./outputs/sunflowers-Alex-150-normal-589.aiff')[0]
|
||||
snd_data = snd_data_arr.tobytes()
|
||||
snd_data_arr.dtype
|
||||
len(snd_data)
|
||||
|
||||
if __name__ == '__main__':
|
||||
plotstft('./outputs/sunflowers-Alex-150-normal-589.aiff')
|
||||
plotstft('./outputs/sunflowers-Alex-180-normal-4763.aiff')
|
||||
|
|
|
|||
157
speech_data.py
157
speech_data.py
|
|
@ -3,135 +3,124 @@ import numpy as np
|
|||
from spectro_gen import generate_aiff_spectrogram
|
||||
from sklearn.model_selection import train_test_split
|
||||
import itertools
|
||||
import pickle,gc
|
||||
import gc
|
||||
|
||||
def sunflower_data():
|
||||
audio_samples = pd.read_csv('./outputs/audio.csv',names=['word','voice','rate','variant','file'])
|
||||
sunflowers = audio_samples.loc[audio_samples['word'] == 'sunflowers'].reset_index(drop=True)
|
||||
sunflowers.loc[:,'file'] = sunflowers.loc[:,'file'].apply(lambda x:'outputs/'+x).apply(generate_aiff_spectrogram)
|
||||
y_data = sunflowers['variant'].apply(lambda x:x=='normal').values
|
||||
max_samples = sunflowers['file'].apply(lambda x:x.shape[0]).max()
|
||||
sample_size = sunflowers['file'][0].shape[1]
|
||||
sample_count = sunflowers['file'].shape[0]
|
||||
sunflowers['file'][0].shape[0]
|
||||
def append_zeros(spgr):
|
||||
orig = spgr.shape[0]
|
||||
return np.lib.pad(spgr,[(0, max_samples-orig), (0,0)],'median')
|
||||
pad_sun = sunflowers['file'].apply(append_zeros).values
|
||||
x_data = np.vstack(pad_sun).reshape((sample_count,max_samples,sample_size,))
|
||||
return train_test_split(x_data,y_data,test_size=0.33)
|
||||
|
||||
def get_siamese_pairs(groupF1,groupF2):
|
||||
group1 = [r for (i,r) in groupF1.iterrows()]
|
||||
group2 = [r for (i,r) in groupF2.iterrows()]
|
||||
f = [(g1,g2) for g2 in group2 for g1 in group1]
|
||||
t = [i for i in itertools.combinations(group1,2)]+[i for i in itertools.combinations(group2,2)]
|
||||
return (t,f)
|
||||
def get_siamese_pairs(groupF1, groupF2):
|
||||
group1 = [r for (i, r) in groupF1.iterrows()]
|
||||
group2 = [r for (i, r) in groupF2.iterrows()]
|
||||
f = [(g1, g2) for g2 in group2 for g1 in group1]
|
||||
t = [i for i in itertools.combinations(group1, 2)
|
||||
] + [i for i in itertools.combinations(group2, 2)]
|
||||
return (t, f)
|
||||
|
||||
|
||||
def sunflower_pairs_data():
|
||||
audio_samples = pd.read_csv('./outputs/audio.csv',names=['word','voice','rate','variant','file'])
|
||||
audio_samples = audio_samples.loc[audio_samples['word'] == 'sunflowers'].reset_index(drop=True)
|
||||
audio_samples.loc[:,'spectrogram'] = audio_samples.loc[:,'file'].apply(lambda x:'outputs/audio/'+x).apply(generate_aiff_spectrogram)
|
||||
max_samples = audio_samples['spectrogram'].apply(lambda x:x.shape[0]).max()
|
||||
sample_size = audio_samples['spectrogram'][0].shape[1]
|
||||
same_data,diff_data = [],[]
|
||||
for (w,g) in audio_samples.groupby(audio_samples['word']):
|
||||
audio_samples = pd.read_csv(
|
||||
'./outputs/audio.csv',
|
||||
names=['word', 'voice', 'rate', 'variant', 'file'])
|
||||
audio_samples = audio_samples.loc[audio_samples['word'] ==
|
||||
'sunflowers'].reset_index(drop=True)
|
||||
audio_samples.loc[:, 'spectrogram'] = audio_samples.loc[:, 'file'].apply(
|
||||
lambda x: 'outputs/audio/' + x).apply(generate_aiff_spectrogram)
|
||||
max_samples = audio_samples['spectrogram'].apply(
|
||||
lambda x: x.shape[0]).max()
|
||||
same_data, diff_data = [], []
|
||||
for (w, g) in audio_samples.groupby(audio_samples['word']):
|
||||
sample_norm = g.loc[audio_samples['variant'] == 'normal']
|
||||
sample_phon = g.loc[audio_samples['variant'] == 'phoneme']
|
||||
same , diff = get_siamese_pairs(sample_norm,sample_phon)
|
||||
same, diff = get_siamese_pairs(sample_norm, sample_phon)
|
||||
same_data.extend(same)
|
||||
diff_data.extend(diff)
|
||||
Y = np.hstack([np.ones(len(same_data)),np.zeros(len(diff_data))])
|
||||
X_sample_pairs = same_data+diff_data
|
||||
Y = np.hstack([np.ones(len(same_data)), np.zeros(len(diff_data))])
|
||||
X_sample_pairs = same_data + diff_data
|
||||
|
||||
def append_zeros(spgr):
|
||||
sample = np.lib.pad(spgr,[(0, max_samples-spgr.shape[0]), (0,0)],'median')
|
||||
return np.expand_dims(sample,axis=0)
|
||||
sample = np.lib.pad(spgr, [(0, max_samples - spgr.shape[0]), (0, 0)],
|
||||
'median')
|
||||
return np.expand_dims(sample, axis=0)
|
||||
|
||||
def create_X(sp):
|
||||
# sample_count = sp[0]['file'].shape[0]
|
||||
l_sample = append_zeros(sp[0]['spectrogram'])
|
||||
r_sample = append_zeros(sp[1]['spectrogram'])#.apply(append_zeros).values
|
||||
# x_data = np.vstack(pad_sun).reshape((sample_count,max_samples,sample_size))
|
||||
return np.expand_dims(np.vstack([l_sample,r_sample]),axis=0)
|
||||
r_sample = append_zeros(
|
||||
sp[1]['spectrogram'])
|
||||
return np.expand_dims(np.vstack([l_sample, r_sample]), axis=0)
|
||||
|
||||
X_list = (create_X(sp) for sp in X_sample_pairs)
|
||||
X = np.vstack(X_list)
|
||||
tr_pairs,te_pairs,tr_y,te_y = train_test_split(X,Y,test_size=0.1)
|
||||
return train_test_split(X,Y,test_size=0.1)
|
||||
tr_pairs, te_pairs, tr_y, te_y = train_test_split(X, Y, test_size=0.1)
|
||||
return train_test_split(X, Y, test_size=0.1)
|
||||
|
||||
|
||||
def create_spectrogram_data(audio_group='audio'):
|
||||
audio_samples = pd.read_csv('./outputs/'+audio_group+'.csv',names=['word','voice','rate','variant','file'])
|
||||
# audio_samples = audio_samples.loc[audio_samples['word'] == 'sunflowers'].reset_index(drop=True)
|
||||
audio_samples.loc[:,'spectrogram'] = audio_samples.loc[:,'file'].apply(lambda x:'outputs/'+audio_group+'/'+x).apply(generate_aiff_spectrogram)
|
||||
audio_samples = pd.read_csv(
|
||||
'./outputs/' + audio_group + '.csv',
|
||||
names=['word', 'voice', 'rate', 'variant', 'file'])
|
||||
# audio_samples = audio_samples.loc[audio_samples['word'] ==
|
||||
# 'sunflowers'].reset_index(drop=True)
|
||||
audio_samples.loc[:, 'spectrogram'] = audio_samples.loc[:, 'file'].apply(
|
||||
lambda x: 'outputs/' + audio_group + '/' + x).apply(
|
||||
generate_aiff_spectrogram)
|
||||
audio_samples.to_pickle('outputs/spectrogram.pkl')
|
||||
|
||||
|
||||
def create_speech_pairs_data(audio_group='audio'):
|
||||
audio_samples = pd.read_pickle('outputs/spectrogram.pkl')
|
||||
max_samples = audio_samples['spectrogram'].apply(lambda x:x.shape[0]).max()
|
||||
sample_size = audio_samples['spectrogram'][0].shape[1]
|
||||
max_samples = audio_samples['spectrogram'].apply(
|
||||
lambda x: x.shape[0]).max()
|
||||
|
||||
# sample_size = audio_samples['spectrogram'][0].shape[1]
|
||||
|
||||
def append_zeros(spgr):
|
||||
sample = np.lib.pad(spgr,[(0, max_samples-spgr.shape[0]), (0,0)],'median')
|
||||
sample = np.lib.pad(spgr, [(0, max_samples - spgr.shape[0]), (0, 0)],
|
||||
'median')
|
||||
return sample
|
||||
|
||||
def create_X(sp):
|
||||
l_sample = append_zeros(sp[0]['spectrogram'])
|
||||
r_sample = append_zeros(sp[1]['spectrogram'])
|
||||
return np.asarray([l_sample,r_sample])
|
||||
return np.asarray([l_sample, r_sample])
|
||||
|
||||
print('generating siamese speech pairs')
|
||||
same_data,diff_data = [],[]
|
||||
for (w,g) in audio_samples.groupby(audio_samples['word']):
|
||||
sample_norm = g.loc[audio_samples['variant'] == 'normal']#.reset_index(drop=True)
|
||||
sample_phon = g.loc[audio_samples['variant'] == 'phoneme']#.reset_index(drop=True)
|
||||
same , diff = get_siamese_pairs(sample_norm,sample_phon)
|
||||
same_data, diff_data = [], []
|
||||
for (w, g) in audio_samples.groupby(audio_samples['word']):
|
||||
sample_norm = g.loc[audio_samples['variant'] == 'normal']
|
||||
sample_phon = g.loc[audio_samples['variant'] == 'phoneme']
|
||||
same, diff = get_siamese_pairs(sample_norm, sample_phon)
|
||||
same_data.extend([create_X(s) for s in same[:10]])
|
||||
diff_data.extend([create_X(d) for d in diff[:10]])
|
||||
print('creating all speech pairs')
|
||||
Y = np.hstack([np.ones(len(same_data)),np.zeros(len(diff_data))])
|
||||
Y = np.hstack([np.ones(len(same_data)), np.zeros(len(diff_data))])
|
||||
print('casting as array speech pairs')
|
||||
X = np.asarray(same_data+diff_data)
|
||||
X = np.asarray(same_data + diff_data)
|
||||
print('pickling X/Y')
|
||||
np.save('outputs/X.npy',X)
|
||||
np.save('outputs/Y.npy',Y)
|
||||
del X
|
||||
np.save('outputs/X.npy', X)
|
||||
np.save('outputs/Y.npy', Y)
|
||||
del same_data
|
||||
del diff_data
|
||||
gc.collect()
|
||||
print('train/test splitting speech pairs')
|
||||
tr_pairs,te_pairs,tr_y,te_y = train_test_split(X,Y,test_size=0.1)
|
||||
tr_pairs, te_pairs, tr_y, te_y = train_test_split(X, Y, test_size=0.1)
|
||||
print('pickling train/test')
|
||||
np.save('outputs/tr_pairs.npy',tr_pairs)
|
||||
np.save('outputs/te_pairs.npy',te_pairs)
|
||||
np.save('outputs/tr_y.npy',tr_y)
|
||||
np.save('outputs/te_y.npy',te_y)
|
||||
np.save('outputs/tr_pairs.npy', tr_pairs)
|
||||
np.save('outputs/te_pairs.npy', te_pairs)
|
||||
np.save('outputs/tr_y.npy', tr_y)
|
||||
np.save('outputs/te_y.npy', te_y)
|
||||
|
||||
# def create_speech_model_data():
|
||||
# (max_samples,sample_size) = pickle.load(open('./spectrogram_vars.pkl','rb'))
|
||||
# x_data_pos = np.load('outputs/x_data_pos.npy')
|
||||
# x_data_neg = np.load('outputs/x_data_neg.npy')
|
||||
# x_pos_train, x_pos_test, x_neg_train, x_neg_test =train_test_split(x_data_pos,x_data_neg,test_size=0.1)
|
||||
# del x_data_pos
|
||||
# del x_data_neg
|
||||
# gc.collect()
|
||||
# print('split train and test')
|
||||
# tr_y = np.array(x_pos_train.shape[0]*[1])
|
||||
# te_y = np.array(x_pos_test.shape[0]*[[1,0]])
|
||||
# tr_pairs = np.array([x_pos_train,x_neg_train]).reshape(x_pos_train.shape[0],2,max_samples,sample_size)
|
||||
# te_pairs = np.array([x_pos_test,x_neg_test]).reshape(x_pos_test.shape[0],2,max_samples,sample_size)
|
||||
# print('reshaped to input dim')
|
||||
# np.save('outputs/tr_pairs.npy',tr_pairs)
|
||||
# np.save('outputs/te_pairs.npy',te_pairs)
|
||||
# np.save('outputs/tr_y.npy',tr_y)
|
||||
# np.save('outputs/te_y.npy',te_y)
|
||||
# print('pickled speech model data')
|
||||
|
||||
def speech_model_data():
|
||||
tr_pairs = np.load('outputs/tr_pairs.npy')/255.0
|
||||
te_pairs = np.load('outputs/te_pairs.npy')/255.0
|
||||
tr_pairs = np.load('outputs/tr_pairs.npy') / 255.0
|
||||
te_pairs = np.load('outputs/te_pairs.npy') / 255.0
|
||||
tr_pairs[tr_pairs < 0] = 0
|
||||
te_pairs[te_pairs < 0] = 0
|
||||
tr_y = np.load('outputs/tr_y.npy')
|
||||
te_y = np.load('outputs/te_y.npy')
|
||||
return tr_pairs,te_pairs,tr_y,te_y
|
||||
return tr_pairs, te_pairs, tr_y, te_y
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# sunflower_pairs_data()
|
||||
#create_spectrogram_data()
|
||||
# create_spectrogram_data()
|
||||
create_speech_pairs_data()
|
||||
# print(speech_model_data())
|
||||
|
|
|
|||
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import print_function
|
||||
import numpy as np
|
||||
|
|
@ -12,8 +11,8 @@ from keras import backend as K
|
|||
|
||||
def euclidean_distance(vects):
|
||||
x, y = vects
|
||||
return K.sqrt(K.maximum(K.sum(K.square(x - y), axis=1, keepdims=True),
|
||||
K.epsilon()))
|
||||
return K.sqrt(
|
||||
K.maximum(K.sum(K.square(x - y), axis=1, keepdims=True), K.epsilon()))
|
||||
|
||||
|
||||
def eucl_dist_output_shape(shapes):
|
||||
|
|
@ -79,31 +78,44 @@ input_b = Input(shape=input_dim)
|
|||
processed_a = base_network(input_a)
|
||||
processed_b = base_network(input_b)
|
||||
|
||||
distance = Lambda(euclidean_distance,
|
||||
output_shape=eucl_dist_output_shape)(
|
||||
[processed_a, processed_b]
|
||||
)
|
||||
distance = Lambda(
|
||||
euclidean_distance,
|
||||
output_shape=eucl_dist_output_shape)([processed_a, processed_b])
|
||||
|
||||
model = Model([input_a, input_b], distance)
|
||||
|
||||
tb_cb = TensorBoard(log_dir='./logs/siamese_logs', histogram_freq=1,
|
||||
batch_size=32, write_graph=True, write_grads=True,
|
||||
write_images=True, embeddings_freq=0,
|
||||
embeddings_layer_names=None, embeddings_metadata=None)
|
||||
tb_cb = TensorBoard(
|
||||
log_dir='./logs/siamese_logs',
|
||||
histogram_freq=1,
|
||||
batch_size=32,
|
||||
write_graph=True,
|
||||
write_grads=True,
|
||||
write_images=True,
|
||||
embeddings_freq=0,
|
||||
embeddings_layer_names=None,
|
||||
embeddings_metadata=None)
|
||||
cp_file_fmt = './models/siamese_speech_model-{epoch:02d}-epoch-{val_acc:0.2f}\
|
||||
-acc.h5'
|
||||
cp_cb = ModelCheckpoint(cp_file_fmt, monitor='val_acc', verbose=0,
|
||||
save_best_only=False, save_weights_only=False,
|
||||
mode='auto', period=1)
|
||||
|
||||
cp_cb = ModelCheckpoint(
|
||||
cp_file_fmt,
|
||||
monitor='val_acc',
|
||||
verbose=0,
|
||||
save_best_only=False,
|
||||
save_weights_only=False,
|
||||
mode='auto',
|
||||
period=1)
|
||||
# train
|
||||
rms = RMSprop(lr=0.001)
|
||||
sgd = SGD(lr=0.001)
|
||||
model.compile(loss=contrastive_loss, optimizer=rms, metrics=[accuracy])
|
||||
model.fit([tr_pairs[:, 0], tr_pairs[:, 1]], tr_y,
|
||||
batch_size=128,
|
||||
epochs=50,
|
||||
validation_data=([te_pairs[:, 0], te_pairs[:, 1]], te_y),
|
||||
callbacks=[tb_cb, cp_cb])
|
||||
model.fit(
|
||||
[tr_pairs[:, 0], tr_pairs[:, 1]],
|
||||
tr_y,
|
||||
batch_size=128,
|
||||
epochs=50,
|
||||
validation_data=([te_pairs[:, 0], te_pairs[:, 1]], te_y),
|
||||
callbacks=[tb_cb, cp_cb])
|
||||
|
||||
model.save('./models/siamese_speech_model-final.h5')
|
||||
# compute final accuracy on training and test sets
|
||||
|
|
|
|||
122
tts_samplegen.py
122
tts_samplegen.py
|
|
@ -1,29 +1,42 @@
|
|||
import objc
|
||||
from AppKit import NSSpeechSynthesizer,NSSpeechInputModeProperty,NSSpeechModePhoneme
|
||||
from Foundation import NSURL,NSError,NSObject
|
||||
from AppKit import NSSpeechSynthesizer, NSSpeechInputModeProperty
|
||||
from AppKit import NSSpeechModePhoneme
|
||||
from Foundation import NSURL
|
||||
import json
|
||||
import random
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
|
||||
|
||||
OUTPUT_NAME = 'audio'
|
||||
dest_dir = os.path.abspath('.')+'/outputs/'+OUTPUT_NAME+'/'
|
||||
dest_file = './outputs/'+OUTPUT_NAME+'.csv'
|
||||
dest_dir = os.path.abspath('.') + '/outputs/' + OUTPUT_NAME + '/'
|
||||
dest_file = './outputs/' + OUTPUT_NAME + '.csv'
|
||||
|
||||
|
||||
def create_dir(direc):
|
||||
if not os.path.exists(direc):
|
||||
os.mkdir(direc)
|
||||
dest_filename = lambda n,v,r,t: '{}-{}-{}-{}-'.format(n,v,r,t)+str(random.randint(0,10000))+'.aiff'
|
||||
dest_path = lambda v,r,n: dest_dir+v+'/'+r+'/'+n
|
||||
dest_url = lambda p: NSURL.fileURLWithPath_(p)
|
||||
|
||||
def cli_gen_audio(speech_cmd,rate,voice,out_path):
|
||||
subprocess.call(['say','-v',voice,'-r',str(rate),'-o',out_path,speech_cmd])
|
||||
|
||||
def dest_filename(n, v, r, t):
|
||||
return '{}-{}-{}-{}-'.format(n, v, r,
|
||||
t) + str(random.randint(0, 10000)) + '.aiff'
|
||||
|
||||
|
||||
def dest_path(v, r, n):
|
||||
return dest_dir + v + '/' + r + '/' + n
|
||||
|
||||
|
||||
def cli_gen_audio(speech_cmd, rate, voice, out_path):
|
||||
subprocess.call(
|
||||
['say', '-v', voice, '-r',
|
||||
str(rate), '-o', out_path, speech_cmd])
|
||||
|
||||
|
||||
class SynthFile(object):
|
||||
"""docstring for SynthFile."""
|
||||
def __init__(self,word,phon, filename,voice,rate,operation):
|
||||
|
||||
def __init__(self, word, phon, filename, voice, rate, operation):
|
||||
super(SynthFile, self).__init__()
|
||||
self.word = word
|
||||
self.phoneme = phon
|
||||
|
|
@ -33,91 +46,114 @@ class SynthFile(object):
|
|||
self.variant = operation
|
||||
|
||||
def get_json(self):
|
||||
return {'filename':self.filename,'voice':self.voice,
|
||||
'rate':self.rate,'operation':self.operation}
|
||||
return {
|
||||
'filename': self.filename,
|
||||
'voice': self.voice,
|
||||
'rate': self.rate,
|
||||
'operation': self.operation
|
||||
}
|
||||
|
||||
def get_csv(self):
|
||||
return '{},{},{},{},{}\n'.format(self.word,self.phoneme,self.voice,self.rate,self.variant,self.filename)
|
||||
return '{},{},{},{},{}\n'.format(self.word, self.phoneme, self.voice,
|
||||
self.rate, self.variant,
|
||||
self.filename)
|
||||
|
||||
|
||||
class SynthVariant(object):
|
||||
"""docstring for SynthVariant."""
|
||||
def __init__(self,identifier,rate):
|
||||
|
||||
def __init__(self, identifier, rate):
|
||||
super(SynthVariant, self).__init__()
|
||||
self.synth = NSSpeechSynthesizer.alloc().initWithVoice_(identifier)
|
||||
self.synth.setVolume_(100)
|
||||
self.synth.setRate_(rate)
|
||||
self.phone_synth = NSSpeechSynthesizer.alloc().initWithVoice_(identifier)
|
||||
self.phone_synth = NSSpeechSynthesizer.alloc().initWithVoice_(
|
||||
identifier)
|
||||
self.phone_synth.setVolume_(100)
|
||||
self.phone_synth.setRate_(rate)
|
||||
self.phone_synth.setObject_forProperty_error_(NSSpeechModePhoneme,NSSpeechInputModeProperty,None)
|
||||
self.phone_synth.setObject_forProperty_error_(
|
||||
NSSpeechModePhoneme, NSSpeechInputModeProperty, None)
|
||||
self.identifier = identifier
|
||||
self.rate = rate
|
||||
self.name = identifier.split('.')[-1]
|
||||
|
||||
def __repr__(self):
|
||||
return 'Synthesizer[{} - {}]({})'.format(self.name,self.rate)
|
||||
return 'Synthesizer[{} - {}]({})'.format(self.name, self.rate)
|
||||
|
||||
def generate_audio(self,word,variant):
|
||||
orig_phon,phoneme,phon_cmd = self.synth.phonemesFromText_(word),'',word
|
||||
def generate_audio(self, word, variant):
|
||||
orig_phon, phoneme, phon_cmd = self.synth.phonemesFromText_(
|
||||
word), '', word
|
||||
if variant == 'low':
|
||||
# self.synth.startSpeakingString_toURL_(word,d_url)
|
||||
phoneme = orig_phon
|
||||
elif variant == 'medium':
|
||||
phoneme = re.sub('[0-9]','',orig_phon)
|
||||
phon_cmd = '[[inpt PHON]] '+phoneme
|
||||
phoneme = re.sub('[0-9]', '', orig_phon)
|
||||
phon_cmd = '[[inpt PHON]] ' + phoneme
|
||||
elif variant == 'high':
|
||||
phoneme = orig_phon
|
||||
phon_cmd = word
|
||||
# elif variant == 'long':
|
||||
# if phon != '':
|
||||
# self.phone_synth.startSpeakingString_toURL_(phon,d_url)
|
||||
# else:
|
||||
# self.synth.startSpeakingString_toURL_(word,d_url)
|
||||
fname = dest_filename(word,phoneme,self.name,self.rate)
|
||||
d_path = dest_path(self.name,self.rate,fname)
|
||||
d_url = dest_url(d_path)
|
||||
cli_gen_audio(phon_cmd,self.rate,self.name,d_path)
|
||||
return SynthFile(word,phoneme,fname,self.name,self.rate,variant)
|
||||
# if phon != '':
|
||||
# self.phone_synth.startSpeakingString_toURL_(phon,d_url)
|
||||
# else:
|
||||
# self.synth.startSpeakingString_toURL_(word,d_url)
|
||||
fname = dest_filename(word, phoneme, self.name, self.rate)
|
||||
d_path = dest_path(self.name, self.rate, fname)
|
||||
# d_url = NSURL.fileURLWithPath_(d_path)
|
||||
cli_gen_audio(phon_cmd, self.rate, self.name, d_path)
|
||||
return SynthFile(word, phoneme, fname, self.name, self.rate, variant)
|
||||
|
||||
|
||||
def synth_generator():
|
||||
voices_installed = NSSpeechSynthesizer.availableVoices()
|
||||
voice_attrs = [NSSpeechSynthesizer.attributesForVoice_(v) for v in voices_installed]
|
||||
us_voices_ids = [v['VoiceIdentifier'] for v in voice_attrs if v['VoiceLanguage'] == 'en-US' and v['VoiceIdentifier'].split('.')[-1][0].isupper()]
|
||||
# us_voices_ids = ['com.apple.speech.synthesis.voice.Fred','com.apple.speech.synthesis.voice.Alex',
|
||||
voice_attrs = [
|
||||
NSSpeechSynthesizer.attributesForVoice_(v) for v in voices_installed
|
||||
]
|
||||
us_voices_ids = [
|
||||
v['VoiceIdentifier'] for v in voice_attrs
|
||||
if v['VoiceLanguage'] == 'en-US'
|
||||
and v['VoiceIdentifier'].split('.')[-1][0].isupper()
|
||||
]
|
||||
# us_voices_ids = ['com.apple.speech.synthesis.voice.Fred',
|
||||
# 'com.apple.speech.synthesis.voice.Alex',
|
||||
# 'com.apple.speech.synthesis.voice.Victoria']
|
||||
# voice_rates = list(range(150,221,(220-180)//4))
|
||||
voice_rates = [150,180,210,250]
|
||||
voice_rates = [150, 180, 210, 250]
|
||||
voice_synths = []
|
||||
create_dir(dest_dir)
|
||||
for v in us_voices_ids:
|
||||
for r in voice_rates:
|
||||
create_dir(dest_dir+v+'/'+r)
|
||||
voice_synths.append(SynthVariant(v,r))
|
||||
create_dir(dest_dir + v + '/' + r)
|
||||
voice_synths.append(SynthVariant(v, r))
|
||||
|
||||
def synth_for_words(words):
|
||||
all_synths = []
|
||||
for w in words:
|
||||
for s in voice_synths:
|
||||
for v in ['low','medium','high']:
|
||||
all_synths.append(s.generate_audio(w,v))
|
||||
for v in ['low', 'medium', 'high']:
|
||||
all_synths.append(s.generate_audio(w, v))
|
||||
return all_synths
|
||||
|
||||
return synth_for_words
|
||||
|
||||
def write_synths(synth_list,fname,csv=False):
|
||||
f = open(fname,'w')
|
||||
|
||||
def write_synths(synth_list, fname, csv=False):
|
||||
f = open(fname, 'w')
|
||||
if csv:
|
||||
for s in synth_list:
|
||||
f.write(s.get_csv())
|
||||
else:
|
||||
json.dump([s.get_json() for s in synth_list],f)
|
||||
json.dump([s.get_json() for s in synth_list], f)
|
||||
f.close()
|
||||
|
||||
|
||||
def generate_audio_for_stories():
|
||||
stories_data = json.load(open('./inputs/all_stories_hs.json'))
|
||||
word_list = [t[0] for i in stories_data.values() for t in i]
|
||||
words_audio_synth = synth_generator()
|
||||
return words_audio_synth(word_list)
|
||||
|
||||
|
||||
# words_audio_synth = synth_generator()
|
||||
# synth = NSSpeechSynthesizer.alloc().init()
|
||||
# voices_installed = NSSpeechSynthesizer.availableVoices()
|
||||
|
|
@ -131,5 +167,5 @@ def generate_audio_for_stories():
|
|||
|
||||
synths = synth_generator()([OUTPUT_NAME])
|
||||
# synths = generate_audio_for_stories()
|
||||
write_synths(synths,dest_file,True)
|
||||
write_synths(synths, dest_file, True)
|
||||
# write_synths(synths,'./outputs/synths.json')
|
||||
|
|
|
|||
Loading…
Reference in New Issue