speech-scoring/speech_pitch.py

159 lines
5.7 KiB
Python

import parselmouth as pm
from pysndfile import sndio as snd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pyaudio as pa
sns.set() # Use seaborn's default style to make graphs more pretty
def pm_snd(sample_file):
# sample_file = 'inputs/self-apple/apple-low1.aiff'
samples, samplerate, _ = snd.read(sample_file)
return pm.Sound(values=samples,sampling_frequency=samplerate)
def pitch_array(sample_file='outputs/audio/sunflowers-Victoria-180-normal-870.aiff'):
sample_sound = pm_snd(sample_file)
sample_pitch = sample_sound.to_pitch()
return sample_pitch.to_matrix().as_array()
def intensity_array(sample_file='outputs/audio/sunflowers-Victoria-180-normal-870.aiff'):
sample_sound = pm_snd(sample_file)
sample_intensity = sample_sound.to_mfcc()
sample_intensity.as_array().shape
return sample_pitch.to_matrix().as_array()
def compute_mfcc(sample_file='outputs/audio/sunflowers-Victoria-180-normal-870.aiff'):
sample_sound = pm_snd(sample_file)
sample_mfcc = sample_sound.to_mfcc()
# sample_mfcc.to_array().shape
return sample_mfcc.to_array()
def compute_formants(sample_file='outputs/audio/sunflowers-Victoria-180-normal-870.aiff'):
# sample_file='outputs/audio/sunflowers-Victoria-180-normal-870.aiff'
sample_sound = pm_snd(sample_file)
sample_formant = sample_sound.to_formant_burg()
# sample_formant.x_bins()
return sample_formant.x_bins()
def draw_spectrogram(spectrogram, dynamic_range=70):
X, Y = spectrogram.x_grid(), spectrogram.y_grid()
sg_db = 10 * np.log10(spectrogram.values.T)
plt.pcolormesh(X, Y, sg_db, vmin=sg_db.max() - dynamic_range, cmap='afmhot')
plt.ylim([spectrogram.ymin, spectrogram.ymax])
plt.xlabel("time [s]")
plt.ylabel("frequency [Hz]")
def draw_intensity(intensity):
plt.plot(intensity.xs(), intensity.values, linewidth=3, color='w')
plt.plot(intensity.xs(), intensity.values, linewidth=1)
plt.grid(False)
plt.ylim(0)
plt.ylabel("intensity [dB]")
def draw_pitch(pitch):
# Extract selected pitch contour, and
# replace unvoiced samples by NaN to not plot
pitch_values = pitch.to_matrix().values
pitch_values[pitch_values==0] = np.nan
plt.plot(pitch.xs(), pitch_values, linewidth=3, color='w')
plt.plot(pitch.xs(), pitch_values, linewidth=1)
plt.grid(False)
plt.ylim(0, pitch.ceiling)
plt.ylabel("pitch [Hz]")
def draw_formants(formant):
# Extract selected pitch contour, and
# replace unvoiced samples by NaN to not plot
formant_values = formant.to_matrix().values
pitch_values[pitch_values==0] = np.nan
plt.plot(pitch.xs(), pitch_values, linewidth=3, color='w')
plt.plot(pitch.xs(), pitch_values, linewidth=1)
plt.grid(False)
plt.ylim(0, pitch.ceiling)
plt.ylabel("Formants [val]")
def plot_sample_raw(sample_file='outputs/audio/sunflowers-Victoria-180-normal-870.aiff'):
# %matplotlib inline
# sample_file='outputs/audio/sunflowers-Victoria-180-normal-870.aiff
snd_d = pm_snd(sample_file)
plt.figure()
plt.plot(snd_d.xs(), snd_d.values)
plt.xlim([snd_d.xmin, snd_d.xmax])
plt.xlabel("time [s]")
plt.ylabel("amplitude")
plt.show()
def plot_file_intensity(sample_file='outputs/audio/sunflowers-Victoria-180-normal-870.aiff'):
snd_d = pm_snd(sample_file)
plot_sample_intensity(snd_d)
def plot_sample_intensity(snd_d):
intensity = snd_d.to_intensity()
spectrogram = snd_d.to_spectrogram()
plt.figure()
draw_spectrogram(spectrogram)
plt.twinx()
draw_intensity(intensity)
plt.xlim([snd_d.xmin, snd_d.xmax])
plt.show()
def plot_file_pitch(sample_file='outputs/audio/sunflowers-Victoria-180-normal-870.aiff'):
snd_d = pm_snd(sample_file)
plot_sample_pitch(snd_d)
def plot_sample_pitch(snd_d,phons = []):
pitch = snd_d.to_pitch()
spectrogram = snd_d.to_spectrogram(window_length=0.03, maximum_frequency=8000)
plt.figure()
draw_spectrogram(spectrogram)
plt.twinx()
draw_pitch(pitch)
for (p,c) in phons:
plt.axvline(x=p)
plt.text(p,-1,c)
plt.xlim([snd_d.xmin, snd_d.xmax])
plt.show()
def play_sound(samplerate=22050):
#snd_sample = pm_snd('outputs/test/a_warm_smile_and_a_good_heart-1917.aiff')
p_oup = pa.PyAudio()
stream = p_oup.open(
format=pa.paFloat32,
channels=2,
rate=samplerate,
output=True)
def sample_player(snd_sample=None):
samples = snd_sample.as_array()[:,0]
one_channel = np.asarray([samples, samples]).T.reshape(-1)
audio_data = one_channel.astype(np.float32).tobytes()
stream.write(audio_data)
def close_player():
stream.close()
p_oup.terminate()
return sample_player,close_player
# snd_part = snd_d.extract_part(from_time=0.9, preserve_times=True)
# plt.figure()
# plt.plot(snd_part.xs(), snd_part.values, linewidth=0.5)
# plt.xlim([snd_part.xmin, snd_part.xmax])
# plt.xlabel("time [s]")
# plt.ylabel("amplitude")
# plt.show()
if __name__ == '__main__':
plot_file_pitch('outputs/audio/sunflowers-Victoria-180-normal-870.aiff')
plot_file_pitch('outputs/test/a_warm_smile_and_a_good_heart-1917.aiff')
play_sound(pm_snd('outputs/test/a_warm_smile_and_a_good_heart-1917.aiff'))
plot_file_pitch('outputs/test/a_wrong_turn-3763.aiff')
play_sound(pm_snd('outputs/test/a_wrong_turn-3763.aiff'))
plot_file_pitch('inputs/self/a_wrong_turn-low1.aiff')
play_sound(pm_snd('inputs/self/a_wrong_turn-low1.aiff'))
plot_file_pitch('inputs/self/a_wrong_turn-low2.aiff')
play_sound(pm_snd('inputs/self/a_wrong_turn-low2.aiff'))
plot_file_pitch('inputs/self/apple-low1.aiff')
plot_file_pitch('inputs/self/apple-low2.aiff')
plot_file_pitch('inputs/self/apple-medium1.aiff')