speech-scoring/speech_samplegen.py

import objc
from AppKit import NSSpeechSynthesizer, NSSpeechInputModeProperty
from AppKit import NSSpeechModePhoneme
from Foundation import NSURL
import json
import csv
import random
import string
import os
import re
import subprocess
import time
from tqdm import tqdm

from generate_similar import similar_phoneme_phrase,similar_phrase

OUTPUT_NAME = 'story_phrases'
dest_dir = os.path.abspath('.') + '/outputs/' + OUTPUT_NAME + '/'
dest_file = './outputs/' + OUTPUT_NAME + '.csv'

def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60.
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)

def create_dir(direc):
    if not os.path.exists(direc):
        os.makedirs(direc)

def format_filename(s):
    """
    Take a string and return a valid filename constructed from the string.
    Uses a whitelist approach: any characters not present in valid_chars are
    removed. Also spaces are replaced with underscores.

    Note: this method may produce invalid filenames such as ``, `.` or `..`
    When I use this method I prepend a date string like '2009_01_15_19_46_32_'
    and append a file extension like '.txt', so I avoid the potential of using
    an invalid filename.
    """
    valid_chars = "-_.() %s%s" % (string.ascii_letters, string.digits)
    filename = ''.join(c for c in s if c in valid_chars)
    filename = filename.replace(' ','_') # I don't like spaces in filenames.
    return filename

def dest_filename(w, v, r, t):
    rand_no = str(random.randint(0, 10000))
    fname = '{}-{}-{}-{}-{}.aiff'.format(w, v, r, t, rand_no)
    sanitized = format_filename(fname)
    return sanitized


def dest_path(v, r, n):
    rel = v + '/' + str(r) + '/' + n
    return (dest_dir + rel), rel


def cli_gen_audio(speech_cmd, rate, voice, out_path):
    subprocess.call(
        ['say', '-v', voice, '-r',
         str(rate), '-o', out_path, "'"+speech_cmd+"'"])


class SynthFile(object):
    """docstring for SynthFile."""

    def __init__(self, word, phon, filename, voice, voice_lang, rate, operation):
        super(SynthFile, self).__init__()
        self.word = word
        self.phoneme = phon
        self.filename = filename
        self.voice = voice
        self.voice_lang = voice_lang
        self.rate = rate
        self.variant = operation

    def get_json(self):
        return {
            'filename': self.filename,
            'voice': self.voice,
            'rate': self.rate,
            'operation': self.operation
        }

    def get_csv(self):
        cols = [self.word, self.phoneme, self.voice,
                self.voice_lang, self.rate, self.variant,
                self.filename]

        return ','.join([str(c) for c in cols])+'\n'

    def get_values(self):
        cols = [self.word, self.phoneme, self.voice,
                self.voice_lang, self.rate, self.variant,
                self.filename]
        return [str(c) for c in cols]

class SynthVariant(object):
    """docstring for SynthVariant."""

    def __init__(self, identifier, voice, lang, rate):
        super(SynthVariant, self).__init__()
        self.synth = NSSpeechSynthesizer.alloc().initWithVoice_(identifier)
        self.synth.setVolume_(100)
        self.synth.setRate_(rate)
        self.phone_synth = NSSpeechSynthesizer.alloc().initWithVoice_(
            identifier)
        self.phone_synth.setVolume_(100)
        self.phone_synth.setRate_(rate)
        self.phone_synth.setObject_forProperty_error_(
            NSSpeechModePhoneme, NSSpeechInputModeProperty, None)
        self.identifier = identifier
        self.rate = rate
        self.name = voice
        self.lang = lang
        self.phoneme_capable = self.is_phoneme_capable()


    def __repr__(self):
        return 'Synthesizer[{} - {}]'.format(self.name, self.rate)

    def is_phoneme_capable(self):
        orig_phon = self.synth.phonemesFromText_('water')
        return orig_phon != ''

    def generate_audio(self, text, variant):
        orig_phon, phoneme, phon_cmd = self.synth.phonemesFromText_(
            text), '', text
        if variant == 'low':
            # self.synth.startSpeakingString_toURL_(word,d_url)
            phoneme = orig_phon
        elif variant == 'medium':
            phoneme = similar_phoneme_phrase(orig_phon)
            phon_cmd = '[[inpt PHON]] ' + phoneme
        elif variant == 'high':
            phoneme = similar_phrase(text)
            phon_cmd = phoneme
        # elif variant == 'long':
        # if phon != '':
        # self.phone_synth.startSpeakingString_toURL_(phon,d_url)
        # else:
        #     self.synth.startSpeakingString_toURL_(word,d_url)
        fname = dest_filename(text, self.name, self.rate, variant)
        d_path, r_path = dest_path(self.name, self.rate, fname)
        # d_url = NSURL.fileURLWithPath_(d_path)
        cli_gen_audio(phon_cmd, self.rate, self.name, d_path)
        return SynthFile(text, phoneme, r_path, self.name, self.lang, self.rate, variant)

    def create_synth_dirs(self):
        if self.phoneme_capable:
            create_dir(dest_dir + self.name + '/' + str(self.rate))

    @staticmethod
    def voices_for_lang(lang):
        voices_installed = NSSpeechSynthesizer.availableVoices()
        voice_attrs = [
            NSSpeechSynthesizer.attributesForVoice_(v) for v in voices_installed
        ]
        # sk = [k for k in voice_attrs[0].keys() if k not in [
        #     'VoiceIndividuallySpokenCharacters', 'VoiceSupportedCharacters']]
        # s_attrs = [[v[i] for i in sk] for v in voice_attrs if 'VoiceShowInFullListOnly' in v
        #            and 'VoiceRelativeDesirability' in v]
        return [
            (v['VoiceIdentifier'],
             v['VoiceName'],
             v['VoiceLanguage']) for v in voice_attrs
            if v['VoiceLanguage'] == lang
            and v['VoiceGender'] != 'VoiceGenderNeuter'
        ]

    @classmethod
    def synth_with(cls,voice_params,rate=180):
        identifier,voice,lang = voice_params
        return cls(identifier,voice,lang,rate)


def synth_generator():
    us_voices_ids = SynthVariant.voices_for_lang('en-US')
    voice_rates = [150, 180, 210]#, 250]
    voice_synths = []
    create_dir(dest_dir)
    for vp in us_voices_ids:
        for r in voice_rates:
            s = SynthVariant.synth_with(vp,r)
            if s.phoneme_capable:
                print('Adding ', s)
                voice_synths.append(s)
            else:
                print('Discarding phoneme incapable ', s)

    def synth_for_words(words, writer):
        start_time = time.time()
        prog_title = "Synthesizing {} words : ".format(len(words))
        for s in voice_synths:
            s.create_synth_dirs()
            for v in ['low', 'medium', 'high']:
                prog = tqdm(words)
                prog.set_postfix(variant=v,voice=s.name,rate=s.rate)
                for w in tqdm(words):
                    prog.set_postfix(word=w)
                    synthed = s.generate_audio(w, v)
                    writer(synthed)
        end_time = time.time()
        time_str = hms_string(end_time - start_time)
        print("It took {} to synthsize all variants.".format(time_str))
    return synth_for_words

def synth_logger(fname, csv_mode=False):
    f = open(fname, 'w')
    s_csv_w = csv.writer(f, quoting=csv.QUOTE_MINIMAL)
    def csv_writer(s):
        s_csv_w.writerow(s.get_values())
    synth_list = []

    def json_writer(s):
        synth_list.append(s)

    def close_file():
        if csv_mode:
            f.close()
        else:
            json.dump([s.get_json() for s in synth_list], f)
            f.close()
    if csv_mode:
        return csv_writer, close_file
    else:
        return json_writer, close_file

def generate_audio_for_text_list(text_list):
    (writer, closer) = synth_logger(dest_file, csv_mode=True)
    synth_for_texts = synth_generator()
    try:
        synth_for_texts(text_list, writer)
    except:
        import traceback
        import sys
        traceback.print_exc(file=sys.stdout)
        pass
    closer()

def generate_audio_for_stories():
    # story_file = './inputs/all_stories_hs.json'
    story_file = './inputs/all_stories.json'
    stories_data = json.load(open(story_file))
    # text_list = [t[0] for i in stories_data.values() for t in i]
    text_list = [i for g in stories_data.values() for i in g]
    generate_audio_for_text_list(text_list)

def generate_test_audio_for_stories():
    story_file = './inputs/all_stories_hs.json'
    # story_file = './inputs/all_stories.json'
    stories_data = json.load(open(story_file))
    text_list = [t[0] for i in stories_data.values() for t in i]
    # text_list = [i.replace('-','') for g in stories_data.values() for i in g]
    word_list = [i.strip('\n_') for i in open('./inputs/wordlist.txt','r').readlines()]
    text_set = set(text_list)
    new_word_list = [i for i in word_list if i not in text_set and len(i) > 4]
    # len(new_word_list)
    test_words = new_word_list[:int(len(text_list)/5+1)]
    generate_audio_for_text_list(test_words)


if __name__ == '__main__':
    # generate_test_audio_for_stories()
    # generate_audio_for_text_list(['I want to go home','education'])
    generate_audio_for_stories()