jasper-asr/jasper/data_utils/utils.py

import numpy as np
import wave
import io
import json
from pathlib import Path
from num2words import num2words


def manifest_str(path, dur, text):
    return (
        json.dumps({"audio_filepath": path, "duration": round(dur, 1), "text": text})
        + "\n"
    )


def wav_bytes(audio_bytes, frame_rate=24000):
    wf_b = io.BytesIO()
    with wave.open(wf_b, mode="w") as wf:
        wf.setnchannels(1)
        wf.setframerate(frame_rate)
        wf.setsampwidth(2)
        wf.writeframesraw(audio_bytes)
    return wf_b.getvalue()


def random_pnr_generator(count=10000):
    LENGTH = 3

    # alphabet = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
    alphabet = list("ABCDEFGHIJKLMNOPQRSTUVWXYZ")
    numeric = list("0123456789")
    np_alphabet = np.array(alphabet, dtype="|S1")
    np_numeric = np.array(numeric, dtype="|S1")
    np_alpha_codes = np.random.choice(np_alphabet, [count, LENGTH])
    np_num_codes = np.random.choice(np_numeric, [count, LENGTH])
    np_code_seed = np.concatenate((np_alpha_codes, np_num_codes), axis=1).T
    np.random.shuffle(np_code_seed)
    np_codes = np_code_seed.T
    codes = [(b"".join(np_codes[i])).decode("utf-8") for i in range(len(np_codes))]
    return codes


def alnum_to_asr_tokens(text):
    letters = " ".join(list(text))
    num_tokens = [num2words(c) if "0" <= c <= "9" else c for c in letters]
    return ("".join(num_tokens)).lower()


def asr_data_writer(output_dir, dataset_name, asr_data_source):
    dataset_dir = output_dir / Path(dataset_name)
    (dataset_dir / Path("wav")).mkdir(parents=True, exist_ok=True)
    asr_manifest = dataset_dir / Path("manifest.json")
    with asr_manifest.open("w") as mf:
        for pnr_code, audio_dur, wav_data in asr_data_source:
            pnr_af = dataset_dir / Path("wav") / Path(pnr_code).with_suffix(".wav")
            pnr_af.write_bytes(wav_data)
            rel_pnr_path = pnr_af.relative_to(dataset_dir)
            manifest = manifest_str(
                str(rel_pnr_path), audio_dur, alnum_to_asr_tokens(pnr_code)
            )
            mf.write(manifest)


def asr_manifest_reader(data_manifest_path: Path):
    print(f'reading manifest from {data_manifest_path}')
    with data_manifest_path.open("r") as pf:
        pnr_jsonl = pf.readlines()
    pnr_data = [json.loads(v) for v in pnr_jsonl]
    for p in pnr_data:
        p['audio_path'] = data_manifest_path.parent / Path(p['audio_filepath'])
        p['chars'] = Path(p['audio_filepath']).stem
        yield p


def asr_manifest_writer(asr_manifest_path: Path, manifest_str_source):
    with asr_manifest_path.open("w") as mf:
        print(f'opening {asr_manifest_path} for writing manifest')
        for mani_dict in manifest_str_source:
            manifest = manifest_str(
                mani_dict['audio_filepath'], mani_dict['duration'], mani_dict['text']
            )
            mf.write(manifest)


def main():
    for c in random_pnr_generator():
        print(c)


if __name__ == "__main__":
    main()