jasper-asr/jasper/data_utils/utils.py

92 lines
2.9 KiB
Python

import numpy as np
import wave
import io
import json
from pathlib import Path
from num2words import num2words
def manifest_str(path, dur, text):
return (
json.dumps({"audio_filepath": path, "duration": round(dur, 1), "text": text})
+ "\n"
)
def wav_bytes(audio_bytes, frame_rate=24000):
wf_b = io.BytesIO()
with wave.open(wf_b, mode="w") as wf:
wf.setnchannels(1)
wf.setframerate(frame_rate)
wf.setsampwidth(2)
wf.writeframesraw(audio_bytes)
return wf_b.getvalue()
def random_pnr_generator(count=10000):
LENGTH = 3
# alphabet = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
alphabet = list("ABCDEFGHIJKLMNOPQRSTUVWXYZ")
numeric = list("0123456789")
np_alphabet = np.array(alphabet, dtype="|S1")
np_numeric = np.array(numeric, dtype="|S1")
np_alpha_codes = np.random.choice(np_alphabet, [count, LENGTH])
np_num_codes = np.random.choice(np_numeric, [count, LENGTH])
np_code_seed = np.concatenate((np_alpha_codes, np_num_codes), axis=1).T
np.random.shuffle(np_code_seed)
np_codes = np_code_seed.T
codes = [(b"".join(np_codes[i])).decode("utf-8") for i in range(len(np_codes))]
return codes
def alnum_to_asr_tokens(text):
letters = " ".join(list(text))
num_tokens = [num2words(c) if "0" <= c <= "9" else c for c in letters]
return ("".join(num_tokens)).lower()
def asr_data_writer(output_dir, dataset_name, asr_data_source):
dataset_dir = output_dir / Path(dataset_name)
(dataset_dir / Path("wav")).mkdir(parents=True, exist_ok=True)
asr_manifest = dataset_dir / Path("manifest.json")
with asr_manifest.open("w") as mf:
for pnr_code, audio_dur, wav_data in asr_data_source:
pnr_af = dataset_dir / Path("wav") / Path(pnr_code).with_suffix(".wav")
pnr_af.write_bytes(wav_data)
rel_pnr_path = pnr_af.relative_to(dataset_dir)
manifest = manifest_str(
str(rel_pnr_path), audio_dur, alnum_to_asr_tokens(pnr_code)
)
mf.write(manifest)
def asr_manifest_reader(data_manifest_path: Path):
print(f'reading manifest from {data_manifest_path}')
with data_manifest_path.open("r") as pf:
pnr_jsonl = pf.readlines()
pnr_data = [json.loads(v) for v in pnr_jsonl]
for p in pnr_data:
p['audio_path'] = data_manifest_path.parent / Path(p['audio_filepath'])
p['chars'] = Path(p['audio_filepath']).stem
yield p
def asr_manifest_writer(asr_manifest_path: Path, manifest_str_source):
with asr_manifest_path.open("w") as mf:
print(f'opening {asr_manifest_path} for writing manifest')
for mani_dict in manifest_str_source:
manifest = manifest_str(
mani_dict['audio_filepath'], mani_dict['duration'], mani_dict['text']
)
mf.write(manifest)
def main():
for c in random_pnr_generator():
print(c)
if __name__ == "__main__":
main()