diff --git a/jasper/data_utils/data_server.py b/jasper/data_utils/data_server.py index 3a912a8..eecd848 100644 --- a/jasper/data_utils/data_server.py +++ b/jasper/data_utils/data_server.py @@ -1,15 +1,17 @@ +import os +# from pathlib import Path + import typer import rpyc -import os -from pathlib import Path from rpyc.utils.server import ThreadedServer +import nemo.collections.asr as nemo_asr app = typer.Typer() class ASRDataService(rpyc.Service): - def get_data_loader(self, data_manifest: Path): - return "hello" + def get_data_loader(self): + return nemo_asr.AudioToTextDataLayer @app.command() diff --git a/jasper/data_utils/process.py b/jasper/data_utils/process.py index 5d184f7..65e66bf 100644 --- a/jasper/data_utils/process.py +++ b/jasper/data_utils/process.py @@ -1,34 +1,12 @@ import json from pathlib import Path from sklearn.model_selection import train_test_split -from .utils import alnum_to_asr_tokens, asr_manifest_reader, asr_manifest_writer +from .utils import asr_manifest_reader, asr_manifest_writer import typer app = typer.Typer() -@app.command() -def separate_space_convert_digit_setpath(): - with Path("/home/malar/work/asr-data-utils/asr_data/pnr_data.json").open("r") as pf: - pnr_jsonl = pf.readlines() - - pnr_data = [json.loads(i) for i in pnr_jsonl] - - new_pnr_data = [] - for i in pnr_data: - i["text"] = alnum_to_asr_tokens(i["text"]) - i["audio_filepath"] = i["audio_filepath"].replace( - "pnr_data/", "/dataset/asr_data/pnr_data/wav/" - ) - new_pnr_data.append(i) - - new_pnr_jsonl = [json.dumps(i) for i in new_pnr_data] - - with Path("/dataset/asr_data/pnr_data/pnr_data.json").open("w") as pf: - new_pnr_data = "\n".join(new_pnr_jsonl) # + "\n" - pf.write(new_pnr_data) - - @app.command() def split_data(dataset_path: Path, test_size: float = 0.1): manifest_path = dataset_path / Path("abs_manifest.json") @@ -50,19 +28,6 @@ def fixate_data(dataset_path: Path): asr_manifest_writer(real_manifest_path, fix_path()) - # with manifest_path.open("r") as pf: - # pnr_jsonl = pf.readlines() - # pnr_data = [json.loads(i) for i in pnr_jsonl] - # new_pnr_data = [] - # for i in pnr_data: - # i["audio_filepath"] = str(dataset_path / Path(i["audio_filepath"])) - # new_pnr_data.append(i) - # new_pnr_jsonl = [json.dumps(i) for i in new_pnr_data] - # real_manifest_path = dataset_path / Path("abs_manifest.json") - # with real_manifest_path.open("w") as pf: - # new_pnr_data = "\n".join(new_pnr_jsonl) # + "\n" - # pf.write(new_pnr_data) - @app.command() def augment_an4(): @@ -77,9 +42,6 @@ def augment_an4(): pf.write(an4_test + pnr_test) -# augment_an4() - - @app.command() def validate_data(data_file: Path): with Path(data_file).open("r") as pf: @@ -101,23 +63,3 @@ def main(): if __name__ == "__main__": main() - -# def convert_digits(data_file="/dataset/asr_data/an4_pnr/test_manifest.json"): -# with Path(data_file).open("r") as pf: -# pnr_jsonl = pf.readlines() -# -# pnr_data = [json.loads(i) for i in pnr_jsonl] -# new_pnr_data = [] -# for i in pnr_data: -# num_tokens = [num2words(c) for c in i["text"] if "0" <= c <= "9"] -# i["text"] = "".join(num_tokens) -# new_pnr_data.append(i) -# -# new_pnr_jsonl = [json.dumps(i) for i in new_pnr_data] -# -# with Path(data_file).open("w") as pf: -# new_pnr_data = "\n".join(new_pnr_jsonl) # + "\n" -# pf.write(new_pnr_data) -# -# -# convert_digits(data_file="/dataset/asr_data/an4_pnr/train_manifest.json") diff --git a/jasper/data_utils/validation/ui.py b/jasper/data_utils/validation/ui.py index 6d495cf..7467b0e 100644 --- a/jasper/data_utils/validation/ui.py +++ b/jasper/data_utils/validation/ui.py @@ -63,12 +63,13 @@ def main(manifest: Path): asr_data = ui_config["data"] use_domain_asr = ui_config["use_domain_asr"] sample_no = st.get_current_cursor() + if len(asr_data) - 1 < sample_no or sample_no < 0: + print("Invalid samplno resetting to 0") + st.update_cursor(0) sample = asr_data[sample_no] - title_type = 'Speller ' if use_domain_asr else '' + title_type = "Speller " if use_domain_asr else "" st.title(f"ASR {title_type}Validation") - addl_text = ( - f"spelled *{sample['spoken']}*" if use_domain_asr else "" - ) + addl_text = f"spelled *{sample['spoken']}*" if use_domain_asr else "" st.markdown(f"{sample_no+1} of {len(asr_data)} : **{sample['text']}**" + addl_text) new_sample = st.number_input( "Go To Sample:", value=sample_no + 1, min_value=1, max_value=len(asr_data)