1. clean-up unused data process code

2. fix invalid sample no from mongo 3. data loader service return remote netref
2026-03-08 10:32:35 +00:00 · 2020-05-13 14:02:46 +05:30
parent fdccea6b23
commit d4aef4088d
3 changed files with 12 additions and 67 deletions
--- a/jasper/data_utils/data_server.py
+++ b/jasper/data_utils/data_server.py
@@ -1,15 +1,17 @@
+import os
+# from pathlib import Path
+
 import typer
 import rpyc
-import os
-from pathlib import Path
 from rpyc.utils.server import ThreadedServer
+import nemo.collections.asr as nemo_asr

 app = typer.Typer()


 class ASRDataService(rpyc.Service):
-    def get_data_loader(self, data_manifest: Path):
-        return "hello"
+    def get_data_loader(self):
+        return nemo_asr.AudioToTextDataLayer


@app.command()
--- a/jasper/data_utils/process.py
+++ b/jasper/data_utils/process.py
@@ -1,34 +1,12 @@
 import json
 from pathlib import Path
 from sklearn.model_selection import train_test_split
-from .utils import alnum_to_asr_tokens, asr_manifest_reader, asr_manifest_writer
+from .utils import asr_manifest_reader, asr_manifest_writer
 import typer

 app = typer.Typer()


-@app.command()
-def separate_space_convert_digit_setpath():
-    with Path("/home/malar/work/asr-data-utils/asr_data/pnr_data.json").open("r") as pf:
-        pnr_jsonl = pf.readlines()
-
-    pnr_data = [json.loads(i) for i in pnr_jsonl]
-
-    new_pnr_data = []
-    for i in pnr_data:
-        i["text"] = alnum_to_asr_tokens(i["text"])
-        i["audio_filepath"] = i["audio_filepath"].replace(
-            "pnr_data/", "/dataset/asr_data/pnr_data/wav/"
-        )
-        new_pnr_data.append(i)
-
-    new_pnr_jsonl = [json.dumps(i) for i in new_pnr_data]
-
-    with Path("/dataset/asr_data/pnr_data/pnr_data.json").open("w") as pf:
-        new_pnr_data = "\n".join(new_pnr_jsonl)  # + "\n"
-        pf.write(new_pnr_data)
-
-
@app.command()
 def split_data(dataset_path: Path, test_size: float = 0.1):
    manifest_path = dataset_path / Path("abs_manifest.json")
@@ -50,19 +28,6 @@ def fixate_data(dataset_path: Path):

    asr_manifest_writer(real_manifest_path, fix_path())

-    # with manifest_path.open("r") as pf:
-    #     pnr_jsonl = pf.readlines()
-    #     pnr_data = [json.loads(i) for i in pnr_jsonl]
-    #     new_pnr_data = []
-    #     for i in pnr_data:
-    #         i["audio_filepath"] = str(dataset_path / Path(i["audio_filepath"]))
-    #         new_pnr_data.append(i)
-    #     new_pnr_jsonl = [json.dumps(i) for i in new_pnr_data]
-    #     real_manifest_path = dataset_path / Path("abs_manifest.json")
-    #     with real_manifest_path.open("w") as pf:
-    #         new_pnr_data = "\n".join(new_pnr_jsonl)  # + "\n"
-    #         pf.write(new_pnr_data)
-

@app.command()
 def augment_an4():
@@ -77,9 +42,6 @@ def augment_an4():
        pf.write(an4_test + pnr_test)


-# augment_an4()
-
-
@app.command()
 def validate_data(data_file: Path):
    with Path(data_file).open("r") as pf:
@@ -101,23 +63,3 @@ def main():

 if __name__ == "__main__":
    main()
-
-# def convert_digits(data_file="/dataset/asr_data/an4_pnr/test_manifest.json"):
-#     with Path(data_file).open("r") as pf:
-#         pnr_jsonl = pf.readlines()
-#
-#     pnr_data = [json.loads(i) for i in pnr_jsonl]
-#     new_pnr_data = []
-#     for i in pnr_data:
-#         num_tokens = [num2words(c) for c in i["text"] if "0" <= c <= "9"]
-#         i["text"] = "".join(num_tokens)
-#         new_pnr_data.append(i)
-#
-#     new_pnr_jsonl = [json.dumps(i) for i in new_pnr_data]
-#
-#     with Path(data_file).open("w") as pf:
-#         new_pnr_data = "\n".join(new_pnr_jsonl)  # + "\n"
-#         pf.write(new_pnr_data)
-#
-#
-# convert_digits(data_file="/dataset/asr_data/an4_pnr/train_manifest.json")
--- a/jasper/data_utils/validation/ui.py
+++ b/jasper/data_utils/validation/ui.py
@@ -63,12 +63,13 @@ def main(manifest: Path):
    asr_data = ui_config["data"]
    use_domain_asr = ui_config["use_domain_asr"]
    sample_no = st.get_current_cursor()
+    if len(asr_data) - 1 < sample_no or sample_no < 0:
+        print("Invalid samplno resetting to 0")
+        st.update_cursor(0)
    sample = asr_data[sample_no]
-    title_type = 'Speller ' if use_domain_asr else ''
+    title_type = "Speller " if use_domain_asr else ""
    st.title(f"ASR {title_type}Validation")
-    addl_text = (
-        f"spelled *{sample['spoken']}*" if use_domain_asr else ""
-    )
+    addl_text = f"spelled *{sample['spoken']}*" if use_domain_asr else ""
    st.markdown(f"{sample_no+1} of {len(asr_data)} : **{sample['text']}**" + addl_text)
    new_sample = st.number_input(
        "Go To Sample:", value=sample_no + 1, min_value=1, max_value=len(asr_data)