1. clean-up unused data process code
2. fix invalid sample no from mongo 3. data loader service return remote netref
parent
fdccea6b23
commit
d4aef4088d
|
|
@ -1,15 +1,17 @@
|
||||||
|
import os
|
||||||
|
# from pathlib import Path
|
||||||
|
|
||||||
import typer
|
import typer
|
||||||
import rpyc
|
import rpyc
|
||||||
import os
|
|
||||||
from pathlib import Path
|
|
||||||
from rpyc.utils.server import ThreadedServer
|
from rpyc.utils.server import ThreadedServer
|
||||||
|
import nemo.collections.asr as nemo_asr
|
||||||
|
|
||||||
app = typer.Typer()
|
app = typer.Typer()
|
||||||
|
|
||||||
|
|
||||||
class ASRDataService(rpyc.Service):
|
class ASRDataService(rpyc.Service):
|
||||||
def get_data_loader(self, data_manifest: Path):
|
def get_data_loader(self):
|
||||||
return "hello"
|
return nemo_asr.AudioToTextDataLayer
|
||||||
|
|
||||||
|
|
||||||
@app.command()
|
@app.command()
|
||||||
|
|
|
||||||
|
|
@ -1,34 +1,12 @@
|
||||||
import json
|
import json
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from sklearn.model_selection import train_test_split
|
from sklearn.model_selection import train_test_split
|
||||||
from .utils import alnum_to_asr_tokens, asr_manifest_reader, asr_manifest_writer
|
from .utils import asr_manifest_reader, asr_manifest_writer
|
||||||
import typer
|
import typer
|
||||||
|
|
||||||
app = typer.Typer()
|
app = typer.Typer()
|
||||||
|
|
||||||
|
|
||||||
@app.command()
|
|
||||||
def separate_space_convert_digit_setpath():
|
|
||||||
with Path("/home/malar/work/asr-data-utils/asr_data/pnr_data.json").open("r") as pf:
|
|
||||||
pnr_jsonl = pf.readlines()
|
|
||||||
|
|
||||||
pnr_data = [json.loads(i) for i in pnr_jsonl]
|
|
||||||
|
|
||||||
new_pnr_data = []
|
|
||||||
for i in pnr_data:
|
|
||||||
i["text"] = alnum_to_asr_tokens(i["text"])
|
|
||||||
i["audio_filepath"] = i["audio_filepath"].replace(
|
|
||||||
"pnr_data/", "/dataset/asr_data/pnr_data/wav/"
|
|
||||||
)
|
|
||||||
new_pnr_data.append(i)
|
|
||||||
|
|
||||||
new_pnr_jsonl = [json.dumps(i) for i in new_pnr_data]
|
|
||||||
|
|
||||||
with Path("/dataset/asr_data/pnr_data/pnr_data.json").open("w") as pf:
|
|
||||||
new_pnr_data = "\n".join(new_pnr_jsonl) # + "\n"
|
|
||||||
pf.write(new_pnr_data)
|
|
||||||
|
|
||||||
|
|
||||||
@app.command()
|
@app.command()
|
||||||
def split_data(dataset_path: Path, test_size: float = 0.1):
|
def split_data(dataset_path: Path, test_size: float = 0.1):
|
||||||
manifest_path = dataset_path / Path("abs_manifest.json")
|
manifest_path = dataset_path / Path("abs_manifest.json")
|
||||||
|
|
@ -50,19 +28,6 @@ def fixate_data(dataset_path: Path):
|
||||||
|
|
||||||
asr_manifest_writer(real_manifest_path, fix_path())
|
asr_manifest_writer(real_manifest_path, fix_path())
|
||||||
|
|
||||||
# with manifest_path.open("r") as pf:
|
|
||||||
# pnr_jsonl = pf.readlines()
|
|
||||||
# pnr_data = [json.loads(i) for i in pnr_jsonl]
|
|
||||||
# new_pnr_data = []
|
|
||||||
# for i in pnr_data:
|
|
||||||
# i["audio_filepath"] = str(dataset_path / Path(i["audio_filepath"]))
|
|
||||||
# new_pnr_data.append(i)
|
|
||||||
# new_pnr_jsonl = [json.dumps(i) for i in new_pnr_data]
|
|
||||||
# real_manifest_path = dataset_path / Path("abs_manifest.json")
|
|
||||||
# with real_manifest_path.open("w") as pf:
|
|
||||||
# new_pnr_data = "\n".join(new_pnr_jsonl) # + "\n"
|
|
||||||
# pf.write(new_pnr_data)
|
|
||||||
|
|
||||||
|
|
||||||
@app.command()
|
@app.command()
|
||||||
def augment_an4():
|
def augment_an4():
|
||||||
|
|
@ -77,9 +42,6 @@ def augment_an4():
|
||||||
pf.write(an4_test + pnr_test)
|
pf.write(an4_test + pnr_test)
|
||||||
|
|
||||||
|
|
||||||
# augment_an4()
|
|
||||||
|
|
||||||
|
|
||||||
@app.command()
|
@app.command()
|
||||||
def validate_data(data_file: Path):
|
def validate_data(data_file: Path):
|
||||||
with Path(data_file).open("r") as pf:
|
with Path(data_file).open("r") as pf:
|
||||||
|
|
@ -101,23 +63,3 @@ def main():
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|
||||||
# def convert_digits(data_file="/dataset/asr_data/an4_pnr/test_manifest.json"):
|
|
||||||
# with Path(data_file).open("r") as pf:
|
|
||||||
# pnr_jsonl = pf.readlines()
|
|
||||||
#
|
|
||||||
# pnr_data = [json.loads(i) for i in pnr_jsonl]
|
|
||||||
# new_pnr_data = []
|
|
||||||
# for i in pnr_data:
|
|
||||||
# num_tokens = [num2words(c) for c in i["text"] if "0" <= c <= "9"]
|
|
||||||
# i["text"] = "".join(num_tokens)
|
|
||||||
# new_pnr_data.append(i)
|
|
||||||
#
|
|
||||||
# new_pnr_jsonl = [json.dumps(i) for i in new_pnr_data]
|
|
||||||
#
|
|
||||||
# with Path(data_file).open("w") as pf:
|
|
||||||
# new_pnr_data = "\n".join(new_pnr_jsonl) # + "\n"
|
|
||||||
# pf.write(new_pnr_data)
|
|
||||||
#
|
|
||||||
#
|
|
||||||
# convert_digits(data_file="/dataset/asr_data/an4_pnr/train_manifest.json")
|
|
||||||
|
|
|
||||||
|
|
@ -63,12 +63,13 @@ def main(manifest: Path):
|
||||||
asr_data = ui_config["data"]
|
asr_data = ui_config["data"]
|
||||||
use_domain_asr = ui_config["use_domain_asr"]
|
use_domain_asr = ui_config["use_domain_asr"]
|
||||||
sample_no = st.get_current_cursor()
|
sample_no = st.get_current_cursor()
|
||||||
|
if len(asr_data) - 1 < sample_no or sample_no < 0:
|
||||||
|
print("Invalid samplno resetting to 0")
|
||||||
|
st.update_cursor(0)
|
||||||
sample = asr_data[sample_no]
|
sample = asr_data[sample_no]
|
||||||
title_type = 'Speller ' if use_domain_asr else ''
|
title_type = "Speller " if use_domain_asr else ""
|
||||||
st.title(f"ASR {title_type}Validation")
|
st.title(f"ASR {title_type}Validation")
|
||||||
addl_text = (
|
addl_text = f"spelled *{sample['spoken']}*" if use_domain_asr else ""
|
||||||
f"spelled *{sample['spoken']}*" if use_domain_asr else ""
|
|
||||||
)
|
|
||||||
st.markdown(f"{sample_no+1} of {len(asr_data)} : **{sample['text']}**" + addl_text)
|
st.markdown(f"{sample_no+1} of {len(asr_data)} : **{sample['text']}**" + addl_text)
|
||||||
new_sample = st.number_input(
|
new_sample = st.number_input(
|
||||||
"Go To Sample:", value=sample_no + 1, min_value=1, max_value=len(asr_data)
|
"Go To Sample:", value=sample_no + 1, min_value=1, max_value=len(asr_data)
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue