1
0
mirror of https://github.com/malarinv/jasper-asr.git synced 2026-03-09 19:02:35 +00:00

1. implement dataset augmentation and validation in process

2. added option to skip 'incorrect' annotations in validation data
3. added confirmation on clearing mongo collection
4. added an option to navigate to a given text in the validation ui
5. added a dataset and remote option to trainer to load dataset from directory and remote rpyc service
This commit is contained in:
2020-05-20 11:16:22 +05:30
parent 83db445a6f
commit 8e79bbb571
4 changed files with 85 additions and 58 deletions

View File

@@ -143,8 +143,8 @@ def fill_unannotated(
@app.command()
def update_corrections(
data_manifest_path: Path = Path("./data/asr_data/call_alphanum/manifest.json"),
processed_data_path: Path = Path("./data/valiation_data/ui_dump.json"),
corrections_path: Path = Path("./data/valiation_data/corrections.json"),
skip_incorrect: bool = True,
):
def correct_manifest(manifest_data_gen, corrections_path):
corrections = json.load(corrections_path.open())
@@ -170,15 +170,18 @@ def update_corrections(
}
elif d["chars"] in correction_map:
correct_text = correction_map[d["chars"]]
renamed_set.add(correct_text)
new_name = str(Path(correct_text).with_suffix(".wav"))
d["audio_path"].replace(d["audio_path"].with_name(new_name))
new_filepath = str(Path(d["audio_filepath"]).with_name(new_name))
yield {
"audio_filepath": new_filepath,
"duration": d["duration"],
"text": alnum_to_asr_tokens(correct_text),
}
if skip_incorrect:
print(f'skipping incorrect {d["audio_path"]} corrected to {correct_text}')
else:
renamed_set.add(correct_text)
new_name = str(Path(correct_text).with_suffix(".wav"))
d["audio_path"].replace(d["audio_path"].with_name(new_name))
new_filepath = str(Path(d["audio_filepath"]).with_name(new_name))
yield {
"audio_filepath": new_filepath,
"duration": d["duration"],
"text": alnum_to_asr_tokens(correct_text),
}
else:
# don't delete if another correction points to an old file
if d["chars"] not in renamed_set:
@@ -202,8 +205,12 @@ def update_corrections(
@app.command()
def clear_mongo_corrections():
col = get_mongo_conn().test.asr_validation
col.delete_many({"type": "correction"})
delete = typer.confirm("are you sure you want to clear mongo collection it?")
if delete:
col = get_mongo_conn().test.asr_validation
col.delete_many({"type": "correction"})
typer.echo("deleted mongo collection.")
typer.echo("Aborted")
def main():

View File

@@ -53,7 +53,7 @@ if not hasattr(st, "mongo_connected"):
@st.cache()
def load_ui_data(validation_ui_data_path: Path):
typer.echo(f"Using validation ui data from :{validation_ui_data_path}")
typer.echo(f"Using validation ui data from {validation_ui_data_path}")
return ExtendedPath(validation_ui_data_path).read_json()
@@ -117,6 +117,11 @@ def main(manifest: Path):
# pass
# if st.button("Next Untagged"):
# pass
text_sample = st.text_input("Go to Text:", value='')
if text_sample != '':
candidates = [i for (i, p) in enumerate(asr_data) if p["text"] == text_sample or p["spoken"] == text_sample]
if len(candidates) > 0:
st.update_cursor(candidates[0])
real_idx = st.number_input(
"Go to real-index",
value=sample["real_idx"],