plume-asr/plume/cli/data/__init__.py

import json
from pathlib import Path
# from sklearn.model_selection import train_test_split
from plume.utils import (
    asr_manifest_reader,
    asr_manifest_writer,
    ExtendedPath,
    duration_str,
    generate_filter_map,
    get_mongo_conn,
    tscript_uuid_fname,
    lazy_callable
)
from typing import List
from itertools import chain
import shutil
import typer
import soundfile

from ...models.wav2vec2.data import app as wav2vec2_app
from .generate import app as generate_app

train_test_split = lazy_callable('sklearn.model_selection.train_test_split')

app = typer.Typer()
app.add_typer(generate_app, name="generate")
app.add_typer(wav2vec2_app, name="wav2vec2")


@app.command()
def fix_path(dataset_path: Path, force: bool = False):
    manifest_path = dataset_path / Path("manifest.json")
    real_manifest_path = dataset_path / Path("abs_manifest.json")

    def fix_real_path():
        for i in asr_manifest_reader(manifest_path):
            i["audio_filepath"] = str(
                (dataset_path / Path(i["audio_filepath"])).absolute()
            )
            yield i

    def fix_rel_path():
        for i in asr_manifest_reader(real_manifest_path):
            i["audio_filepath"] = str(
                Path(i["audio_filepath"]).relative_to(dataset_path)
            )
            yield i

    if not manifest_path.exists() and not real_manifest_path.exists():
        typer.echo("Invalid dataset directory")
    if not real_manifest_path.exists() or force:
        asr_manifest_writer(real_manifest_path, fix_real_path())
    if not manifest_path.exists():
        asr_manifest_writer(manifest_path, fix_rel_path())


@app.command()
def augment(src_dataset_paths: List[Path], dest_dataset_path: Path):
    reader_list = []
    abs_manifest_path = Path("abs_manifest.json")
    for dataset_path in src_dataset_paths:
        manifest_path = dataset_path / abs_manifest_path
        reader_list.append(asr_manifest_reader(manifest_path))
    dest_dataset_path.mkdir(parents=True, exist_ok=True)
    dest_manifest_path = dest_dataset_path / abs_manifest_path
    asr_manifest_writer(dest_manifest_path, chain(*reader_list))


@app.command()
def split(dataset_path: Path, test_size: float = 0.03):
    manifest_path = dataset_path / Path("abs_manifest.json")
    if not manifest_path.exists():
        fix_path(dataset_path)
    asr_data = list(asr_manifest_reader(manifest_path))
    train_pnr, test_pnr = train_test_split(asr_data, test_size=test_size)
    asr_manifest_writer(manifest_path.with_name("train_manifest.json"), train_pnr)
    asr_manifest_writer(manifest_path.with_name("test_manifest.json"), test_pnr)


@app.command()
def validate(dataset_path: Path):
    from natural.date import compress
    from datetime import timedelta

    for mf_type in ["train_manifest.json", "test_manifest.json"]:
        data_file = dataset_path / Path(mf_type)
        print(f"validating {data_file}.")
        with Path(data_file).open("r") as pf:
            pnr_jsonl = pf.readlines()
        duration = 0
        for (i, s) in enumerate(pnr_jsonl):
            try:
                d = json.loads(s)
                duration += d["duration"]
                audio_file = data_file.parent / Path(d["audio_filepath"])
                if not audio_file.exists():
                    raise OSError(f"File {audio_file} not found")
            except BaseException as e:
                print(f'failed on {i} with "{e}"')
        duration_str = compress(timedelta(seconds=duration), pad=" ")
        print(
            f"no errors found. seems like a valid {mf_type}. contains {duration_str} of audio"
        )


@app.command()
def filter(src_dataset_path: Path, dest_dataset_path: Path, kind: str = "skip_dur"):
    dest_manifest = dest_dataset_path / Path("manifest.json")
    data_file = src_dataset_path / Path("manifest.json")
    dest_wav_dir = dest_dataset_path / Path("wavs")
    dest_wav_dir.mkdir(exist_ok=True, parents=True)
    filter_kind_map = generate_filter_map(
        src_dataset_path, dest_dataset_path, data_file
    )

    selected_filter = filter_kind_map.get(kind, None)
    if selected_filter:
        asr_manifest_writer(dest_manifest, selected_filter())
    else:
        typer.echo(f"filter kind - {kind} not implemented")
        typer.echo(f"select one of {', '.join(filter_kind_map.keys())}")


@app.command()
def info(dataset_path: Path):
    for k in ["", "abs_", "train_", "test_"]:
        mf_wav_duration = (
            real_duration
        ) = max_duration = empty_duration = empty_count = total_count = 0
        data_file = dataset_path / Path(f"{k}manifest.json")
        if data_file.exists():
            print(f"stats on {data_file}")
            for s in ExtendedPath(data_file).read_jsonl():
                total_count += 1
                mf_wav_duration += s["duration"]
                if s["text"] == "":
                    empty_count += 1
                    empty_duration += s["duration"]
                wav_path = str(dataset_path / Path(s["audio_filepath"]))
                if max_duration < soundfile.info(wav_path).duration:
                    max_duration = soundfile.info(wav_path).duration
                real_duration += soundfile.info(wav_path).duration

            # frame_count = soundfile.info(audio_fname).frames
            print(f"max audio duration : {duration_str(max_duration)}")
            print(f"total audio duration : {duration_str(mf_wav_duration)}")
            print(f"total real audio duration : {duration_str(real_duration)}")
            print(
                f"total content duration : {duration_str(mf_wav_duration-empty_duration)}"
            )
            print(f"total empty duration : {duration_str(empty_duration)}")
            print(
                f"total empty samples : {empty_count}/{total_count} ({empty_count*100/total_count:.2f}%)"
            )


@app.command()
def audio_duration(dataset_path: Path):
    wav_duration = 0
    for audio_rel_fname in dataset_path.absolute().glob("**/*.wav"):
        audio_fname = str(audio_rel_fname)
        wav_duration += soundfile.info(audio_fname).duration
    typer.echo(f"duration of wav files @ {dataset_path}: {duration_str(wav_duration)}")


@app.command()
def migrate(src_path: Path, dest_path: Path):
    shutil.copytree(str(src_path), str(dest_path))
    wav_dir = dest_path / Path("wavs")
    wav_dir.mkdir(exist_ok=True, parents=True)
    abs_manifest_path = ExtendedPath(dest_path / Path("abs_manifest.json"))
    backup_abs_manifest_path = abs_manifest_path.with_suffix(".json.orig")
    shutil.copy(abs_manifest_path, backup_abs_manifest_path)
    manifest_data = list(abs_manifest_path.read_jsonl())
    for md in manifest_data:
        orig_path = Path(md["audio_filepath"])
        new_path = wav_dir / Path(orig_path.name)
        shutil.copy(orig_path, new_path)
        md["audio_filepath"] = str(new_path)
    abs_manifest_path.write_jsonl(manifest_data)
    fix_path(dest_path)


@app.command()
def task_split(
    data_dir: Path,
    dump_file: Path = Path("ui_dump.json"),
    task_count: int = typer.Option(2, show_default=True),
    task_file: str = "task_dump",
    sort: bool = True,
):
    """
    split ui_dump.json to `task_count` tasks
    """
    import pandas as pd
    import numpy as np

    processed_data_path = data_dir / dump_file
    processed_data = ExtendedPath(processed_data_path).read_json()
    df = pd.DataFrame(processed_data["data"]).sample(frac=1).reset_index(drop=True)
    for t_idx, task_f in enumerate(np.array_split(df, task_count)):
        task_f = task_f.reset_index(drop=True)
        task_f["real_idx"] = task_f.index
        task_data = task_f.to_dict("records")
        if sort:
            task_data = sorted(task_data, key=lambda x: x["asr_wer"], reverse=True)
        processed_data["data"] = task_data
        task_path = data_dir / Path(task_file + f"-{t_idx}.json")
        ExtendedPath(task_path).write_json(processed_data)


def get_corrections(task_uid):
    col = get_mongo_conn(col="asr_validation")
    task_id = [
        c
        for c in col.distinct("task_id")
        if c.rsplit("-", 1)[1] == task_uid or c == task_uid
    ][0]
    corrections = list(col.find({"type": "correction"}, projection={"_id": False}))
    cursor_obj = col.find(
        {"type": "correction", "task_id": task_id}, projection={"_id": False}
    )
    corrections = [c for c in cursor_obj]
    return corrections


@app.command()
def dump_task_corrections(data_dir: Path, task_uid: str):
    dump_fname: Path = Path(f"corrections-{task_uid}.json")
    dump_path = data_dir / dump_fname
    corrections = get_corrections(task_uid)
    ExtendedPath(dump_path).write_json(corrections)


@app.command()
def dump_all_corrections(data_dir: Path):
    for task_lcks in data_dir.glob('task-*.lck'):
        task_uid = task_lcks.stem.replace('task-', '')
        dump_task_corrections(data_dir, task_uid)


@app.command()
def update_corrections(
    data_dir: Path,
    skip_incorrect: bool = typer.Option(
        False, show_default=True, help="treats incorrect as invalid"
    ),
    skip_inaudible: bool = typer.Option(
        False, show_default=True, help="include invalid as blank target"
    ),
):
    """
    applies the corrections-*.json
    backup the original dataset
    """
    manifest_file: Path = Path("manifest.json")
    renames_file: Path = Path("rename_map.json")
    ui_dump_file: Path = Path("ui_dump.json")
    data_manifest_path = data_dir / manifest_file
    renames_path = data_dir / renames_file

    def correct_ui_dump(data_dir, rename_result):
        ui_dump_path = data_dir / ui_dump_file
        # corrections_path = data_dir / Path("corrections.json")
        corrections = [
            t
            for p in data_dir.glob("corrections-*.json")
            for t in ExtendedPath(p).read_json()
        ]
        ui_data = ExtendedPath(ui_dump_path).read_json()["data"]
        correct_set = {
            c["code"] for c in corrections if c["value"]["status"] == "Correct"
        }
        correction_map = {
            c["code"]: c["value"]["correction"]
            for c in corrections
            if c["value"]["status"] == "Incorrect"
        }
        for d in ui_data:
            orig_audio_path = (data_dir / Path(d["audio_path"])).absolute()
            if d["utterance_id"] in correct_set:
                d["corrected_from"] = d["text"]
                yield d
            elif d["utterance_id"] in correction_map:
                correct_text = correction_map[d["utterance_id"]]
                if skip_incorrect:
                    ap = d["audio_path"]
                    print(f"skipping incorrect {ap} corrected to {correct_text}")
                    orig_audio_path.unlink()
                else:
                    new_fname = tscript_uuid_fname(correct_text)
                    rename_result[new_fname] = {
                        "orig_text": d["text"],
                        "correct_text": correct_text,
                        "orig_id": d["utterance_id"],
                    }
                    new_name = str(Path(new_fname).with_suffix(".wav"))
                    new_audio_path = orig_audio_path.with_name(new_name)
                    orig_audio_path.replace(new_audio_path)
                    new_filepath = str(Path(d["audio_path"]).with_name(new_name))
                    d["corrected_from"] = d["text"]
                    d["text"] = correct_text
                    d["audio_path"] = new_filepath
                    yield d
            else:
                if skip_inaudible:
                    orig_audio_path.unlink()
                else:
                    d["corrected_from"] = d["text"]
                    d["text"] = ""
                    yield d

    dataset_dir = data_manifest_path.parent
    dataset_name = dataset_dir.name
    backup_dir = dataset_dir.with_name(dataset_name + ".bkp")
    if not backup_dir.exists():
        typer.echo(f"backing up to {backup_dir}")
        shutil.copytree(str(dataset_dir), str(backup_dir))
    renames = {}
    corrected_ui_dump = list(correct_ui_dump(data_dir, renames))
    ExtendedPath(data_dir / ui_dump_file).write_json({"data": corrected_ui_dump})
    corrected_manifest = (
        {
            "audio_filepath": d["audio_path"],
            "duration": d["duration"],
            "text": d["text"],
        }
        for d in corrected_ui_dump
    )
    asr_manifest_writer(data_manifest_path, corrected_manifest)
    ExtendedPath(renames_path).write_json(renames)


def main():
    app()


if __name__ == "__main__":
    main()