1. include additional ui dependencies

2. set sample width to 1 for wav2vec2 training data export from jasper 3. add 'audio_seg' key to asr_manifest_reader 4. add alpha rules 5. bugfixes and tests
2026-03-07 20:02:34 +00:00 · 2021-08-16 18:02:26 +05:30
parent 076b0d11e3
commit db51553320
6 changed files with 99 additions and 4 deletions
--- a/setup.py
+++ b/setup.py
@@ -82,6 +82,8 @@ extra_requirements = {
        "google-cloud-speech~=1.3.1",
    ],
    "ui": [
+        "pyspellchecker~=0.6.2",
+        "google-cloud-texttospeech~=1.0.1",
        "rangehttpserver~=1.2.0",
    ],
    "crypto": ["cryptography~=3.4.7"],
--- a/src/plume/models/wav2vec2/data.py
+++ b/src/plume/models/wav2vec2/data.py
@@ -42,6 +42,7 @@ def export_jasper(src_dataset_path: Path, dest_dataset_path: Path, unlink: bool
                pydub.AudioSegment.from_wav(wav_path)
                .set_frame_rate(16000)
                .set_channels(1)
+                .set_sample_width(1)
            )
            dest_path = dest_dataset_path / Path("wavs") / Path(wav_path.name)
            audio_seg.export(dest_path, format="wav")
--- a/src/plume/models/wav2vec2/serve.py
+++ b/src/plume/models/wav2vec2/serve.py
@@ -24,7 +24,7 @@ def rpyc(
 ):
    for p in [w2v_path, ctc_path, target_dict_path]:
        if not p.exists():
-            logging.info(f"{p} doesn't exists")
+            typer.echo(f"{p} doesn't exists")
            return
    w2vasr = Wav2Vec2ASR(str(ctc_path), str(w2v_path), str(target_dict_path))
    service = ASRService(w2vasr)
--- a/src/plume/utils/manifest.py
+++ b/src/plume/utils/manifest.py
@@ -2,6 +2,7 @@ from pathlib import Path

 # from tqdm import tqdm
 import json
+from .audio import audio_wav_bytes_to_seg

 # from .extended_path import ExtendedPath
 # from .parallel import parallel_apply
@@ -21,6 +22,10 @@ def asr_manifest_reader(data_manifest_path: Path):
    for p in data_data:
        p["audio_path"] = data_manifest_path.parent / Path(p["audio_filepath"])
        p["text"] = p["text"].strip()
+        # import pdb; pdb.set_trace()
+        p["audio_seg"] = audio_wav_bytes_to_seg(
+                (data_manifest_path.parent / p["audio_filepath"]).read_bytes()
+        )
        yield p


--- a/src/plume/utils/regentity.py
+++ b/src/plume/utils/regentity.py
@@ -147,6 +147,43 @@ def do_tri_verbose_list():
    ] + ["hundred"]


+def default_alpha_rules(oh_is_zero, i_oh_limit):
+    o_i_vars = r"(\[?(?:A|Oh|O|I)\]?)"
+    i_oh_limit_rules = [
+        (r"\b([b-hj-np-z])\b", "\\1"),
+        # (
+        #     r"\b((?:"
+        #     + al_num_regex
+        #     + r"|^)\b\s*)(I|O)(\s*\b)(?="
+        #     + al_num_regex
+        #     + r"\s+|$)\b",
+        #     "\\1[\\2]\\3",
+        # ),
+        # (
+        #     r"\b" + o_i_vars + r"(\s+)" + o_i_vars + r"\b",
+        #     "[\\1]\\2[\\3]",
+        # ),
+        (
+            r"(\s+|^)" + o_i_vars + r"(\s+)\[?" + o_i_vars + r"\]?(\s+|$)",
+            "\\1[\\2]\\3[\\4]\\5",
+        ),
+        (
+            r"(\s+|^)\[?" + o_i_vars + r"\]?(\s+)" + o_i_vars + r"(\s+|$)",
+            "\\1[\\2]\\3[\\4]\\5",
+        ),
+    ]
+    entity_rules = (
+        +[(r"\boh\b", "o")]
+        + [
+            (r"\bdouble(?: |-)(\w+|\d+)\b", "\\1 \\1"),
+            (r"\btriple(?: |-)(\w+|\d+)\b", "\\1 \\1 \\1"),
+            # (r"\b([a-zA-Z])\b", "\\1"),
+        ]
+        + (i_oh_limit_rules if i_oh_limit else [(r"\b([a-zA-Z])\b", "\\1")])
+    )
+    return entity_rules
+
+
 def default_alnum_rules(num_range, oh_is_zero, i_oh_limit):
    oh_is_zero_rules = [
        (r"\boh\b", "0"),
@@ -285,6 +322,34 @@ def alnum_keeper(num_range=100, oh_is_zero=False):
    return keeper


+def alpha_keeper(oh_is_zero=False):
+    entity_rules = default_alpha_rules(oh_is_zero, i_oh_limit=True)
+
+    # def strip_space(match_obj):
+    #     # char_elem = match_obj.group(1)
+    #     return match_obj.group(1).strip() + match_obj.group(2).strip()
+
+    pre_rules = [
+        (r"[ ;,.]", " "),
+        (r"[']", ""),
+        # (
+        #     r"((?:(?<=\w{2,2})|^)\s*)(?:\bI\b|\bi\b|\bOh\b|\boh\b)(\s*(?:\w{2,}|$))",
+        #     strip_space,
+        # ),
+    ]
+
+    post_rules = [
+        # (
+        #     r"((?:(?<=\w{2,2})|^)\s*)(?:\bI\b|\bi\b|\bOh\b|\boh\b)(\s*(?:\w{2,}|$))",
+        #     strip_space,
+        # )
+    ]
+    replacer, keeper = entity_replacer_keeper(
+        pre_rules=pre_rules, entity_rules=entity_rules, post_rules=post_rules
+    )
+    return keeper
+
+
 def num_keeper_orig(num_range=10, extra_rules=[]):
    num_int_map_ty = [
        (
@@ -377,9 +442,7 @@ def vocab_corrector_gen(vocab, distance=1, method="spell"):
            # return " ".join(
            #     [spell.correction(tok) for tok in spell.split_words(inp)]
            # )
-            return " ".join(
-                [spell.correction(tok) for tok in inp.split()]
-            )
+            return " ".join([spell.correction(tok) for tok in inp.split()])

    elif method == "edit":
        # editdistance.eval("banana", "bahama")
--- a/tests/plume/test_utils.py
+++ b/tests/plume/test_utils.py
@@ -87,6 +87,30 @@ def test_alnum_keeper():
    )


+def test_alpha_keeper():
+    keeper = alnum_keeper()
+    assert keeper("I One hundred n fifty-eight not 5 oh o fifty A B more") == (
+        "I One hundred n fifty-eight 5 oh o fifty A B",
+        11,
+    )
+    assert keeper(
+        "I'll phone number One hundred n fifty-eight not 5 oh o fifty A B more"
+    ) == ("One hundred n fifty-eight 5 oh o fifty A B", 10)
+    assert keeper(
+        "I'm One hundred n fifty-eight not 5 oh o fifty A B more"
+    ) == (
+        "One hundred n fifty-eight 5 oh o fifty A B",
+        10,
+    )
+
+    assert keeper(
+        "I am One hundred n fifty-eight not 5 oh o fifty A B more"
+    ) == (
+        "One hundred n fifty-eight 5 oh o fifty A B",
+        10,
+    )
+
+
@pytest.fixture
 def random():
    rand.seed(0)