1. add some pyaudio dep

2. fixed merge/ add eject command with unlink option 3. wip - marblenet vad 4. add slu_infer ui util 5. fix filter command with maxmin support 6. some logging changes and fixes
2026-03-07 20:02:34 +00:00 · 2021-07-19 15:20:50 +05:30
parent 4bca2097e1
commit 076b0d11e3
10 changed files with 452 additions and 34 deletions
--- a/setup.py
+++ b/setup.py
@@ -66,6 +66,7 @@ extra_requirements = {
        "pyspellchecker~=0.6.2",
        "num2words~=0.5.10",
        "pydub~=0.24.0",
+        "pyaudio~=0.2.11"
    ],
    "infer_min": [
        "pyspellchecker~=0.6.2",
--- a/src/plume/cli/data/init.py
+++ b/src/plume/cli/data/init.py
@@ -73,7 +73,12 @@ def fix_path(dataset_path: Path, force: bool = False):


@app.command()
-def merge(src_dataset_paths: List[Path], dest_dataset_path: Path):
+def merge(
+    src_dataset_paths: List[Path],
+    dest_dataset_path: Path,
+    unlink: bool = False,
+    verbose: bool = True,
+):
    reader_list = []
    abs_manifest_path = Path("abs_manifest.json")
    for dataset_path in src_dataset_paths:
@@ -81,7 +86,29 @@ def merge(src_dataset_paths: List[Path], dest_dataset_path: Path):
        reader_list.append(asr_manifest_reader(manifest_path))
    dest_dataset_path.mkdir(parents=True, exist_ok=True)
    dest_manifest_path = dest_dataset_path / abs_manifest_path
-    asr_manifest_writer(dest_manifest_path, chain(*reader_list))
+    asr_manifest_writer(
+        dest_manifest_path, chain(*reader_list), verbose=verbose
+    )
+    if unlink:
+        eject(dest_dataset_path, verbose=verbose)
+
+
+def eject(dest_dataset_path: Path, verbose: bool = False):
+    wav_dir = dest_dataset_path / Path("wavs")
+    wav_dir.mkdir(exist_ok=True, parents=True)
+    abs_manifest_path = ExtendedPath(
+        dest_dataset_path / Path("abs_manifest.json")
+    )
+    backup_abs_manifest_path = abs_manifest_path.with_suffix(".json.orig")
+    shutil.copy(abs_manifest_path, backup_abs_manifest_path)
+    manifest_data = list(abs_manifest_path.read_jsonl())
+    for md in tqdm(manifest_data) if verbose else manifest_data:
+        orig_path = Path(md["audio_filepath"])
+        new_path = wav_dir / Path(orig_path.name)
+        shutil.copy(orig_path, new_path)
+        md["audio_filepath"] = str(new_path)
+    abs_manifest_path.write_jsonl(manifest_data)
+    fix_path(dest_dataset_path)


@app.command()
@@ -275,7 +302,7 @@ def encrypt(
    src_dataset_path: Path,
    dest_dataset_path: Path,
    encryption_key: str = typer.Option(..., prompt=True, hide_input=True),
-    verbose: bool = False,
+    verbose: bool = True,
 ):
    dest_manifest = dest_dataset_path / Path("manifest.json")
    src_manifest = src_dataset_path / Path("manifest.json")
--- a/src/plume/models/marblenet_nemo/trial.py
+++ b/src/plume/models/marblenet_nemo/trial.py
@@ -1,22 +1,357 @@
 import numpy as np
 import os
-import time
-import copy

-from omegaconf import OmegaConf
+# import time
+import copy
+import wave
+import wget
+
+# from omegaconf import OmegaConf
+
 import matplotlib.pyplot as plt
+import librosa.display
+
 import IPython.display as ipd
+
 # import pyaudio as pa
 import librosa
-import nemo
+
+# import nemo
 import nemo.collections.asr as nemo_asr
+from nemo.core.classes import IterableDataset
+from nemo.core.neural_types import NeuralType, AudioSignal, LengthsType
+import torch
+from torch.utils.data import DataLoader

 # sample rate, Hz
 SAMPLE_RATE = 16000
+# import pdb; pdb.set_trace()

-vad_model = nemo_asr.models.EncDecClassificationModel.from_pretrained(
-    "vad_marblenet"
+# vad_model = nemo_asr.models.EncDecClassificationModel.from_pretrained(
+#     "vad_marblenet"
+# )
+# vad_model = nemo_asr.models.EncDecClassificationModel.from_pretrained(
+#     model_name="MarbleNet-3x2x64-Telephony"
+# )
+vad_model = nemo_asr.models.EncDecClassificationModel.restore_from(
+    "/home/malar/work/test/vad_telephony_marblenet.nemo"
 )
+# vad_model = nemo_asr.models.EncDecClassificationModel.from_pretrained(
+#     model_name="vad_telephony_marblenet"
+# )
 # Preserve a copy of the full config
 cfg = copy.deepcopy(vad_model._cfg)
 # print(OmegaConf.to_yaml(cfg))
+
+vad_model.preprocessor = vad_model.from_config_dict(cfg.preprocessor)
+
+# Set model to inference mode
+vad_model.eval()
+vad_model = vad_model.to(vad_model.device)
+# import pdb; pdb.set_trace()
+
+# simple data layer to pass audio signal
+class AudioDataLayer(IterableDataset):
+    @property
+    def output_types(self):
+        return {
+            "audio_signal": NeuralType(
+                ("B", "T"), AudioSignal(freq=self._sample_rate)
+            ),
+            "a_sig_length": NeuralType(tuple("B"), LengthsType()),
+        }
+
+    def __init__(self, sample_rate):
+        super().__init__()
+        self._sample_rate = sample_rate
+        self.output = True
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        if not self.output:
+            raise StopIteration
+        self.output = False
+        return (
+            torch.as_tensor(self.signal, dtype=torch.float32),
+            torch.as_tensor(self.signal_shape, dtype=torch.int64),
+        )
+
+    def set_signal(self, signal):
+        self.signal = signal.astype(np.float32) / 32768.0
+        self.signal_shape = self.signal.size
+        self.output = True
+
+    def __len__(self):
+        return 1
+
+
+data_layer = AudioDataLayer(sample_rate=cfg.train_ds.sample_rate)
+data_loader = DataLoader(
+    data_layer, batch_size=1, collate_fn=data_layer.collate_fn
+)
+
+
+# inference method for audio signal (single instance)
+def infer_signal(model, signal):
+    data_layer.set_signal(signal)
+    batch = next(iter(data_loader))
+    audio_signal, audio_signal_len = batch
+    audio_signal, audio_signal_len = (
+        audio_signal.to(vad_model.device),
+        audio_signal_len.to(vad_model.device),
+    )
+    logits = model.forward(
+        input_signal=audio_signal, input_signal_length=audio_signal_len
+    )
+    return logits
+
+
+# class for streaming frame-based VAD
+# 1) use reset() method to reset FrameVAD's state
+# 2) call transcribe(frame) to do VAD on
+#    contiguous signal's frames
+class FrameVAD:
+    def __init__(
+        self,
+        model_definition,
+        threshold=0.5,
+        frame_len=2,
+        frame_overlap=2.5,
+        offset=10,
+    ):
+        """
+        Args:
+          threshold: If prob of speech is larger than threshold, classify the segment to be speech.
+          frame_len: frame's duration, seconds
+          frame_overlap: duration of overlaps before and after current frame, seconds
+          offset: number of symbols to drop for smooth streaming
+        """
+        self.vocab = list(model_definition["labels"])
+        self.vocab.append("_")
+
+        self.sr = model_definition["sample_rate"]
+        self.threshold = threshold
+        self.frame_len = frame_len
+        self.n_frame_len = int(frame_len * self.sr)
+        self.frame_overlap = frame_overlap
+        self.n_frame_overlap = int(frame_overlap * self.sr)
+        timestep_duration = model_definition["AudioToMFCCPreprocessor"][
+            "window_stride"
+        ]
+        for block in model_definition["JasperEncoder"]["jasper"]:
+            timestep_duration *= block["stride"][0] ** block["repeat"]
+        self.buffer = np.zeros(
+            shape=2 * self.n_frame_overlap + self.n_frame_len, dtype=np.float32
+        )
+        self.offset = offset
+        self.reset()
+
+    def _decode(self, frame, offset=0):
+        assert len(frame) == self.n_frame_len
+        self.buffer[: -self.n_frame_len] = self.buffer[self.n_frame_len :]
+        self.buffer[-self.n_frame_len :] = frame
+        logits = infer_signal(vad_model, self.buffer).cpu().numpy()[0]
+        decoded = self._greedy_decoder(self.threshold, logits, self.vocab)
+        return decoded
+
+    @torch.no_grad()
+    def transcribe(self, frame=None):
+        if frame is None:
+            frame = np.zeros(shape=self.n_frame_len, dtype=np.float32)
+        if len(frame) < self.n_frame_len:
+            frame = np.pad(
+                frame, [0, self.n_frame_len - len(frame)], "constant"
+            )
+        unmerged = self._decode(frame, self.offset)
+        return unmerged
+
+    def reset(self):
+        """
+        Reset frame_history and decoder's state
+        """
+        self.buffer = np.zeros(shape=self.buffer.shape, dtype=np.float32)
+        self.prev_char = ""
+
+    @staticmethod
+    def _greedy_decoder(threshold, logits, vocab):
+        s = []
+        if logits.shape[0]:
+            probs = torch.softmax(torch.as_tensor(logits), dim=-1)
+            probas, _ = torch.max(probs, dim=-1)
+            probas_s = probs[1].item()
+            preds = 1 if probas_s >= threshold else 0
+            s = [
+                preds,
+                str(vocab[preds]),
+                probs[0].item(),
+                probs[1].item(),
+                str(logits),
+            ]
+        return s
+
+
+# WINDOW_SIZE_RANGE = [0.10, 0.15, 0.20, 0.25, 0.30, 0.5, 0.8]
+# # STEP_RANGE = [0.01, 0.02, 0.03]
+# # WINDOW_SIZE_RANGE = [0.15, 0.20]
+# STEP_RANGE = [0.01, 0.02, 0.03]
+WINDOW_SIZE_RANGE = [0.15, 0.20, 0.25]
+# STEP_RANGE = [0.01, 0.02, 0.03]
+# WINDOW_SIZE_RANGE = [0.15, 0.20]
+STEP_RANGE = [0.03, 0.05, 0.07, 0.1]
+STEP_LIST = [r for t in STEP_RANGE for r in [t]*len(WINDOW_SIZE_RANGE)]
+# STEP_LIST
+# STEP_LIST = (
+#     [0.01] * len(WINDOW_SIZE_RANGE)
+#     + [0.02] * len(WINDOW_SIZE_RANGE)
+#     + [0.03] * len(WINDOW_SIZE_RANGE)
+# )
+WINDOW_SIZE_LIST = WINDOW_SIZE_RANGE * len(STEP_RANGE)
+
+
+def offline_inference(wave_file, STEP=0.025, WINDOW_SIZE=0.5, threshold=0.5):
+
+    FRAME_LEN = STEP  # infer every STEP seconds
+    CHANNELS = 1  # number of audio channels (expect mono signal)
+    RATE = 16000  # sample rate, Hz
+
+    CHUNK_SIZE = int(FRAME_LEN * RATE)
+
+    vad = FrameVAD(
+        model_definition={
+            "sample_rate": SAMPLE_RATE,
+            "AudioToMFCCPreprocessor": cfg.preprocessor,
+            "JasperEncoder": cfg.encoder,
+            "labels": cfg.labels,
+        },
+        threshold=threshold,
+        frame_len=FRAME_LEN,
+        frame_overlap=(WINDOW_SIZE - FRAME_LEN) / 2,
+        offset=0,
+    )
+
+    wf = wave.open(wave_file, "rb")
+    # p = pa.PyAudio()
+
+    empty_counter = 0
+
+    preds = []
+    proba_b = []
+    proba_s = []
+
+    # stream = p.open(
+    #     format=p.get_format_from_width(wf.getsampwidth()),
+    #     channels=CHANNELS,
+    #     rate=RATE,
+    #     output=True,
+    # )
+
+    data = wf.readframes(CHUNK_SIZE)
+
+    while len(data) > 0:
+
+        data = wf.readframes(CHUNK_SIZE)
+        signal = np.frombuffer(data, dtype=np.int16)
+        result = vad.transcribe(signal)
+
+        preds.append(result[0])
+        proba_b.append(result[2])
+        proba_s.append(result[3])
+
+        if len(result):
+            # print(result, end="\n")
+            empty_counter = 3
+        elif empty_counter > 0:
+            empty_counter -= 1
+            # if empty_counter == 0:
+            #     print(" ", end="")
+
+    # p.terminate()
+    vad.reset()
+
+    return preds, proba_b, proba_s
+
+
+# demo_wave = "VAD_demo.wav"
+# if not os.path.exists(demo_wave):
+#     wget.download(
+#         "https://dldata-public.s3.us-east-2.amazonaws.com/VAD_demo.wav",
+#         demo_wave,
+#     )
+
+demo_wave = "WAL-1201-cust.wav"
+
+
+wave_file = demo_wave
+
+CHANNELS = 1
+RATE = 16000
+audio, sample_rate = librosa.load(wave_file, sr=RATE)
+dur = librosa.get_duration(audio)
+print(dur)
+
+
+threshold = 0.5
+
+results = []
+for STEP, WINDOW_SIZE in zip(
+    STEP_LIST,
+    WINDOW_SIZE_LIST,
+):
+    print(f"====== STEP is {STEP}s, WINDOW_SIZE is {WINDOW_SIZE}s ====== ")
+    preds, proba_b, proba_s = offline_inference(
+        wave_file, STEP, WINDOW_SIZE, threshold
+    )
+    results.append([STEP, WINDOW_SIZE, preds, proba_b, proba_s])
+
+
+plt.figure(figsize=[20, 3*len(STEP_LIST)])
+
+num = len(results)
+for i in range(num):
+    len_pred = len(results[i][2])
+    FRAME_LEN = results[i][0]
+    ax1 = plt.subplot(num + 1, 1, i + 1)
+
+    ax1.plot(np.arange(audio.size) / sample_rate, audio, "b")
+    ax1.set_xlim([-0.01, int(dur) + 1])
+    ax1.tick_params(axis="y", labelcolor="b")
+    ax1.set_ylabel("Signal")
+    ax1.set_ylim([-1, 1])
+
+    proba_s = results[i][4]
+    pred = [1 if p > threshold else 0 for p in proba_s]
+    ax2 = ax1.twinx()
+    ax2.plot(
+        np.arange(len_pred) / (1 / results[i][0]),
+        np.array(pred),
+        "r",
+        label="pred",
+    )
+    ax2.plot(
+        np.arange(len_pred) / (1 / results[i][0]),
+        np.array(proba_s),
+        "g--",
+        label="speech prob",
+    )
+    ax2.tick_params(axis="y", labelcolor="r")
+    legend = ax2.legend(loc="lower right", shadow=True)
+    ax1.set_ylabel("prediction")
+
+    ax2.set_title(f"step {results[i][0]}s, buffer size {results[i][1]}s")
+    ax2.set_ylabel("Preds and Probas")
+
+
+ax = plt.subplot(num + 1, 1, i + 2)
+S = librosa.feature.melspectrogram(
+    y=audio, sr=sample_rate, n_mels=64, fmax=8000
+)
+S_dB = librosa.power_to_db(S, ref=np.max)
+librosa.display.specshow(
+    S_dB, x_axis="time", y_axis="mel", sr=sample_rate, fmax=8000
+)
+ax.set_title("Mel-frequency spectrogram")
+ax.grid()
+plt.show()
+ipd.Audio(data=audio, rate=sample_rate)
--- a/src/plume/models/wav2vec2/data.py
+++ b/src/plume/models/wav2vec2/data.py
@@ -55,10 +55,8 @@ def export_jasper(src_dataset_path: Path, dest_dataset_path: Path, unlink: bool
        out_tsv = dest_dataset_path / Path(o_tsv)
        out_ltr = dest_dataset_path / Path(o_ltr)
        with out_tsv.open("w") as tsv_f, out_ltr.open("w") as ltr_f:
-            if unlink:
-                tsv_f.write(f"{dest_dataset_path}\n")
-            else:
-                tsv_f.write(f"{src_dataset_path}\n")
+            dest_path = dest_dataset_path if unlink else src_dataset_path
+            tsv_f.write(f"{dest_path}\n")
            for md in manifest_data:
                audio_fname = md["audio_filepath"]
                pipe_toks = replace_redundant_spaces_with(md["text"], "|").upper()
--- a/src/plume/ui/init.py
+++ b/src/plume/ui/init.py
@@ -18,7 +18,9 @@ def ui():


@app.command()
-def annotation(data_dir: Path, dump_fname: Path = "ui_dump.json", task_id: str = ""):
+def annotation(
+    data_dir: Path, dump_fname: Path = "ui_dump.json", task_id: str = ""
+):
    annotation_lit_path = Path(__file__).parent / Path("annotation.py")
    if task_id:
        sys.argv = [
@@ -83,6 +85,13 @@ def audio(audio_dir: Path):
    sys.exit(stcli.main())


+@app.command()
+def slu_infer():
+    lit_path = Path(__file__).parent / Path("slu_infer.py")
+    sys.argv = ["streamlit", "run", str(lit_path)]
+    sys.exit(stcli.main())
+
+
@app.command()
 def collection(data_dir: Path, task_id: str = ""):
    # TODO: Implement web ui for data collection
--- a/src/plume/ui/slu_infer.py
+++ b/src/plume/ui/slu_infer.py
@@ -0,0 +1,32 @@
+# from pathlib import Path
+
+import streamlit as st
+import typer
+
+from plume.utils.transcribe import triton_transcribe_grpc_gen
+from plume.utils.audio import audio_wav_bytes_to_seg
+
+app = typer.Typer()
+
+transcriber, prep = triton_transcribe_grpc_gen(
+    asr_model="slu_num_wav2vec2", method="whole", append_raw=True
+)
+
+
+@app.command()
+def main():
+    st.title("SLU Inference")
+    audio_file = st.file_uploader("Upload File", type=["wav", "mp3"])
+    if audio_file:
+        audio_bytes = audio_file.read()
+        seg = audio_wav_bytes_to_seg(audio_bytes)
+        st.audio(audio_bytes)
+        tscript = transcriber(prep(seg))
+        st.write(tscript)
+
+
+if __name__ == "__main__":
+    try:
+        app()
+    except SystemExit:
+        pass
--- a/src/plume/utils/init.py
+++ b/src/plume/utils/init.py
@@ -32,7 +32,11 @@ import six

 # from .transcribe import triton_transcribe_grpc_gen
 # from .eval import app as eval_app
-from .manifest import asr_manifest_writer, manifest_str
+from .manifest import (
+    asr_manifest_writer,
+    asr_manifest_reader,
+    manifest_str,
+)  # noqa
 from .lazy_import import lazy_callable, lazy_module
 from .parallel import parallel_apply
 from .extended_path import ExtendedPath
@@ -430,17 +434,6 @@ def ui_dump_manifest_writer(dataset_dir, asr_data_source, verbose=False):
    return num_datapoints


-def asr_manifest_reader(data_manifest_path: Path):
-    print(f"reading manifest from {data_manifest_path}")
-    with data_manifest_path.open("r") as pf:
-        data_jsonl = pf.readlines()
-    data_data = [json.loads(v) for v in data_jsonl]
-    for p in data_data:
-        p["audio_path"] = data_manifest_path.parent / Path(p["audio_filepath"])
-        p["text"] = p["text"].strip()
-        yield p
-
-
 def asr_test_writer(out_file_path: Path, source):
    def dd_str(dd, idx):
        path = dd["audio_filepath"]
@@ -558,11 +551,19 @@ def generate_filter_map(src_dataset_path, dest_dataset_path, data_file):
                blank_count += 1
        typer.echo(f"filtered {blank_count} of {total_count} blank samples")

-    def filtered_max_sample_dur():
+    def filtered_maxmin_sample_dur():
+        import soundfile
+
        max_dur_count = 0
        for s in src_data_enum:
-            wav_duration = s["duration"]
-            if wav_duration <= max_sample_dur:
+            wav_real_duration = soundfile.info(
+                src_dataset_path / Path(s["audio_filepath"])
+            ).duration
+            wav_duration = min(wav_real_duration, s["duration"])
+            if (
+                wav_duration <= max_sample_dur
+                and wav_duration > min_sample_dur
+            ):
                shutil.copy(
                    src_dataset_path / Path(s["audio_filepath"]),
                    dest_dataset_path / Path(s["audio_filepath"]),
@@ -571,7 +572,7 @@ def generate_filter_map(src_dataset_path, dest_dataset_path, data_file):
            else:
                max_dur_count += 1
        typer.echo(
-            f"filtered {max_dur_count} samples longer thans {max_sample_dur}s"
+            f"filtered {max_dur_count} samples longer thans {max_sample_dur}s and shorter than {min_sample_dur}s"
        )

    def filtered_transform_digits():
@@ -641,7 +642,9 @@ def generate_filter_map(src_dataset_path, dest_dataset_path, data_file):
        wav_duration = 0
        for s in src_data_enum:
            # nums = re.sub(" ", "", s["text"])
-            s["text"] = "gAAAAABgq2FR6ajbhMsDmWRQBzX6gIzyAG5sMwFihGeV7E_6eVJqqF78yzmtTJPsJAOJEEXhJ9Z45MrYNgE1sq7VUdsBVGh2cw=="
+            s[
+                "text"
+            ] = "gAAAAABgq2FR6ajbhMsDmWRQBzX6gIzyAG5sMwFihGeV7E_6eVJqqF78yzmtTJPsJAOJEEXhJ9Z45MrYNgE1sq7VUdsBVGh2cw=="
            if (
                s["duration"] >= min_sample_dur
                and s["duration"] <= max_sample_dur
@@ -663,7 +666,7 @@ def generate_filter_map(src_dataset_path, dest_dataset_path, data_file):
        "transform_digits": filtered_transform_digits,
        "extract_chars": filtered_extract_chars,
        "resample_ulaw24kmono": filtered_resample,
-        "max_sample_dur": filtered_max_sample_dur,
+        "maxmin_sample_dur": filtered_maxmin_sample_dur,
        "msec_to_sec": filtered_msec_to_sec,
        "blank_3hr_max_dur": filtered_blank_hr_max_dur,
    }
--- a/src/plume/utils/encrypt.py
+++ b/src/plume/utils/encrypt.py
@@ -13,7 +13,7 @@ from .audio import audio_seg_to_wav_bytes, audio_wav_bytes_to_seg
 from .parallel import parallel_apply
 from .lazy_import import lazy_module

-cryptography = lazy_module("cryptography")
+cryptography = lazy_module("cryptography.fernet", level='base')
 # cryptography.fernet = lazy_module("cryptography.fernet")
 pydub = lazy_module("pydub")

--- a/src/plume/utils/manifest.py
+++ b/src/plume/utils/manifest.py
@@ -13,6 +13,17 @@ def manifest_str(path, dur, text):
    return json.dumps(k) + "\n"


+def asr_manifest_reader(data_manifest_path: Path):
+    print(f"reading manifest from {data_manifest_path}")
+    with data_manifest_path.open("r") as pf:
+        data_jsonl = pf.readlines()
+    data_data = [json.loads(v) for v in data_jsonl]
+    for p in data_data:
+        p["audio_path"] = data_manifest_path.parent / Path(p["audio_filepath"])
+        p["text"] = p["text"].strip()
+        yield p
+
+
 def asr_manifest_writer(
    asr_manifest_path: Path, manifest_str_source, verbose=False
 ):
--- a/src/plume/utils/transcribe.py
+++ b/src/plume/utils/transcribe.py
@@ -104,6 +104,8 @@ def triton_transcribe_grpc_gen(
            if len(outputs) > 1 and append_raw:
                transcript = transcript + "|" + outputs[1].decode("utf-8")
        except InferenceServerException:
+            import traceback
+            traceback.print_exc()
            transcript = "[server error]"
        return transcript

@@ -146,7 +148,7 @@ def triton_transcribe_grpc_gen(


@app.command()
-def file(
+def audio_file(
    audio_file: Path,
    write_file: bool = False,
    chunked: bool = False,