1. add some pyaudio dep
2. fixed merge/ add eject command with unlink option 3. wip - marblenet vad 4. add slu_infer ui util 5. fix filter command with maxmin support 6. some logging changes and fixestegra
parent
4bca2097e1
commit
076b0d11e3
1
setup.py
1
setup.py
|
|
@ -66,6 +66,7 @@ extra_requirements = {
|
||||||
"pyspellchecker~=0.6.2",
|
"pyspellchecker~=0.6.2",
|
||||||
"num2words~=0.5.10",
|
"num2words~=0.5.10",
|
||||||
"pydub~=0.24.0",
|
"pydub~=0.24.0",
|
||||||
|
"pyaudio~=0.2.11"
|
||||||
],
|
],
|
||||||
"infer_min": [
|
"infer_min": [
|
||||||
"pyspellchecker~=0.6.2",
|
"pyspellchecker~=0.6.2",
|
||||||
|
|
|
||||||
|
|
@ -73,7 +73,12 @@ def fix_path(dataset_path: Path, force: bool = False):
|
||||||
|
|
||||||
|
|
||||||
@app.command()
|
@app.command()
|
||||||
def merge(src_dataset_paths: List[Path], dest_dataset_path: Path):
|
def merge(
|
||||||
|
src_dataset_paths: List[Path],
|
||||||
|
dest_dataset_path: Path,
|
||||||
|
unlink: bool = False,
|
||||||
|
verbose: bool = True,
|
||||||
|
):
|
||||||
reader_list = []
|
reader_list = []
|
||||||
abs_manifest_path = Path("abs_manifest.json")
|
abs_manifest_path = Path("abs_manifest.json")
|
||||||
for dataset_path in src_dataset_paths:
|
for dataset_path in src_dataset_paths:
|
||||||
|
|
@ -81,7 +86,29 @@ def merge(src_dataset_paths: List[Path], dest_dataset_path: Path):
|
||||||
reader_list.append(asr_manifest_reader(manifest_path))
|
reader_list.append(asr_manifest_reader(manifest_path))
|
||||||
dest_dataset_path.mkdir(parents=True, exist_ok=True)
|
dest_dataset_path.mkdir(parents=True, exist_ok=True)
|
||||||
dest_manifest_path = dest_dataset_path / abs_manifest_path
|
dest_manifest_path = dest_dataset_path / abs_manifest_path
|
||||||
asr_manifest_writer(dest_manifest_path, chain(*reader_list))
|
asr_manifest_writer(
|
||||||
|
dest_manifest_path, chain(*reader_list), verbose=verbose
|
||||||
|
)
|
||||||
|
if unlink:
|
||||||
|
eject(dest_dataset_path, verbose=verbose)
|
||||||
|
|
||||||
|
|
||||||
|
def eject(dest_dataset_path: Path, verbose: bool = False):
|
||||||
|
wav_dir = dest_dataset_path / Path("wavs")
|
||||||
|
wav_dir.mkdir(exist_ok=True, parents=True)
|
||||||
|
abs_manifest_path = ExtendedPath(
|
||||||
|
dest_dataset_path / Path("abs_manifest.json")
|
||||||
|
)
|
||||||
|
backup_abs_manifest_path = abs_manifest_path.with_suffix(".json.orig")
|
||||||
|
shutil.copy(abs_manifest_path, backup_abs_manifest_path)
|
||||||
|
manifest_data = list(abs_manifest_path.read_jsonl())
|
||||||
|
for md in tqdm(manifest_data) if verbose else manifest_data:
|
||||||
|
orig_path = Path(md["audio_filepath"])
|
||||||
|
new_path = wav_dir / Path(orig_path.name)
|
||||||
|
shutil.copy(orig_path, new_path)
|
||||||
|
md["audio_filepath"] = str(new_path)
|
||||||
|
abs_manifest_path.write_jsonl(manifest_data)
|
||||||
|
fix_path(dest_dataset_path)
|
||||||
|
|
||||||
|
|
||||||
@app.command()
|
@app.command()
|
||||||
|
|
@ -275,7 +302,7 @@ def encrypt(
|
||||||
src_dataset_path: Path,
|
src_dataset_path: Path,
|
||||||
dest_dataset_path: Path,
|
dest_dataset_path: Path,
|
||||||
encryption_key: str = typer.Option(..., prompt=True, hide_input=True),
|
encryption_key: str = typer.Option(..., prompt=True, hide_input=True),
|
||||||
verbose: bool = False,
|
verbose: bool = True,
|
||||||
):
|
):
|
||||||
dest_manifest = dest_dataset_path / Path("manifest.json")
|
dest_manifest = dest_dataset_path / Path("manifest.json")
|
||||||
src_manifest = src_dataset_path / Path("manifest.json")
|
src_manifest = src_dataset_path / Path("manifest.json")
|
||||||
|
|
|
||||||
|
|
@ -1,22 +1,357 @@
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import os
|
import os
|
||||||
import time
|
|
||||||
import copy
|
|
||||||
|
|
||||||
from omegaconf import OmegaConf
|
# import time
|
||||||
|
import copy
|
||||||
|
import wave
|
||||||
|
import wget
|
||||||
|
|
||||||
|
# from omegaconf import OmegaConf
|
||||||
|
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
|
import librosa.display
|
||||||
|
|
||||||
import IPython.display as ipd
|
import IPython.display as ipd
|
||||||
|
|
||||||
# import pyaudio as pa
|
# import pyaudio as pa
|
||||||
import librosa
|
import librosa
|
||||||
import nemo
|
|
||||||
|
# import nemo
|
||||||
import nemo.collections.asr as nemo_asr
|
import nemo.collections.asr as nemo_asr
|
||||||
|
from nemo.core.classes import IterableDataset
|
||||||
|
from nemo.core.neural_types import NeuralType, AudioSignal, LengthsType
|
||||||
|
import torch
|
||||||
|
from torch.utils.data import DataLoader
|
||||||
|
|
||||||
# sample rate, Hz
|
# sample rate, Hz
|
||||||
SAMPLE_RATE = 16000
|
SAMPLE_RATE = 16000
|
||||||
|
# import pdb; pdb.set_trace()
|
||||||
|
|
||||||
vad_model = nemo_asr.models.EncDecClassificationModel.from_pretrained(
|
# vad_model = nemo_asr.models.EncDecClassificationModel.from_pretrained(
|
||||||
"vad_marblenet"
|
# "vad_marblenet"
|
||||||
|
# )
|
||||||
|
# vad_model = nemo_asr.models.EncDecClassificationModel.from_pretrained(
|
||||||
|
# model_name="MarbleNet-3x2x64-Telephony"
|
||||||
|
# )
|
||||||
|
vad_model = nemo_asr.models.EncDecClassificationModel.restore_from(
|
||||||
|
"/home/malar/work/test/vad_telephony_marblenet.nemo"
|
||||||
)
|
)
|
||||||
|
# vad_model = nemo_asr.models.EncDecClassificationModel.from_pretrained(
|
||||||
|
# model_name="vad_telephony_marblenet"
|
||||||
|
# )
|
||||||
# Preserve a copy of the full config
|
# Preserve a copy of the full config
|
||||||
cfg = copy.deepcopy(vad_model._cfg)
|
cfg = copy.deepcopy(vad_model._cfg)
|
||||||
# print(OmegaConf.to_yaml(cfg))
|
# print(OmegaConf.to_yaml(cfg))
|
||||||
|
|
||||||
|
vad_model.preprocessor = vad_model.from_config_dict(cfg.preprocessor)
|
||||||
|
|
||||||
|
# Set model to inference mode
|
||||||
|
vad_model.eval()
|
||||||
|
vad_model = vad_model.to(vad_model.device)
|
||||||
|
# import pdb; pdb.set_trace()
|
||||||
|
|
||||||
|
# simple data layer to pass audio signal
|
||||||
|
class AudioDataLayer(IterableDataset):
|
||||||
|
@property
|
||||||
|
def output_types(self):
|
||||||
|
return {
|
||||||
|
"audio_signal": NeuralType(
|
||||||
|
("B", "T"), AudioSignal(freq=self._sample_rate)
|
||||||
|
),
|
||||||
|
"a_sig_length": NeuralType(tuple("B"), LengthsType()),
|
||||||
|
}
|
||||||
|
|
||||||
|
def __init__(self, sample_rate):
|
||||||
|
super().__init__()
|
||||||
|
self._sample_rate = sample_rate
|
||||||
|
self.output = True
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
return self
|
||||||
|
|
||||||
|
def __next__(self):
|
||||||
|
if not self.output:
|
||||||
|
raise StopIteration
|
||||||
|
self.output = False
|
||||||
|
return (
|
||||||
|
torch.as_tensor(self.signal, dtype=torch.float32),
|
||||||
|
torch.as_tensor(self.signal_shape, dtype=torch.int64),
|
||||||
|
)
|
||||||
|
|
||||||
|
def set_signal(self, signal):
|
||||||
|
self.signal = signal.astype(np.float32) / 32768.0
|
||||||
|
self.signal_shape = self.signal.size
|
||||||
|
self.output = True
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return 1
|
||||||
|
|
||||||
|
|
||||||
|
data_layer = AudioDataLayer(sample_rate=cfg.train_ds.sample_rate)
|
||||||
|
data_loader = DataLoader(
|
||||||
|
data_layer, batch_size=1, collate_fn=data_layer.collate_fn
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# inference method for audio signal (single instance)
|
||||||
|
def infer_signal(model, signal):
|
||||||
|
data_layer.set_signal(signal)
|
||||||
|
batch = next(iter(data_loader))
|
||||||
|
audio_signal, audio_signal_len = batch
|
||||||
|
audio_signal, audio_signal_len = (
|
||||||
|
audio_signal.to(vad_model.device),
|
||||||
|
audio_signal_len.to(vad_model.device),
|
||||||
|
)
|
||||||
|
logits = model.forward(
|
||||||
|
input_signal=audio_signal, input_signal_length=audio_signal_len
|
||||||
|
)
|
||||||
|
return logits
|
||||||
|
|
||||||
|
|
||||||
|
# class for streaming frame-based VAD
|
||||||
|
# 1) use reset() method to reset FrameVAD's state
|
||||||
|
# 2) call transcribe(frame) to do VAD on
|
||||||
|
# contiguous signal's frames
|
||||||
|
class FrameVAD:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
model_definition,
|
||||||
|
threshold=0.5,
|
||||||
|
frame_len=2,
|
||||||
|
frame_overlap=2.5,
|
||||||
|
offset=10,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
threshold: If prob of speech is larger than threshold, classify the segment to be speech.
|
||||||
|
frame_len: frame's duration, seconds
|
||||||
|
frame_overlap: duration of overlaps before and after current frame, seconds
|
||||||
|
offset: number of symbols to drop for smooth streaming
|
||||||
|
"""
|
||||||
|
self.vocab = list(model_definition["labels"])
|
||||||
|
self.vocab.append("_")
|
||||||
|
|
||||||
|
self.sr = model_definition["sample_rate"]
|
||||||
|
self.threshold = threshold
|
||||||
|
self.frame_len = frame_len
|
||||||
|
self.n_frame_len = int(frame_len * self.sr)
|
||||||
|
self.frame_overlap = frame_overlap
|
||||||
|
self.n_frame_overlap = int(frame_overlap * self.sr)
|
||||||
|
timestep_duration = model_definition["AudioToMFCCPreprocessor"][
|
||||||
|
"window_stride"
|
||||||
|
]
|
||||||
|
for block in model_definition["JasperEncoder"]["jasper"]:
|
||||||
|
timestep_duration *= block["stride"][0] ** block["repeat"]
|
||||||
|
self.buffer = np.zeros(
|
||||||
|
shape=2 * self.n_frame_overlap + self.n_frame_len, dtype=np.float32
|
||||||
|
)
|
||||||
|
self.offset = offset
|
||||||
|
self.reset()
|
||||||
|
|
||||||
|
def _decode(self, frame, offset=0):
|
||||||
|
assert len(frame) == self.n_frame_len
|
||||||
|
self.buffer[: -self.n_frame_len] = self.buffer[self.n_frame_len :]
|
||||||
|
self.buffer[-self.n_frame_len :] = frame
|
||||||
|
logits = infer_signal(vad_model, self.buffer).cpu().numpy()[0]
|
||||||
|
decoded = self._greedy_decoder(self.threshold, logits, self.vocab)
|
||||||
|
return decoded
|
||||||
|
|
||||||
|
@torch.no_grad()
|
||||||
|
def transcribe(self, frame=None):
|
||||||
|
if frame is None:
|
||||||
|
frame = np.zeros(shape=self.n_frame_len, dtype=np.float32)
|
||||||
|
if len(frame) < self.n_frame_len:
|
||||||
|
frame = np.pad(
|
||||||
|
frame, [0, self.n_frame_len - len(frame)], "constant"
|
||||||
|
)
|
||||||
|
unmerged = self._decode(frame, self.offset)
|
||||||
|
return unmerged
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
"""
|
||||||
|
Reset frame_history and decoder's state
|
||||||
|
"""
|
||||||
|
self.buffer = np.zeros(shape=self.buffer.shape, dtype=np.float32)
|
||||||
|
self.prev_char = ""
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _greedy_decoder(threshold, logits, vocab):
|
||||||
|
s = []
|
||||||
|
if logits.shape[0]:
|
||||||
|
probs = torch.softmax(torch.as_tensor(logits), dim=-1)
|
||||||
|
probas, _ = torch.max(probs, dim=-1)
|
||||||
|
probas_s = probs[1].item()
|
||||||
|
preds = 1 if probas_s >= threshold else 0
|
||||||
|
s = [
|
||||||
|
preds,
|
||||||
|
str(vocab[preds]),
|
||||||
|
probs[0].item(),
|
||||||
|
probs[1].item(),
|
||||||
|
str(logits),
|
||||||
|
]
|
||||||
|
return s
|
||||||
|
|
||||||
|
|
||||||
|
# WINDOW_SIZE_RANGE = [0.10, 0.15, 0.20, 0.25, 0.30, 0.5, 0.8]
|
||||||
|
# # STEP_RANGE = [0.01, 0.02, 0.03]
|
||||||
|
# # WINDOW_SIZE_RANGE = [0.15, 0.20]
|
||||||
|
# STEP_RANGE = [0.01, 0.02, 0.03]
|
||||||
|
WINDOW_SIZE_RANGE = [0.15, 0.20, 0.25]
|
||||||
|
# STEP_RANGE = [0.01, 0.02, 0.03]
|
||||||
|
# WINDOW_SIZE_RANGE = [0.15, 0.20]
|
||||||
|
STEP_RANGE = [0.03, 0.05, 0.07, 0.1]
|
||||||
|
STEP_LIST = [r for t in STEP_RANGE for r in [t]*len(WINDOW_SIZE_RANGE)]
|
||||||
|
# STEP_LIST
|
||||||
|
# STEP_LIST = (
|
||||||
|
# [0.01] * len(WINDOW_SIZE_RANGE)
|
||||||
|
# + [0.02] * len(WINDOW_SIZE_RANGE)
|
||||||
|
# + [0.03] * len(WINDOW_SIZE_RANGE)
|
||||||
|
# )
|
||||||
|
WINDOW_SIZE_LIST = WINDOW_SIZE_RANGE * len(STEP_RANGE)
|
||||||
|
|
||||||
|
|
||||||
|
def offline_inference(wave_file, STEP=0.025, WINDOW_SIZE=0.5, threshold=0.5):
|
||||||
|
|
||||||
|
FRAME_LEN = STEP # infer every STEP seconds
|
||||||
|
CHANNELS = 1 # number of audio channels (expect mono signal)
|
||||||
|
RATE = 16000 # sample rate, Hz
|
||||||
|
|
||||||
|
CHUNK_SIZE = int(FRAME_LEN * RATE)
|
||||||
|
|
||||||
|
vad = FrameVAD(
|
||||||
|
model_definition={
|
||||||
|
"sample_rate": SAMPLE_RATE,
|
||||||
|
"AudioToMFCCPreprocessor": cfg.preprocessor,
|
||||||
|
"JasperEncoder": cfg.encoder,
|
||||||
|
"labels": cfg.labels,
|
||||||
|
},
|
||||||
|
threshold=threshold,
|
||||||
|
frame_len=FRAME_LEN,
|
||||||
|
frame_overlap=(WINDOW_SIZE - FRAME_LEN) / 2,
|
||||||
|
offset=0,
|
||||||
|
)
|
||||||
|
|
||||||
|
wf = wave.open(wave_file, "rb")
|
||||||
|
# p = pa.PyAudio()
|
||||||
|
|
||||||
|
empty_counter = 0
|
||||||
|
|
||||||
|
preds = []
|
||||||
|
proba_b = []
|
||||||
|
proba_s = []
|
||||||
|
|
||||||
|
# stream = p.open(
|
||||||
|
# format=p.get_format_from_width(wf.getsampwidth()),
|
||||||
|
# channels=CHANNELS,
|
||||||
|
# rate=RATE,
|
||||||
|
# output=True,
|
||||||
|
# )
|
||||||
|
|
||||||
|
data = wf.readframes(CHUNK_SIZE)
|
||||||
|
|
||||||
|
while len(data) > 0:
|
||||||
|
|
||||||
|
data = wf.readframes(CHUNK_SIZE)
|
||||||
|
signal = np.frombuffer(data, dtype=np.int16)
|
||||||
|
result = vad.transcribe(signal)
|
||||||
|
|
||||||
|
preds.append(result[0])
|
||||||
|
proba_b.append(result[2])
|
||||||
|
proba_s.append(result[3])
|
||||||
|
|
||||||
|
if len(result):
|
||||||
|
# print(result, end="\n")
|
||||||
|
empty_counter = 3
|
||||||
|
elif empty_counter > 0:
|
||||||
|
empty_counter -= 1
|
||||||
|
# if empty_counter == 0:
|
||||||
|
# print(" ", end="")
|
||||||
|
|
||||||
|
# p.terminate()
|
||||||
|
vad.reset()
|
||||||
|
|
||||||
|
return preds, proba_b, proba_s
|
||||||
|
|
||||||
|
|
||||||
|
# demo_wave = "VAD_demo.wav"
|
||||||
|
# if not os.path.exists(demo_wave):
|
||||||
|
# wget.download(
|
||||||
|
# "https://dldata-public.s3.us-east-2.amazonaws.com/VAD_demo.wav",
|
||||||
|
# demo_wave,
|
||||||
|
# )
|
||||||
|
|
||||||
|
demo_wave = "WAL-1201-cust.wav"
|
||||||
|
|
||||||
|
|
||||||
|
wave_file = demo_wave
|
||||||
|
|
||||||
|
CHANNELS = 1
|
||||||
|
RATE = 16000
|
||||||
|
audio, sample_rate = librosa.load(wave_file, sr=RATE)
|
||||||
|
dur = librosa.get_duration(audio)
|
||||||
|
print(dur)
|
||||||
|
|
||||||
|
|
||||||
|
threshold = 0.5
|
||||||
|
|
||||||
|
results = []
|
||||||
|
for STEP, WINDOW_SIZE in zip(
|
||||||
|
STEP_LIST,
|
||||||
|
WINDOW_SIZE_LIST,
|
||||||
|
):
|
||||||
|
print(f"====== STEP is {STEP}s, WINDOW_SIZE is {WINDOW_SIZE}s ====== ")
|
||||||
|
preds, proba_b, proba_s = offline_inference(
|
||||||
|
wave_file, STEP, WINDOW_SIZE, threshold
|
||||||
|
)
|
||||||
|
results.append([STEP, WINDOW_SIZE, preds, proba_b, proba_s])
|
||||||
|
|
||||||
|
|
||||||
|
plt.figure(figsize=[20, 3*len(STEP_LIST)])
|
||||||
|
|
||||||
|
num = len(results)
|
||||||
|
for i in range(num):
|
||||||
|
len_pred = len(results[i][2])
|
||||||
|
FRAME_LEN = results[i][0]
|
||||||
|
ax1 = plt.subplot(num + 1, 1, i + 1)
|
||||||
|
|
||||||
|
ax1.plot(np.arange(audio.size) / sample_rate, audio, "b")
|
||||||
|
ax1.set_xlim([-0.01, int(dur) + 1])
|
||||||
|
ax1.tick_params(axis="y", labelcolor="b")
|
||||||
|
ax1.set_ylabel("Signal")
|
||||||
|
ax1.set_ylim([-1, 1])
|
||||||
|
|
||||||
|
proba_s = results[i][4]
|
||||||
|
pred = [1 if p > threshold else 0 for p in proba_s]
|
||||||
|
ax2 = ax1.twinx()
|
||||||
|
ax2.plot(
|
||||||
|
np.arange(len_pred) / (1 / results[i][0]),
|
||||||
|
np.array(pred),
|
||||||
|
"r",
|
||||||
|
label="pred",
|
||||||
|
)
|
||||||
|
ax2.plot(
|
||||||
|
np.arange(len_pred) / (1 / results[i][0]),
|
||||||
|
np.array(proba_s),
|
||||||
|
"g--",
|
||||||
|
label="speech prob",
|
||||||
|
)
|
||||||
|
ax2.tick_params(axis="y", labelcolor="r")
|
||||||
|
legend = ax2.legend(loc="lower right", shadow=True)
|
||||||
|
ax1.set_ylabel("prediction")
|
||||||
|
|
||||||
|
ax2.set_title(f"step {results[i][0]}s, buffer size {results[i][1]}s")
|
||||||
|
ax2.set_ylabel("Preds and Probas")
|
||||||
|
|
||||||
|
|
||||||
|
ax = plt.subplot(num + 1, 1, i + 2)
|
||||||
|
S = librosa.feature.melspectrogram(
|
||||||
|
y=audio, sr=sample_rate, n_mels=64, fmax=8000
|
||||||
|
)
|
||||||
|
S_dB = librosa.power_to_db(S, ref=np.max)
|
||||||
|
librosa.display.specshow(
|
||||||
|
S_dB, x_axis="time", y_axis="mel", sr=sample_rate, fmax=8000
|
||||||
|
)
|
||||||
|
ax.set_title("Mel-frequency spectrogram")
|
||||||
|
ax.grid()
|
||||||
|
plt.show()
|
||||||
|
ipd.Audio(data=audio, rate=sample_rate)
|
||||||
|
|
|
||||||
|
|
@ -55,10 +55,8 @@ def export_jasper(src_dataset_path: Path, dest_dataset_path: Path, unlink: bool
|
||||||
out_tsv = dest_dataset_path / Path(o_tsv)
|
out_tsv = dest_dataset_path / Path(o_tsv)
|
||||||
out_ltr = dest_dataset_path / Path(o_ltr)
|
out_ltr = dest_dataset_path / Path(o_ltr)
|
||||||
with out_tsv.open("w") as tsv_f, out_ltr.open("w") as ltr_f:
|
with out_tsv.open("w") as tsv_f, out_ltr.open("w") as ltr_f:
|
||||||
if unlink:
|
dest_path = dest_dataset_path if unlink else src_dataset_path
|
||||||
tsv_f.write(f"{dest_dataset_path}\n")
|
tsv_f.write(f"{dest_path}\n")
|
||||||
else:
|
|
||||||
tsv_f.write(f"{src_dataset_path}\n")
|
|
||||||
for md in manifest_data:
|
for md in manifest_data:
|
||||||
audio_fname = md["audio_filepath"]
|
audio_fname = md["audio_filepath"]
|
||||||
pipe_toks = replace_redundant_spaces_with(md["text"], "|").upper()
|
pipe_toks = replace_redundant_spaces_with(md["text"], "|").upper()
|
||||||
|
|
|
||||||
|
|
@ -18,7 +18,9 @@ def ui():
|
||||||
|
|
||||||
|
|
||||||
@app.command()
|
@app.command()
|
||||||
def annotation(data_dir: Path, dump_fname: Path = "ui_dump.json", task_id: str = ""):
|
def annotation(
|
||||||
|
data_dir: Path, dump_fname: Path = "ui_dump.json", task_id: str = ""
|
||||||
|
):
|
||||||
annotation_lit_path = Path(__file__).parent / Path("annotation.py")
|
annotation_lit_path = Path(__file__).parent / Path("annotation.py")
|
||||||
if task_id:
|
if task_id:
|
||||||
sys.argv = [
|
sys.argv = [
|
||||||
|
|
@ -83,6 +85,13 @@ def audio(audio_dir: Path):
|
||||||
sys.exit(stcli.main())
|
sys.exit(stcli.main())
|
||||||
|
|
||||||
|
|
||||||
|
@app.command()
|
||||||
|
def slu_infer():
|
||||||
|
lit_path = Path(__file__).parent / Path("slu_infer.py")
|
||||||
|
sys.argv = ["streamlit", "run", str(lit_path)]
|
||||||
|
sys.exit(stcli.main())
|
||||||
|
|
||||||
|
|
||||||
@app.command()
|
@app.command()
|
||||||
def collection(data_dir: Path, task_id: str = ""):
|
def collection(data_dir: Path, task_id: str = ""):
|
||||||
# TODO: Implement web ui for data collection
|
# TODO: Implement web ui for data collection
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,32 @@
|
||||||
|
# from pathlib import Path
|
||||||
|
|
||||||
|
import streamlit as st
|
||||||
|
import typer
|
||||||
|
|
||||||
|
from plume.utils.transcribe import triton_transcribe_grpc_gen
|
||||||
|
from plume.utils.audio import audio_wav_bytes_to_seg
|
||||||
|
|
||||||
|
app = typer.Typer()
|
||||||
|
|
||||||
|
transcriber, prep = triton_transcribe_grpc_gen(
|
||||||
|
asr_model="slu_num_wav2vec2", method="whole", append_raw=True
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@app.command()
|
||||||
|
def main():
|
||||||
|
st.title("SLU Inference")
|
||||||
|
audio_file = st.file_uploader("Upload File", type=["wav", "mp3"])
|
||||||
|
if audio_file:
|
||||||
|
audio_bytes = audio_file.read()
|
||||||
|
seg = audio_wav_bytes_to_seg(audio_bytes)
|
||||||
|
st.audio(audio_bytes)
|
||||||
|
tscript = transcriber(prep(seg))
|
||||||
|
st.write(tscript)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
try:
|
||||||
|
app()
|
||||||
|
except SystemExit:
|
||||||
|
pass
|
||||||
|
|
@ -32,7 +32,11 @@ import six
|
||||||
|
|
||||||
# from .transcribe import triton_transcribe_grpc_gen
|
# from .transcribe import triton_transcribe_grpc_gen
|
||||||
# from .eval import app as eval_app
|
# from .eval import app as eval_app
|
||||||
from .manifest import asr_manifest_writer, manifest_str
|
from .manifest import (
|
||||||
|
asr_manifest_writer,
|
||||||
|
asr_manifest_reader,
|
||||||
|
manifest_str,
|
||||||
|
) # noqa
|
||||||
from .lazy_import import lazy_callable, lazy_module
|
from .lazy_import import lazy_callable, lazy_module
|
||||||
from .parallel import parallel_apply
|
from .parallel import parallel_apply
|
||||||
from .extended_path import ExtendedPath
|
from .extended_path import ExtendedPath
|
||||||
|
|
@ -430,17 +434,6 @@ def ui_dump_manifest_writer(dataset_dir, asr_data_source, verbose=False):
|
||||||
return num_datapoints
|
return num_datapoints
|
||||||
|
|
||||||
|
|
||||||
def asr_manifest_reader(data_manifest_path: Path):
|
|
||||||
print(f"reading manifest from {data_manifest_path}")
|
|
||||||
with data_manifest_path.open("r") as pf:
|
|
||||||
data_jsonl = pf.readlines()
|
|
||||||
data_data = [json.loads(v) for v in data_jsonl]
|
|
||||||
for p in data_data:
|
|
||||||
p["audio_path"] = data_manifest_path.parent / Path(p["audio_filepath"])
|
|
||||||
p["text"] = p["text"].strip()
|
|
||||||
yield p
|
|
||||||
|
|
||||||
|
|
||||||
def asr_test_writer(out_file_path: Path, source):
|
def asr_test_writer(out_file_path: Path, source):
|
||||||
def dd_str(dd, idx):
|
def dd_str(dd, idx):
|
||||||
path = dd["audio_filepath"]
|
path = dd["audio_filepath"]
|
||||||
|
|
@ -558,11 +551,19 @@ def generate_filter_map(src_dataset_path, dest_dataset_path, data_file):
|
||||||
blank_count += 1
|
blank_count += 1
|
||||||
typer.echo(f"filtered {blank_count} of {total_count} blank samples")
|
typer.echo(f"filtered {blank_count} of {total_count} blank samples")
|
||||||
|
|
||||||
def filtered_max_sample_dur():
|
def filtered_maxmin_sample_dur():
|
||||||
|
import soundfile
|
||||||
|
|
||||||
max_dur_count = 0
|
max_dur_count = 0
|
||||||
for s in src_data_enum:
|
for s in src_data_enum:
|
||||||
wav_duration = s["duration"]
|
wav_real_duration = soundfile.info(
|
||||||
if wav_duration <= max_sample_dur:
|
src_dataset_path / Path(s["audio_filepath"])
|
||||||
|
).duration
|
||||||
|
wav_duration = min(wav_real_duration, s["duration"])
|
||||||
|
if (
|
||||||
|
wav_duration <= max_sample_dur
|
||||||
|
and wav_duration > min_sample_dur
|
||||||
|
):
|
||||||
shutil.copy(
|
shutil.copy(
|
||||||
src_dataset_path / Path(s["audio_filepath"]),
|
src_dataset_path / Path(s["audio_filepath"]),
|
||||||
dest_dataset_path / Path(s["audio_filepath"]),
|
dest_dataset_path / Path(s["audio_filepath"]),
|
||||||
|
|
@ -571,7 +572,7 @@ def generate_filter_map(src_dataset_path, dest_dataset_path, data_file):
|
||||||
else:
|
else:
|
||||||
max_dur_count += 1
|
max_dur_count += 1
|
||||||
typer.echo(
|
typer.echo(
|
||||||
f"filtered {max_dur_count} samples longer thans {max_sample_dur}s"
|
f"filtered {max_dur_count} samples longer thans {max_sample_dur}s and shorter than {min_sample_dur}s"
|
||||||
)
|
)
|
||||||
|
|
||||||
def filtered_transform_digits():
|
def filtered_transform_digits():
|
||||||
|
|
@ -641,7 +642,9 @@ def generate_filter_map(src_dataset_path, dest_dataset_path, data_file):
|
||||||
wav_duration = 0
|
wav_duration = 0
|
||||||
for s in src_data_enum:
|
for s in src_data_enum:
|
||||||
# nums = re.sub(" ", "", s["text"])
|
# nums = re.sub(" ", "", s["text"])
|
||||||
s["text"] = "gAAAAABgq2FR6ajbhMsDmWRQBzX6gIzyAG5sMwFihGeV7E_6eVJqqF78yzmtTJPsJAOJEEXhJ9Z45MrYNgE1sq7VUdsBVGh2cw=="
|
s[
|
||||||
|
"text"
|
||||||
|
] = "gAAAAABgq2FR6ajbhMsDmWRQBzX6gIzyAG5sMwFihGeV7E_6eVJqqF78yzmtTJPsJAOJEEXhJ9Z45MrYNgE1sq7VUdsBVGh2cw=="
|
||||||
if (
|
if (
|
||||||
s["duration"] >= min_sample_dur
|
s["duration"] >= min_sample_dur
|
||||||
and s["duration"] <= max_sample_dur
|
and s["duration"] <= max_sample_dur
|
||||||
|
|
@ -663,7 +666,7 @@ def generate_filter_map(src_dataset_path, dest_dataset_path, data_file):
|
||||||
"transform_digits": filtered_transform_digits,
|
"transform_digits": filtered_transform_digits,
|
||||||
"extract_chars": filtered_extract_chars,
|
"extract_chars": filtered_extract_chars,
|
||||||
"resample_ulaw24kmono": filtered_resample,
|
"resample_ulaw24kmono": filtered_resample,
|
||||||
"max_sample_dur": filtered_max_sample_dur,
|
"maxmin_sample_dur": filtered_maxmin_sample_dur,
|
||||||
"msec_to_sec": filtered_msec_to_sec,
|
"msec_to_sec": filtered_msec_to_sec,
|
||||||
"blank_3hr_max_dur": filtered_blank_hr_max_dur,
|
"blank_3hr_max_dur": filtered_blank_hr_max_dur,
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -13,7 +13,7 @@ from .audio import audio_seg_to_wav_bytes, audio_wav_bytes_to_seg
|
||||||
from .parallel import parallel_apply
|
from .parallel import parallel_apply
|
||||||
from .lazy_import import lazy_module
|
from .lazy_import import lazy_module
|
||||||
|
|
||||||
cryptography = lazy_module("cryptography")
|
cryptography = lazy_module("cryptography.fernet", level='base')
|
||||||
# cryptography.fernet = lazy_module("cryptography.fernet")
|
# cryptography.fernet = lazy_module("cryptography.fernet")
|
||||||
pydub = lazy_module("pydub")
|
pydub = lazy_module("pydub")
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -13,6 +13,17 @@ def manifest_str(path, dur, text):
|
||||||
return json.dumps(k) + "\n"
|
return json.dumps(k) + "\n"
|
||||||
|
|
||||||
|
|
||||||
|
def asr_manifest_reader(data_manifest_path: Path):
|
||||||
|
print(f"reading manifest from {data_manifest_path}")
|
||||||
|
with data_manifest_path.open("r") as pf:
|
||||||
|
data_jsonl = pf.readlines()
|
||||||
|
data_data = [json.loads(v) for v in data_jsonl]
|
||||||
|
for p in data_data:
|
||||||
|
p["audio_path"] = data_manifest_path.parent / Path(p["audio_filepath"])
|
||||||
|
p["text"] = p["text"].strip()
|
||||||
|
yield p
|
||||||
|
|
||||||
|
|
||||||
def asr_manifest_writer(
|
def asr_manifest_writer(
|
||||||
asr_manifest_path: Path, manifest_str_source, verbose=False
|
asr_manifest_path: Path, manifest_str_source, verbose=False
|
||||||
):
|
):
|
||||||
|
|
|
||||||
|
|
@ -104,6 +104,8 @@ def triton_transcribe_grpc_gen(
|
||||||
if len(outputs) > 1 and append_raw:
|
if len(outputs) > 1 and append_raw:
|
||||||
transcript = transcript + "|" + outputs[1].decode("utf-8")
|
transcript = transcript + "|" + outputs[1].decode("utf-8")
|
||||||
except InferenceServerException:
|
except InferenceServerException:
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
transcript = "[server error]"
|
transcript = "[server error]"
|
||||||
return transcript
|
return transcript
|
||||||
|
|
||||||
|
|
@ -146,7 +148,7 @@ def triton_transcribe_grpc_gen(
|
||||||
|
|
||||||
|
|
||||||
@app.command()
|
@app.command()
|
||||||
def file(
|
def audio_file(
|
||||||
audio_file: Path,
|
audio_file: Path,
|
||||||
write_file: bool = False,
|
write_file: bool = False,
|
||||||
chunked: bool = False,
|
chunked: bool = False,
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue