diff --git a/README.md b/README.md index 9aa7754..d95808d 100644 --- a/README.md +++ b/README.md @@ -7,10 +7,16 @@ # Table of Contents +* [Prerequisites](#prerequisites) * [Features](#features) * [Installation](#installation) * [Usage](#usage) +# Prerequisites +```bash +# apt install libsndfile-dev ffmpeg +``` + # Features * ASR using Jasper (from [NemoToolkit](https://github.com/NVIDIA/NeMo) ) diff --git a/jasper/data/rastrik_recycler.py b/jasper/data/rastrik_recycler.py index e051551..6093b18 100644 --- a/jasper/data/rastrik_recycler.py +++ b/jasper/data/rastrik_recycler.py @@ -1,8 +1,7 @@ -from rastrik.proto.callrecord_pb2 import CallRecordEvent, CallRecord +from rastrik.proto.callrecord_pb2 import CallRecord import gzip from pydub import AudioSegment -import json -from .utils import ExtendedPath, asr_data_writer, strip_silence +from .utils import ui_dump_manifest_writer, strip_silence import typer from itertools import chain @@ -11,127 +10,84 @@ from pathlib import Path app = typer.Typer() + @app.command() def extract_manifest( - call_audio_dir: Path = Path("./data/call_audio"), - call_meta_dir: Path = Path("./data/call_metadata"), + call_log_dir: Path = Path("./data/call_audio"), output_dir: Path = Path("./data"), dataset_name: str = "grassroot_pizzahut_v1", + caller_name: str = "grassroot", verbose: bool = False, ): - - call_asr_data: Path = output_dir / Path("asr_data") call_asr_data.mkdir(exist_ok=True, parents=True) - """ - def read_event_old(log_file,audio_file): - call_wav = AudioSegment.from_wav(audio_file) - call_wav_0, call_wav_1 = call_wav.split_to_mono() - with gzip.open(log_file, "rb") as log_h: - record_data = log_h.read() - cr = CallRecord() - cr.ParseFromString(record_data) - import pdb - first_audio_event_timestamp = next ((i - for i in cr.events - if i.WhichOneof("event_type") == "call_event" - and i.call_event.WhichOneof("event_type") == "call_audio" - )).timestamp.ToDatetime() - - speech_events = [ i - for i in cr.events - if i.WhichOneof("event_type") == "asr_result" - ] - previous_event_timestamp = first_audio_event_timestamp - first_audio_event_timestamp - for index,each_speech_events in enumerate(speech_events): - asr_final = each_speech_events.asr_result.text - speech_timestamp = each_speech_events.timestamp.ToDatetime() - actual_timestamp = speech_timestamp - first_audio_event_timestamp - print(previous_event_timestamp.total_seconds(),actual_timestamp.total_seconds(),asr_final) - start_time = previous_event_timestamp.total_seconds()*1000 - end_time = actual_timestamp.total_seconds() * 1000 - audio_segment = strip_silence(call_wav_1[start_time:end_time]) - audio_segment.export(output_folder+str(index) + '.wav' ,format='wav') - previous_event_timestamp = actual_timestamp - """ - - def wav_pb2_generator(call_audio_dir): - for wav_path in call_audio_dir.glob("**/*.wav"): + def wav_pb2_generator(log_dir): + for wav_path in log_dir.glob("**/*.wav"): if verbose: typer.echo(f"loading events for file {wav_path}") call_wav = AudioSegment.from_file_using_temporary_files(wav_path) - rel_meta_path = wav_path.with_suffix(".pb2.gz").relative_to(call_audio_dir) - meta_path = call_meta_dir / rel_meta_path - #events = ExtendedPath(meta_path).read_json() - yield call_wav,wav_path, meta_path + meta_path = wav_path.with_suffix(".pb2.gz") + yield call_wav, wav_path, meta_path - def read_event(call_wav,log_file): - #call_wav = AudioSegment.from_wav(audio_file) + def read_event(call_wav, log_file): call_wav_0, call_wav_1 = call_wav.split_to_mono() with gzip.open(log_file, "rb") as log_h: record_data = log_h.read() cr = CallRecord() cr.ParseFromString(record_data) - import pdb - first_audio_event_timestamp = next ((i - for i in cr.events + first_audio_event_timestamp = next( + ( + i + for i in cr.events if i.WhichOneof("event_type") == "call_event" and i.call_event.WhichOneof("event_type") == "call_audio" - )).timestamp.ToDatetime() + ) + ).timestamp.ToDatetime() - speech_events = [ i + speech_events = [ + i for i in cr.events if i.WhichOneof("event_type") == "speech_event" and i.speech_event.WhichOneof("event_type") == "asr_final" ] - previous_event_timestamp = first_audio_event_timestamp - first_audio_event_timestamp - for index,each_speech_events in enumerate(speech_events): + previous_event_timestamp = ( + first_audio_event_timestamp - first_audio_event_timestamp + ) + for index, each_speech_events in enumerate(speech_events): asr_final = each_speech_events.speech_event.asr_final speech_timestamp = each_speech_events.timestamp.ToDatetime() actual_timestamp = speech_timestamp - first_audio_event_timestamp - print(previous_event_timestamp.total_seconds(),actual_timestamp.total_seconds(),asr_final) - start_time = previous_event_timestamp.total_seconds()*1000 + start_time = previous_event_timestamp.total_seconds() * 1000 end_time = actual_timestamp.total_seconds() * 1000 audio_segment = strip_silence(call_wav_1[start_time:end_time]) code_fb = BytesIO() audio_segment.export(code_fb, format="wav") wav_data = code_fb.getvalue() - - #output_audio_path = output_folder + audio_file.replace('.wav','') + '_' + str(index) - #audio_segment.export( output_audio_path+ '.wav' ,format='wav') - #manifest_file.write(json.dumps({"audio_filepath":output_audio_path , "duration": (end_time-start_time) / 1000 , "text":asr_final }) + '\n') previous_event_timestamp = actual_timestamp - duration = (end_time-start_time) / 1000 - yield asr_final,duration,wav_data - + duration = (end_time - start_time) / 1000 + yield asr_final, duration, wav_data, "grassroot", audio_segment def generate_call_asr_data(): - full_asr_data = [] + full_data = [] total_duration = 0 - for wav,wav_path, pb2_path in wav_pb2_generator(call_audio_dir): - asr_data = read_event(wav,pb2_path) + for wav, wav_path, pb2_path in wav_pb2_generator(call_log_dir): + asr_data = read_event(wav, pb2_path) total_duration += wav.duration_seconds - full_asr_data.append(asr_data) - - typer.echo(f"loaded {len(full_asr_data)} calls of duration {total_duration}s") - n_dps = asr_data_writer(call_asr_data, dataset_name, chain(*full_asr_data)) + full_data.append(asr_data) + n_calls = len(full_data) + typer.echo(f"loaded {n_calls} calls of duration {total_duration}s") + n_dps = ui_dump_manifest_writer(call_asr_data, dataset_name, chain(*full_data)) typer.echo(f"written {n_dps} data points") - generate_call_asr_data() + def main(): app() if __name__ == "__main__": main() - - - - - - diff --git a/jasper/data/validation/ui.py b/jasper/data/validation/ui.py index 3915aeb..00f2e5c 100644 --- a/jasper/data/validation/ui.py +++ b/jasper/data/validation/ui.py @@ -42,7 +42,9 @@ if not hasattr(st, "mongo_connected"): upsert=True, ) - def set_task_fn(mf_path): + def set_task_fn(mf_path, task_id): + if task_id: + st.task_id = task_id task_path = mf_path.parent / Path(f"task-{st.task_id}.lck") if not task_path.exists(): print(f"creating task lock at {task_path}") @@ -66,8 +68,8 @@ def load_ui_data(validation_ui_data_path: Path): @app.command() -def main(manifest: Path): - st.set_task(manifest) +def main(manifest: Path, task_id: str = ""): + st.set_task(manifest, task_id) ui_config = load_ui_data(manifest) asr_data = ui_config["data"] use_domain_asr = ui_config.get("use_domain_asr", True) diff --git a/setup.py b/setup.py index 5b70dcc..eb23848 100644 --- a/setup.py +++ b/setup.py @@ -19,13 +19,15 @@ extra_requirements = { "ruamel.yaml==0.16.10", "pymongo==3.10.1", "librosa==0.7.2", + "numba==0.48", "matplotlib==3.2.1", "pandas==1.0.3", "tabulate==0.8.7", "natural==0.2.0", "num2words==0.5.10", - "typer[all]==0.1.1", + "typer[all]==0.3.1", "python-slugify==4.0.0", + "rpyc~=4.1.4", "lenses @ git+https://github.com/ingolemo/python-lenses.git@b2a2a9aa5b61540992d70b2cf36008d0121e8948#egg=lenses", ], "validation": [