1. fixed dependency issues

2. add task-id option to validation ui to respawn previous task
3. clean-up rastrik-recycler
pull/2/head
Malar Kannan 2020-08-06 22:40:14 +05:30
parent e77943b2f2
commit 42647196fe
4 changed files with 47 additions and 81 deletions

View File

@ -7,10 +7,16 @@
# Table of Contents # Table of Contents
* [Prerequisites](#prerequisites)
* [Features](#features) * [Features](#features)
* [Installation](#installation) * [Installation](#installation)
* [Usage](#usage) * [Usage](#usage)
# Prerequisites
```bash
# apt install libsndfile-dev ffmpeg
```
# Features # Features
* ASR using Jasper (from [NemoToolkit](https://github.com/NVIDIA/NeMo) ) * ASR using Jasper (from [NemoToolkit](https://github.com/NVIDIA/NeMo) )

View File

@ -1,8 +1,7 @@
from rastrik.proto.callrecord_pb2 import CallRecordEvent, CallRecord from rastrik.proto.callrecord_pb2 import CallRecord
import gzip import gzip
from pydub import AudioSegment from pydub import AudioSegment
import json from .utils import ui_dump_manifest_writer, strip_silence
from .utils import ExtendedPath, asr_data_writer, strip_silence
import typer import typer
from itertools import chain from itertools import chain
@ -11,87 +10,55 @@ from pathlib import Path
app = typer.Typer() app = typer.Typer()
@app.command() @app.command()
def extract_manifest( def extract_manifest(
call_audio_dir: Path = Path("./data/call_audio"), call_log_dir: Path = Path("./data/call_audio"),
call_meta_dir: Path = Path("./data/call_metadata"),
output_dir: Path = Path("./data"), output_dir: Path = Path("./data"),
dataset_name: str = "grassroot_pizzahut_v1", dataset_name: str = "grassroot_pizzahut_v1",
caller_name: str = "grassroot",
verbose: bool = False, verbose: bool = False,
): ):
call_asr_data: Path = output_dir / Path("asr_data") call_asr_data: Path = output_dir / Path("asr_data")
call_asr_data.mkdir(exist_ok=True, parents=True) call_asr_data.mkdir(exist_ok=True, parents=True)
"""
def read_event_old(log_file,audio_file):
call_wav = AudioSegment.from_wav(audio_file)
call_wav_0, call_wav_1 = call_wav.split_to_mono()
with gzip.open(log_file, "rb") as log_h:
record_data = log_h.read()
cr = CallRecord()
cr.ParseFromString(record_data)
import pdb def wav_pb2_generator(log_dir):
first_audio_event_timestamp = next ((i for wav_path in log_dir.glob("**/*.wav"):
for i in cr.events
if i.WhichOneof("event_type") == "call_event"
and i.call_event.WhichOneof("event_type") == "call_audio"
)).timestamp.ToDatetime()
speech_events = [ i
for i in cr.events
if i.WhichOneof("event_type") == "asr_result"
]
previous_event_timestamp = first_audio_event_timestamp - first_audio_event_timestamp
for index,each_speech_events in enumerate(speech_events):
asr_final = each_speech_events.asr_result.text
speech_timestamp = each_speech_events.timestamp.ToDatetime()
actual_timestamp = speech_timestamp - first_audio_event_timestamp
print(previous_event_timestamp.total_seconds(),actual_timestamp.total_seconds(),asr_final)
start_time = previous_event_timestamp.total_seconds()*1000
end_time = actual_timestamp.total_seconds() * 1000
audio_segment = strip_silence(call_wav_1[start_time:end_time])
audio_segment.export(output_folder+str(index) + '.wav' ,format='wav')
previous_event_timestamp = actual_timestamp
"""
def wav_pb2_generator(call_audio_dir):
for wav_path in call_audio_dir.glob("**/*.wav"):
if verbose: if verbose:
typer.echo(f"loading events for file {wav_path}") typer.echo(f"loading events for file {wav_path}")
call_wav = AudioSegment.from_file_using_temporary_files(wav_path) call_wav = AudioSegment.from_file_using_temporary_files(wav_path)
rel_meta_path = wav_path.with_suffix(".pb2.gz").relative_to(call_audio_dir) meta_path = wav_path.with_suffix(".pb2.gz")
meta_path = call_meta_dir / rel_meta_path
#events = ExtendedPath(meta_path).read_json()
yield call_wav, wav_path, meta_path yield call_wav, wav_path, meta_path
def read_event(call_wav, log_file): def read_event(call_wav, log_file):
#call_wav = AudioSegment.from_wav(audio_file)
call_wav_0, call_wav_1 = call_wav.split_to_mono() call_wav_0, call_wav_1 = call_wav.split_to_mono()
with gzip.open(log_file, "rb") as log_h: with gzip.open(log_file, "rb") as log_h:
record_data = log_h.read() record_data = log_h.read()
cr = CallRecord() cr = CallRecord()
cr.ParseFromString(record_data) cr.ParseFromString(record_data)
import pdb first_audio_event_timestamp = next(
first_audio_event_timestamp = next ((i (
i
for i in cr.events for i in cr.events
if i.WhichOneof("event_type") == "call_event" if i.WhichOneof("event_type") == "call_event"
and i.call_event.WhichOneof("event_type") == "call_audio" and i.call_event.WhichOneof("event_type") == "call_audio"
)).timestamp.ToDatetime() )
).timestamp.ToDatetime()
speech_events = [ i speech_events = [
i
for i in cr.events for i in cr.events
if i.WhichOneof("event_type") == "speech_event" if i.WhichOneof("event_type") == "speech_event"
and i.speech_event.WhichOneof("event_type") == "asr_final" and i.speech_event.WhichOneof("event_type") == "asr_final"
] ]
previous_event_timestamp = first_audio_event_timestamp - first_audio_event_timestamp previous_event_timestamp = (
first_audio_event_timestamp - first_audio_event_timestamp
)
for index, each_speech_events in enumerate(speech_events): for index, each_speech_events in enumerate(speech_events):
asr_final = each_speech_events.speech_event.asr_final asr_final = each_speech_events.speech_event.asr_final
speech_timestamp = each_speech_events.timestamp.ToDatetime() speech_timestamp = each_speech_events.timestamp.ToDatetime()
actual_timestamp = speech_timestamp - first_audio_event_timestamp actual_timestamp = speech_timestamp - first_audio_event_timestamp
print(previous_event_timestamp.total_seconds(),actual_timestamp.total_seconds(),asr_final)
start_time = previous_event_timestamp.total_seconds() * 1000 start_time = previous_event_timestamp.total_seconds() * 1000
end_time = actual_timestamp.total_seconds() * 1000 end_time = actual_timestamp.total_seconds() * 1000
audio_segment = strip_silence(call_wav_1[start_time:end_time]) audio_segment = strip_silence(call_wav_1[start_time:end_time])
@ -99,39 +66,28 @@ def extract_manifest(
code_fb = BytesIO() code_fb = BytesIO()
audio_segment.export(code_fb, format="wav") audio_segment.export(code_fb, format="wav")
wav_data = code_fb.getvalue() wav_data = code_fb.getvalue()
#output_audio_path = output_folder + audio_file.replace('.wav','') + '_' + str(index)
#audio_segment.export( output_audio_path+ '.wav' ,format='wav')
#manifest_file.write(json.dumps({"audio_filepath":output_audio_path , "duration": (end_time-start_time) / 1000 , "text":asr_final }) + '\n')
previous_event_timestamp = actual_timestamp previous_event_timestamp = actual_timestamp
duration = (end_time - start_time) / 1000 duration = (end_time - start_time) / 1000
yield asr_final,duration,wav_data yield asr_final, duration, wav_data, "grassroot", audio_segment
def generate_call_asr_data(): def generate_call_asr_data():
full_asr_data = [] full_data = []
total_duration = 0 total_duration = 0
for wav,wav_path, pb2_path in wav_pb2_generator(call_audio_dir): for wav, wav_path, pb2_path in wav_pb2_generator(call_log_dir):
asr_data = read_event(wav, pb2_path) asr_data = read_event(wav, pb2_path)
total_duration += wav.duration_seconds total_duration += wav.duration_seconds
full_asr_data.append(asr_data) full_data.append(asr_data)
n_calls = len(full_data)
typer.echo(f"loaded {len(full_asr_data)} calls of duration {total_duration}s") typer.echo(f"loaded {n_calls} calls of duration {total_duration}s")
n_dps = asr_data_writer(call_asr_data, dataset_name, chain(*full_asr_data)) n_dps = ui_dump_manifest_writer(call_asr_data, dataset_name, chain(*full_data))
typer.echo(f"written {n_dps} data points") typer.echo(f"written {n_dps} data points")
generate_call_asr_data() generate_call_asr_data()
def main(): def main():
app() app()
if __name__ == "__main__": if __name__ == "__main__":
main() main()

View File

@ -42,7 +42,9 @@ if not hasattr(st, "mongo_connected"):
upsert=True, upsert=True,
) )
def set_task_fn(mf_path): def set_task_fn(mf_path, task_id):
if task_id:
st.task_id = task_id
task_path = mf_path.parent / Path(f"task-{st.task_id}.lck") task_path = mf_path.parent / Path(f"task-{st.task_id}.lck")
if not task_path.exists(): if not task_path.exists():
print(f"creating task lock at {task_path}") print(f"creating task lock at {task_path}")
@ -66,8 +68,8 @@ def load_ui_data(validation_ui_data_path: Path):
@app.command() @app.command()
def main(manifest: Path): def main(manifest: Path, task_id: str = ""):
st.set_task(manifest) st.set_task(manifest, task_id)
ui_config = load_ui_data(manifest) ui_config = load_ui_data(manifest)
asr_data = ui_config["data"] asr_data = ui_config["data"]
use_domain_asr = ui_config.get("use_domain_asr", True) use_domain_asr = ui_config.get("use_domain_asr", True)

View File

@ -19,13 +19,15 @@ extra_requirements = {
"ruamel.yaml==0.16.10", "ruamel.yaml==0.16.10",
"pymongo==3.10.1", "pymongo==3.10.1",
"librosa==0.7.2", "librosa==0.7.2",
"numba==0.48",
"matplotlib==3.2.1", "matplotlib==3.2.1",
"pandas==1.0.3", "pandas==1.0.3",
"tabulate==0.8.7", "tabulate==0.8.7",
"natural==0.2.0", "natural==0.2.0",
"num2words==0.5.10", "num2words==0.5.10",
"typer[all]==0.1.1", "typer[all]==0.3.1",
"python-slugify==4.0.0", "python-slugify==4.0.0",
"rpyc~=4.1.4",
"lenses @ git+https://github.com/ingolemo/python-lenses.git@b2a2a9aa5b61540992d70b2cf36008d0121e8948#egg=lenses", "lenses @ git+https://github.com/ingolemo/python-lenses.git@b2a2a9aa5b61540992d70b2cf36008d0121e8948#egg=lenses",
], ],
"validation": [ "validation": [