adding support for asr data generator

2026-03-07 20:02:34 +00:00 · 2020-08-06 00:08:46 +05:30
parent e24a8cf9d0
commit 14d31a51c3
2 changed files with 138 additions and 4 deletions
--- a/jasper/data/rastrik_recycler.py
+++ b/jasper/data/rastrik_recycler.py
@@ -0,0 +1,137 @@
+from rastrik.proto.callrecord_pb2 import CallRecordEvent, CallRecord
+import gzip
+from pydub import AudioSegment
+import json
+from .utils import ExtendedPath, asr_data_writer, strip_silence
+
+import typer
+from itertools import chain
+from io import BytesIO
+from pathlib import Path
+
+app = typer.Typer()
+
+@app.command()
+def extract_manifest(
+    call_audio_dir: Path = Path("./data/call_audio"),
+    call_meta_dir: Path = Path("./data/call_metadata"),
+    output_dir: Path = Path("./data"),
+    dataset_name: str = "grassroot_pizzahut_v1",
+    verbose: bool = False,
+):
+
+
+    call_asr_data: Path = output_dir / Path("asr_data")
+    call_asr_data.mkdir(exist_ok=True, parents=True)
+    """
+    def read_event_old(log_file,audio_file):
+        call_wav = AudioSegment.from_wav(audio_file)
+        call_wav_0, call_wav_1 = call_wav.split_to_mono()
+        with gzip.open(log_file, "rb") as log_h:
+            record_data = log_h.read()
+        cr = CallRecord()
+        cr.ParseFromString(record_data)
+
+        import pdb
+        first_audio_event_timestamp = next ((i
+            for i in cr.events
+                if i.WhichOneof("event_type") == "call_event"
+                and i.call_event.WhichOneof("event_type") == "call_audio"
+        )).timestamp.ToDatetime()
+
+        speech_events = [ i
+            for i in cr.events
+            if i.WhichOneof("event_type") == "asr_result"
+        ]
+        previous_event_timestamp = first_audio_event_timestamp - first_audio_event_timestamp
+        for index,each_speech_events in enumerate(speech_events):
+            asr_final = each_speech_events.asr_result.text
+            speech_timestamp = each_speech_events.timestamp.ToDatetime()
+            actual_timestamp = speech_timestamp - first_audio_event_timestamp
+            print(previous_event_timestamp.total_seconds(),actual_timestamp.total_seconds(),asr_final)
+            start_time = previous_event_timestamp.total_seconds()*1000
+            end_time = actual_timestamp.total_seconds() * 1000
+            audio_segment = strip_silence(call_wav_1[start_time:end_time])
+            audio_segment.export(output_folder+str(index) + '.wav' ,format='wav')
+            previous_event_timestamp = actual_timestamp
+    """
+
+    def wav_pb2_generator(call_audio_dir):
+        for wav_path in call_audio_dir.glob("**/*.wav"):
+            if verbose:
+                typer.echo(f"loading events for file {wav_path}")
+            call_wav = AudioSegment.from_file_using_temporary_files(wav_path)
+            rel_meta_path = wav_path.with_suffix(".pb2.gz").relative_to(call_audio_dir)
+            meta_path = call_meta_dir / rel_meta_path
+            #events = ExtendedPath(meta_path).read_json()
+            yield  call_wav,wav_path, meta_path
+
+    def read_event(call_wav,log_file):
+        #call_wav = AudioSegment.from_wav(audio_file)
+        call_wav_0, call_wav_1 = call_wav.split_to_mono()
+        with gzip.open(log_file, "rb") as log_h:
+            record_data = log_h.read()
+        cr = CallRecord()
+        cr.ParseFromString(record_data)
+
+        import pdb
+        first_audio_event_timestamp = next ((i
+            for i in cr.events
+                if i.WhichOneof("event_type") == "call_event"
+                and i.call_event.WhichOneof("event_type") == "call_audio"
+        )).timestamp.ToDatetime()
+
+        speech_events = [ i
+            for i in cr.events
+            if i.WhichOneof("event_type") == "speech_event"
+            and i.speech_event.WhichOneof("event_type") == "asr_final"
+        ]
+        previous_event_timestamp = first_audio_event_timestamp - first_audio_event_timestamp
+        for index,each_speech_events in enumerate(speech_events):
+            asr_final = each_speech_events.speech_event.asr_final
+            speech_timestamp = each_speech_events.timestamp.ToDatetime()
+            actual_timestamp = speech_timestamp - first_audio_event_timestamp
+            print(previous_event_timestamp.total_seconds(),actual_timestamp.total_seconds(),asr_final)
+            start_time = previous_event_timestamp.total_seconds()*1000
+            end_time = actual_timestamp.total_seconds() * 1000
+            audio_segment = strip_silence(call_wav_1[start_time:end_time])
+
+            code_fb = BytesIO()
+            audio_segment.export(code_fb, format="wav")
+            wav_data = code_fb.getvalue()
+
+            #output_audio_path = output_folder + audio_file.replace('.wav','') + '_' + str(index) 
+            #audio_segment.export( output_audio_path+ '.wav' ,format='wav')
+            #manifest_file.write(json.dumps({"audio_filepath":output_audio_path , "duration": (end_time-start_time) / 1000 , "text":asr_final }) + '\n')
+            previous_event_timestamp = actual_timestamp
+            duration = (end_time-start_time) / 1000
+            yield asr_final,duration,wav_data
+
+
+    def generate_call_asr_data():
+        full_asr_data = []
+        total_duration = 0
+        for wav,wav_path, pb2_path in wav_pb2_generator(call_audio_dir):
+            asr_data = read_event(wav,pb2_path)
+            total_duration += wav.duration_seconds
+            full_asr_data.append(asr_data)
+
+        typer.echo(f"loaded {len(full_asr_data)} calls of duration {total_duration}s")
+        n_dps = asr_data_writer(call_asr_data, dataset_name, chain(*full_asr_data))
+        typer.echo(f"written {n_dps} data points")
+
+
+    generate_call_asr_data()
+
+def main():
+    app()
+
+
+if __name__ == "__main__":
+    main()
+    
+
+
+
+
+
--- a/setup.py
+++ b/setup.py
@@ -68,10 +68,7 @@ setup(
            "jasper_data_tts_generate = jasper.data.tts_generator:main",
            "jasper_data_conv_generate = jasper.data.conv_generator:main",
            "jasper_data_nlu_generate = jasper.data.nlu_generator:main",
-            "jasper_data_test_generate = jasper.data.test_generator:main",
-            "jasper_data_call_recycle = jasper.data.call_recycler:main",
-            "jasper_data_asr_recycle = jasper.data.asr_recycler:main",
-            "jasper_data_rev_recycle = jasper.data.rev_recycler:main",
+            "jasper_data_rastrik_recycle = jasper.data.rastrik_recycler:main",
            "jasper_data_server = jasper.data.server:main",
            "jasper_data_validation = jasper.data.validation.process:main",
            "jasper_data_preprocess = jasper.data.process:main",