1. added deepgram support

2. compute asr sample accuracy
2026-03-08 02:22:34 +00:00 · 2020-08-07 12:02:01 +05:30
parent fa89775f86
commit f5c49338d9
2 changed files with 269 additions and 10 deletions
--- a/jasper/data/utils.py
+++ b/jasper/data/utils.py
@@ -1,13 +1,16 @@
 import io
 import os
 import re
 import json
 import base64
 import wave
 from pathlib import Path
 from itertools import product
 from functools import partial
 from math import floor
 from uuid import uuid4
-from urllib.parse import urlsplit
+from urllib.parse import urlsplit, urlencode
 from urllib.request import Request, urlopen
 from concurrent.futures import ThreadPoolExecutor
 import numpy as np
@@ -100,6 +103,9 @@ def ui_dump_manifest_writer(output_dir, dataset_name, asr_data_source, verbose=F
        "data": [],
    }
    data_funcs = []
    deepgram_transcriber = deepgram_transcribe_gen()
    # t2n = Text2Num()
    transcriber_gcp = gcp_transcribe_gen()
    transcriber_pretrained = transcribe_gen(asr_port=8044)
    with asr_manifest.open("w") as mf:
@@ -119,6 +125,10 @@ def ui_dump_manifest_writer(output_dir, dataset_name, asr_data_source, verbose=F
            pretrained_result = transcriber_pretrained(aud_seg.raw_data)
            gcp_seg = aud_seg.set_frame_rate(16000)
            gcp_result = transcriber_gcp(gcp_seg.raw_data)
            aud_data = audio_path.read_bytes()
            dgram_result = deepgram_transcriber(aud_data)
            # gtruth = dp['text']
            # dgram_result = t2n.convert(dgram_script)
            pretrained_wer = word_error_rate([transcript], [pretrained_result])
            wav_plot_path = (
                dataset_dir / Path("wav_plots") / Path(fname).with_suffix(".png")
@@ -135,6 +145,7 @@ def ui_dump_manifest_writer(output_dir, dataset_name, asr_data_source, verbose=F
                "caller": caller_name,
                "utterance_id": fname,
                "gcp_asr": gcp_result,
                "deepgram_asr": dgram_result,
                "pretrained_asr": pretrained_result,
                "pretrained_wer": pretrained_wer,
                "plot_path": str(wav_plot_path),
@@ -471,6 +482,203 @@ def gcp_transcribe_gen():
    return sample_recognize
 def deepgram_transcribe_gen():
    DEEPGRAM_URL = "https://brain.deepgram.com/v2/listen"
    MODEL = "agara"
    encoding = "linear16"
    sample_rate = "8000"
    # diarize = "false"
    q_params = {
        "model": MODEL,
        "encoding": encoding,
        "sample_rate": sample_rate,
        "language": "en-US",
        "multichannel": "false",
        "punctuate": "true",
    }
    url = "{}?{}".format(DEEPGRAM_URL, urlencode(q_params))
    # print(url)
    creds = ("arjun@agaralabs.com", "PoX1Y@x4h%oS")
    def deepgram_offline(audio_data):
        request = Request(
            url,
            method="POST",
            headers={
                "Authorization": "Basic {}".format(
                    base64.b64encode("{}:{}".format(*creds).encode("utf-8")).decode(
                        "utf-8"
                    )
                )
            },
            data=audio_data,
        )
        with urlopen(request) as response:
            msg = json.loads(response.read())
            data = msg["results"]["channels"][0]["alternatives"][0]
            return data["transcript"]
    return deepgram_offline
 class Text2Num(object):
    """docstring for Text2Num."""
    def __init__(self):
        numwords = {}
        if not numwords:
            units = [
                "zero",
                "one",
                "two",
                "three",
                "four",
                "five",
                "six",
                "seven",
                "eight",
                "nine",
                "ten",
                "eleven",
                "twelve",
                "thirteen",
                "fourteen",
                "fifteen",
                "sixteen",
                "seventeen",
                "eighteen",
                "nineteen",
            ]
            tens = [
                "",
                "",
                "twenty",
                "thirty",
                "forty",
                "fifty",
                "sixty",
                "seventy",
                "eighty",
                "ninety",
            ]
            scales = ["hundred", "thousand", "million", "billion", "trillion"]
            numwords["and"] = (1, 0)
            for idx, word in enumerate(units):
                numwords[word] = (1, idx)
            for idx, word in enumerate(tens):
                numwords[word] = (1, idx * 10)
            for idx, word in enumerate(scales):
                numwords[word] = (10 ** (idx * 3 or 2), 0)
            self.numwords = numwords
    def is_num(self, word):
        return word in self.numwords
    def parseOrdinal(self, utterance, **kwargs):
        lookup_dict = {
            "first": 1,
            "second": 2,
            "third": 3,
            "fourth": 4,
            "fifth": 5,
            "sixth": 6,
            "seventh": 7,
            "eighth": 8,
            "ninth": 9,
            "tenth": 10,
            "one": 1,
            "two": 2,
            "three": 3,
            "four": 4,
            "five": 5,
            "six": 6,
            "seven": 7,
            "eight": 8,
            "nine": 9,
            "ten": 10,
            "1": 1,
            "2": 2,
            "3": 3,
            "4": 4,
            "5": 5,
            "6": 6,
            "7": 7,
            "8": 8,
            "9": 9,
            "10": 10,
            "last": -1,
        }
        pattern = re.compile(
            r"(\s|^)(?P<num>(first)|(third)|(fourth)|(fifth)|(sixth)|(seventh)|(eighth)|(ninth)|(tenth)|(two)|(three)|(four)|(five)|(six)|(seven)|(eight)|(nine)|(ten)|(1)|(2)|(3)|(4)|(5)|(6)|(7)|(8)|(9)|(10)|(last))(\s|$)",
            re.IGNORECASE,
        )
        ordinal = ""
        if pattern.search(utterance):
            ordinal = pattern.search(utterance).groupdict()["num"].strip()
        elif re.search(r"(\s|^)(?P<num>(second))(\s|$)", utterance):
            ordinal = "second"
        elif re.search(r"(\s|^)(?P<num>(one))(\s|$)", utterance):
            ordinal = "one"
        ordinal = lookup_dict.get(ordinal, "")
        return ordinal
    def convert(self, sent):
        # res = []
        # for token in sent.split():
        #     if token in self.numwords:
        #         res.append(str(self.text2int(token)))
        #     else:
        #         res.append(token)
        # return " ".join(res)
        return " ".join(
            [
                str(self.parseOrdinal(x)) if self.parseOrdinal(x) != "" else x
                for x in sent.split()
            ]
        )
    def text2int(self, textnum):
        current = result = 0
        for word in textnum.split():
            if word not in self.numwords:
                raise Exception("Illegal word: " + word)
            scale, increment = self.numwords[word]
            current = current * scale + increment
            if scale > 100:
                result += current
                current = 0
        return result + current
 def is_sub_sequence(str1, str2):
    m = len(str1)
    n = len(str2)
    def check_seq(string1, string2, m, n):
        # Base Cases
        if m == 0:
            return True
        if n == 0:
            return False
        # If last characters of two strings are matching
        if string1[m - 1] == string2[n - 1]:
            return check_seq(string1, string2, m - 1, n - 1)
        # If last characters are not matching
        return check_seq(string1, string2, m, n - 1)
    return check_seq(str1, str2, m, n)
 def parallel_apply(fn, iterable, workers=8):
    with ThreadPoolExecutor(max_workers=workers) as exe:
        print(f"parallelly applying {fn}")
--- a/jasper/data/validation/process.py
+++ b/jasper/data/validation/process.py
@@ -156,6 +156,47 @@ def sample_ui(
    ExtendedPath(sample_path).write_json(processed_data)
@app.command()
 def sample_asr_accuracy(
    data_name: str = typer.Option(
        "png_06_2020_week1_numbers_window_customer", show_default=True
    ),
    dump_dir: Path = Path("./data/asr_data"),
    sample_file: Path = Path("sample_dump.json"),
    asr_service: str = "deepgram",
 ):
    # import pandas as pd
    # from pydub import AudioSegment
    from ..utils import is_sub_sequence, Text2Num
    # from ..utils import deepgram_transcribe_gen
    #
    # deepgram_transcriber = deepgram_transcribe_gen()
    t2n = Text2Num()
    # processed_data_path = dump_dir / Path(data_name) / dump_file
    sample_path = dump_dir / Path(data_name) / sample_file
    processed_data = ExtendedPath(sample_path).read_json()
    # asr_data = []
    match_count, total_samples = 0, len(processed_data["data"])
    for dp in tqdm(processed_data["data"]):
        # aud_data = Path(dp["audio_path"]).read_bytes()
        # dgram_result = deepgram_transcriber(aud_data)
        # dp["deepgram_asr"] = dgram_result
        gcp_num = dp["text"]
        dgm_num = t2n.convert(dp["deepgram_asr"].lower())
        if is_sub_sequence(gcp_num, dgm_num):
            match_count += 1
            print(f"MATCH GCP:{gcp_num}\tDGM:{dgm_num}")
        else:
            print(f"FAIL GCP:{gcp_num}\tDGM:{dgm_num}")
        # asr_data.append(dp)
    typer.echo(
        f"{match_count} from deepgram matches with {total_samples} gcp transcripts."
    )
    # processed_data["data"] = asr_data
    # ExtendedPath(sample_path).write_json(processed_data)
@app.command()
 def task_ui(
    data_name: str = typer.Option("call_upwork_train_cnd", show_default=True),
@@ -190,7 +231,9 @@ def dump_corrections(
    col = get_mongo_conn(col="asr_validation")
    task_id = [c for c in col.distinct("task_id") if c.rsplit("-", 1)[1] == task_uid][0]
    corrections = list(col.find({"type": "correction"}, projection={"_id": False}))
-    cursor_obj = col.find({"type": "correction", "task_id": task_id}, projection={"_id": False})
+    cursor_obj = col.find(
        {"type": "correction", "task_id": task_id}, projection={"_id": False}
    )
    corrections = [c for c in cursor_obj]
    ExtendedPath(dump_path).write_json(corrections)
@@ -271,7 +314,9 @@ def split_extract(
    dump_file: Path = Path("ui_dump.json"),
    manifest_file: Path = Path("manifest.json"),
    corrections_file: str = typer.Option("corrections.json", show_default=True),
-    conv_data_path: Path = typer.Option(Path("./data/conv_data.json"), show_default=True),
+    conv_data_path: Path = typer.Option(
        Path("./data/conv_data.json"), show_default=True
    ),
    extraction_type: ExtractionType = ExtractionType.all,
 ):
    import shutil
@@ -293,7 +338,9 @@ def split_extract(
        def extract_manifest(mg):
            for m in mg:
                if m["text"] in extraction_vals:
-                    shutil.copy(m["audio_path"], dest_data_dir / Path(m["audio_filepath"]))
+                    shutil.copy(
                        m["audio_path"], dest_data_dir / Path(m["audio_filepath"])
                    )
                    yield m
        asr_manifest_writer(dest_manifest_path, extract_manifest(manifest_gen))
@@ -302,12 +349,14 @@ def split_extract(
        orig_ui_data = ExtendedPath(ui_data_path).read_json()
        ui_data = orig_ui_data["data"]
        file_ui_map = {Path(u["audio_filepath"]).stem: u for u in ui_data}
-        extracted_ui_data = list(filter(lambda u: u["text"] in extraction_vals, ui_data))
+        extracted_ui_data = list(
            filter(lambda u: u["text"] in extraction_vals, ui_data)
        )
        final_data = []
        for i, d in enumerate(extracted_ui_data):
-            d['real_idx'] = i
+            d["real_idx"] = i
            final_data.append(d)
-        orig_ui_data['data'] = final_data
+        orig_ui_data["data"] = final_data
        ExtendedPath(dest_ui_path).write_json(orig_ui_data)
        if corrections_file:
@@ -323,7 +372,7 @@ def split_extract(
            )
            ExtendedPath(dest_correction_path).write_json(extracted_corrections)
-    if extraction_type.value == 'all':
+    if extraction_type.value == "all":
        for ext_key in conv_data.keys():
            extract_data_of_type(ext_key)
    else:
@@ -345,7 +394,7 @@ def update_corrections(
    def correct_manifest(ui_dump_path, corrections_path):
        corrections = ExtendedPath(corrections_path).read_json()
-        ui_data = ExtendedPath(ui_dump_path).read_json()['data']
+        ui_data = ExtendedPath(ui_dump_path).read_json()["data"]
        correct_set = {
            c["code"] for c in corrections if c["value"]["status"] == "Correct"
        }
@@ -374,7 +423,9 @@ def update_corrections(
                    )
                else:
                    orig_audio_path = Path(d["audio_path"])
-                    new_name = str(Path(tscript_uuid_fname(correct_text)).with_suffix(".wav"))
+                    new_name = str(
                        Path(tscript_uuid_fname(correct_text)).with_suffix(".wav")
                    )
                    new_audio_path = orig_audio_path.with_name(new_name)
                    orig_audio_path.replace(new_audio_path)
                    new_filepath = str(Path(d["audio_filepath"]).with_name(new_name))