1. Self contained typers

2. Asr force-aligner visualization 3. streamlit state management abstraction 4. new utils / reorganize 5. added verbose flags 6. add tts by name
2026-03-07 20:02:34 +00:00 · 2021-03-23 13:27:35 +05:30
parent f72c6bbe5b
commit c474aa5f5a
22 changed files with 1097 additions and 146 deletions
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -0,0 +1 @@
 graft plume/utils/gentle_preview
--- a/plume/cli/init.py
+++ b/plume/cli/init.py
@@ -7,12 +7,12 @@ from .eval import app as eval_app
 from .serve import app as serve_app
 app = typer.Typer()
-app.add_typer(data_app, name="data")
+app.add_typer(data_app)
-app.add_typer(ui_app, name="ui")
+app.add_typer(ui_app)
-app.add_typer(train_app, name="train")
+app.add_typer(train_app)
-app.add_typer(eval_app, name="eval")
+app.add_typer(eval_app)
-app.add_typer(serve_app, name="serve")
+app.add_typer(serve_app)
-app.add_typer(utils_app, name='utils')
+app.add_typer(utils_app)
 def main():
--- a/plume/cli/data/init.py
+++ b/plume/cli/data/init.py
@@ -27,6 +27,13 @@ app.add_typer(generate_app, name="generate")
 app.add_typer(wav2vec2_app, name="wav2vec2")
@app.callback()
 def data():
    """
    data sub commands
    """
@app.command()
 def fix_path(dataset_path: Path, force: bool = False):
    manifest_path = dataset_path / Path("manifest.json")
--- a/plume/cli/eval.py
+++ b/plume/cli/eval.py
@@ -3,3 +3,10 @@ from ..models.wav2vec2.eval import app as wav2vec2_app
 app = typer.Typer()
 app.add_typer(wav2vec2_app, name="wav2vec2")
@app.callback()
 def eval():
    """
    eval sub commands
    """
--- a/plume/cli/serve.py
+++ b/plume/cli/serve.py
@@ -5,3 +5,10 @@ from ..models.jasper.serve import app as jasper_app
 app = typer.Typer()
 app.add_typer(wav2vec2_app, name="wav2vec2")
 app.add_typer(jasper_app, name="jasper")
@app.callback()
 def serve():
    """
    serve sub commands
    """
--- a/plume/cli/train.py
+++ b/plume/cli/train.py
@@ -1,5 +1,12 @@
 import typer
-from ..models.wav2vec2.train import app as train_app
+from ..models.wav2vec2.train import app as wav2vec2_app
 app = typer.Typer()
-app.add_typer(train_app, name="wav2vec2")
+app.add_typer(wav2vec2_app, name="wav2vec2")
@app.callback()
 def train():
    """
    train sub commands
    """
--- a/plume/ui/init.py
+++ b/plume/ui/init.py
@@ -3,12 +3,20 @@ import sys
 from pathlib import Path
 from plume.utils import lazy_module
 # from streamlit import cli as stcli
-stcli = lazy_module('streamlit.cli')
+stcli = lazy_module("streamlit.cli")
 app = typer.Typer()
@app.callback()
 def ui():
    """
    ui sub commands
    """
@app.command()
 def annotation(data_dir: Path, dump_fname: Path = "ui_dump.json", task_id: str = ""):
    annotation_lit_path = Path(__file__).parent / Path("annotation.py")
@@ -40,13 +48,7 @@ def annotation(data_dir: Path, dump_fname: Path = "ui_dump.json", task_id: str =
@app.command()
 def preview(manifest_path: Path):
    annotation_lit_path = Path(__file__).parent / Path("preview.py")
-    sys.argv = [
+    sys.argv = ["streamlit", "run", str(annotation_lit_path), "--", str(manifest_path)]
        "streamlit",
        "run",
        str(annotation_lit_path),
        "--",
        str(manifest_path)
    ]
    sys.exit(stcli.main())
@@ -56,6 +58,18 @@ def collection(data_dir: Path, task_id: str = ""):
    pass
@app.command()
 def alignment(preview_dir: Path, port: int = 8010):
    from RangeHTTPServer import RangeRequestHandler
    from functools import partial
    from http.server import HTTPServer
    server_address = ("", port)
    handler_class = partial(RangeRequestHandler, directory=str(preview_dir))
    httpd = HTTPServer(server_address, handler_class)
    httpd.serve_forever()
 def main():
    app()
--- a/plume/ui/annotation.py
+++ b/plume/ui/annotation.py
@@ -1,66 +1,14 @@
 # import sys
 from pathlib import Path
 from uuid import uuid4
 import streamlit as st
 import typer
-
+from plume.utils import ExtendedPath
-from plume.utils import ExtendedPath, get_mongo_conn
+from plume.utils.ui_persist import setup_mongo_asr_validation_state
 from plume.preview.st_rerun import rerun
 app = typer.Typer()
-
+setup_mongo_asr_validation_state(st)
 if not hasattr(st, "mongo_connected"):
    st.mongoclient = get_mongo_conn(col="asr_validation")
    mongo_conn = st.mongoclient
    st.task_id = str(uuid4())
    def current_cursor_fn():
        # mongo_conn = st.mongoclient
        cursor_obj = mongo_conn.find_one(
            {"type": "current_cursor", "task_id": st.task_id}
        )
        cursor_val = cursor_obj["cursor"]
        return cursor_val
    def update_cursor_fn(val=0):
        mongo_conn.find_one_and_update(
            {"type": "current_cursor", "task_id": st.task_id},
            {"$set": {"type": "current_cursor", "task_id": st.task_id, "cursor": val}},
            upsert=True,
        )
        rerun()
    def get_correction_entry_fn(code):
        return mongo_conn.find_one(
            {"type": "correction", "code": code}, projection={"_id": False}
        )
    def update_entry_fn(code, value):
        mongo_conn.find_one_and_update(
            {"type": "correction", "code": code},
            {"$set": {"value": value, "task_id": st.task_id}},
            upsert=True,
        )
    def set_task_fn(data_path, task_id):
        if task_id:
            st.task_id = task_id
        task_path = data_path / Path(f"task-{st.task_id}.lck")
        if not task_path.exists():
            print(f"creating task lock at {task_path}")
            task_path.touch()
    st.get_current_cursor = current_cursor_fn
    st.update_cursor = update_cursor_fn
    st.get_correction_entry = get_correction_entry_fn
    st.update_entry = update_entry_fn
    st.set_task = set_task_fn
    st.mongo_connected = True
    cursor_obj = mongo_conn.find_one({"type": "current_cursor", "task_id": st.task_id})
    if not cursor_obj:
        update_cursor_fn(0)
@st.cache()
--- a/plume/ui/preview.py
+++ b/plume/ui/preview.py
@@ -3,27 +3,11 @@ from pathlib import Path
 import streamlit as st
 import typer
 from plume.utils import ExtendedPath
-from plume.preview.st_rerun import rerun
+from plume.utils.ui_persist import setup_file_state
 app = typer.Typer()
-if not hasattr(st, "state_lock"):
+setup_file_state(st)
    # st.task_id = str(uuid4())
    task_path = ExtendedPath("preview.lck")
    def current_cursor_fn():
        return task_path.read_json()["current_cursor"]
    def update_cursor_fn(val=0):
        task_path.write_json({"current_cursor": val})
        rerun()
    st.get_current_cursor = current_cursor_fn
    st.update_cursor = update_cursor_fn
    st.state_lock = True
    # cursor_obj = mongo_conn.find_one({"type": "current_cursor", "task_id": st.task_id})
    # if not cursor_obj:
    update_cursor_fn(0)
@st.cache()
@@ -40,7 +24,7 @@ def main(manifest: Path):
        print("Invalid samplno resetting to 0")
        st.update_cursor(0)
    sample = asr_data[sample_no]
-    st.title(f"ASR Manifest Preview")
+    st.title("ASR Manifest Preview")
    st.markdown(f"{sample_no+1} of {len(asr_data)} : **{sample['text']}**")
    new_sample = st.number_input(
        "Go To Sample:", value=sample_no + 1, min_value=1, max_value=len(asr_data)
--- a/plume/utils/.gitignore
+++ b/plume/utils/.gitignore
@@ -0,0 +1,151 @@
 /data/
 /model/
 /train/
 .env*
 *.yaml
 *.yml
 *.json
 # Created by https://www.gitignore.io/api/python
 # Edit at https://www.gitignore.io/?templates=python
 ### Python ###
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class
 # C extensions
 *.so
 # Distribution / packaging
 .Python
 build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 pip-wheel-metadata/
 share/python-wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 MANIFEST
 # PyInstaller
 #  Usually these files are written by a python script from a template
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 *.manifest
 *.spec
 # Installer logs
 pip-log.txt
 pip-delete-this-directory.txt
 # Unit test / coverage reports
 htmlcov/
 .tox/
 .nox/
 .coverage
 .coverage.*
 .cache
 nosetests.xml
 coverage.xml
 *.cover
 .hypothesis/
 .pytest_cache/
 # Translations
 *.mo
 *.pot
 # Scrapy stuff:
 .scrapy
 # Sphinx documentation
 docs/_build/
 # PyBuilder
 target/
 # pyenv
 .python-version
 # pipenv
 #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 #   install all needed dependencies.
 #Pipfile.lock
 # celery beat schedule file
 celerybeat-schedule
 # SageMath parsed files
 *.sage.py
 # Spyder project settings
 .spyderproject
 .spyproject
 # Rope project settings
 .ropeproject
 # Mr Developer
 .mr.developer.cfg
 .project
 .pydevproject
 # mkdocs documentation
 /site
 # mypy
 .mypy_cache/
 .dmypy.json
 dmypy.json
 # Pyre type checker
 .pyre/
 # End of https://www.gitignore.io/api/python
 # Created by https://www.gitignore.io/api/macos
 # Edit at https://www.gitignore.io/?templates=macos
 ### macOS ###
 # General
 .DS_Store
 .AppleDouble
 .LSOverride
 # Icon must end with two \r
 Icon
 # Thumbnails
 ._*
 # Files that might appear in the root of a volume
 .DocumentRevisions-V100
 .fseventsd
 .Spotlight-V100
 .TemporaryItems
 .Trashes
 .VolumeIcon.icns
 .com.apple.timemachine.donotpresent
 # Directories potentially created on remote AFP share
 .AppleDB
 .AppleDesktop
 Network Trash Folder
 Temporary Items
 .apdisk
 # End of https://www.gitignore.io/api/macos
--- a/plume/utils/init.py
+++ b/plume/utils/init.py
@@ -11,12 +11,14 @@ from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
 import subprocess
 import shutil
 from urllib.parse import urlsplit
 # from .lazy_loader import LazyLoader
 from .lazy_import import lazy_callable, lazy_module
 # from ruamel.yaml import YAML
 # import boto3
 import typer
 # import pymongo
 # from slugify import slugify
 # import pydub
@@ -34,16 +36,16 @@ from .tts import app as tts_app
 from .transcribe import app as transcribe_app
 from .align import app as align_app
-boto3 = lazy_module('boto3')
+boto3 = lazy_module("boto3")
-pymongo = lazy_module('pymongo')
+pymongo = lazy_module("pymongo")
-pydub = lazy_module('pydub')
+pydub = lazy_module("pydub")
-audio_display = lazy_module('librosa.display')
+audio_display = lazy_module("librosa.display")
-plt = lazy_module('matplotlib.pyplot')
+plt = lazy_module("matplotlib.pyplot")
-librosa = lazy_module('librosa')
+librosa = lazy_module("librosa")
-YAML = lazy_callable('ruamel.yaml.YAML')
+YAML = lazy_callable("ruamel.yaml.YAML")
-num2words = lazy_callable('num2words.num2words')
+num2words = lazy_callable("num2words.num2words")
-slugify = lazy_callable('slugify.slugify')
+slugify = lazy_callable("slugify.slugify")
-compress = lazy_callable('natural.date.compress')
+compress = lazy_callable("natural.date.compress")
 app = typer.Typer()
 app.add_typer(tts_app, name="tts")
@@ -51,6 +53,13 @@ app.add_typer(align_app, name="align")
 app.add_typer(transcribe_app, name="transcribe")
@app.callback()
 def utils():
    """
    utils sub commands
    """
 logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
 )
@@ -125,6 +134,10 @@ def upload_s3(dataset_path, s3_path):
    run_shell(f"aws s3 sync {dataset_path} {s3_path}")
 def copy_s3(dataset_path, s3_path):
    run_shell(f"aws s3 cp {dataset_path} {s3_path}")
 def get_download_path(s3_uri, output_path):
    s3_uri_p = urlsplit(s3_uri)
    download_path = output_path / Path(s3_uri_p.path[1:])
@@ -135,11 +148,12 @@ def get_download_path(s3_uri, output_path):
 def s3_downloader():
    s3 = boto3.client("s3")
-    def download_s3(s3_uri, download_path):
+    def download_s3(s3_uri, download_path, verbose=False):
        s3_uri_p = urlsplit(s3_uri)
        download_path.parent.mkdir(exist_ok=True, parents=True)
        if not download_path.exists():
-            print(f"downloading {s3_uri} to {download_path}")
+            if verbose:
                print(f"downloading {s3_uri} to {download_path}")
            s3.download_file(s3_uri_p.netloc, s3_uri_p.path[1:], str(download_path))
    return download_s3
@@ -186,6 +200,7 @@ def ui_data_generator(dataset_dir, asr_data_source, verbose=False):
            plot_seg(wav_plot_path.absolute(), audio_file)
        return {
            "audio_path": str(rel_data_path),
            "audio_filepath": str(rel_data_path),
            "duration": round(audio_dur, 1),
            "text": transcript,
            "real_idx": num_datapoints,
@@ -229,17 +244,17 @@ def ui_dump_manifest_writer(dataset_dir, asr_data_source, verbose=False):
    )
    asr_manifest = dataset_dir / Path("manifest.json")
-    with asr_manifest.open("w") as mf:
+    asr_manifest_writer(asr_manifest, dump_data, verbose=verbose)
-        print(f"writing manifest to {asr_manifest}")
+    # with asr_manifest.open("w") as mf:
-        for d in dump_data:
+    #     print(f"writing manifest to {asr_manifest}")
-            rel_data_path = d["audio_path"]
+    #     for d in dump_data:
-            audio_dur = d["duration"]
+    #         rel_data_path = d["audio_path"]
-            transcript = d["text"]
+    #         audio_dur = d["duration"]
-            manifest = manifest_str(str(rel_data_path), audio_dur, transcript)
+    #         transcript = d["text"]
-            mf.write(manifest)
+    #         manifest = manifest_str(str(rel_data_path), audio_dur, transcript)
-
+    #         mf.write(manifest)
    ui_dump_file = dataset_dir / Path("ui_dump.json")
-    ExtendedPath(ui_dump_file).write_json({"data": dump_data})
+    ExtendedPath(ui_dump_file).write_json({"data": dump_data}, verbose=verbose)
    return num_datapoints
@@ -254,9 +269,10 @@ def asr_manifest_reader(data_manifest_path: Path):
        yield p
-def asr_manifest_writer(asr_manifest_path: Path, manifest_str_source):
+def asr_manifest_writer(asr_manifest_path: Path, manifest_str_source, verbose=False):
    with asr_manifest_path.open("w") as mf:
-        print(f"opening {asr_manifest_path} for writing manifest")
+        if verbose:
            print(f"writing asr manifest to {asr_manifest_path}")
        for mani_dict in manifest_str_source:
            manifest = manifest_str(
                mani_dict["audio_filepath"], mani_dict["duration"], mani_dict["text"]
@@ -293,37 +309,43 @@ def batch(iterable, n=1):
 class ExtendedPath(type(Path())):
    """docstring for ExtendedPath."""
-    def read_json(self):
+    def read_json(self, verbose=False):
-        print(f"reading json from {self}")
+        if verbose:
            print(f"reading json from {self}")
        with self.open("r") as jf:
            return json.load(jf)
-    def read_yaml(self):
+    def read_yaml(self, verbose=False):
        yaml = YAML(typ="safe", pure=True)
-        print(f"reading yaml from {self}")
+        if verbose:
            print(f"reading yaml from {self}")
        with self.open("r") as yf:
            return yaml.load(yf)
-    def read_jsonl(self):
+    def read_jsonl(self, verbose=False):
-        print(f"reading jsonl from {self}")
+        if verbose:
            print(f"reading jsonl from {self}")
        with self.open("r") as jf:
-            for l in jf.readlines():
+            for ln in jf.readlines():
-                yield json.loads(l)
+                yield json.loads(ln)
-    def write_json(self, data):
+    def write_json(self, data, verbose=False):
-        print(f"writing json to {self}")
+        if verbose:
            print(f"writing json to {self}")
        self.parent.mkdir(parents=True, exist_ok=True)
        with self.open("w") as jf:
            json.dump(data, jf, indent=2)
-    def write_yaml(self, data):
+    def write_yaml(self, data, verbose=False):
        yaml = YAML()
-        print(f"writing yaml to {self}")
+        if verbose:
            print(f"writing yaml to {self}")
        with self.open("w") as yf:
            yaml.dump(data, yf)
-    def write_jsonl(self, data):
+    def write_jsonl(self, data, verbose=False):
-        print(f"writing jsonl to {self}")
+        if verbose:
            print(f"writing jsonl to {self}")
        self.parent.mkdir(parents=True, exist_ok=True)
        with self.open("w") as jf:
            for d in data:
--- a/plume/utils/align.py
+++ b/plume/utils/align.py
@@ -1,12 +1,14 @@
 from pathlib import Path
 from .tts import GoogleTTS
 # from IPython import display
 import requests
 import io
-import typer
+import shutil
 import typer
 from plume.utils import lazy_module
 from .tts import GoogleTTS
 display = lazy_module('IPython.display')
 pydub = lazy_module('pydub')
@@ -63,16 +65,19 @@ def gentle_preview(
    audio_path: Path,
    transcript_path: Path,
    service_uri="http://101.53.142.218:8765/transcriptions",
-    gent_preview_dir="../gentle_preview",
+    gent_preview_dir="./gentle_preview",
 ):
    from . import ExtendedPath
-    ab = audio_path.read_bytes()
+    pkg_gentle_dir = Path(__file__).parent / 'gentle_preview'
-    tt = transcript_path.read_text()
+
-    audio, alignment = gentle_aligner(service_uri, ab, tt)
+    shutil.copytree(str(pkg_gentle_dir), str(gent_preview_dir))
-    audio.export(gent_preview_dir / Path("a.wav"), format="wav")
+    # ab = audio_path.read_bytes()
-    alignment["status"] = "OK"
+    # tt = transcript_path.read_text()
-    ExtendedPath(gent_preview_dir / Path("status.json")).write_json(alignment)
+    # audio, alignment = gentle_aligner(service_uri, ab, tt)
    # audio.export(gent_preview_dir / Path("a.wav"), format="wav")
    # alignment["status"] = "OK"
    # ExtendedPath(gent_preview_dir / Path("status.json")).write_json(alignment)
 def main():
--- a/plume/utils/gentle_preview/README.md
+++ b/plume/utils/gentle_preview/README.md
@@ -0,0 +1,5 @@
 Serve with https://github.com/danvk/RangeHTTPServer
 `https://github.com/claysciences/CORSRangeHTTPServer`
 `python -m RangeHTTPServer`
 `python -m http.server`
--- a/plume/utils/gentle_preview/align.html
+++ b/plume/utils/gentle_preview/align.html
@@ -0,0 +1,80 @@
 <html>
  <head>
    <meta charset="utf-8" />
    <style>
      body {font-family: sans-serif; padding-top: 70px; }
      textarea { width: 500px; height: 20em; }
      input, textarea { margin: 1em 0; }
      #header {
          position: fixed;
          top: 0;
          left: 0;
          height: 50px;
          line-height: 50px;
          width: 100%;
          background-color: #999;
          box-shadow: 0px 0px 5px 0px rgba(0,0,0,0.5);
          font-family: Helvetica, sans-serif;
      }
      #header, #header a {
          color: white;
      }
      .home {
          margin: 0;
          font-size: 125%;
          font-weight: lighter;
          text-transform: lowercase;
      }
      .home a {
          margin: 0;
          background: #666;
          padding-left: 25px;
          padding-right: 30px;
          margin-right: 20px;
          float: left;
          text-decoration: none;
      }
      .home:hover a {
          background: #555;
      }
      #align-button {
        background: #CCC;
        border: 0;
        font-size: 18px;
        padding: 10px 30px;
        cursor: pointer;
      }
      #alignment-flags {
        background: #CCC;
        border: 0;
        font-size: 18px;
        padding: 10px 30px;
      }
      #footer {
        margin-top: 100px;
        border-top: 1px dotted black;
        font-size: 8pt;
      font-style: italic;
      padding: 10px;
      }
    </style>
  </head>
  <body>
    <div id="header">
      <h1 class="home"><a href="/">Gentle</a></h1>
    </div>
    <form action="/transcriptions" method="POST" enctype="multipart/form-data">
      Audio:<br>
      <input type=file name=audio><br>
      <br>
      Transcript:<br>
      <textarea name="transcript"></textarea><br>
      <input id=alignment-flags name=conservative type=checkbox> Conservative<br>
      <input id=alignment-flags name=disfluency type=checkbox> Include disfluencies<br>
      <input id="align-button" type=submit value=Align>
    </form>
    <div id="footer">
      <a href="https://lowerquality.com/gentle">Gentle</a> is free software released under the <a href="https://opensource.org/licenses/MIT">MIT license</a>. <a href="https://lowerquality.com/gentle">Homepage</a> | <a href="https://github.com/lowerquality/gentle">Source code</a>.
    </div>
  </body>
 </html>
--- a/plume/utils/gentle_preview/index.html
+++ b/plume/utils/gentle_preview/index.html
@@ -0,0 +1,408 @@
 <html>
  <head>
    <meta charset="utf-8" />
    <style>
 html, body {
    margin: 0;
    padding: 0;
 }
 #header {
    position: fixed;
    top: 0;
    left: 0;
    height: 50px;
    line-height: 50px;
    width: 100%;
    background-color: #999;
    box-shadow: 0px 0px 5px 0px rgba(0,0,0,0.5);
    font-family: Helvetica, sans-serif;
 }
 #header, #header a {
    color: white;
 }
 #downloads {
    float: right;
    background: #999;
 }
 .download {
    float: right;
    background: #999;
    padding: 0 5px;
 }
 .home {
  margin: 0;
  font-size: 125%;
  font-weight: lighter;
  text-transform: lowercase;
 }
 .home a {
  margin: 0;
  background: #666;
  padding-left: 25px;
  padding-right: 30px;
  margin-right: 20px;
  float: left;
  text-decoration: none;
 }
 .home:hover a {
  background: #555;
 }
 #audio {
    margin-top: 9px;
    width: 50%;
    display: inline-block;
 }
 #transcript {
    margin: 0 15px;
    margin-top: 70px;
    margin-bottom: 5em;
    white-space: pre-wrap;
    line-height: 2em;
    max-width: 600px;
    color: #999;
 }
 #transcript.status {
    background-color: #333;
    color: #fff;
    font-family: Courier, mono;
    line-height: 1em;
    font-size: 10pt;
    max-width: 100%;
 }
 #transcript.status h2 {
    padding: 10px;
 }
 #transcript.status .entry {
    margin-bottom: 10px;
    padding: 10px;
 }
 #transcript.status progress {
    width: 100%;
    height: 30px;
    margin-bottom: 20px;
 }
 .success {
    color: black;
 }
 .success:hover {
    text-decoration: underline;
 }
 .active {
    color: magenta;
 }
 #preloader {
    visibility: hidden;
 }
 .phactive {
    text-decoration: underline;
 }
 .phones {
    position: absolute;
    color: #333;
 }
 .phones .phone {
    margin-right: 5px;
    font-family: Helvetica, sans-serif;
    text-transform: uppercase;
    font-size: 50%;
 }
 .phones .phone:last-child {
    margin-right: 0;
 }
 #footer {
  margin-top: 100px;
  border-top: 1px dotted black;
  font-size: 8pt;
  font-style: italic;
  font-family: Helvetica, sans-serif;
  padding: 10px;
 }
    </style>
  </head>
  <body>
    <div id="header">
      <!-- <h1 class="home"><a href="/">Gentle</a></h1> -->
      <audio id="audio" src="a.wav" controls="true" preload="auto"></audio>
      <img src="/preloader.gif" id="preloader" alt="loading...">
      <span id="downloads"> </div>
    </div>
    <div id="transcript"></div>
    <!-- <div id="footer">
      <a href="https://lowerquality.com/gentle">Gentle</a> is free software released under the <a href="https://opensource.org/licenses/MIT">MIT license</a>. <a href="https://lowerquality.com/gentle">Homepage</a> | <a href="https://github.com/lowerquality/gentle">Source code</a>.
    </div> -->
    <script>
 function get(url, cb) {
    var xhr = new XMLHttpRequest();
    xhr.open("GET", url, true);
    xhr.onload = function() {
        cb(this.responseText);
    }
    xhr.send();
 }
 function get_json(url, cb) {
    get(url, function(x) {
        cb(JSON.parse(x));
    });
 }
 var $a = document.getElementById("audio");
 window.onkeydown = function(ev) {
    if(ev.keyCode == 32) {
        ev.preventDefault();
        $a.pause();
    }
 }
 var $trans = document.getElementById("transcript");
 var $preloader = document.getElementById('preloader');
 var wds = [];
 var cur_wd;
 var $phones = document.createElement("div");
 $phones.className = "phones";
 document.body.appendChild($phones);
 var cur_phones$ = [];           // List of phoneme $divs
 var $active_phone;
 function render_phones(wd) {
    cur_phones$ = [];
    $phones.innerHTML = "";
    $active_phone = null;
    $phones.style.top = wd.$div.offsetTop + 18;
    $phones.style.left = wd.$div.offsetLeft;
    var dur = wd.end - wd.start;
    var start_x = wd.$div.offsetLeft;
    wd.phones
        .forEach(function(ph){
            var $p = document.createElement("span");
            $p.className = "phone";
            $p.textContent = ph.phone.split("_")[0];
            $phones.appendChild($p);
            cur_phones$.push($p);
        });
    var offsetToCenter = (wd.$div.offsetWidth - $phones.offsetWidth) / 2;
    $phones.style.left = wd.$div.offsetLeft + offsetToCenter;
 }
 function highlight_phone(t) {
    if(!cur_wd) {
        $phones.innerHTML = "";
        return;
    }
    var hit;
    var cur_t = cur_wd.start;
    cur_wd.phones.forEach(function(ph, idx) {
        if(cur_t <= t && cur_t + ph.duration >= t) {
            hit = idx;
        }
        cur_t += ph.duration;
    });
    if(hit) {
        var $ph = cur_phones$[hit];
        if($ph != $active_phone) {
            if($active_phone) {
                $active_phone.classList.remove("phactive");
            }
            if($ph) {
                $ph.classList.add("phactive");
            }
        }
        $active_phone = $ph;
    }
 }
 function highlight_word() {
    var t = $a.currentTime;
    // XXX: O(N); use binary search
    var hits = wds.filter(function(x) {
        return (t - x.start) > 0.01 && (x.end - t) > 0.01;
    }, wds);
    var next_wd = hits[hits.length - 1];
    if(cur_wd != next_wd) {
        var active = document.querySelectorAll('.active');
        for(var i = 0; i < active.length; i++) {
            active[i].classList.remove('active');
        }
        if(next_wd && next_wd.$div) {
            next_wd.$div.classList.add('active');
            render_phones(next_wd);
        }
    }
    cur_wd = next_wd;
    highlight_phone(t);
    window.requestAnimationFrame(highlight_word);
 }
 window.requestAnimationFrame(highlight_word);
 $trans.innerHTML = "Loading...";
 function render(ret) {
    wds = ret['words'] || [];
    transcript = ret['transcript'];
    $trans.innerHTML = '';
    var currentOffset = 0;
    wds.forEach(function(wd) {
        if(wd.case == 'not-found-in-transcript') {
            // TODO: show phonemes somewhere
            var txt = ' ' + wd.word;
            var $plaintext = document.createTextNode(txt);
            $trans.appendChild($plaintext);
            return;
        }
        // Add non-linked text
        if(wd.startOffset > currentOffset) {
            var txt = transcript.slice(currentOffset, wd.startOffset);
            var $plaintext = document.createTextNode(txt);
            $trans.appendChild($plaintext);
            currentOffset = wd.startOffset;
        }
        var $wd = document.createElement('span');
        var txt = transcript.slice(wd.startOffset, wd.endOffset);
        var $wdText = document.createTextNode(txt);
        $wd.appendChild($wdText);
        wd.$div = $wd;
        if(wd.start !== undefined) {
            $wd.className = 'success';
        }
        $wd.onclick = function() {
            if(wd.start !== undefined) {
                console.log(wd.start);
                $a.currentTime = wd.start;
                $a.play();
            }
        };
        $trans.appendChild($wd);
        currentOffset = wd.endOffset;
    });
    var txt = transcript.slice(currentOffset, transcript.length);
    var $plaintext = document.createTextNode(txt);
    $trans.appendChild($plaintext);
    currentOffset = transcript.length;
 }
 function show_downloads() {
    var $d = document.getElementById("downloads");
    $d.textContent = "Download as: ";
    var uid = window.location.pathname.split("/")[2];
    // Name, path, title, inhibit-on-file:///
    [["CSV", "align.csv", "Word alignment CSV"],
     ["JSON", "align.json", "JSON word/phoneme alignment data"],
     ["Zip", "/zip/" + uid + ".zip", "Standalone zipfile", true]]
        .forEach(function(x) {
            var $a = document.createElement("a");
            $a.className = "download";
            $a.textContent = x[0];
            $a.href = x[1];
            $a.title = x[2];
            if(!x[3] || window.location.protocol != "file:") {
                $d.appendChild($a);
            }
        });
 }
 var status_init = false;
 var status_log  = [];		// [ status ]
 var $status_pro;
 function render_status(ret) {
    if(!status_init) {
 	// Clobber the $trans div and use it for status updates
 	$trans.innerHTML = "<h2>transcription in progress</h2>";
 	$trans.className = "status";
 	$status_pro = document.createElement("progress");
 	$status_pro.setAttribute("min", "0");
 	$status_pro.setAttribute("max", "100");
 	$status_pro.value = 0;
 	$trans.appendChild($status_pro);
 	status_init = true;
    }
    if(ret.status !== "TRANSCRIBING") {
 	if(ret.percent) {
 	    $status_pro.value = (100*ret.percent);
 	}
    }
    else if(ret.percent && (status_log.length == 0 || status_log[status_log.length-1].percent+0.0001 < ret.percent)) {
 	// New entry
 	var $entry = document.createElement("div");
 	$entry.className = "entry";
 	$entry.textContent = ret.message;
 	ret.$div = $entry;
 	if(ret.percent) {
 	    $status_pro.value = (100*ret.percent);
 	}
 	if(status_log.length > 0) {
 	    $trans.insertBefore($entry, status_log[status_log.length-1].$div);
 	}
 	else {
 	    $trans.appendChild($entry);
 	}
 	status_log.push(ret);
    }
 }
 function update() {
    if(INLINE_JSON) {
        // We want this to work from file:/// domains, so we provide a
        // mechanism for inlining the alignment data.
        render(INLINE_JSON);
        // show_downloads();
    }
    else  {
 	// Show the status
        get_json('status.json', function(ret) {
 	          $a.style.visibility = 'hidden';
            if (ret.status == 'ERROR') {
                $preloader.style.visibility = 'hidden';
                $trans.innerHTML = '<b>' + ret.status + ': ' + ret.error + '</b>';
            } else if (ret.status == 'TRANSCRIBING' || ret.status == 'ALIGNING') {
                $preloader.style.visibility = 'visible';
                render_status(ret);
                setTimeout(update, 2000);
            } else if (ret.status == 'OK') {
                // show_downloads();
                $preloader.style.visibility = 'hidden';
 		// XXX: should we fetch the align.json?
 		// window.location.reload();
                $a.style.visibility = 'visible';
                render(ret);
            } else if (ret.status == 'ENCODING' || ret.status == 'STARTED') {
                $preloader.style.visibility = 'visible';
                $trans.innerHTML = 'Encoding, please wait...';
                setTimeout(update, 2000);
            } else {
 		console.log("unknown status", ret);
                $preloader.style.visibility = 'hidden';
                $trans.innerHTML = ret.status + '...';
                setTimeout(update, 5000);
            }
        });
    }
 }
 var INLINE_JSON;
 update();
 </script></body></html>
--- a/plume/utils/gentle_preview/preloader.gif
+++ b/plume/utils/gentle_preview/preloader.gif
--- a/plume/utils/st_rerun.py
+++ b/plume/utils/st_rerun.py
--- a/plume/utils/transcribe.py
+++ b/plume/utils/transcribe.py
@@ -8,12 +8,11 @@ import typer
 # import rpyc
 # from tqdm import tqdm
 # from pydub import AudioSegment
 # from pydub.silence import split_on_silence
 from plume.utils import lazy_module, lazy_callable
 rpyc = lazy_module('rpyc')
-AudioSegment = lazy_callable('pydub.AudioSegment')
+pydub = lazy_module('pydub')
 split_on_silence = lazy_callable('pydub.silence.split_on_silence')
 app = typer.Typer()
@@ -106,7 +105,7 @@ def triton_transcribe_grpc_gen(
        #     ]
        #     pass
        transcript_list = []
-        sil_pad = AudioSegment.silent(duration=sil_msec)
+        sil_pad = pydub.AudioSegment.silent(duration=sil_msec)
        for seg in chunks:
            t_seg = sil_pad + seg + sil_pad
            c_transcript = transcriber(t_seg)
@@ -124,9 +123,7 @@ def triton_transcribe_grpc_gen(
@app.command()
 def file(audio_file: Path, write_file: bool = False, chunked=True):
-    from pydub import AudioSegment
+    aseg = pydub.AudioSegment.from_file(audio_file)
    aseg = AudioSegment.from_file(audio_file)
    transcriber, prep = triton_transcribe_grpc_gen()
    transcription = transcriber(prep(aseg))
@@ -139,10 +136,8 @@ def file(audio_file: Path, write_file: bool = False, chunked=True):
@app.command()
 def benchmark(audio_file: Path):
    from pydub import AudioSegment
    transcriber, audio_prep = transcribe_rpyc_gen()
-    file_seg = AudioSegment.from_file(audio_file)
+    file_seg = pydub.AudioSegment.from_file(audio_file)
    aud_seg = audio_prep(file_seg)
    def timeinfo():
--- a/plume/utils/tts.py
+++ b/plume/utils/tts.py
@@ -27,6 +27,10 @@ class GoogleTTS(object):
            audio_encoding=texttospeech.enums.AudioEncoding.LINEAR16,
            sample_rate_hertz=params["sample_rate"],
        )
        if 'speaking_rate' in params:
            audio_config.speaking_rate = params['speaking_rate']
        if 'pitch' in params:
            audio_config.pitch = params['pitch']
        response = self.client.synthesize_speech(tts_input, voice, audio_config)
        audio_content = response.audio_content
        return audio_content
@@ -74,6 +78,19 @@ class GoogleTTS(object):
            )
        return results
    @classmethod
    def voice_by_name(cls, name):
        """Lists the available voices."""
        # client = cls().client
        # Performs the list voices request
        results = cls.voice_list()
        for voice in results:
            if voice['name'] == name:
                return voice
        raise ValueError(f'{name} not a valid voice')
@app.command()
 def generate_audio_file(text, dest_path: Path = "./tts_audio.wav", voice="en-US-Wavenet-D"):
--- a/plume/utils/ui_persist.py
+++ b/plume/utils/ui_persist.py
@@ -0,0 +1,85 @@
 from plume.utils import ExtendedPath, get_mongo_conn
 from plume.utils.st_rerun import rerun
 from uuid import uuid4
 from pathlib import Path
 def setup_file_state(st):
    if not hasattr(st, "state_lock"):
        # st.task_id = str(uuid4())
        task_path = ExtendedPath("preview.lck")
        def current_cursor_fn():
            return task_path.read_json()["current_cursor"]
        def update_cursor_fn(val=0):
            task_path.write_json({"current_cursor": val})
            rerun()
        st.get_current_cursor = current_cursor_fn
        st.update_cursor = update_cursor_fn
        st.state_lock = True
        # cursor_obj = mongo_conn.find_one({"type": "current_cursor", "task_id": st.task_id})
        # if not cursor_obj:
        update_cursor_fn(0)
 def setup_mongo_asr_validation_state(st):
    if not hasattr(st, "mongo_connected"):
        st.mongoclient = get_mongo_conn(col="asr_validation")
        mongo_conn = st.mongoclient
        st.task_id = str(uuid4())
        def current_cursor_fn():
            # mongo_conn = st.mongoclient
            cursor_obj = mongo_conn.find_one(
                {"type": "current_cursor", "task_id": st.task_id}
            )
            cursor_val = cursor_obj["cursor"]
            return cursor_val
        def update_cursor_fn(val=0):
            mongo_conn.find_one_and_update(
                {"type": "current_cursor", "task_id": st.task_id},
                {
                    "$set": {
                        "type": "current_cursor",
                        "task_id": st.task_id,
                        "cursor": val,
                    }
                },
                upsert=True,
            )
            rerun()
        def get_correction_entry_fn(code):
            return mongo_conn.find_one(
                {"type": "correction", "code": code}, projection={"_id": False}
            )
        def update_entry_fn(code, value):
            mongo_conn.find_one_and_update(
                {"type": "correction", "code": code},
                {"$set": {"value": value, "task_id": st.task_id}},
                upsert=True,
            )
        def set_task_fn(data_path, task_id):
            if task_id:
                st.task_id = task_id
            task_path = data_path / Path(f"task-{st.task_id}.lck")
            if not task_path.exists():
                print(f"creating task lock at {task_path}")
                task_path.touch()
        st.get_current_cursor = current_cursor_fn
        st.update_cursor = update_cursor_fn
        st.get_correction_entry = get_correction_entry_fn
        st.update_entry = update_entry_fn
        st.set_task = set_task_fn
        st.mongo_connected = True
        cursor_obj = mongo_conn.find_one(
            {"type": "current_cursor", "task_id": st.task_id}
        )
        if not cursor_obj:
            update_cursor_fn(0)
--- a/plume/utils/vad.py
+++ b/plume/utils/vad.py
@@ -0,0 +1,205 @@
 import logging
 import asyncio
 import argparse
 from pathlib import Path
 import webrtcvad
 import pydub
 from pydub.playback import play
 from pydub.utils import make_chunks
 DEFAULT_CHUNK_DUR = 20
 logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
 )
 logger = logging.getLogger(__name__)
 def is_frame_voice(vad, seg, chunk_dur):
    return (
        True
        if (
            seg.duration_seconds == chunk_dur / 1000
            and vad.is_speech(seg.raw_data, seg.frame_rate)
        )
        else False
    )
 class VADFilterAudio(object):
    """docstring for VADFilterAudio."""
    def __init__(self, chunk_dur=DEFAULT_CHUNK_DUR):
        super(VADFilterAudio, self).__init__()
        self.chunk_dur = chunk_dur
        self.vad = webrtcvad.Vad()
    def filter_segment(self, wav_seg):
        chunks = make_chunks(wav_seg, self.chunk_dur)
        speech_buffer = b""
        for i, c in enumerate(chunks[:-1]):
            voice_frame = is_frame_voice(self.vad, c, self.chunk_dur)
            if voice_frame:
                speech_buffer += c.raw_data
        filtered_seg = pydub.AudioSegment(
            data=speech_buffer,
            frame_rate=wav_seg.frame_rate,
            channels=wav_seg.channels,
            sample_width=wav_seg.sample_width,
        )
        return filtered_seg
 class VADUtterance(object):
    """docstring for VADUtterance."""
    def __init__(
        self,
        max_silence=500,
        min_utterance=280,
        max_utterance=20000,
        chunk_dur=DEFAULT_CHUNK_DUR,
        start_cycles=3,
    ):
        super(VADUtterance, self).__init__()
        self.vad = webrtcvad.Vad()
        self.chunk_dur = chunk_dur
        # duration in millisecs
        self.max_sil = max_silence
        self.min_utt = min_utterance
        self.max_utt = max_utterance
        self.speech_start = start_cycles * chunk_dur
    def __repr__(self):
        return f"VAD(max_silence={self.max_sil},min_utterance:{self.min_utt},max_utterance:{self.max_utt})"
    async def stream_utterance(self, audio_stream):
        silence_buffer = pydub.AudioSegment.empty()
        voice_buffer = pydub.AudioSegment.empty()
        silence_threshold = False
        async for c in audio_stream:
            voice_frame = is_frame_voice(self.vad, c, self.chunk_dur)
            logger.debug(f"is audio stream voice? {voice_frame}")
            if voice_frame:
                silence_threshold = False
                voice_buffer += c
                silence_buffer = pydub.AudioSegment.empty()
            else:
                silence_buffer += c
            voc_dur = voice_buffer.duration_seconds * 1000
            sil_dur = silence_buffer.duration_seconds * 1000
            if voc_dur >= self.max_utt:
                logger.info(
                    f"detected voice overflow: voice duration {voice_buffer.duration_seconds}"
                )
                yield voice_buffer
                voice_buffer = pydub.AudioSegment.empty()
            if sil_dur >= self.max_sil:
                if voc_dur >= self.min_utt:
                    logger.info(
                        f"detected silence: voice duration {voice_buffer.duration_seconds}"
                    )
                    yield voice_buffer
                voice_buffer = pydub.AudioSegment.empty()
                # ignore/clear voice if silence reached threshold or indent the statement
                if not silence_threshold:
                    silence_threshold = True
        if voice_buffer:
            yield voice_buffer
    async def stream_events(self, audio_stream):
        """
        yields 0, voice_buffer for SpeechBuffer
        yields 1, None for StartedSpeaking
        yields 2, None for StoppedSpeaking
        yields 4, audio_stream
        """
        silence_buffer = pydub.AudioSegment.empty()
        voice_buffer = pydub.AudioSegment.empty()
        silence_threshold, started_speaking = False, False
        async for c in audio_stream:
            # yield (4, c)
            voice_frame = is_frame_voice(self.vad, c, self.chunk_dur)
            logger.debug(f"is audio stream voice? {voice_frame}")
            if voice_frame:
                silence_threshold = False
                voice_buffer += c
                silence_buffer = pydub.AudioSegment.empty()
            else:
                silence_buffer += c
            voc_dur = voice_buffer.duration_seconds * 1000
            sil_dur = silence_buffer.duration_seconds * 1000
            if voc_dur >= self.speech_start and not started_speaking:
                started_speaking = True
                yield (1, None)
            if voc_dur >= self.max_utt:
                logger.info(
                    f"detected voice overflow: voice duration {voice_buffer.duration_seconds}"
                )
                yield (0, voice_buffer)
                voice_buffer = pydub.AudioSegment.empty()
                started_speaking = False
            if sil_dur >= self.max_sil:
                if voc_dur >= self.min_utt:
                    logger.info(
                        f"detected silence: voice duration {voice_buffer.duration_seconds}"
                    )
                    yield (0, voice_buffer)
                voice_buffer = pydub.AudioSegment.empty()
                started_speaking = False
                # ignore/clear voice if silence reached threshold or indent the statement
                if not silence_threshold:
                    silence_threshold = True
                    yield (2, None)
        if voice_buffer:
            yield (0, voice_buffer)
    @classmethod
    async def stream_utterance_file(cls, audio_file):
        async def stream_gen():
            audio_seg = pydub.AudioSegment.from_file(audio_file).set_frame_rate(32000)
            chunks = make_chunks(audio_seg, DEFAULT_CHUNK_DUR)
            for c in chunks:
                yield c
        va_ut = cls()
        buffer_src = va_ut.stream_utterance(stream_gen())
        async for buf in buffer_src:
            play(buf)
            await asyncio.sleep(1)
 class VADStreamGen(object):
    """docstring for VADStreamGen."""
    def __init__(self, arg):
        super(VADStreamGen, self).__init__()
        self.arg = arg
 def main():
    prog = Path(__file__).stem
    parser = argparse.ArgumentParser(prog=prog, description="transcribes audio file")
    parser.add_argument(
        "--audio_file",
        type=argparse.FileType("rb"),
        help="audio file to transcribe",
        default="./test_utter2.wav",
    )
    args = parser.parse_args()
    loop = asyncio.get_event_loop()
    loop.run_until_complete(VADUtterance.stream_utterance_file(args.audio_file))
 if __name__ == "__main__":
    main()
--- a/setup.py
+++ b/setup.py
@@ -58,6 +58,9 @@ extra_requirements = {
        "stringcase~=1.2.0",
        "google-cloud-speech~=1.3.1",
    ],
    "ui": [
        "rangehttpserver~=1.2.0",
    ],
    "train": ["torchaudio~=0.6.0", "torch-stft~=0.1.4"],
 }