1. Self contained typers

2. Asr force-aligner visualization
3. streamlit state management abstraction
4. new utils / reorganize
5. added verbose flags
6. add tts by name
tegra
Malar 2021-03-23 13:27:35 +05:30
parent f72c6bbe5b
commit c474aa5f5a
22 changed files with 1097 additions and 146 deletions

1
MANIFEST.in Normal file
View File

@ -0,0 +1 @@
graft plume/utils/gentle_preview

View File

@ -7,12 +7,12 @@ from .eval import app as eval_app
from .serve import app as serve_app from .serve import app as serve_app
app = typer.Typer() app = typer.Typer()
app.add_typer(data_app, name="data") app.add_typer(data_app)
app.add_typer(ui_app, name="ui") app.add_typer(ui_app)
app.add_typer(train_app, name="train") app.add_typer(train_app)
app.add_typer(eval_app, name="eval") app.add_typer(eval_app)
app.add_typer(serve_app, name="serve") app.add_typer(serve_app)
app.add_typer(utils_app, name='utils') app.add_typer(utils_app)
def main(): def main():

View File

@ -27,6 +27,13 @@ app.add_typer(generate_app, name="generate")
app.add_typer(wav2vec2_app, name="wav2vec2") app.add_typer(wav2vec2_app, name="wav2vec2")
@app.callback()
def data():
"""
data sub commands
"""
@app.command() @app.command()
def fix_path(dataset_path: Path, force: bool = False): def fix_path(dataset_path: Path, force: bool = False):
manifest_path = dataset_path / Path("manifest.json") manifest_path = dataset_path / Path("manifest.json")

View File

@ -3,3 +3,10 @@ from ..models.wav2vec2.eval import app as wav2vec2_app
app = typer.Typer() app = typer.Typer()
app.add_typer(wav2vec2_app, name="wav2vec2") app.add_typer(wav2vec2_app, name="wav2vec2")
@app.callback()
def eval():
"""
eval sub commands
"""

View File

@ -5,3 +5,10 @@ from ..models.jasper.serve import app as jasper_app
app = typer.Typer() app = typer.Typer()
app.add_typer(wav2vec2_app, name="wav2vec2") app.add_typer(wav2vec2_app, name="wav2vec2")
app.add_typer(jasper_app, name="jasper") app.add_typer(jasper_app, name="jasper")
@app.callback()
def serve():
"""
serve sub commands
"""

View File

@ -1,5 +1,12 @@
import typer import typer
from ..models.wav2vec2.train import app as train_app from ..models.wav2vec2.train import app as wav2vec2_app
app = typer.Typer() app = typer.Typer()
app.add_typer(train_app, name="wav2vec2") app.add_typer(wav2vec2_app, name="wav2vec2")
@app.callback()
def train():
"""
train sub commands
"""

View File

@ -3,12 +3,20 @@ import sys
from pathlib import Path from pathlib import Path
from plume.utils import lazy_module from plume.utils import lazy_module
# from streamlit import cli as stcli # from streamlit import cli as stcli
stcli = lazy_module('streamlit.cli') stcli = lazy_module("streamlit.cli")
app = typer.Typer() app = typer.Typer()
@app.callback()
def ui():
"""
ui sub commands
"""
@app.command() @app.command()
def annotation(data_dir: Path, dump_fname: Path = "ui_dump.json", task_id: str = ""): def annotation(data_dir: Path, dump_fname: Path = "ui_dump.json", task_id: str = ""):
annotation_lit_path = Path(__file__).parent / Path("annotation.py") annotation_lit_path = Path(__file__).parent / Path("annotation.py")
@ -40,13 +48,7 @@ def annotation(data_dir: Path, dump_fname: Path = "ui_dump.json", task_id: str =
@app.command() @app.command()
def preview(manifest_path: Path): def preview(manifest_path: Path):
annotation_lit_path = Path(__file__).parent / Path("preview.py") annotation_lit_path = Path(__file__).parent / Path("preview.py")
sys.argv = [ sys.argv = ["streamlit", "run", str(annotation_lit_path), "--", str(manifest_path)]
"streamlit",
"run",
str(annotation_lit_path),
"--",
str(manifest_path)
]
sys.exit(stcli.main()) sys.exit(stcli.main())
@ -56,6 +58,18 @@ def collection(data_dir: Path, task_id: str = ""):
pass pass
@app.command()
def alignment(preview_dir: Path, port: int = 8010):
from RangeHTTPServer import RangeRequestHandler
from functools import partial
from http.server import HTTPServer
server_address = ("", port)
handler_class = partial(RangeRequestHandler, directory=str(preview_dir))
httpd = HTTPServer(server_address, handler_class)
httpd.serve_forever()
def main(): def main():
app() app()

View File

@ -1,66 +1,14 @@
# import sys # import sys
from pathlib import Path from pathlib import Path
from uuid import uuid4
import streamlit as st import streamlit as st
import typer import typer
from plume.utils import ExtendedPath
from plume.utils import ExtendedPath, get_mongo_conn from plume.utils.ui_persist import setup_mongo_asr_validation_state
from plume.preview.st_rerun import rerun
app = typer.Typer() app = typer.Typer()
setup_mongo_asr_validation_state(st)
if not hasattr(st, "mongo_connected"):
st.mongoclient = get_mongo_conn(col="asr_validation")
mongo_conn = st.mongoclient
st.task_id = str(uuid4())
def current_cursor_fn():
# mongo_conn = st.mongoclient
cursor_obj = mongo_conn.find_one(
{"type": "current_cursor", "task_id": st.task_id}
)
cursor_val = cursor_obj["cursor"]
return cursor_val
def update_cursor_fn(val=0):
mongo_conn.find_one_and_update(
{"type": "current_cursor", "task_id": st.task_id},
{"$set": {"type": "current_cursor", "task_id": st.task_id, "cursor": val}},
upsert=True,
)
rerun()
def get_correction_entry_fn(code):
return mongo_conn.find_one(
{"type": "correction", "code": code}, projection={"_id": False}
)
def update_entry_fn(code, value):
mongo_conn.find_one_and_update(
{"type": "correction", "code": code},
{"$set": {"value": value, "task_id": st.task_id}},
upsert=True,
)
def set_task_fn(data_path, task_id):
if task_id:
st.task_id = task_id
task_path = data_path / Path(f"task-{st.task_id}.lck")
if not task_path.exists():
print(f"creating task lock at {task_path}")
task_path.touch()
st.get_current_cursor = current_cursor_fn
st.update_cursor = update_cursor_fn
st.get_correction_entry = get_correction_entry_fn
st.update_entry = update_entry_fn
st.set_task = set_task_fn
st.mongo_connected = True
cursor_obj = mongo_conn.find_one({"type": "current_cursor", "task_id": st.task_id})
if not cursor_obj:
update_cursor_fn(0)
@st.cache() @st.cache()

View File

@ -3,27 +3,11 @@ from pathlib import Path
import streamlit as st import streamlit as st
import typer import typer
from plume.utils import ExtendedPath from plume.utils import ExtendedPath
from plume.preview.st_rerun import rerun from plume.utils.ui_persist import setup_file_state
app = typer.Typer() app = typer.Typer()
if not hasattr(st, "state_lock"): setup_file_state(st)
# st.task_id = str(uuid4())
task_path = ExtendedPath("preview.lck")
def current_cursor_fn():
return task_path.read_json()["current_cursor"]
def update_cursor_fn(val=0):
task_path.write_json({"current_cursor": val})
rerun()
st.get_current_cursor = current_cursor_fn
st.update_cursor = update_cursor_fn
st.state_lock = True
# cursor_obj = mongo_conn.find_one({"type": "current_cursor", "task_id": st.task_id})
# if not cursor_obj:
update_cursor_fn(0)
@st.cache() @st.cache()
@ -40,7 +24,7 @@ def main(manifest: Path):
print("Invalid samplno resetting to 0") print("Invalid samplno resetting to 0")
st.update_cursor(0) st.update_cursor(0)
sample = asr_data[sample_no] sample = asr_data[sample_no]
st.title(f"ASR Manifest Preview") st.title("ASR Manifest Preview")
st.markdown(f"{sample_no+1} of {len(asr_data)} : **{sample['text']}**") st.markdown(f"{sample_no+1} of {len(asr_data)} : **{sample['text']}**")
new_sample = st.number_input( new_sample = st.number_input(
"Go To Sample:", value=sample_no + 1, min_value=1, max_value=len(asr_data) "Go To Sample:", value=sample_no + 1, min_value=1, max_value=len(asr_data)

151
plume/utils/.gitignore vendored Normal file
View File

@ -0,0 +1,151 @@
/data/
/model/
/train/
.env*
*.yaml
*.yml
*.json
# Created by https://www.gitignore.io/api/python
# Edit at https://www.gitignore.io/?templates=python
### Python ###
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# pyenv
.python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# Mr Developer
.mr.developer.cfg
.project
.pydevproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# End of https://www.gitignore.io/api/python
# Created by https://www.gitignore.io/api/macos
# Edit at https://www.gitignore.io/?templates=macos
### macOS ###
# General
.DS_Store
.AppleDouble
.LSOverride
# Icon must end with two \r
Icon
# Thumbnails
._*
# Files that might appear in the root of a volume
.DocumentRevisions-V100
.fseventsd
.Spotlight-V100
.TemporaryItems
.Trashes
.VolumeIcon.icns
.com.apple.timemachine.donotpresent
# Directories potentially created on remote AFP share
.AppleDB
.AppleDesktop
Network Trash Folder
Temporary Items
.apdisk
# End of https://www.gitignore.io/api/macos

View File

@ -11,12 +11,14 @@ from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
import subprocess import subprocess
import shutil import shutil
from urllib.parse import urlsplit from urllib.parse import urlsplit
# from .lazy_loader import LazyLoader # from .lazy_loader import LazyLoader
from .lazy_import import lazy_callable, lazy_module from .lazy_import import lazy_callable, lazy_module
# from ruamel.yaml import YAML # from ruamel.yaml import YAML
# import boto3 # import boto3
import typer import typer
# import pymongo # import pymongo
# from slugify import slugify # from slugify import slugify
# import pydub # import pydub
@ -34,16 +36,16 @@ from .tts import app as tts_app
from .transcribe import app as transcribe_app from .transcribe import app as transcribe_app
from .align import app as align_app from .align import app as align_app
boto3 = lazy_module('boto3') boto3 = lazy_module("boto3")
pymongo = lazy_module('pymongo') pymongo = lazy_module("pymongo")
pydub = lazy_module('pydub') pydub = lazy_module("pydub")
audio_display = lazy_module('librosa.display') audio_display = lazy_module("librosa.display")
plt = lazy_module('matplotlib.pyplot') plt = lazy_module("matplotlib.pyplot")
librosa = lazy_module('librosa') librosa = lazy_module("librosa")
YAML = lazy_callable('ruamel.yaml.YAML') YAML = lazy_callable("ruamel.yaml.YAML")
num2words = lazy_callable('num2words.num2words') num2words = lazy_callable("num2words.num2words")
slugify = lazy_callable('slugify.slugify') slugify = lazy_callable("slugify.slugify")
compress = lazy_callable('natural.date.compress') compress = lazy_callable("natural.date.compress")
app = typer.Typer() app = typer.Typer()
app.add_typer(tts_app, name="tts") app.add_typer(tts_app, name="tts")
@ -51,6 +53,13 @@ app.add_typer(align_app, name="align")
app.add_typer(transcribe_app, name="transcribe") app.add_typer(transcribe_app, name="transcribe")
@app.callback()
def utils():
"""
utils sub commands
"""
logging.basicConfig( logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
) )
@ -125,6 +134,10 @@ def upload_s3(dataset_path, s3_path):
run_shell(f"aws s3 sync {dataset_path} {s3_path}") run_shell(f"aws s3 sync {dataset_path} {s3_path}")
def copy_s3(dataset_path, s3_path):
run_shell(f"aws s3 cp {dataset_path} {s3_path}")
def get_download_path(s3_uri, output_path): def get_download_path(s3_uri, output_path):
s3_uri_p = urlsplit(s3_uri) s3_uri_p = urlsplit(s3_uri)
download_path = output_path / Path(s3_uri_p.path[1:]) download_path = output_path / Path(s3_uri_p.path[1:])
@ -135,10 +148,11 @@ def get_download_path(s3_uri, output_path):
def s3_downloader(): def s3_downloader():
s3 = boto3.client("s3") s3 = boto3.client("s3")
def download_s3(s3_uri, download_path): def download_s3(s3_uri, download_path, verbose=False):
s3_uri_p = urlsplit(s3_uri) s3_uri_p = urlsplit(s3_uri)
download_path.parent.mkdir(exist_ok=True, parents=True) download_path.parent.mkdir(exist_ok=True, parents=True)
if not download_path.exists(): if not download_path.exists():
if verbose:
print(f"downloading {s3_uri} to {download_path}") print(f"downloading {s3_uri} to {download_path}")
s3.download_file(s3_uri_p.netloc, s3_uri_p.path[1:], str(download_path)) s3.download_file(s3_uri_p.netloc, s3_uri_p.path[1:], str(download_path))
@ -186,6 +200,7 @@ def ui_data_generator(dataset_dir, asr_data_source, verbose=False):
plot_seg(wav_plot_path.absolute(), audio_file) plot_seg(wav_plot_path.absolute(), audio_file)
return { return {
"audio_path": str(rel_data_path), "audio_path": str(rel_data_path),
"audio_filepath": str(rel_data_path),
"duration": round(audio_dur, 1), "duration": round(audio_dur, 1),
"text": transcript, "text": transcript,
"real_idx": num_datapoints, "real_idx": num_datapoints,
@ -229,17 +244,17 @@ def ui_dump_manifest_writer(dataset_dir, asr_data_source, verbose=False):
) )
asr_manifest = dataset_dir / Path("manifest.json") asr_manifest = dataset_dir / Path("manifest.json")
with asr_manifest.open("w") as mf: asr_manifest_writer(asr_manifest, dump_data, verbose=verbose)
print(f"writing manifest to {asr_manifest}") # with asr_manifest.open("w") as mf:
for d in dump_data: # print(f"writing manifest to {asr_manifest}")
rel_data_path = d["audio_path"] # for d in dump_data:
audio_dur = d["duration"] # rel_data_path = d["audio_path"]
transcript = d["text"] # audio_dur = d["duration"]
manifest = manifest_str(str(rel_data_path), audio_dur, transcript) # transcript = d["text"]
mf.write(manifest) # manifest = manifest_str(str(rel_data_path), audio_dur, transcript)
# mf.write(manifest)
ui_dump_file = dataset_dir / Path("ui_dump.json") ui_dump_file = dataset_dir / Path("ui_dump.json")
ExtendedPath(ui_dump_file).write_json({"data": dump_data}) ExtendedPath(ui_dump_file).write_json({"data": dump_data}, verbose=verbose)
return num_datapoints return num_datapoints
@ -254,9 +269,10 @@ def asr_manifest_reader(data_manifest_path: Path):
yield p yield p
def asr_manifest_writer(asr_manifest_path: Path, manifest_str_source): def asr_manifest_writer(asr_manifest_path: Path, manifest_str_source, verbose=False):
with asr_manifest_path.open("w") as mf: with asr_manifest_path.open("w") as mf:
print(f"opening {asr_manifest_path} for writing manifest") if verbose:
print(f"writing asr manifest to {asr_manifest_path}")
for mani_dict in manifest_str_source: for mani_dict in manifest_str_source:
manifest = manifest_str( manifest = manifest_str(
mani_dict["audio_filepath"], mani_dict["duration"], mani_dict["text"] mani_dict["audio_filepath"], mani_dict["duration"], mani_dict["text"]
@ -293,36 +309,42 @@ def batch(iterable, n=1):
class ExtendedPath(type(Path())): class ExtendedPath(type(Path())):
"""docstring for ExtendedPath.""" """docstring for ExtendedPath."""
def read_json(self): def read_json(self, verbose=False):
if verbose:
print(f"reading json from {self}") print(f"reading json from {self}")
with self.open("r") as jf: with self.open("r") as jf:
return json.load(jf) return json.load(jf)
def read_yaml(self): def read_yaml(self, verbose=False):
yaml = YAML(typ="safe", pure=True) yaml = YAML(typ="safe", pure=True)
if verbose:
print(f"reading yaml from {self}") print(f"reading yaml from {self}")
with self.open("r") as yf: with self.open("r") as yf:
return yaml.load(yf) return yaml.load(yf)
def read_jsonl(self): def read_jsonl(self, verbose=False):
if verbose:
print(f"reading jsonl from {self}") print(f"reading jsonl from {self}")
with self.open("r") as jf: with self.open("r") as jf:
for l in jf.readlines(): for ln in jf.readlines():
yield json.loads(l) yield json.loads(ln)
def write_json(self, data): def write_json(self, data, verbose=False):
if verbose:
print(f"writing json to {self}") print(f"writing json to {self}")
self.parent.mkdir(parents=True, exist_ok=True) self.parent.mkdir(parents=True, exist_ok=True)
with self.open("w") as jf: with self.open("w") as jf:
json.dump(data, jf, indent=2) json.dump(data, jf, indent=2)
def write_yaml(self, data): def write_yaml(self, data, verbose=False):
yaml = YAML() yaml = YAML()
if verbose:
print(f"writing yaml to {self}") print(f"writing yaml to {self}")
with self.open("w") as yf: with self.open("w") as yf:
yaml.dump(data, yf) yaml.dump(data, yf)
def write_jsonl(self, data): def write_jsonl(self, data, verbose=False):
if verbose:
print(f"writing jsonl to {self}") print(f"writing jsonl to {self}")
self.parent.mkdir(parents=True, exist_ok=True) self.parent.mkdir(parents=True, exist_ok=True)
with self.open("w") as jf: with self.open("w") as jf:

View File

@ -1,12 +1,14 @@
from pathlib import Path from pathlib import Path
from .tts import GoogleTTS
# from IPython import display # from IPython import display
import requests import requests
import io import io
import typer import shutil
import typer
from plume.utils import lazy_module from plume.utils import lazy_module
from .tts import GoogleTTS
display = lazy_module('IPython.display') display = lazy_module('IPython.display')
pydub = lazy_module('pydub') pydub = lazy_module('pydub')
@ -63,16 +65,19 @@ def gentle_preview(
audio_path: Path, audio_path: Path,
transcript_path: Path, transcript_path: Path,
service_uri="http://101.53.142.218:8765/transcriptions", service_uri="http://101.53.142.218:8765/transcriptions",
gent_preview_dir="../gentle_preview", gent_preview_dir="./gentle_preview",
): ):
from . import ExtendedPath from . import ExtendedPath
ab = audio_path.read_bytes() pkg_gentle_dir = Path(__file__).parent / 'gentle_preview'
tt = transcript_path.read_text()
audio, alignment = gentle_aligner(service_uri, ab, tt) shutil.copytree(str(pkg_gentle_dir), str(gent_preview_dir))
audio.export(gent_preview_dir / Path("a.wav"), format="wav") # ab = audio_path.read_bytes()
alignment["status"] = "OK" # tt = transcript_path.read_text()
ExtendedPath(gent_preview_dir / Path("status.json")).write_json(alignment) # audio, alignment = gentle_aligner(service_uri, ab, tt)
# audio.export(gent_preview_dir / Path("a.wav"), format="wav")
# alignment["status"] = "OK"
# ExtendedPath(gent_preview_dir / Path("status.json")).write_json(alignment)
def main(): def main():

View File

@ -0,0 +1,5 @@
Serve with https://github.com/danvk/RangeHTTPServer
`https://github.com/claysciences/CORSRangeHTTPServer`
`python -m RangeHTTPServer`
`python -m http.server`

View File

@ -0,0 +1,80 @@
<html>
<head>
<meta charset="utf-8" />
<style>
body {font-family: sans-serif; padding-top: 70px; }
textarea { width: 500px; height: 20em; }
input, textarea { margin: 1em 0; }
#header {
position: fixed;
top: 0;
left: 0;
height: 50px;
line-height: 50px;
width: 100%;
background-color: #999;
box-shadow: 0px 0px 5px 0px rgba(0,0,0,0.5);
font-family: Helvetica, sans-serif;
}
#header, #header a {
color: white;
}
.home {
margin: 0;
font-size: 125%;
font-weight: lighter;
text-transform: lowercase;
}
.home a {
margin: 0;
background: #666;
padding-left: 25px;
padding-right: 30px;
margin-right: 20px;
float: left;
text-decoration: none;
}
.home:hover a {
background: #555;
}
#align-button {
background: #CCC;
border: 0;
font-size: 18px;
padding: 10px 30px;
cursor: pointer;
}
#alignment-flags {
background: #CCC;
border: 0;
font-size: 18px;
padding: 10px 30px;
}
#footer {
margin-top: 100px;
border-top: 1px dotted black;
font-size: 8pt;
font-style: italic;
padding: 10px;
}
</style>
</head>
<body>
<div id="header">
<h1 class="home"><a href="/">Gentle</a></h1>
</div>
<form action="/transcriptions" method="POST" enctype="multipart/form-data">
Audio:<br>
<input type=file name=audio><br>
<br>
Transcript:<br>
<textarea name="transcript"></textarea><br>
<input id=alignment-flags name=conservative type=checkbox> Conservative<br>
<input id=alignment-flags name=disfluency type=checkbox> Include disfluencies<br>
<input id="align-button" type=submit value=Align>
</form>
<div id="footer">
<a href="https://lowerquality.com/gentle">Gentle</a> is free software released under the <a href="https://opensource.org/licenses/MIT">MIT license</a>. <a href="https://lowerquality.com/gentle">Homepage</a> | <a href="https://github.com/lowerquality/gentle">Source code</a>.
</div>
</body>
</html>

View File

@ -0,0 +1,408 @@
<html>
<head>
<meta charset="utf-8" />
<style>
html, body {
margin: 0;
padding: 0;
}
#header {
position: fixed;
top: 0;
left: 0;
height: 50px;
line-height: 50px;
width: 100%;
background-color: #999;
box-shadow: 0px 0px 5px 0px rgba(0,0,0,0.5);
font-family: Helvetica, sans-serif;
}
#header, #header a {
color: white;
}
#downloads {
float: right;
background: #999;
}
.download {
float: right;
background: #999;
padding: 0 5px;
}
.home {
margin: 0;
font-size: 125%;
font-weight: lighter;
text-transform: lowercase;
}
.home a {
margin: 0;
background: #666;
padding-left: 25px;
padding-right: 30px;
margin-right: 20px;
float: left;
text-decoration: none;
}
.home:hover a {
background: #555;
}
#audio {
margin-top: 9px;
width: 50%;
display: inline-block;
}
#transcript {
margin: 0 15px;
margin-top: 70px;
margin-bottom: 5em;
white-space: pre-wrap;
line-height: 2em;
max-width: 600px;
color: #999;
}
#transcript.status {
background-color: #333;
color: #fff;
font-family: Courier, mono;
line-height: 1em;
font-size: 10pt;
max-width: 100%;
}
#transcript.status h2 {
padding: 10px;
}
#transcript.status .entry {
margin-bottom: 10px;
padding: 10px;
}
#transcript.status progress {
width: 100%;
height: 30px;
margin-bottom: 20px;
}
.success {
color: black;
}
.success:hover {
text-decoration: underline;
}
.active {
color: magenta;
}
#preloader {
visibility: hidden;
}
.phactive {
text-decoration: underline;
}
.phones {
position: absolute;
color: #333;
}
.phones .phone {
margin-right: 5px;
font-family: Helvetica, sans-serif;
text-transform: uppercase;
font-size: 50%;
}
.phones .phone:last-child {
margin-right: 0;
}
#footer {
margin-top: 100px;
border-top: 1px dotted black;
font-size: 8pt;
font-style: italic;
font-family: Helvetica, sans-serif;
padding: 10px;
}
</style>
</head>
<body>
<div id="header">
<!-- <h1 class="home"><a href="/">Gentle</a></h1> -->
<audio id="audio" src="a.wav" controls="true" preload="auto"></audio>
<img src="/preloader.gif" id="preloader" alt="loading...">
<span id="downloads"> </div>
</div>
<div id="transcript"></div>
<!-- <div id="footer">
<a href="https://lowerquality.com/gentle">Gentle</a> is free software released under the <a href="https://opensource.org/licenses/MIT">MIT license</a>. <a href="https://lowerquality.com/gentle">Homepage</a> | <a href="https://github.com/lowerquality/gentle">Source code</a>.
</div> -->
<script>
function get(url, cb) {
var xhr = new XMLHttpRequest();
xhr.open("GET", url, true);
xhr.onload = function() {
cb(this.responseText);
}
xhr.send();
}
function get_json(url, cb) {
get(url, function(x) {
cb(JSON.parse(x));
});
}
var $a = document.getElementById("audio");
window.onkeydown = function(ev) {
if(ev.keyCode == 32) {
ev.preventDefault();
$a.pause();
}
}
var $trans = document.getElementById("transcript");
var $preloader = document.getElementById('preloader');
var wds = [];
var cur_wd;
var $phones = document.createElement("div");
$phones.className = "phones";
document.body.appendChild($phones);
var cur_phones$ = []; // List of phoneme $divs
var $active_phone;
function render_phones(wd) {
cur_phones$ = [];
$phones.innerHTML = "";
$active_phone = null;
$phones.style.top = wd.$div.offsetTop + 18;
$phones.style.left = wd.$div.offsetLeft;
var dur = wd.end - wd.start;
var start_x = wd.$div.offsetLeft;
wd.phones
.forEach(function(ph){
var $p = document.createElement("span");
$p.className = "phone";
$p.textContent = ph.phone.split("_")[0];
$phones.appendChild($p);
cur_phones$.push($p);
});
var offsetToCenter = (wd.$div.offsetWidth - $phones.offsetWidth) / 2;
$phones.style.left = wd.$div.offsetLeft + offsetToCenter;
}
function highlight_phone(t) {
if(!cur_wd) {
$phones.innerHTML = "";
return;
}
var hit;
var cur_t = cur_wd.start;
cur_wd.phones.forEach(function(ph, idx) {
if(cur_t <= t && cur_t + ph.duration >= t) {
hit = idx;
}
cur_t += ph.duration;
});
if(hit) {
var $ph = cur_phones$[hit];
if($ph != $active_phone) {
if($active_phone) {
$active_phone.classList.remove("phactive");
}
if($ph) {
$ph.classList.add("phactive");
}
}
$active_phone = $ph;
}
}
function highlight_word() {
var t = $a.currentTime;
// XXX: O(N); use binary search
var hits = wds.filter(function(x) {
return (t - x.start) > 0.01 && (x.end - t) > 0.01;
}, wds);
var next_wd = hits[hits.length - 1];
if(cur_wd != next_wd) {
var active = document.querySelectorAll('.active');
for(var i = 0; i < active.length; i++) {
active[i].classList.remove('active');
}
if(next_wd && next_wd.$div) {
next_wd.$div.classList.add('active');
render_phones(next_wd);
}
}
cur_wd = next_wd;
highlight_phone(t);
window.requestAnimationFrame(highlight_word);
}
window.requestAnimationFrame(highlight_word);
$trans.innerHTML = "Loading...";
function render(ret) {
wds = ret['words'] || [];
transcript = ret['transcript'];
$trans.innerHTML = '';
var currentOffset = 0;
wds.forEach(function(wd) {
if(wd.case == 'not-found-in-transcript') {
// TODO: show phonemes somewhere
var txt = ' ' + wd.word;
var $plaintext = document.createTextNode(txt);
$trans.appendChild($plaintext);
return;
}
// Add non-linked text
if(wd.startOffset > currentOffset) {
var txt = transcript.slice(currentOffset, wd.startOffset);
var $plaintext = document.createTextNode(txt);
$trans.appendChild($plaintext);
currentOffset = wd.startOffset;
}
var $wd = document.createElement('span');
var txt = transcript.slice(wd.startOffset, wd.endOffset);
var $wdText = document.createTextNode(txt);
$wd.appendChild($wdText);
wd.$div = $wd;
if(wd.start !== undefined) {
$wd.className = 'success';
}
$wd.onclick = function() {
if(wd.start !== undefined) {
console.log(wd.start);
$a.currentTime = wd.start;
$a.play();
}
};
$trans.appendChild($wd);
currentOffset = wd.endOffset;
});
var txt = transcript.slice(currentOffset, transcript.length);
var $plaintext = document.createTextNode(txt);
$trans.appendChild($plaintext);
currentOffset = transcript.length;
}
function show_downloads() {
var $d = document.getElementById("downloads");
$d.textContent = "Download as: ";
var uid = window.location.pathname.split("/")[2];
// Name, path, title, inhibit-on-file:///
[["CSV", "align.csv", "Word alignment CSV"],
["JSON", "align.json", "JSON word/phoneme alignment data"],
["Zip", "/zip/" + uid + ".zip", "Standalone zipfile", true]]
.forEach(function(x) {
var $a = document.createElement("a");
$a.className = "download";
$a.textContent = x[0];
$a.href = x[1];
$a.title = x[2];
if(!x[3] || window.location.protocol != "file:") {
$d.appendChild($a);
}
});
}
var status_init = false;
var status_log = []; // [ status ]
var $status_pro;
function render_status(ret) {
if(!status_init) {
// Clobber the $trans div and use it for status updates
$trans.innerHTML = "<h2>transcription in progress</h2>";
$trans.className = "status";
$status_pro = document.createElement("progress");
$status_pro.setAttribute("min", "0");
$status_pro.setAttribute("max", "100");
$status_pro.value = 0;
$trans.appendChild($status_pro);
status_init = true;
}
if(ret.status !== "TRANSCRIBING") {
if(ret.percent) {
$status_pro.value = (100*ret.percent);
}
}
else if(ret.percent && (status_log.length == 0 || status_log[status_log.length-1].percent+0.0001 < ret.percent)) {
// New entry
var $entry = document.createElement("div");
$entry.className = "entry";
$entry.textContent = ret.message;
ret.$div = $entry;
if(ret.percent) {
$status_pro.value = (100*ret.percent);
}
if(status_log.length > 0) {
$trans.insertBefore($entry, status_log[status_log.length-1].$div);
}
else {
$trans.appendChild($entry);
}
status_log.push(ret);
}
}
function update() {
if(INLINE_JSON) {
// We want this to work from file:/// domains, so we provide a
// mechanism for inlining the alignment data.
render(INLINE_JSON);
// show_downloads();
}
else {
// Show the status
get_json('status.json', function(ret) {
$a.style.visibility = 'hidden';
if (ret.status == 'ERROR') {
$preloader.style.visibility = 'hidden';
$trans.innerHTML = '<b>' + ret.status + ': ' + ret.error + '</b>';
} else if (ret.status == 'TRANSCRIBING' || ret.status == 'ALIGNING') {
$preloader.style.visibility = 'visible';
render_status(ret);
setTimeout(update, 2000);
} else if (ret.status == 'OK') {
// show_downloads();
$preloader.style.visibility = 'hidden';
// XXX: should we fetch the align.json?
// window.location.reload();
$a.style.visibility = 'visible';
render(ret);
} else if (ret.status == 'ENCODING' || ret.status == 'STARTED') {
$preloader.style.visibility = 'visible';
$trans.innerHTML = 'Encoding, please wait...';
setTimeout(update, 2000);
} else {
console.log("unknown status", ret);
$preloader.style.visibility = 'hidden';
$trans.innerHTML = ret.status + '...';
setTimeout(update, 5000);
}
});
}
}
var INLINE_JSON;
update();
</script></body></html>

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.7 KiB

View File

@ -8,12 +8,11 @@ import typer
# import rpyc # import rpyc
# from tqdm import tqdm # from tqdm import tqdm
# from pydub import AudioSegment
# from pydub.silence import split_on_silence # from pydub.silence import split_on_silence
from plume.utils import lazy_module, lazy_callable from plume.utils import lazy_module, lazy_callable
rpyc = lazy_module('rpyc') rpyc = lazy_module('rpyc')
AudioSegment = lazy_callable('pydub.AudioSegment') pydub = lazy_module('pydub')
split_on_silence = lazy_callable('pydub.silence.split_on_silence') split_on_silence = lazy_callable('pydub.silence.split_on_silence')
app = typer.Typer() app = typer.Typer()
@ -106,7 +105,7 @@ def triton_transcribe_grpc_gen(
# ] # ]
# pass # pass
transcript_list = [] transcript_list = []
sil_pad = AudioSegment.silent(duration=sil_msec) sil_pad = pydub.AudioSegment.silent(duration=sil_msec)
for seg in chunks: for seg in chunks:
t_seg = sil_pad + seg + sil_pad t_seg = sil_pad + seg + sil_pad
c_transcript = transcriber(t_seg) c_transcript = transcriber(t_seg)
@ -124,9 +123,7 @@ def triton_transcribe_grpc_gen(
@app.command() @app.command()
def file(audio_file: Path, write_file: bool = False, chunked=True): def file(audio_file: Path, write_file: bool = False, chunked=True):
from pydub import AudioSegment aseg = pydub.AudioSegment.from_file(audio_file)
aseg = AudioSegment.from_file(audio_file)
transcriber, prep = triton_transcribe_grpc_gen() transcriber, prep = triton_transcribe_grpc_gen()
transcription = transcriber(prep(aseg)) transcription = transcriber(prep(aseg))
@ -139,10 +136,8 @@ def file(audio_file: Path, write_file: bool = False, chunked=True):
@app.command() @app.command()
def benchmark(audio_file: Path): def benchmark(audio_file: Path):
from pydub import AudioSegment
transcriber, audio_prep = transcribe_rpyc_gen() transcriber, audio_prep = transcribe_rpyc_gen()
file_seg = AudioSegment.from_file(audio_file) file_seg = pydub.AudioSegment.from_file(audio_file)
aud_seg = audio_prep(file_seg) aud_seg = audio_prep(file_seg)
def timeinfo(): def timeinfo():

View File

@ -27,6 +27,10 @@ class GoogleTTS(object):
audio_encoding=texttospeech.enums.AudioEncoding.LINEAR16, audio_encoding=texttospeech.enums.AudioEncoding.LINEAR16,
sample_rate_hertz=params["sample_rate"], sample_rate_hertz=params["sample_rate"],
) )
if 'speaking_rate' in params:
audio_config.speaking_rate = params['speaking_rate']
if 'pitch' in params:
audio_config.pitch = params['pitch']
response = self.client.synthesize_speech(tts_input, voice, audio_config) response = self.client.synthesize_speech(tts_input, voice, audio_config)
audio_content = response.audio_content audio_content = response.audio_content
return audio_content return audio_content
@ -74,6 +78,19 @@ class GoogleTTS(object):
) )
return results return results
@classmethod
def voice_by_name(cls, name):
"""Lists the available voices."""
# client = cls().client
# Performs the list voices request
results = cls.voice_list()
for voice in results:
if voice['name'] == name:
return voice
raise ValueError(f'{name} not a valid voice')
@app.command() @app.command()
def generate_audio_file(text, dest_path: Path = "./tts_audio.wav", voice="en-US-Wavenet-D"): def generate_audio_file(text, dest_path: Path = "./tts_audio.wav", voice="en-US-Wavenet-D"):

85
plume/utils/ui_persist.py Normal file
View File

@ -0,0 +1,85 @@
from plume.utils import ExtendedPath, get_mongo_conn
from plume.utils.st_rerun import rerun
from uuid import uuid4
from pathlib import Path
def setup_file_state(st):
if not hasattr(st, "state_lock"):
# st.task_id = str(uuid4())
task_path = ExtendedPath("preview.lck")
def current_cursor_fn():
return task_path.read_json()["current_cursor"]
def update_cursor_fn(val=0):
task_path.write_json({"current_cursor": val})
rerun()
st.get_current_cursor = current_cursor_fn
st.update_cursor = update_cursor_fn
st.state_lock = True
# cursor_obj = mongo_conn.find_one({"type": "current_cursor", "task_id": st.task_id})
# if not cursor_obj:
update_cursor_fn(0)
def setup_mongo_asr_validation_state(st):
if not hasattr(st, "mongo_connected"):
st.mongoclient = get_mongo_conn(col="asr_validation")
mongo_conn = st.mongoclient
st.task_id = str(uuid4())
def current_cursor_fn():
# mongo_conn = st.mongoclient
cursor_obj = mongo_conn.find_one(
{"type": "current_cursor", "task_id": st.task_id}
)
cursor_val = cursor_obj["cursor"]
return cursor_val
def update_cursor_fn(val=0):
mongo_conn.find_one_and_update(
{"type": "current_cursor", "task_id": st.task_id},
{
"$set": {
"type": "current_cursor",
"task_id": st.task_id,
"cursor": val,
}
},
upsert=True,
)
rerun()
def get_correction_entry_fn(code):
return mongo_conn.find_one(
{"type": "correction", "code": code}, projection={"_id": False}
)
def update_entry_fn(code, value):
mongo_conn.find_one_and_update(
{"type": "correction", "code": code},
{"$set": {"value": value, "task_id": st.task_id}},
upsert=True,
)
def set_task_fn(data_path, task_id):
if task_id:
st.task_id = task_id
task_path = data_path / Path(f"task-{st.task_id}.lck")
if not task_path.exists():
print(f"creating task lock at {task_path}")
task_path.touch()
st.get_current_cursor = current_cursor_fn
st.update_cursor = update_cursor_fn
st.get_correction_entry = get_correction_entry_fn
st.update_entry = update_entry_fn
st.set_task = set_task_fn
st.mongo_connected = True
cursor_obj = mongo_conn.find_one(
{"type": "current_cursor", "task_id": st.task_id}
)
if not cursor_obj:
update_cursor_fn(0)

205
plume/utils/vad.py Normal file
View File

@ -0,0 +1,205 @@
import logging
import asyncio
import argparse
from pathlib import Path
import webrtcvad
import pydub
from pydub.playback import play
from pydub.utils import make_chunks
DEFAULT_CHUNK_DUR = 20
logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)
def is_frame_voice(vad, seg, chunk_dur):
return (
True
if (
seg.duration_seconds == chunk_dur / 1000
and vad.is_speech(seg.raw_data, seg.frame_rate)
)
else False
)
class VADFilterAudio(object):
"""docstring for VADFilterAudio."""
def __init__(self, chunk_dur=DEFAULT_CHUNK_DUR):
super(VADFilterAudio, self).__init__()
self.chunk_dur = chunk_dur
self.vad = webrtcvad.Vad()
def filter_segment(self, wav_seg):
chunks = make_chunks(wav_seg, self.chunk_dur)
speech_buffer = b""
for i, c in enumerate(chunks[:-1]):
voice_frame = is_frame_voice(self.vad, c, self.chunk_dur)
if voice_frame:
speech_buffer += c.raw_data
filtered_seg = pydub.AudioSegment(
data=speech_buffer,
frame_rate=wav_seg.frame_rate,
channels=wav_seg.channels,
sample_width=wav_seg.sample_width,
)
return filtered_seg
class VADUtterance(object):
"""docstring for VADUtterance."""
def __init__(
self,
max_silence=500,
min_utterance=280,
max_utterance=20000,
chunk_dur=DEFAULT_CHUNK_DUR,
start_cycles=3,
):
super(VADUtterance, self).__init__()
self.vad = webrtcvad.Vad()
self.chunk_dur = chunk_dur
# duration in millisecs
self.max_sil = max_silence
self.min_utt = min_utterance
self.max_utt = max_utterance
self.speech_start = start_cycles * chunk_dur
def __repr__(self):
return f"VAD(max_silence={self.max_sil},min_utterance:{self.min_utt},max_utterance:{self.max_utt})"
async def stream_utterance(self, audio_stream):
silence_buffer = pydub.AudioSegment.empty()
voice_buffer = pydub.AudioSegment.empty()
silence_threshold = False
async for c in audio_stream:
voice_frame = is_frame_voice(self.vad, c, self.chunk_dur)
logger.debug(f"is audio stream voice? {voice_frame}")
if voice_frame:
silence_threshold = False
voice_buffer += c
silence_buffer = pydub.AudioSegment.empty()
else:
silence_buffer += c
voc_dur = voice_buffer.duration_seconds * 1000
sil_dur = silence_buffer.duration_seconds * 1000
if voc_dur >= self.max_utt:
logger.info(
f"detected voice overflow: voice duration {voice_buffer.duration_seconds}"
)
yield voice_buffer
voice_buffer = pydub.AudioSegment.empty()
if sil_dur >= self.max_sil:
if voc_dur >= self.min_utt:
logger.info(
f"detected silence: voice duration {voice_buffer.duration_seconds}"
)
yield voice_buffer
voice_buffer = pydub.AudioSegment.empty()
# ignore/clear voice if silence reached threshold or indent the statement
if not silence_threshold:
silence_threshold = True
if voice_buffer:
yield voice_buffer
async def stream_events(self, audio_stream):
"""
yields 0, voice_buffer for SpeechBuffer
yields 1, None for StartedSpeaking
yields 2, None for StoppedSpeaking
yields 4, audio_stream
"""
silence_buffer = pydub.AudioSegment.empty()
voice_buffer = pydub.AudioSegment.empty()
silence_threshold, started_speaking = False, False
async for c in audio_stream:
# yield (4, c)
voice_frame = is_frame_voice(self.vad, c, self.chunk_dur)
logger.debug(f"is audio stream voice? {voice_frame}")
if voice_frame:
silence_threshold = False
voice_buffer += c
silence_buffer = pydub.AudioSegment.empty()
else:
silence_buffer += c
voc_dur = voice_buffer.duration_seconds * 1000
sil_dur = silence_buffer.duration_seconds * 1000
if voc_dur >= self.speech_start and not started_speaking:
started_speaking = True
yield (1, None)
if voc_dur >= self.max_utt:
logger.info(
f"detected voice overflow: voice duration {voice_buffer.duration_seconds}"
)
yield (0, voice_buffer)
voice_buffer = pydub.AudioSegment.empty()
started_speaking = False
if sil_dur >= self.max_sil:
if voc_dur >= self.min_utt:
logger.info(
f"detected silence: voice duration {voice_buffer.duration_seconds}"
)
yield (0, voice_buffer)
voice_buffer = pydub.AudioSegment.empty()
started_speaking = False
# ignore/clear voice if silence reached threshold or indent the statement
if not silence_threshold:
silence_threshold = True
yield (2, None)
if voice_buffer:
yield (0, voice_buffer)
@classmethod
async def stream_utterance_file(cls, audio_file):
async def stream_gen():
audio_seg = pydub.AudioSegment.from_file(audio_file).set_frame_rate(32000)
chunks = make_chunks(audio_seg, DEFAULT_CHUNK_DUR)
for c in chunks:
yield c
va_ut = cls()
buffer_src = va_ut.stream_utterance(stream_gen())
async for buf in buffer_src:
play(buf)
await asyncio.sleep(1)
class VADStreamGen(object):
"""docstring for VADStreamGen."""
def __init__(self, arg):
super(VADStreamGen, self).__init__()
self.arg = arg
def main():
prog = Path(__file__).stem
parser = argparse.ArgumentParser(prog=prog, description="transcribes audio file")
parser.add_argument(
"--audio_file",
type=argparse.FileType("rb"),
help="audio file to transcribe",
default="./test_utter2.wav",
)
args = parser.parse_args()
loop = asyncio.get_event_loop()
loop.run_until_complete(VADUtterance.stream_utterance_file(args.audio_file))
if __name__ == "__main__":
main()

View File

@ -58,6 +58,9 @@ extra_requirements = {
"stringcase~=1.2.0", "stringcase~=1.2.0",
"google-cloud-speech~=1.3.1", "google-cloud-speech~=1.3.1",
], ],
"ui": [
"rangehttpserver~=1.2.0",
],
"train": ["torchaudio~=0.6.0", "torch-stft~=0.1.4"], "train": ["torchaudio~=0.6.0", "torch-stft~=0.1.4"],
} }