1. Self contained typers
2. Asr force-aligner visualization 3. streamlit state management abstraction 4. new utils / reorganize 5. added verbose flags 6. add tts by nametegra
parent
f72c6bbe5b
commit
c474aa5f5a
|
|
@ -0,0 +1 @@
|
||||||
|
graft plume/utils/gentle_preview
|
||||||
|
|
@ -7,12 +7,12 @@ from .eval import app as eval_app
|
||||||
from .serve import app as serve_app
|
from .serve import app as serve_app
|
||||||
|
|
||||||
app = typer.Typer()
|
app = typer.Typer()
|
||||||
app.add_typer(data_app, name="data")
|
app.add_typer(data_app)
|
||||||
app.add_typer(ui_app, name="ui")
|
app.add_typer(ui_app)
|
||||||
app.add_typer(train_app, name="train")
|
app.add_typer(train_app)
|
||||||
app.add_typer(eval_app, name="eval")
|
app.add_typer(eval_app)
|
||||||
app.add_typer(serve_app, name="serve")
|
app.add_typer(serve_app)
|
||||||
app.add_typer(utils_app, name='utils')
|
app.add_typer(utils_app)
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
|
|
||||||
|
|
@ -27,6 +27,13 @@ app.add_typer(generate_app, name="generate")
|
||||||
app.add_typer(wav2vec2_app, name="wav2vec2")
|
app.add_typer(wav2vec2_app, name="wav2vec2")
|
||||||
|
|
||||||
|
|
||||||
|
@app.callback()
|
||||||
|
def data():
|
||||||
|
"""
|
||||||
|
data sub commands
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
@app.command()
|
@app.command()
|
||||||
def fix_path(dataset_path: Path, force: bool = False):
|
def fix_path(dataset_path: Path, force: bool = False):
|
||||||
manifest_path = dataset_path / Path("manifest.json")
|
manifest_path = dataset_path / Path("manifest.json")
|
||||||
|
|
|
||||||
|
|
@ -3,3 +3,10 @@ from ..models.wav2vec2.eval import app as wav2vec2_app
|
||||||
|
|
||||||
app = typer.Typer()
|
app = typer.Typer()
|
||||||
app.add_typer(wav2vec2_app, name="wav2vec2")
|
app.add_typer(wav2vec2_app, name="wav2vec2")
|
||||||
|
|
||||||
|
|
||||||
|
@app.callback()
|
||||||
|
def eval():
|
||||||
|
"""
|
||||||
|
eval sub commands
|
||||||
|
"""
|
||||||
|
|
|
||||||
|
|
@ -5,3 +5,10 @@ from ..models.jasper.serve import app as jasper_app
|
||||||
app = typer.Typer()
|
app = typer.Typer()
|
||||||
app.add_typer(wav2vec2_app, name="wav2vec2")
|
app.add_typer(wav2vec2_app, name="wav2vec2")
|
||||||
app.add_typer(jasper_app, name="jasper")
|
app.add_typer(jasper_app, name="jasper")
|
||||||
|
|
||||||
|
|
||||||
|
@app.callback()
|
||||||
|
def serve():
|
||||||
|
"""
|
||||||
|
serve sub commands
|
||||||
|
"""
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,12 @@
|
||||||
import typer
|
import typer
|
||||||
from ..models.wav2vec2.train import app as train_app
|
from ..models.wav2vec2.train import app as wav2vec2_app
|
||||||
|
|
||||||
app = typer.Typer()
|
app = typer.Typer()
|
||||||
app.add_typer(train_app, name="wav2vec2")
|
app.add_typer(wav2vec2_app, name="wav2vec2")
|
||||||
|
|
||||||
|
|
||||||
|
@app.callback()
|
||||||
|
def train():
|
||||||
|
"""
|
||||||
|
train sub commands
|
||||||
|
"""
|
||||||
|
|
|
||||||
|
|
@ -3,12 +3,20 @@ import sys
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from plume.utils import lazy_module
|
from plume.utils import lazy_module
|
||||||
|
|
||||||
# from streamlit import cli as stcli
|
# from streamlit import cli as stcli
|
||||||
|
|
||||||
stcli = lazy_module('streamlit.cli')
|
stcli = lazy_module("streamlit.cli")
|
||||||
app = typer.Typer()
|
app = typer.Typer()
|
||||||
|
|
||||||
|
|
||||||
|
@app.callback()
|
||||||
|
def ui():
|
||||||
|
"""
|
||||||
|
ui sub commands
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
@app.command()
|
@app.command()
|
||||||
def annotation(data_dir: Path, dump_fname: Path = "ui_dump.json", task_id: str = ""):
|
def annotation(data_dir: Path, dump_fname: Path = "ui_dump.json", task_id: str = ""):
|
||||||
annotation_lit_path = Path(__file__).parent / Path("annotation.py")
|
annotation_lit_path = Path(__file__).parent / Path("annotation.py")
|
||||||
|
|
@ -40,13 +48,7 @@ def annotation(data_dir: Path, dump_fname: Path = "ui_dump.json", task_id: str =
|
||||||
@app.command()
|
@app.command()
|
||||||
def preview(manifest_path: Path):
|
def preview(manifest_path: Path):
|
||||||
annotation_lit_path = Path(__file__).parent / Path("preview.py")
|
annotation_lit_path = Path(__file__).parent / Path("preview.py")
|
||||||
sys.argv = [
|
sys.argv = ["streamlit", "run", str(annotation_lit_path), "--", str(manifest_path)]
|
||||||
"streamlit",
|
|
||||||
"run",
|
|
||||||
str(annotation_lit_path),
|
|
||||||
"--",
|
|
||||||
str(manifest_path)
|
|
||||||
]
|
|
||||||
sys.exit(stcli.main())
|
sys.exit(stcli.main())
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -56,6 +58,18 @@ def collection(data_dir: Path, task_id: str = ""):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
@app.command()
|
||||||
|
def alignment(preview_dir: Path, port: int = 8010):
|
||||||
|
from RangeHTTPServer import RangeRequestHandler
|
||||||
|
from functools import partial
|
||||||
|
from http.server import HTTPServer
|
||||||
|
|
||||||
|
server_address = ("", port)
|
||||||
|
handler_class = partial(RangeRequestHandler, directory=str(preview_dir))
|
||||||
|
httpd = HTTPServer(server_address, handler_class)
|
||||||
|
httpd.serve_forever()
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
app()
|
app()
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,66 +1,14 @@
|
||||||
# import sys
|
# import sys
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from uuid import uuid4
|
|
||||||
|
|
||||||
import streamlit as st
|
import streamlit as st
|
||||||
import typer
|
import typer
|
||||||
|
from plume.utils import ExtendedPath
|
||||||
from plume.utils import ExtendedPath, get_mongo_conn
|
from plume.utils.ui_persist import setup_mongo_asr_validation_state
|
||||||
from plume.preview.st_rerun import rerun
|
|
||||||
|
|
||||||
app = typer.Typer()
|
app = typer.Typer()
|
||||||
|
|
||||||
|
setup_mongo_asr_validation_state(st)
|
||||||
if not hasattr(st, "mongo_connected"):
|
|
||||||
st.mongoclient = get_mongo_conn(col="asr_validation")
|
|
||||||
mongo_conn = st.mongoclient
|
|
||||||
st.task_id = str(uuid4())
|
|
||||||
|
|
||||||
def current_cursor_fn():
|
|
||||||
# mongo_conn = st.mongoclient
|
|
||||||
cursor_obj = mongo_conn.find_one(
|
|
||||||
{"type": "current_cursor", "task_id": st.task_id}
|
|
||||||
)
|
|
||||||
cursor_val = cursor_obj["cursor"]
|
|
||||||
return cursor_val
|
|
||||||
|
|
||||||
def update_cursor_fn(val=0):
|
|
||||||
mongo_conn.find_one_and_update(
|
|
||||||
{"type": "current_cursor", "task_id": st.task_id},
|
|
||||||
{"$set": {"type": "current_cursor", "task_id": st.task_id, "cursor": val}},
|
|
||||||
upsert=True,
|
|
||||||
)
|
|
||||||
rerun()
|
|
||||||
|
|
||||||
def get_correction_entry_fn(code):
|
|
||||||
return mongo_conn.find_one(
|
|
||||||
{"type": "correction", "code": code}, projection={"_id": False}
|
|
||||||
)
|
|
||||||
|
|
||||||
def update_entry_fn(code, value):
|
|
||||||
mongo_conn.find_one_and_update(
|
|
||||||
{"type": "correction", "code": code},
|
|
||||||
{"$set": {"value": value, "task_id": st.task_id}},
|
|
||||||
upsert=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
def set_task_fn(data_path, task_id):
|
|
||||||
if task_id:
|
|
||||||
st.task_id = task_id
|
|
||||||
task_path = data_path / Path(f"task-{st.task_id}.lck")
|
|
||||||
if not task_path.exists():
|
|
||||||
print(f"creating task lock at {task_path}")
|
|
||||||
task_path.touch()
|
|
||||||
|
|
||||||
st.get_current_cursor = current_cursor_fn
|
|
||||||
st.update_cursor = update_cursor_fn
|
|
||||||
st.get_correction_entry = get_correction_entry_fn
|
|
||||||
st.update_entry = update_entry_fn
|
|
||||||
st.set_task = set_task_fn
|
|
||||||
st.mongo_connected = True
|
|
||||||
cursor_obj = mongo_conn.find_one({"type": "current_cursor", "task_id": st.task_id})
|
|
||||||
if not cursor_obj:
|
|
||||||
update_cursor_fn(0)
|
|
||||||
|
|
||||||
|
|
||||||
@st.cache()
|
@st.cache()
|
||||||
|
|
|
||||||
|
|
@ -3,27 +3,11 @@ from pathlib import Path
|
||||||
import streamlit as st
|
import streamlit as st
|
||||||
import typer
|
import typer
|
||||||
from plume.utils import ExtendedPath
|
from plume.utils import ExtendedPath
|
||||||
from plume.preview.st_rerun import rerun
|
from plume.utils.ui_persist import setup_file_state
|
||||||
|
|
||||||
app = typer.Typer()
|
app = typer.Typer()
|
||||||
|
|
||||||
if not hasattr(st, "state_lock"):
|
setup_file_state(st)
|
||||||
# st.task_id = str(uuid4())
|
|
||||||
task_path = ExtendedPath("preview.lck")
|
|
||||||
|
|
||||||
def current_cursor_fn():
|
|
||||||
return task_path.read_json()["current_cursor"]
|
|
||||||
|
|
||||||
def update_cursor_fn(val=0):
|
|
||||||
task_path.write_json({"current_cursor": val})
|
|
||||||
rerun()
|
|
||||||
|
|
||||||
st.get_current_cursor = current_cursor_fn
|
|
||||||
st.update_cursor = update_cursor_fn
|
|
||||||
st.state_lock = True
|
|
||||||
# cursor_obj = mongo_conn.find_one({"type": "current_cursor", "task_id": st.task_id})
|
|
||||||
# if not cursor_obj:
|
|
||||||
update_cursor_fn(0)
|
|
||||||
|
|
||||||
|
|
||||||
@st.cache()
|
@st.cache()
|
||||||
|
|
@ -40,7 +24,7 @@ def main(manifest: Path):
|
||||||
print("Invalid samplno resetting to 0")
|
print("Invalid samplno resetting to 0")
|
||||||
st.update_cursor(0)
|
st.update_cursor(0)
|
||||||
sample = asr_data[sample_no]
|
sample = asr_data[sample_no]
|
||||||
st.title(f"ASR Manifest Preview")
|
st.title("ASR Manifest Preview")
|
||||||
st.markdown(f"{sample_no+1} of {len(asr_data)} : **{sample['text']}**")
|
st.markdown(f"{sample_no+1} of {len(asr_data)} : **{sample['text']}**")
|
||||||
new_sample = st.number_input(
|
new_sample = st.number_input(
|
||||||
"Go To Sample:", value=sample_no + 1, min_value=1, max_value=len(asr_data)
|
"Go To Sample:", value=sample_no + 1, min_value=1, max_value=len(asr_data)
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,151 @@
|
||||||
|
/data/
|
||||||
|
/model/
|
||||||
|
/train/
|
||||||
|
.env*
|
||||||
|
*.yaml
|
||||||
|
*.yml
|
||||||
|
*.json
|
||||||
|
|
||||||
|
|
||||||
|
# Created by https://www.gitignore.io/api/python
|
||||||
|
# Edit at https://www.gitignore.io/?templates=python
|
||||||
|
|
||||||
|
### Python ###
|
||||||
|
# Byte-compiled / optimized / DLL files
|
||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
*$py.class
|
||||||
|
|
||||||
|
# C extensions
|
||||||
|
*.so
|
||||||
|
|
||||||
|
# Distribution / packaging
|
||||||
|
.Python
|
||||||
|
build/
|
||||||
|
develop-eggs/
|
||||||
|
dist/
|
||||||
|
downloads/
|
||||||
|
eggs/
|
||||||
|
.eggs/
|
||||||
|
lib/
|
||||||
|
lib64/
|
||||||
|
parts/
|
||||||
|
sdist/
|
||||||
|
var/
|
||||||
|
wheels/
|
||||||
|
pip-wheel-metadata/
|
||||||
|
share/python-wheels/
|
||||||
|
*.egg-info/
|
||||||
|
.installed.cfg
|
||||||
|
*.egg
|
||||||
|
MANIFEST
|
||||||
|
|
||||||
|
# PyInstaller
|
||||||
|
# Usually these files are written by a python script from a template
|
||||||
|
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||||
|
*.manifest
|
||||||
|
*.spec
|
||||||
|
|
||||||
|
# Installer logs
|
||||||
|
pip-log.txt
|
||||||
|
pip-delete-this-directory.txt
|
||||||
|
|
||||||
|
# Unit test / coverage reports
|
||||||
|
htmlcov/
|
||||||
|
.tox/
|
||||||
|
.nox/
|
||||||
|
.coverage
|
||||||
|
.coverage.*
|
||||||
|
.cache
|
||||||
|
nosetests.xml
|
||||||
|
coverage.xml
|
||||||
|
*.cover
|
||||||
|
.hypothesis/
|
||||||
|
.pytest_cache/
|
||||||
|
|
||||||
|
# Translations
|
||||||
|
*.mo
|
||||||
|
*.pot
|
||||||
|
|
||||||
|
# Scrapy stuff:
|
||||||
|
.scrapy
|
||||||
|
|
||||||
|
# Sphinx documentation
|
||||||
|
docs/_build/
|
||||||
|
|
||||||
|
# PyBuilder
|
||||||
|
target/
|
||||||
|
|
||||||
|
# pyenv
|
||||||
|
.python-version
|
||||||
|
|
||||||
|
# pipenv
|
||||||
|
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||||
|
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||||
|
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||||
|
# install all needed dependencies.
|
||||||
|
#Pipfile.lock
|
||||||
|
|
||||||
|
# celery beat schedule file
|
||||||
|
celerybeat-schedule
|
||||||
|
|
||||||
|
# SageMath parsed files
|
||||||
|
*.sage.py
|
||||||
|
|
||||||
|
# Spyder project settings
|
||||||
|
.spyderproject
|
||||||
|
.spyproject
|
||||||
|
|
||||||
|
# Rope project settings
|
||||||
|
.ropeproject
|
||||||
|
|
||||||
|
# Mr Developer
|
||||||
|
.mr.developer.cfg
|
||||||
|
.project
|
||||||
|
.pydevproject
|
||||||
|
|
||||||
|
# mkdocs documentation
|
||||||
|
/site
|
||||||
|
|
||||||
|
# mypy
|
||||||
|
.mypy_cache/
|
||||||
|
.dmypy.json
|
||||||
|
dmypy.json
|
||||||
|
|
||||||
|
# Pyre type checker
|
||||||
|
.pyre/
|
||||||
|
|
||||||
|
# End of https://www.gitignore.io/api/python
|
||||||
|
|
||||||
|
# Created by https://www.gitignore.io/api/macos
|
||||||
|
# Edit at https://www.gitignore.io/?templates=macos
|
||||||
|
|
||||||
|
### macOS ###
|
||||||
|
# General
|
||||||
|
.DS_Store
|
||||||
|
.AppleDouble
|
||||||
|
.LSOverride
|
||||||
|
|
||||||
|
# Icon must end with two \r
|
||||||
|
Icon
|
||||||
|
|
||||||
|
# Thumbnails
|
||||||
|
._*
|
||||||
|
|
||||||
|
# Files that might appear in the root of a volume
|
||||||
|
.DocumentRevisions-V100
|
||||||
|
.fseventsd
|
||||||
|
.Spotlight-V100
|
||||||
|
.TemporaryItems
|
||||||
|
.Trashes
|
||||||
|
.VolumeIcon.icns
|
||||||
|
.com.apple.timemachine.donotpresent
|
||||||
|
|
||||||
|
# Directories potentially created on remote AFP share
|
||||||
|
.AppleDB
|
||||||
|
.AppleDesktop
|
||||||
|
Network Trash Folder
|
||||||
|
Temporary Items
|
||||||
|
.apdisk
|
||||||
|
|
||||||
|
# End of https://www.gitignore.io/api/macos
|
||||||
|
|
@ -11,12 +11,14 @@ from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
|
||||||
import subprocess
|
import subprocess
|
||||||
import shutil
|
import shutil
|
||||||
from urllib.parse import urlsplit
|
from urllib.parse import urlsplit
|
||||||
|
|
||||||
# from .lazy_loader import LazyLoader
|
# from .lazy_loader import LazyLoader
|
||||||
from .lazy_import import lazy_callable, lazy_module
|
from .lazy_import import lazy_callable, lazy_module
|
||||||
|
|
||||||
# from ruamel.yaml import YAML
|
# from ruamel.yaml import YAML
|
||||||
# import boto3
|
# import boto3
|
||||||
import typer
|
import typer
|
||||||
|
|
||||||
# import pymongo
|
# import pymongo
|
||||||
# from slugify import slugify
|
# from slugify import slugify
|
||||||
# import pydub
|
# import pydub
|
||||||
|
|
@ -34,16 +36,16 @@ from .tts import app as tts_app
|
||||||
from .transcribe import app as transcribe_app
|
from .transcribe import app as transcribe_app
|
||||||
from .align import app as align_app
|
from .align import app as align_app
|
||||||
|
|
||||||
boto3 = lazy_module('boto3')
|
boto3 = lazy_module("boto3")
|
||||||
pymongo = lazy_module('pymongo')
|
pymongo = lazy_module("pymongo")
|
||||||
pydub = lazy_module('pydub')
|
pydub = lazy_module("pydub")
|
||||||
audio_display = lazy_module('librosa.display')
|
audio_display = lazy_module("librosa.display")
|
||||||
plt = lazy_module('matplotlib.pyplot')
|
plt = lazy_module("matplotlib.pyplot")
|
||||||
librosa = lazy_module('librosa')
|
librosa = lazy_module("librosa")
|
||||||
YAML = lazy_callable('ruamel.yaml.YAML')
|
YAML = lazy_callable("ruamel.yaml.YAML")
|
||||||
num2words = lazy_callable('num2words.num2words')
|
num2words = lazy_callable("num2words.num2words")
|
||||||
slugify = lazy_callable('slugify.slugify')
|
slugify = lazy_callable("slugify.slugify")
|
||||||
compress = lazy_callable('natural.date.compress')
|
compress = lazy_callable("natural.date.compress")
|
||||||
|
|
||||||
app = typer.Typer()
|
app = typer.Typer()
|
||||||
app.add_typer(tts_app, name="tts")
|
app.add_typer(tts_app, name="tts")
|
||||||
|
|
@ -51,6 +53,13 @@ app.add_typer(align_app, name="align")
|
||||||
app.add_typer(transcribe_app, name="transcribe")
|
app.add_typer(transcribe_app, name="transcribe")
|
||||||
|
|
||||||
|
|
||||||
|
@app.callback()
|
||||||
|
def utils():
|
||||||
|
"""
|
||||||
|
utils sub commands
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
||||||
)
|
)
|
||||||
|
|
@ -125,6 +134,10 @@ def upload_s3(dataset_path, s3_path):
|
||||||
run_shell(f"aws s3 sync {dataset_path} {s3_path}")
|
run_shell(f"aws s3 sync {dataset_path} {s3_path}")
|
||||||
|
|
||||||
|
|
||||||
|
def copy_s3(dataset_path, s3_path):
|
||||||
|
run_shell(f"aws s3 cp {dataset_path} {s3_path}")
|
||||||
|
|
||||||
|
|
||||||
def get_download_path(s3_uri, output_path):
|
def get_download_path(s3_uri, output_path):
|
||||||
s3_uri_p = urlsplit(s3_uri)
|
s3_uri_p = urlsplit(s3_uri)
|
||||||
download_path = output_path / Path(s3_uri_p.path[1:])
|
download_path = output_path / Path(s3_uri_p.path[1:])
|
||||||
|
|
@ -135,11 +148,12 @@ def get_download_path(s3_uri, output_path):
|
||||||
def s3_downloader():
|
def s3_downloader():
|
||||||
s3 = boto3.client("s3")
|
s3 = boto3.client("s3")
|
||||||
|
|
||||||
def download_s3(s3_uri, download_path):
|
def download_s3(s3_uri, download_path, verbose=False):
|
||||||
s3_uri_p = urlsplit(s3_uri)
|
s3_uri_p = urlsplit(s3_uri)
|
||||||
download_path.parent.mkdir(exist_ok=True, parents=True)
|
download_path.parent.mkdir(exist_ok=True, parents=True)
|
||||||
if not download_path.exists():
|
if not download_path.exists():
|
||||||
print(f"downloading {s3_uri} to {download_path}")
|
if verbose:
|
||||||
|
print(f"downloading {s3_uri} to {download_path}")
|
||||||
s3.download_file(s3_uri_p.netloc, s3_uri_p.path[1:], str(download_path))
|
s3.download_file(s3_uri_p.netloc, s3_uri_p.path[1:], str(download_path))
|
||||||
|
|
||||||
return download_s3
|
return download_s3
|
||||||
|
|
@ -186,6 +200,7 @@ def ui_data_generator(dataset_dir, asr_data_source, verbose=False):
|
||||||
plot_seg(wav_plot_path.absolute(), audio_file)
|
plot_seg(wav_plot_path.absolute(), audio_file)
|
||||||
return {
|
return {
|
||||||
"audio_path": str(rel_data_path),
|
"audio_path": str(rel_data_path),
|
||||||
|
"audio_filepath": str(rel_data_path),
|
||||||
"duration": round(audio_dur, 1),
|
"duration": round(audio_dur, 1),
|
||||||
"text": transcript,
|
"text": transcript,
|
||||||
"real_idx": num_datapoints,
|
"real_idx": num_datapoints,
|
||||||
|
|
@ -229,17 +244,17 @@ def ui_dump_manifest_writer(dataset_dir, asr_data_source, verbose=False):
|
||||||
)
|
)
|
||||||
|
|
||||||
asr_manifest = dataset_dir / Path("manifest.json")
|
asr_manifest = dataset_dir / Path("manifest.json")
|
||||||
with asr_manifest.open("w") as mf:
|
asr_manifest_writer(asr_manifest, dump_data, verbose=verbose)
|
||||||
print(f"writing manifest to {asr_manifest}")
|
# with asr_manifest.open("w") as mf:
|
||||||
for d in dump_data:
|
# print(f"writing manifest to {asr_manifest}")
|
||||||
rel_data_path = d["audio_path"]
|
# for d in dump_data:
|
||||||
audio_dur = d["duration"]
|
# rel_data_path = d["audio_path"]
|
||||||
transcript = d["text"]
|
# audio_dur = d["duration"]
|
||||||
manifest = manifest_str(str(rel_data_path), audio_dur, transcript)
|
# transcript = d["text"]
|
||||||
mf.write(manifest)
|
# manifest = manifest_str(str(rel_data_path), audio_dur, transcript)
|
||||||
|
# mf.write(manifest)
|
||||||
ui_dump_file = dataset_dir / Path("ui_dump.json")
|
ui_dump_file = dataset_dir / Path("ui_dump.json")
|
||||||
ExtendedPath(ui_dump_file).write_json({"data": dump_data})
|
ExtendedPath(ui_dump_file).write_json({"data": dump_data}, verbose=verbose)
|
||||||
return num_datapoints
|
return num_datapoints
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -254,9 +269,10 @@ def asr_manifest_reader(data_manifest_path: Path):
|
||||||
yield p
|
yield p
|
||||||
|
|
||||||
|
|
||||||
def asr_manifest_writer(asr_manifest_path: Path, manifest_str_source):
|
def asr_manifest_writer(asr_manifest_path: Path, manifest_str_source, verbose=False):
|
||||||
with asr_manifest_path.open("w") as mf:
|
with asr_manifest_path.open("w") as mf:
|
||||||
print(f"opening {asr_manifest_path} for writing manifest")
|
if verbose:
|
||||||
|
print(f"writing asr manifest to {asr_manifest_path}")
|
||||||
for mani_dict in manifest_str_source:
|
for mani_dict in manifest_str_source:
|
||||||
manifest = manifest_str(
|
manifest = manifest_str(
|
||||||
mani_dict["audio_filepath"], mani_dict["duration"], mani_dict["text"]
|
mani_dict["audio_filepath"], mani_dict["duration"], mani_dict["text"]
|
||||||
|
|
@ -293,37 +309,43 @@ def batch(iterable, n=1):
|
||||||
class ExtendedPath(type(Path())):
|
class ExtendedPath(type(Path())):
|
||||||
"""docstring for ExtendedPath."""
|
"""docstring for ExtendedPath."""
|
||||||
|
|
||||||
def read_json(self):
|
def read_json(self, verbose=False):
|
||||||
print(f"reading json from {self}")
|
if verbose:
|
||||||
|
print(f"reading json from {self}")
|
||||||
with self.open("r") as jf:
|
with self.open("r") as jf:
|
||||||
return json.load(jf)
|
return json.load(jf)
|
||||||
|
|
||||||
def read_yaml(self):
|
def read_yaml(self, verbose=False):
|
||||||
yaml = YAML(typ="safe", pure=True)
|
yaml = YAML(typ="safe", pure=True)
|
||||||
print(f"reading yaml from {self}")
|
if verbose:
|
||||||
|
print(f"reading yaml from {self}")
|
||||||
with self.open("r") as yf:
|
with self.open("r") as yf:
|
||||||
return yaml.load(yf)
|
return yaml.load(yf)
|
||||||
|
|
||||||
def read_jsonl(self):
|
def read_jsonl(self, verbose=False):
|
||||||
print(f"reading jsonl from {self}")
|
if verbose:
|
||||||
|
print(f"reading jsonl from {self}")
|
||||||
with self.open("r") as jf:
|
with self.open("r") as jf:
|
||||||
for l in jf.readlines():
|
for ln in jf.readlines():
|
||||||
yield json.loads(l)
|
yield json.loads(ln)
|
||||||
|
|
||||||
def write_json(self, data):
|
def write_json(self, data, verbose=False):
|
||||||
print(f"writing json to {self}")
|
if verbose:
|
||||||
|
print(f"writing json to {self}")
|
||||||
self.parent.mkdir(parents=True, exist_ok=True)
|
self.parent.mkdir(parents=True, exist_ok=True)
|
||||||
with self.open("w") as jf:
|
with self.open("w") as jf:
|
||||||
json.dump(data, jf, indent=2)
|
json.dump(data, jf, indent=2)
|
||||||
|
|
||||||
def write_yaml(self, data):
|
def write_yaml(self, data, verbose=False):
|
||||||
yaml = YAML()
|
yaml = YAML()
|
||||||
print(f"writing yaml to {self}")
|
if verbose:
|
||||||
|
print(f"writing yaml to {self}")
|
||||||
with self.open("w") as yf:
|
with self.open("w") as yf:
|
||||||
yaml.dump(data, yf)
|
yaml.dump(data, yf)
|
||||||
|
|
||||||
def write_jsonl(self, data):
|
def write_jsonl(self, data, verbose=False):
|
||||||
print(f"writing jsonl to {self}")
|
if verbose:
|
||||||
|
print(f"writing jsonl to {self}")
|
||||||
self.parent.mkdir(parents=True, exist_ok=True)
|
self.parent.mkdir(parents=True, exist_ok=True)
|
||||||
with self.open("w") as jf:
|
with self.open("w") as jf:
|
||||||
for d in data:
|
for d in data:
|
||||||
|
|
|
||||||
|
|
@ -1,12 +1,14 @@
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from .tts import GoogleTTS
|
|
||||||
# from IPython import display
|
# from IPython import display
|
||||||
import requests
|
import requests
|
||||||
import io
|
import io
|
||||||
import typer
|
import shutil
|
||||||
|
|
||||||
|
import typer
|
||||||
from plume.utils import lazy_module
|
from plume.utils import lazy_module
|
||||||
|
|
||||||
|
from .tts import GoogleTTS
|
||||||
|
|
||||||
display = lazy_module('IPython.display')
|
display = lazy_module('IPython.display')
|
||||||
pydub = lazy_module('pydub')
|
pydub = lazy_module('pydub')
|
||||||
|
|
||||||
|
|
@ -63,16 +65,19 @@ def gentle_preview(
|
||||||
audio_path: Path,
|
audio_path: Path,
|
||||||
transcript_path: Path,
|
transcript_path: Path,
|
||||||
service_uri="http://101.53.142.218:8765/transcriptions",
|
service_uri="http://101.53.142.218:8765/transcriptions",
|
||||||
gent_preview_dir="../gentle_preview",
|
gent_preview_dir="./gentle_preview",
|
||||||
):
|
):
|
||||||
from . import ExtendedPath
|
from . import ExtendedPath
|
||||||
|
|
||||||
ab = audio_path.read_bytes()
|
pkg_gentle_dir = Path(__file__).parent / 'gentle_preview'
|
||||||
tt = transcript_path.read_text()
|
|
||||||
audio, alignment = gentle_aligner(service_uri, ab, tt)
|
shutil.copytree(str(pkg_gentle_dir), str(gent_preview_dir))
|
||||||
audio.export(gent_preview_dir / Path("a.wav"), format="wav")
|
# ab = audio_path.read_bytes()
|
||||||
alignment["status"] = "OK"
|
# tt = transcript_path.read_text()
|
||||||
ExtendedPath(gent_preview_dir / Path("status.json")).write_json(alignment)
|
# audio, alignment = gentle_aligner(service_uri, ab, tt)
|
||||||
|
# audio.export(gent_preview_dir / Path("a.wav"), format="wav")
|
||||||
|
# alignment["status"] = "OK"
|
||||||
|
# ExtendedPath(gent_preview_dir / Path("status.json")).write_json(alignment)
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,5 @@
|
||||||
|
Serve with https://github.com/danvk/RangeHTTPServer
|
||||||
|
`https://github.com/claysciences/CORSRangeHTTPServer`
|
||||||
|
|
||||||
|
`python -m RangeHTTPServer`
|
||||||
|
`python -m http.server`
|
||||||
|
|
@ -0,0 +1,80 @@
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8" />
|
||||||
|
<style>
|
||||||
|
body {font-family: sans-serif; padding-top: 70px; }
|
||||||
|
textarea { width: 500px; height: 20em; }
|
||||||
|
input, textarea { margin: 1em 0; }
|
||||||
|
#header {
|
||||||
|
position: fixed;
|
||||||
|
top: 0;
|
||||||
|
left: 0;
|
||||||
|
height: 50px;
|
||||||
|
line-height: 50px;
|
||||||
|
width: 100%;
|
||||||
|
background-color: #999;
|
||||||
|
box-shadow: 0px 0px 5px 0px rgba(0,0,0,0.5);
|
||||||
|
font-family: Helvetica, sans-serif;
|
||||||
|
}
|
||||||
|
#header, #header a {
|
||||||
|
color: white;
|
||||||
|
}
|
||||||
|
.home {
|
||||||
|
margin: 0;
|
||||||
|
font-size: 125%;
|
||||||
|
font-weight: lighter;
|
||||||
|
text-transform: lowercase;
|
||||||
|
}
|
||||||
|
.home a {
|
||||||
|
margin: 0;
|
||||||
|
background: #666;
|
||||||
|
padding-left: 25px;
|
||||||
|
padding-right: 30px;
|
||||||
|
margin-right: 20px;
|
||||||
|
float: left;
|
||||||
|
text-decoration: none;
|
||||||
|
}
|
||||||
|
.home:hover a {
|
||||||
|
background: #555;
|
||||||
|
}
|
||||||
|
#align-button {
|
||||||
|
background: #CCC;
|
||||||
|
border: 0;
|
||||||
|
font-size: 18px;
|
||||||
|
padding: 10px 30px;
|
||||||
|
cursor: pointer;
|
||||||
|
}
|
||||||
|
#alignment-flags {
|
||||||
|
background: #CCC;
|
||||||
|
border: 0;
|
||||||
|
font-size: 18px;
|
||||||
|
padding: 10px 30px;
|
||||||
|
}
|
||||||
|
#footer {
|
||||||
|
margin-top: 100px;
|
||||||
|
border-top: 1px dotted black;
|
||||||
|
font-size: 8pt;
|
||||||
|
font-style: italic;
|
||||||
|
padding: 10px;
|
||||||
|
}
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<div id="header">
|
||||||
|
<h1 class="home"><a href="/">Gentle</a></h1>
|
||||||
|
</div>
|
||||||
|
<form action="/transcriptions" method="POST" enctype="multipart/form-data">
|
||||||
|
Audio:<br>
|
||||||
|
<input type=file name=audio><br>
|
||||||
|
<br>
|
||||||
|
Transcript:<br>
|
||||||
|
<textarea name="transcript"></textarea><br>
|
||||||
|
<input id=alignment-flags name=conservative type=checkbox> Conservative<br>
|
||||||
|
<input id=alignment-flags name=disfluency type=checkbox> Include disfluencies<br>
|
||||||
|
<input id="align-button" type=submit value=Align>
|
||||||
|
</form>
|
||||||
|
<div id="footer">
|
||||||
|
<a href="https://lowerquality.com/gentle">Gentle</a> is free software released under the <a href="https://opensource.org/licenses/MIT">MIT license</a>. <a href="https://lowerquality.com/gentle">Homepage</a> | <a href="https://github.com/lowerquality/gentle">Source code</a>.
|
||||||
|
</div>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
|
@ -0,0 +1,408 @@
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8" />
|
||||||
|
<style>
|
||||||
|
html, body {
|
||||||
|
margin: 0;
|
||||||
|
padding: 0;
|
||||||
|
}
|
||||||
|
#header {
|
||||||
|
position: fixed;
|
||||||
|
top: 0;
|
||||||
|
left: 0;
|
||||||
|
height: 50px;
|
||||||
|
line-height: 50px;
|
||||||
|
width: 100%;
|
||||||
|
background-color: #999;
|
||||||
|
box-shadow: 0px 0px 5px 0px rgba(0,0,0,0.5);
|
||||||
|
font-family: Helvetica, sans-serif;
|
||||||
|
}
|
||||||
|
#header, #header a {
|
||||||
|
color: white;
|
||||||
|
}
|
||||||
|
#downloads {
|
||||||
|
float: right;
|
||||||
|
background: #999;
|
||||||
|
}
|
||||||
|
.download {
|
||||||
|
float: right;
|
||||||
|
background: #999;
|
||||||
|
padding: 0 5px;
|
||||||
|
}
|
||||||
|
.home {
|
||||||
|
margin: 0;
|
||||||
|
font-size: 125%;
|
||||||
|
font-weight: lighter;
|
||||||
|
text-transform: lowercase;
|
||||||
|
}
|
||||||
|
.home a {
|
||||||
|
margin: 0;
|
||||||
|
background: #666;
|
||||||
|
padding-left: 25px;
|
||||||
|
padding-right: 30px;
|
||||||
|
margin-right: 20px;
|
||||||
|
float: left;
|
||||||
|
text-decoration: none;
|
||||||
|
}
|
||||||
|
.home:hover a {
|
||||||
|
background: #555;
|
||||||
|
}
|
||||||
|
#audio {
|
||||||
|
margin-top: 9px;
|
||||||
|
width: 50%;
|
||||||
|
display: inline-block;
|
||||||
|
}
|
||||||
|
#transcript {
|
||||||
|
margin: 0 15px;
|
||||||
|
margin-top: 70px;
|
||||||
|
margin-bottom: 5em;
|
||||||
|
white-space: pre-wrap;
|
||||||
|
line-height: 2em;
|
||||||
|
max-width: 600px;
|
||||||
|
color: #999;
|
||||||
|
}
|
||||||
|
#transcript.status {
|
||||||
|
background-color: #333;
|
||||||
|
color: #fff;
|
||||||
|
font-family: Courier, mono;
|
||||||
|
line-height: 1em;
|
||||||
|
font-size: 10pt;
|
||||||
|
max-width: 100%;
|
||||||
|
}
|
||||||
|
#transcript.status h2 {
|
||||||
|
padding: 10px;
|
||||||
|
}
|
||||||
|
#transcript.status .entry {
|
||||||
|
margin-bottom: 10px;
|
||||||
|
padding: 10px;
|
||||||
|
}
|
||||||
|
#transcript.status progress {
|
||||||
|
width: 100%;
|
||||||
|
height: 30px;
|
||||||
|
margin-bottom: 20px;
|
||||||
|
}
|
||||||
|
.success {
|
||||||
|
color: black;
|
||||||
|
}
|
||||||
|
.success:hover {
|
||||||
|
text-decoration: underline;
|
||||||
|
}
|
||||||
|
.active {
|
||||||
|
color: magenta;
|
||||||
|
}
|
||||||
|
#preloader {
|
||||||
|
visibility: hidden;
|
||||||
|
}
|
||||||
|
.phactive {
|
||||||
|
text-decoration: underline;
|
||||||
|
}
|
||||||
|
.phones {
|
||||||
|
position: absolute;
|
||||||
|
color: #333;
|
||||||
|
}
|
||||||
|
.phones .phone {
|
||||||
|
margin-right: 5px;
|
||||||
|
font-family: Helvetica, sans-serif;
|
||||||
|
text-transform: uppercase;
|
||||||
|
font-size: 50%;
|
||||||
|
}
|
||||||
|
.phones .phone:last-child {
|
||||||
|
margin-right: 0;
|
||||||
|
}
|
||||||
|
#footer {
|
||||||
|
margin-top: 100px;
|
||||||
|
border-top: 1px dotted black;
|
||||||
|
font-size: 8pt;
|
||||||
|
font-style: italic;
|
||||||
|
font-family: Helvetica, sans-serif;
|
||||||
|
padding: 10px;
|
||||||
|
}
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<div id="header">
|
||||||
|
<!-- <h1 class="home"><a href="/">Gentle</a></h1> -->
|
||||||
|
<audio id="audio" src="a.wav" controls="true" preload="auto"></audio>
|
||||||
|
<img src="/preloader.gif" id="preloader" alt="loading...">
|
||||||
|
<span id="downloads"> </div>
|
||||||
|
</div>
|
||||||
|
<div id="transcript"></div>
|
||||||
|
<!-- <div id="footer">
|
||||||
|
<a href="https://lowerquality.com/gentle">Gentle</a> is free software released under the <a href="https://opensource.org/licenses/MIT">MIT license</a>. <a href="https://lowerquality.com/gentle">Homepage</a> | <a href="https://github.com/lowerquality/gentle">Source code</a>.
|
||||||
|
</div> -->
|
||||||
|
|
||||||
|
<script>
|
||||||
|
|
||||||
|
function get(url, cb) {
|
||||||
|
var xhr = new XMLHttpRequest();
|
||||||
|
xhr.open("GET", url, true);
|
||||||
|
xhr.onload = function() {
|
||||||
|
cb(this.responseText);
|
||||||
|
}
|
||||||
|
xhr.send();
|
||||||
|
}
|
||||||
|
function get_json(url, cb) {
|
||||||
|
get(url, function(x) {
|
||||||
|
cb(JSON.parse(x));
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
var $a = document.getElementById("audio");
|
||||||
|
window.onkeydown = function(ev) {
|
||||||
|
if(ev.keyCode == 32) {
|
||||||
|
ev.preventDefault();
|
||||||
|
$a.pause();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var $trans = document.getElementById("transcript");
|
||||||
|
var $preloader = document.getElementById('preloader');
|
||||||
|
|
||||||
|
var wds = [];
|
||||||
|
var cur_wd;
|
||||||
|
|
||||||
|
var $phones = document.createElement("div");
|
||||||
|
$phones.className = "phones";
|
||||||
|
document.body.appendChild($phones);
|
||||||
|
|
||||||
|
var cur_phones$ = []; // List of phoneme $divs
|
||||||
|
var $active_phone;
|
||||||
|
|
||||||
|
function render_phones(wd) {
|
||||||
|
cur_phones$ = [];
|
||||||
|
$phones.innerHTML = "";
|
||||||
|
$active_phone = null;
|
||||||
|
|
||||||
|
$phones.style.top = wd.$div.offsetTop + 18;
|
||||||
|
$phones.style.left = wd.$div.offsetLeft;
|
||||||
|
|
||||||
|
var dur = wd.end - wd.start;
|
||||||
|
|
||||||
|
var start_x = wd.$div.offsetLeft;
|
||||||
|
|
||||||
|
wd.phones
|
||||||
|
.forEach(function(ph){
|
||||||
|
var $p = document.createElement("span");
|
||||||
|
$p.className = "phone";
|
||||||
|
$p.textContent = ph.phone.split("_")[0];
|
||||||
|
|
||||||
|
$phones.appendChild($p);
|
||||||
|
cur_phones$.push($p);
|
||||||
|
});
|
||||||
|
|
||||||
|
var offsetToCenter = (wd.$div.offsetWidth - $phones.offsetWidth) / 2;
|
||||||
|
$phones.style.left = wd.$div.offsetLeft + offsetToCenter;
|
||||||
|
}
|
||||||
|
function highlight_phone(t) {
|
||||||
|
if(!cur_wd) {
|
||||||
|
$phones.innerHTML = "";
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
var hit;
|
||||||
|
var cur_t = cur_wd.start;
|
||||||
|
|
||||||
|
cur_wd.phones.forEach(function(ph, idx) {
|
||||||
|
if(cur_t <= t && cur_t + ph.duration >= t) {
|
||||||
|
hit = idx;
|
||||||
|
}
|
||||||
|
cur_t += ph.duration;
|
||||||
|
});
|
||||||
|
|
||||||
|
if(hit) {
|
||||||
|
var $ph = cur_phones$[hit];
|
||||||
|
if($ph != $active_phone) {
|
||||||
|
if($active_phone) {
|
||||||
|
$active_phone.classList.remove("phactive");
|
||||||
|
}
|
||||||
|
if($ph) {
|
||||||
|
$ph.classList.add("phactive");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
$active_phone = $ph;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function highlight_word() {
|
||||||
|
var t = $a.currentTime;
|
||||||
|
// XXX: O(N); use binary search
|
||||||
|
var hits = wds.filter(function(x) {
|
||||||
|
return (t - x.start) > 0.01 && (x.end - t) > 0.01;
|
||||||
|
}, wds);
|
||||||
|
var next_wd = hits[hits.length - 1];
|
||||||
|
|
||||||
|
if(cur_wd != next_wd) {
|
||||||
|
var active = document.querySelectorAll('.active');
|
||||||
|
for(var i = 0; i < active.length; i++) {
|
||||||
|
active[i].classList.remove('active');
|
||||||
|
}
|
||||||
|
if(next_wd && next_wd.$div) {
|
||||||
|
next_wd.$div.classList.add('active');
|
||||||
|
render_phones(next_wd);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
cur_wd = next_wd;
|
||||||
|
highlight_phone(t);
|
||||||
|
|
||||||
|
window.requestAnimationFrame(highlight_word);
|
||||||
|
}
|
||||||
|
window.requestAnimationFrame(highlight_word);
|
||||||
|
|
||||||
|
$trans.innerHTML = "Loading...";
|
||||||
|
|
||||||
|
function render(ret) {
|
||||||
|
wds = ret['words'] || [];
|
||||||
|
transcript = ret['transcript'];
|
||||||
|
|
||||||
|
$trans.innerHTML = '';
|
||||||
|
|
||||||
|
var currentOffset = 0;
|
||||||
|
|
||||||
|
wds.forEach(function(wd) {
|
||||||
|
if(wd.case == 'not-found-in-transcript') {
|
||||||
|
// TODO: show phonemes somewhere
|
||||||
|
var txt = ' ' + wd.word;
|
||||||
|
var $plaintext = document.createTextNode(txt);
|
||||||
|
$trans.appendChild($plaintext);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add non-linked text
|
||||||
|
if(wd.startOffset > currentOffset) {
|
||||||
|
var txt = transcript.slice(currentOffset, wd.startOffset);
|
||||||
|
var $plaintext = document.createTextNode(txt);
|
||||||
|
$trans.appendChild($plaintext);
|
||||||
|
currentOffset = wd.startOffset;
|
||||||
|
}
|
||||||
|
|
||||||
|
var $wd = document.createElement('span');
|
||||||
|
var txt = transcript.slice(wd.startOffset, wd.endOffset);
|
||||||
|
var $wdText = document.createTextNode(txt);
|
||||||
|
$wd.appendChild($wdText);
|
||||||
|
wd.$div = $wd;
|
||||||
|
if(wd.start !== undefined) {
|
||||||
|
$wd.className = 'success';
|
||||||
|
}
|
||||||
|
$wd.onclick = function() {
|
||||||
|
if(wd.start !== undefined) {
|
||||||
|
console.log(wd.start);
|
||||||
|
$a.currentTime = wd.start;
|
||||||
|
$a.play();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
$trans.appendChild($wd);
|
||||||
|
currentOffset = wd.endOffset;
|
||||||
|
});
|
||||||
|
|
||||||
|
var txt = transcript.slice(currentOffset, transcript.length);
|
||||||
|
var $plaintext = document.createTextNode(txt);
|
||||||
|
$trans.appendChild($plaintext);
|
||||||
|
currentOffset = transcript.length;
|
||||||
|
}
|
||||||
|
|
||||||
|
function show_downloads() {
|
||||||
|
var $d = document.getElementById("downloads");
|
||||||
|
$d.textContent = "Download as: ";
|
||||||
|
var uid = window.location.pathname.split("/")[2];
|
||||||
|
// Name, path, title, inhibit-on-file:///
|
||||||
|
[["CSV", "align.csv", "Word alignment CSV"],
|
||||||
|
["JSON", "align.json", "JSON word/phoneme alignment data"],
|
||||||
|
["Zip", "/zip/" + uid + ".zip", "Standalone zipfile", true]]
|
||||||
|
.forEach(function(x) {
|
||||||
|
var $a = document.createElement("a");
|
||||||
|
$a.className = "download";
|
||||||
|
$a.textContent = x[0];
|
||||||
|
$a.href = x[1];
|
||||||
|
$a.title = x[2];
|
||||||
|
if(!x[3] || window.location.protocol != "file:") {
|
||||||
|
$d.appendChild($a);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
var status_init = false;
|
||||||
|
var status_log = []; // [ status ]
|
||||||
|
var $status_pro;
|
||||||
|
|
||||||
|
function render_status(ret) {
|
||||||
|
if(!status_init) {
|
||||||
|
// Clobber the $trans div and use it for status updates
|
||||||
|
$trans.innerHTML = "<h2>transcription in progress</h2>";
|
||||||
|
$trans.className = "status";
|
||||||
|
$status_pro = document.createElement("progress");
|
||||||
|
$status_pro.setAttribute("min", "0");
|
||||||
|
$status_pro.setAttribute("max", "100");
|
||||||
|
$status_pro.value = 0;
|
||||||
|
$trans.appendChild($status_pro);
|
||||||
|
|
||||||
|
status_init = true;
|
||||||
|
}
|
||||||
|
if(ret.status !== "TRANSCRIBING") {
|
||||||
|
if(ret.percent) {
|
||||||
|
$status_pro.value = (100*ret.percent);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if(ret.percent && (status_log.length == 0 || status_log[status_log.length-1].percent+0.0001 < ret.percent)) {
|
||||||
|
// New entry
|
||||||
|
var $entry = document.createElement("div");
|
||||||
|
$entry.className = "entry";
|
||||||
|
$entry.textContent = ret.message;
|
||||||
|
ret.$div = $entry;
|
||||||
|
|
||||||
|
if(ret.percent) {
|
||||||
|
$status_pro.value = (100*ret.percent);
|
||||||
|
}
|
||||||
|
|
||||||
|
if(status_log.length > 0) {
|
||||||
|
$trans.insertBefore($entry, status_log[status_log.length-1].$div);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
$trans.appendChild($entry);
|
||||||
|
}
|
||||||
|
status_log.push(ret);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function update() {
|
||||||
|
if(INLINE_JSON) {
|
||||||
|
// We want this to work from file:/// domains, so we provide a
|
||||||
|
// mechanism for inlining the alignment data.
|
||||||
|
render(INLINE_JSON);
|
||||||
|
// show_downloads();
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
// Show the status
|
||||||
|
get_json('status.json', function(ret) {
|
||||||
|
$a.style.visibility = 'hidden';
|
||||||
|
if (ret.status == 'ERROR') {
|
||||||
|
$preloader.style.visibility = 'hidden';
|
||||||
|
$trans.innerHTML = '<b>' + ret.status + ': ' + ret.error + '</b>';
|
||||||
|
} else if (ret.status == 'TRANSCRIBING' || ret.status == 'ALIGNING') {
|
||||||
|
$preloader.style.visibility = 'visible';
|
||||||
|
render_status(ret);
|
||||||
|
setTimeout(update, 2000);
|
||||||
|
} else if (ret.status == 'OK') {
|
||||||
|
// show_downloads();
|
||||||
|
$preloader.style.visibility = 'hidden';
|
||||||
|
// XXX: should we fetch the align.json?
|
||||||
|
// window.location.reload();
|
||||||
|
$a.style.visibility = 'visible';
|
||||||
|
render(ret);
|
||||||
|
} else if (ret.status == 'ENCODING' || ret.status == 'STARTED') {
|
||||||
|
$preloader.style.visibility = 'visible';
|
||||||
|
$trans.innerHTML = 'Encoding, please wait...';
|
||||||
|
setTimeout(update, 2000);
|
||||||
|
} else {
|
||||||
|
console.log("unknown status", ret);
|
||||||
|
$preloader.style.visibility = 'hidden';
|
||||||
|
$trans.innerHTML = ret.status + '...';
|
||||||
|
setTimeout(update, 5000);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var INLINE_JSON;
|
||||||
|
|
||||||
|
update();
|
||||||
|
|
||||||
|
</script></body></html>
|
||||||
Binary file not shown.
|
After Width: | Height: | Size: 2.7 KiB |
|
|
@ -8,12 +8,11 @@ import typer
|
||||||
# import rpyc
|
# import rpyc
|
||||||
|
|
||||||
# from tqdm import tqdm
|
# from tqdm import tqdm
|
||||||
# from pydub import AudioSegment
|
|
||||||
# from pydub.silence import split_on_silence
|
# from pydub.silence import split_on_silence
|
||||||
from plume.utils import lazy_module, lazy_callable
|
from plume.utils import lazy_module, lazy_callable
|
||||||
|
|
||||||
rpyc = lazy_module('rpyc')
|
rpyc = lazy_module('rpyc')
|
||||||
AudioSegment = lazy_callable('pydub.AudioSegment')
|
pydub = lazy_module('pydub')
|
||||||
split_on_silence = lazy_callable('pydub.silence.split_on_silence')
|
split_on_silence = lazy_callable('pydub.silence.split_on_silence')
|
||||||
|
|
||||||
app = typer.Typer()
|
app = typer.Typer()
|
||||||
|
|
@ -106,7 +105,7 @@ def triton_transcribe_grpc_gen(
|
||||||
# ]
|
# ]
|
||||||
# pass
|
# pass
|
||||||
transcript_list = []
|
transcript_list = []
|
||||||
sil_pad = AudioSegment.silent(duration=sil_msec)
|
sil_pad = pydub.AudioSegment.silent(duration=sil_msec)
|
||||||
for seg in chunks:
|
for seg in chunks:
|
||||||
t_seg = sil_pad + seg + sil_pad
|
t_seg = sil_pad + seg + sil_pad
|
||||||
c_transcript = transcriber(t_seg)
|
c_transcript = transcriber(t_seg)
|
||||||
|
|
@ -124,9 +123,7 @@ def triton_transcribe_grpc_gen(
|
||||||
|
|
||||||
@app.command()
|
@app.command()
|
||||||
def file(audio_file: Path, write_file: bool = False, chunked=True):
|
def file(audio_file: Path, write_file: bool = False, chunked=True):
|
||||||
from pydub import AudioSegment
|
aseg = pydub.AudioSegment.from_file(audio_file)
|
||||||
|
|
||||||
aseg = AudioSegment.from_file(audio_file)
|
|
||||||
transcriber, prep = triton_transcribe_grpc_gen()
|
transcriber, prep = triton_transcribe_grpc_gen()
|
||||||
transcription = transcriber(prep(aseg))
|
transcription = transcriber(prep(aseg))
|
||||||
|
|
||||||
|
|
@ -139,10 +136,8 @@ def file(audio_file: Path, write_file: bool = False, chunked=True):
|
||||||
|
|
||||||
@app.command()
|
@app.command()
|
||||||
def benchmark(audio_file: Path):
|
def benchmark(audio_file: Path):
|
||||||
from pydub import AudioSegment
|
|
||||||
|
|
||||||
transcriber, audio_prep = transcribe_rpyc_gen()
|
transcriber, audio_prep = transcribe_rpyc_gen()
|
||||||
file_seg = AudioSegment.from_file(audio_file)
|
file_seg = pydub.AudioSegment.from_file(audio_file)
|
||||||
aud_seg = audio_prep(file_seg)
|
aud_seg = audio_prep(file_seg)
|
||||||
|
|
||||||
def timeinfo():
|
def timeinfo():
|
||||||
|
|
|
||||||
|
|
@ -27,6 +27,10 @@ class GoogleTTS(object):
|
||||||
audio_encoding=texttospeech.enums.AudioEncoding.LINEAR16,
|
audio_encoding=texttospeech.enums.AudioEncoding.LINEAR16,
|
||||||
sample_rate_hertz=params["sample_rate"],
|
sample_rate_hertz=params["sample_rate"],
|
||||||
)
|
)
|
||||||
|
if 'speaking_rate' in params:
|
||||||
|
audio_config.speaking_rate = params['speaking_rate']
|
||||||
|
if 'pitch' in params:
|
||||||
|
audio_config.pitch = params['pitch']
|
||||||
response = self.client.synthesize_speech(tts_input, voice, audio_config)
|
response = self.client.synthesize_speech(tts_input, voice, audio_config)
|
||||||
audio_content = response.audio_content
|
audio_content = response.audio_content
|
||||||
return audio_content
|
return audio_content
|
||||||
|
|
@ -74,6 +78,19 @@ class GoogleTTS(object):
|
||||||
)
|
)
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def voice_by_name(cls, name):
|
||||||
|
"""Lists the available voices."""
|
||||||
|
|
||||||
|
# client = cls().client
|
||||||
|
|
||||||
|
# Performs the list voices request
|
||||||
|
results = cls.voice_list()
|
||||||
|
for voice in results:
|
||||||
|
if voice['name'] == name:
|
||||||
|
return voice
|
||||||
|
raise ValueError(f'{name} not a valid voice')
|
||||||
|
|
||||||
|
|
||||||
@app.command()
|
@app.command()
|
||||||
def generate_audio_file(text, dest_path: Path = "./tts_audio.wav", voice="en-US-Wavenet-D"):
|
def generate_audio_file(text, dest_path: Path = "./tts_audio.wav", voice="en-US-Wavenet-D"):
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,85 @@
|
||||||
|
from plume.utils import ExtendedPath, get_mongo_conn
|
||||||
|
from plume.utils.st_rerun import rerun
|
||||||
|
from uuid import uuid4
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
def setup_file_state(st):
|
||||||
|
if not hasattr(st, "state_lock"):
|
||||||
|
# st.task_id = str(uuid4())
|
||||||
|
task_path = ExtendedPath("preview.lck")
|
||||||
|
|
||||||
|
def current_cursor_fn():
|
||||||
|
return task_path.read_json()["current_cursor"]
|
||||||
|
|
||||||
|
def update_cursor_fn(val=0):
|
||||||
|
task_path.write_json({"current_cursor": val})
|
||||||
|
rerun()
|
||||||
|
|
||||||
|
st.get_current_cursor = current_cursor_fn
|
||||||
|
st.update_cursor = update_cursor_fn
|
||||||
|
st.state_lock = True
|
||||||
|
# cursor_obj = mongo_conn.find_one({"type": "current_cursor", "task_id": st.task_id})
|
||||||
|
# if not cursor_obj:
|
||||||
|
update_cursor_fn(0)
|
||||||
|
|
||||||
|
|
||||||
|
def setup_mongo_asr_validation_state(st):
|
||||||
|
if not hasattr(st, "mongo_connected"):
|
||||||
|
st.mongoclient = get_mongo_conn(col="asr_validation")
|
||||||
|
mongo_conn = st.mongoclient
|
||||||
|
st.task_id = str(uuid4())
|
||||||
|
|
||||||
|
def current_cursor_fn():
|
||||||
|
# mongo_conn = st.mongoclient
|
||||||
|
cursor_obj = mongo_conn.find_one(
|
||||||
|
{"type": "current_cursor", "task_id": st.task_id}
|
||||||
|
)
|
||||||
|
cursor_val = cursor_obj["cursor"]
|
||||||
|
return cursor_val
|
||||||
|
|
||||||
|
def update_cursor_fn(val=0):
|
||||||
|
mongo_conn.find_one_and_update(
|
||||||
|
{"type": "current_cursor", "task_id": st.task_id},
|
||||||
|
{
|
||||||
|
"$set": {
|
||||||
|
"type": "current_cursor",
|
||||||
|
"task_id": st.task_id,
|
||||||
|
"cursor": val,
|
||||||
|
}
|
||||||
|
},
|
||||||
|
upsert=True,
|
||||||
|
)
|
||||||
|
rerun()
|
||||||
|
|
||||||
|
def get_correction_entry_fn(code):
|
||||||
|
return mongo_conn.find_one(
|
||||||
|
{"type": "correction", "code": code}, projection={"_id": False}
|
||||||
|
)
|
||||||
|
|
||||||
|
def update_entry_fn(code, value):
|
||||||
|
mongo_conn.find_one_and_update(
|
||||||
|
{"type": "correction", "code": code},
|
||||||
|
{"$set": {"value": value, "task_id": st.task_id}},
|
||||||
|
upsert=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
def set_task_fn(data_path, task_id):
|
||||||
|
if task_id:
|
||||||
|
st.task_id = task_id
|
||||||
|
task_path = data_path / Path(f"task-{st.task_id}.lck")
|
||||||
|
if not task_path.exists():
|
||||||
|
print(f"creating task lock at {task_path}")
|
||||||
|
task_path.touch()
|
||||||
|
|
||||||
|
st.get_current_cursor = current_cursor_fn
|
||||||
|
st.update_cursor = update_cursor_fn
|
||||||
|
st.get_correction_entry = get_correction_entry_fn
|
||||||
|
st.update_entry = update_entry_fn
|
||||||
|
st.set_task = set_task_fn
|
||||||
|
st.mongo_connected = True
|
||||||
|
cursor_obj = mongo_conn.find_one(
|
||||||
|
{"type": "current_cursor", "task_id": st.task_id}
|
||||||
|
)
|
||||||
|
if not cursor_obj:
|
||||||
|
update_cursor_fn(0)
|
||||||
|
|
@ -0,0 +1,205 @@
|
||||||
|
import logging
|
||||||
|
import asyncio
|
||||||
|
import argparse
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import webrtcvad
|
||||||
|
import pydub
|
||||||
|
from pydub.playback import play
|
||||||
|
from pydub.utils import make_chunks
|
||||||
|
|
||||||
|
|
||||||
|
DEFAULT_CHUNK_DUR = 20
|
||||||
|
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
||||||
|
)
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def is_frame_voice(vad, seg, chunk_dur):
|
||||||
|
return (
|
||||||
|
True
|
||||||
|
if (
|
||||||
|
seg.duration_seconds == chunk_dur / 1000
|
||||||
|
and vad.is_speech(seg.raw_data, seg.frame_rate)
|
||||||
|
)
|
||||||
|
else False
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class VADFilterAudio(object):
|
||||||
|
"""docstring for VADFilterAudio."""
|
||||||
|
|
||||||
|
def __init__(self, chunk_dur=DEFAULT_CHUNK_DUR):
|
||||||
|
super(VADFilterAudio, self).__init__()
|
||||||
|
self.chunk_dur = chunk_dur
|
||||||
|
self.vad = webrtcvad.Vad()
|
||||||
|
|
||||||
|
def filter_segment(self, wav_seg):
|
||||||
|
chunks = make_chunks(wav_seg, self.chunk_dur)
|
||||||
|
speech_buffer = b""
|
||||||
|
|
||||||
|
for i, c in enumerate(chunks[:-1]):
|
||||||
|
voice_frame = is_frame_voice(self.vad, c, self.chunk_dur)
|
||||||
|
if voice_frame:
|
||||||
|
speech_buffer += c.raw_data
|
||||||
|
filtered_seg = pydub.AudioSegment(
|
||||||
|
data=speech_buffer,
|
||||||
|
frame_rate=wav_seg.frame_rate,
|
||||||
|
channels=wav_seg.channels,
|
||||||
|
sample_width=wav_seg.sample_width,
|
||||||
|
)
|
||||||
|
return filtered_seg
|
||||||
|
|
||||||
|
|
||||||
|
class VADUtterance(object):
|
||||||
|
"""docstring for VADUtterance."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
max_silence=500,
|
||||||
|
min_utterance=280,
|
||||||
|
max_utterance=20000,
|
||||||
|
chunk_dur=DEFAULT_CHUNK_DUR,
|
||||||
|
start_cycles=3,
|
||||||
|
):
|
||||||
|
super(VADUtterance, self).__init__()
|
||||||
|
self.vad = webrtcvad.Vad()
|
||||||
|
self.chunk_dur = chunk_dur
|
||||||
|
# duration in millisecs
|
||||||
|
self.max_sil = max_silence
|
||||||
|
self.min_utt = min_utterance
|
||||||
|
self.max_utt = max_utterance
|
||||||
|
self.speech_start = start_cycles * chunk_dur
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return f"VAD(max_silence={self.max_sil},min_utterance:{self.min_utt},max_utterance:{self.max_utt})"
|
||||||
|
|
||||||
|
async def stream_utterance(self, audio_stream):
|
||||||
|
silence_buffer = pydub.AudioSegment.empty()
|
||||||
|
voice_buffer = pydub.AudioSegment.empty()
|
||||||
|
silence_threshold = False
|
||||||
|
async for c in audio_stream:
|
||||||
|
voice_frame = is_frame_voice(self.vad, c, self.chunk_dur)
|
||||||
|
logger.debug(f"is audio stream voice? {voice_frame}")
|
||||||
|
if voice_frame:
|
||||||
|
silence_threshold = False
|
||||||
|
voice_buffer += c
|
||||||
|
silence_buffer = pydub.AudioSegment.empty()
|
||||||
|
else:
|
||||||
|
silence_buffer += c
|
||||||
|
voc_dur = voice_buffer.duration_seconds * 1000
|
||||||
|
sil_dur = silence_buffer.duration_seconds * 1000
|
||||||
|
|
||||||
|
if voc_dur >= self.max_utt:
|
||||||
|
logger.info(
|
||||||
|
f"detected voice overflow: voice duration {voice_buffer.duration_seconds}"
|
||||||
|
)
|
||||||
|
yield voice_buffer
|
||||||
|
voice_buffer = pydub.AudioSegment.empty()
|
||||||
|
|
||||||
|
if sil_dur >= self.max_sil:
|
||||||
|
if voc_dur >= self.min_utt:
|
||||||
|
logger.info(
|
||||||
|
f"detected silence: voice duration {voice_buffer.duration_seconds}"
|
||||||
|
)
|
||||||
|
yield voice_buffer
|
||||||
|
voice_buffer = pydub.AudioSegment.empty()
|
||||||
|
# ignore/clear voice if silence reached threshold or indent the statement
|
||||||
|
if not silence_threshold:
|
||||||
|
silence_threshold = True
|
||||||
|
|
||||||
|
if voice_buffer:
|
||||||
|
yield voice_buffer
|
||||||
|
|
||||||
|
async def stream_events(self, audio_stream):
|
||||||
|
"""
|
||||||
|
yields 0, voice_buffer for SpeechBuffer
|
||||||
|
yields 1, None for StartedSpeaking
|
||||||
|
yields 2, None for StoppedSpeaking
|
||||||
|
yields 4, audio_stream
|
||||||
|
"""
|
||||||
|
silence_buffer = pydub.AudioSegment.empty()
|
||||||
|
voice_buffer = pydub.AudioSegment.empty()
|
||||||
|
silence_threshold, started_speaking = False, False
|
||||||
|
async for c in audio_stream:
|
||||||
|
# yield (4, c)
|
||||||
|
voice_frame = is_frame_voice(self.vad, c, self.chunk_dur)
|
||||||
|
logger.debug(f"is audio stream voice? {voice_frame}")
|
||||||
|
if voice_frame:
|
||||||
|
silence_threshold = False
|
||||||
|
voice_buffer += c
|
||||||
|
silence_buffer = pydub.AudioSegment.empty()
|
||||||
|
else:
|
||||||
|
silence_buffer += c
|
||||||
|
voc_dur = voice_buffer.duration_seconds * 1000
|
||||||
|
sil_dur = silence_buffer.duration_seconds * 1000
|
||||||
|
|
||||||
|
if voc_dur >= self.speech_start and not started_speaking:
|
||||||
|
started_speaking = True
|
||||||
|
yield (1, None)
|
||||||
|
|
||||||
|
if voc_dur >= self.max_utt:
|
||||||
|
logger.info(
|
||||||
|
f"detected voice overflow: voice duration {voice_buffer.duration_seconds}"
|
||||||
|
)
|
||||||
|
yield (0, voice_buffer)
|
||||||
|
voice_buffer = pydub.AudioSegment.empty()
|
||||||
|
started_speaking = False
|
||||||
|
|
||||||
|
if sil_dur >= self.max_sil:
|
||||||
|
if voc_dur >= self.min_utt:
|
||||||
|
logger.info(
|
||||||
|
f"detected silence: voice duration {voice_buffer.duration_seconds}"
|
||||||
|
)
|
||||||
|
yield (0, voice_buffer)
|
||||||
|
voice_buffer = pydub.AudioSegment.empty()
|
||||||
|
started_speaking = False
|
||||||
|
# ignore/clear voice if silence reached threshold or indent the statement
|
||||||
|
if not silence_threshold:
|
||||||
|
silence_threshold = True
|
||||||
|
yield (2, None)
|
||||||
|
|
||||||
|
if voice_buffer:
|
||||||
|
yield (0, voice_buffer)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
async def stream_utterance_file(cls, audio_file):
|
||||||
|
async def stream_gen():
|
||||||
|
audio_seg = pydub.AudioSegment.from_file(audio_file).set_frame_rate(32000)
|
||||||
|
chunks = make_chunks(audio_seg, DEFAULT_CHUNK_DUR)
|
||||||
|
for c in chunks:
|
||||||
|
yield c
|
||||||
|
|
||||||
|
va_ut = cls()
|
||||||
|
buffer_src = va_ut.stream_utterance(stream_gen())
|
||||||
|
async for buf in buffer_src:
|
||||||
|
play(buf)
|
||||||
|
await asyncio.sleep(1)
|
||||||
|
|
||||||
|
|
||||||
|
class VADStreamGen(object):
|
||||||
|
"""docstring for VADStreamGen."""
|
||||||
|
|
||||||
|
def __init__(self, arg):
|
||||||
|
super(VADStreamGen, self).__init__()
|
||||||
|
self.arg = arg
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
prog = Path(__file__).stem
|
||||||
|
parser = argparse.ArgumentParser(prog=prog, description="transcribes audio file")
|
||||||
|
parser.add_argument(
|
||||||
|
"--audio_file",
|
||||||
|
type=argparse.FileType("rb"),
|
||||||
|
help="audio file to transcribe",
|
||||||
|
default="./test_utter2.wav",
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
loop.run_until_complete(VADUtterance.stream_utterance_file(args.audio_file))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
3
setup.py
3
setup.py
|
|
@ -58,6 +58,9 @@ extra_requirements = {
|
||||||
"stringcase~=1.2.0",
|
"stringcase~=1.2.0",
|
||||||
"google-cloud-speech~=1.3.1",
|
"google-cloud-speech~=1.3.1",
|
||||||
],
|
],
|
||||||
|
"ui": [
|
||||||
|
"rangehttpserver~=1.2.0",
|
||||||
|
],
|
||||||
"train": ["torchaudio~=0.6.0", "torch-stft~=0.1.4"],
|
"train": ["torchaudio~=0.6.0", "torch-stft~=0.1.4"],
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue