mirror of
https://github.com/malarinv/plume-asr.git
synced 2026-03-07 20:02:34 +00:00
1. refactor package root to src/ layout
2. add framwork suffix for models 3. change black max columns to 79 4. add tests 5. integrate vad, encrypt and refactor manifest, regentity, extended_path, audio, parallel utils 6. added ui utils for encrypted preview 7. wip marblenet model 8. added transformers based wav2vec2 inference 9. update readme and manifest 10. add deploy setup target
This commit is contained in:
317
tests/plume/test_entity_replacer_standalone.py
Normal file
317
tests/plume/test_entity_replacer_standalone.py
Normal file
@@ -0,0 +1,317 @@
|
||||
import re
|
||||
|
||||
|
||||
def entity_replacer_keeper(pre_rules=[], entity_rules=[], post_rules=[]):
|
||||
# def replacer_keeper_gen():
|
||||
pre_rules_c = [(re.compile(k), v) for (k, v) in pre_rules]
|
||||
entity_rules_c = [(re.compile(k, re.IGNORECASE), v) for (k, v) in entity_rules]
|
||||
post_rules_c = [(re.compile(k), v) for (k, v) in post_rules]
|
||||
|
||||
re_rules = pre_rules_c + entity_rules_c + post_rules_c
|
||||
|
||||
def replacer(w2v_out):
|
||||
out = w2v_out
|
||||
for (k, v) in re_rules:
|
||||
out = k.sub(v, out)
|
||||
return out
|
||||
|
||||
def merge_intervals(intervals):
|
||||
# https://codereview.stackexchange.com/a/69249
|
||||
sorted_by_lower_bound = sorted(intervals, key=lambda tup: tup[0])
|
||||
merged = []
|
||||
|
||||
for higher in sorted_by_lower_bound:
|
||||
if not merged:
|
||||
merged.append(higher)
|
||||
else:
|
||||
lower = merged[-1]
|
||||
# test for intersection between lower and higher:
|
||||
# we know via sorting that lower[0] <= higher[0]
|
||||
if higher[0] <= lower[1]:
|
||||
upper_bound = max(lower[1], higher[1])
|
||||
merged[-1] = (
|
||||
lower[0],
|
||||
upper_bound,
|
||||
) # replace by merged interval
|
||||
else:
|
||||
merged.append(higher)
|
||||
return merged
|
||||
|
||||
# merging interval tree for optimal # https://www.geeksforgeeks.org/interval-tree/
|
||||
|
||||
def keep_literals(w2v_out):
|
||||
# out = re.sub(r"[ ;,.]", " ", w2v_out).strip()
|
||||
out = w2v_out
|
||||
for (k, v) in pre_rules_c:
|
||||
out = k.sub(v, out)
|
||||
num_spans = []
|
||||
for (k, v) in entity_rules_c: # [94:]:
|
||||
matches = k.finditer(out)
|
||||
for m in matches:
|
||||
# num_spans.append((k, m.span()))
|
||||
num_spans.append(m.span())
|
||||
# out = re.sub(k, v, out)
|
||||
merged = merge_intervals(num_spans)
|
||||
num_ents = len(merged)
|
||||
keep_out = " ".join((out[s[0] : s[1]] for s in merged))
|
||||
for (k, v) in post_rules_c:
|
||||
keep_out = k.sub(v, keep_out)
|
||||
return keep_out, num_ents
|
||||
|
||||
return replacer, keep_literals
|
||||
|
||||
|
||||
def default_num_only_rules(num_range):
|
||||
entity_rules = (
|
||||
[
|
||||
("\\bninety-nine\\b", "99"),
|
||||
("\\bninety-eight\\b", "98"),
|
||||
("\\bninety-seven\\b", "97"),
|
||||
("\\bninety-six\\b", "96"),
|
||||
("\\bninety-five\\b", "95"),
|
||||
("\\bninety-four\\b", "94"),
|
||||
("\\bninety-three\\b", "93"),
|
||||
("\\bninety-two\\b", "92"),
|
||||
("\\bninety-one\\b", "91"),
|
||||
("\\bninety\\b", "90"),
|
||||
("\\beighty-nine\\b", "89"),
|
||||
("\\beighty-eight\\b", "88"),
|
||||
("\\beighty-seven\\b", "87"),
|
||||
("\\beighty-six\\b", "86"),
|
||||
("\\beighty-five\\b", "85"),
|
||||
("\\beighty-four\\b", "84"),
|
||||
("\\beighty-three\\b", "83"),
|
||||
("\\beighty-two\\b", "82"),
|
||||
("\\beighty-one\\b", "81"),
|
||||
("\\beighty\\b", "80"),
|
||||
("\\bseventy-nine\\b", "79"),
|
||||
("\\bseventy-eight\\b", "78"),
|
||||
("\\bseventy-seven\\b", "77"),
|
||||
("\\bseventy-six\\b", "76"),
|
||||
("\\bseventy-five\\b", "75"),
|
||||
("\\bseventy-four\\b", "74"),
|
||||
("\\bseventy-three\\b", "73"),
|
||||
("\\bseventy-two\\b", "72"),
|
||||
("\\bseventy-one\\b", "71"),
|
||||
("\\bseventy\\b", "70"),
|
||||
("\\bsixty-nine\\b", "69"),
|
||||
("\\bsixty-eight\\b", "68"),
|
||||
("\\bsixty-seven\\b", "67"),
|
||||
("\\bsixty-six\\b", "66"),
|
||||
("\\bsixty-five\\b", "65"),
|
||||
("\\bsixty-four\\b", "64"),
|
||||
("\\bsixty-three\\b", "63"),
|
||||
("\\bsixty-two\\b", "62"),
|
||||
("\\bsixty-one\\b", "61"),
|
||||
("\\bsixty\\b", "60"),
|
||||
("\\bfifty-nine\\b", "59"),
|
||||
("\\bfifty-eight\\b", "58"),
|
||||
("\\bfifty-seven\\b", "57"),
|
||||
("\\bfifty-six\\b", "56"),
|
||||
("\\bfifty-five\\b", "55"),
|
||||
("\\bfifty-four\\b", "54"),
|
||||
("\\bfifty-three\\b", "53"),
|
||||
("\\bfifty-two\\b", "52"),
|
||||
("\\bfifty-one\\b", "51"),
|
||||
("\\bfifty\\b", "50"),
|
||||
("\\bforty-nine\\b", "49"),
|
||||
("\\bforty-eight\\b", "48"),
|
||||
("\\bforty-seven\\b", "47"),
|
||||
("\\bforty-six\\b", "46"),
|
||||
("\\bforty-five\\b", "45"),
|
||||
("\\bforty-four\\b", "44"),
|
||||
("\\bforty-three\\b", "43"),
|
||||
("\\bforty-two\\b", "42"),
|
||||
("\\bforty-one\\b", "41"),
|
||||
("\\bforty\\b", "40"),
|
||||
("\\bthirty-nine\\b", "39"),
|
||||
("\\bthirty-eight\\b", "38"),
|
||||
("\\bthirty-seven\\b", "37"),
|
||||
("\\bthirty-six\\b", "36"),
|
||||
("\\bthirty-five\\b", "35"),
|
||||
("\\bthirty-four\\b", "34"),
|
||||
("\\bthirty-three\\b", "33"),
|
||||
("\\bthirty-two\\b", "32"),
|
||||
("\\bthirty-one\\b", "31"),
|
||||
("\\bthirty\\b", "30"),
|
||||
("\\btwenty-nine\\b", "29"),
|
||||
("\\btwenty-eight\\b", "28"),
|
||||
("\\btwenty-seven\\b", "27"),
|
||||
("\\btwenty-six\\b", "26"),
|
||||
("\\btwenty-five\\b", "25"),
|
||||
("\\btwenty-four\\b", "24"),
|
||||
("\\btwenty-three\\b", "23"),
|
||||
("\\btwenty-two\\b", "22"),
|
||||
("\\btwenty-one\\b", "21"),
|
||||
("\\btwenty\\b", "20"),
|
||||
("\\bnineteen\\b", "19"),
|
||||
("\\beighteen\\b", "18"),
|
||||
("\\bseventeen\\b", "17"),
|
||||
("\\bsixteen\\b", "16"),
|
||||
("\\bfifteen\\b", "15"),
|
||||
("\\bfourteen\\b", "14"),
|
||||
("\\bthirteen\\b", "13"),
|
||||
("\\btwelve\\b", "12"),
|
||||
("\\beleven\\b", "11"),
|
||||
("\\bten\\b", "10"),
|
||||
("\\bnine\\b", "9"),
|
||||
("\\beight\\b", "8"),
|
||||
("\\bseven\\b", "7"),
|
||||
("\\bsix\\b", "6"),
|
||||
("\\bfive\\b", "5"),
|
||||
("\\bfour\\b", "4"),
|
||||
("\\bthree\\b", "3"),
|
||||
("\\btwo\\b", "2"),
|
||||
("\\bone\\b", "1"),
|
||||
("\\bzero\\b", "0"),
|
||||
]
|
||||
+ [
|
||||
(
|
||||
r"\b" + str(i) + r"\b",
|
||||
str(i),
|
||||
)
|
||||
for i in reversed(range(10))
|
||||
]
|
||||
+ [
|
||||
(r"\bhundred\b", "00"),
|
||||
]
|
||||
)
|
||||
return entity_rules
|
||||
|
||||
|
||||
def default_num_rules(num_range):
|
||||
entity_rules = default_num_only_rules(num_range) + [
|
||||
(r"\boh\b", " 0 "),
|
||||
(r"\bo\b", " 0 "),
|
||||
(r"\bdouble(?: |-)(\w+|\d+)\b", "\\1 \\1"),
|
||||
(r"\btriple(?: |-)(\w+|\d+)\b", "\\1 \\1 \\1"),
|
||||
]
|
||||
return entity_rules
|
||||
|
||||
|
||||
def default_alnum_rules(num_range, oh_is_zero):
|
||||
oh_is_zero_rules = [
|
||||
(r"\boh\b", "0"),
|
||||
(r"\bo\b", "0"),
|
||||
]
|
||||
entity_rules = (
|
||||
default_num_only_rules(num_range)
|
||||
+ (oh_is_zero_rules if oh_is_zero else [(r"\boh\b", "o")])
|
||||
+ [
|
||||
(r"\bdouble(?: |-)(\w+|\d+)\b", "\\1 \\1"),
|
||||
(r"\btriple(?: |-)(\w+|\d+)\b", "\\1 \\1 \\1"),
|
||||
(r"\b([a-zA-Z])\b", "\\1"),
|
||||
]
|
||||
)
|
||||
return entity_rules
|
||||
|
||||
|
||||
def num_replacer(num_range=100, condense=True):
|
||||
entity_rules = default_num_rules(num_range)
|
||||
post_rules = [(r"[^0-9]", "")] if condense else []
|
||||
# post_rules = []
|
||||
replacer, keeper = entity_replacer_keeper(
|
||||
entity_rules=entity_rules, post_rules=post_rules
|
||||
)
|
||||
return replacer
|
||||
|
||||
|
||||
def num_keeper(num_range=100):
|
||||
entity_rules = default_num_rules(num_range)
|
||||
pre_rules = [(r"[ ;,.]", " ")]
|
||||
post_rules = []
|
||||
replacer, keeper = entity_replacer_keeper(
|
||||
pre_rules=pre_rules, entity_rules=entity_rules, post_rules=post_rules
|
||||
)
|
||||
return keeper
|
||||
|
||||
|
||||
def alnum_replacer(num_range=100, oh_is_zero=False, condense=True):
|
||||
entity_rules = default_alnum_rules(num_range, oh_is_zero)
|
||||
# entity_rules = default_num_rules(num_range)
|
||||
pre_rules = [(r"[ ;,.]", " "), (r"[']", "")]
|
||||
|
||||
def upper_case(match_obj):
|
||||
char_elem = match_obj.group(0)
|
||||
return char_elem.upper()
|
||||
|
||||
post_rules = (
|
||||
[
|
||||
# (r"\b[a-zA-Z]+\'[a-zA-Z]+\b", ""),
|
||||
(r"\b[a-zA-Z]{2,}\b", ""),
|
||||
(r"[^a-zA-Z0-9]", ""),
|
||||
(r"([a-z].*)", upper_case),
|
||||
]
|
||||
if condense
|
||||
else []
|
||||
)
|
||||
replacer, keeper = entity_replacer_keeper(
|
||||
pre_rules=pre_rules, entity_rules=entity_rules, post_rules=post_rules
|
||||
)
|
||||
return replacer
|
||||
|
||||
|
||||
def alnum_keeper(num_range=100, oh_is_zero=False):
|
||||
entity_rules = default_alnum_rules(num_range, oh_is_zero)
|
||||
pre_rules = [(r"[ ;,.]", " "), (r"[']", "")]
|
||||
post_rules = []
|
||||
replacer, keeper = entity_replacer_keeper(
|
||||
pre_rules=pre_rules, entity_rules=entity_rules, post_rules=post_rules
|
||||
)
|
||||
return keeper
|
||||
|
||||
|
||||
def test_num():
|
||||
num_extractor = num_replacer()
|
||||
keeper = num_keeper()
|
||||
num_only_replacer = num_replacer(condense=False)
|
||||
assert num_extractor("thirty-two") == "32"
|
||||
assert num_extractor("not thirty-two fifty-nine") == "3259"
|
||||
assert num_extractor(" triPle 5 fifty 3") == "555503"
|
||||
assert num_only_replacer(" triPle 5 fifty 3") == " 5 5 5 50 3"
|
||||
assert num_extractor("douBle 2 130") == "22130"
|
||||
assert num_extractor("It is a One fifty eIght 5 fifty ") == "1508550"
|
||||
assert (
|
||||
num_only_replacer(" It is a One fifty eIght 5 fifty ")
|
||||
== " It is a 1 50 8 5 50 "
|
||||
)
|
||||
assert num_extractor("One fifty-eight 5 oh o fifty") == "15850050"
|
||||
assert keeper(
|
||||
"my phone number is One hundred fifty-eight not 5 oh o fifty more"
|
||||
) == ("One hundred fifty-eight 5 oh o fifty", 7)
|
||||
|
||||
|
||||
def test_alnum():
|
||||
extractor_oh = alnum_replacer(oh_is_zero=True)
|
||||
extractor = alnum_replacer()
|
||||
keeper = alnum_keeper()
|
||||
only_replacer = alnum_replacer(condense=False)
|
||||
assert extractor("I'm thirty-two") == "32"
|
||||
assert extractor("a thirty-two") == "A32"
|
||||
assert extractor("not a b thirty-two fifty-nine") == "AB3259"
|
||||
assert extractor(" triPle 5 fifty 3") == "555503"
|
||||
assert only_replacer(" triPle 5 fifty 3") == " 5 5 5 50 3"
|
||||
assert extractor("douBle 2 130") == "22130"
|
||||
assert extractor("It is a One b fifty eIght A Z 5 fifty ") == "A1B508AZ550"
|
||||
assert (
|
||||
only_replacer(" It's a ; One b fifty eIght A Z 5 fifty ")
|
||||
== " Its a 1 b 50 8 A Z 5 50 "
|
||||
)
|
||||
assert (
|
||||
only_replacer(" I'm is a One b fifty eIght A Z 5 fifty ")
|
||||
== " Im is a 1 b 50 8 A Z 5 50 "
|
||||
)
|
||||
assert extractor("One Z fifty-eight 5 oh o b fifty") == "1Z585OOB50"
|
||||
assert extractor_oh("One Z fifty-eight 5 oh o b fifty") == "1Z58500B50"
|
||||
assert keeper(
|
||||
"I'll phone number One hundred n fifty-eight not 5 oh o fifty A B more"
|
||||
) == ("One hundred n fifty-eight 5 oh o fifty A B", 10)
|
||||
assert keeper("I'm One hundred n fifty-eight not 5 oh o fifty A B more") == (
|
||||
"One hundred n fifty-eight 5 oh o fifty A B",
|
||||
10,
|
||||
)
|
||||
|
||||
assert keeper("I am One hundred n fifty-eight not 5 oh o fifty A B more") == (
|
||||
"I One hundred n fifty-eight 5 oh o fifty A B",
|
||||
11,
|
||||
)
|
||||
105
tests/plume/test_utils.py
Normal file
105
tests/plume/test_utils.py
Normal file
@@ -0,0 +1,105 @@
|
||||
from plume.utils import (
|
||||
num_replacer,
|
||||
num_keeper,
|
||||
alnum_replacer,
|
||||
alnum_keeper,
|
||||
random_segs,
|
||||
)
|
||||
import numpy
|
||||
import random as rand
|
||||
import pytest
|
||||
|
||||
|
||||
def test_num_replacer_keeper():
|
||||
num_extractor = num_replacer()
|
||||
num_only_replacer = num_replacer(condense=False)
|
||||
assert num_extractor("thirty-two") == "32"
|
||||
assert num_extractor("not thirty-two fifty-nine") == "3259"
|
||||
assert num_extractor(" triPle 5 fifty 3") == "555503"
|
||||
assert num_only_replacer(" triPle 5 fifty 3") == " 5 5 5 50 3"
|
||||
assert num_extractor("douBle 2 130") == "22130"
|
||||
assert num_extractor("It is a One fifty eIght 5 fifty ") == "1508550"
|
||||
assert (
|
||||
num_only_replacer(" It is a One fifty eIght 5 fifty ")
|
||||
== " It is a 1 50 8 5 50 "
|
||||
)
|
||||
assert num_extractor("One fifty-eight 5 oh o fifty") == "15850050"
|
||||
keeper = num_keeper()
|
||||
assert keeper(
|
||||
"my phone number is One hundred fifty-eight not 5 oh o fifty more"
|
||||
) == ("One hundred fifty-eight 5 oh o fifty", 7)
|
||||
|
||||
|
||||
def test_alnum_replacer():
|
||||
extractor_oh = alnum_replacer(oh_is_zero=True)
|
||||
extractor = alnum_replacer()
|
||||
only_replacer = alnum_replacer(condense=False)
|
||||
assert extractor("5 oh i c 3") == "5OIC3"
|
||||
assert extractor("I am, oh it is 3. I will") == "3"
|
||||
assert extractor("I oh o 3") == "IOO3"
|
||||
assert extractor("I will 3 I") == "3I"
|
||||
assert extractor("I'm thirty-two") == "32"
|
||||
assert extractor("I am thirty-two") == "32"
|
||||
assert extractor("I j thirty-two") == "IJ32"
|
||||
assert extractor("a thirty-two") == "A32"
|
||||
assert extractor("not a b thirty-two fifty-nine") == "AB3259"
|
||||
assert extractor(" triPle 5 fifty 3") == "555503"
|
||||
assert only_replacer(" triPle 5 fifty 3") == " 5 5 5 50 3"
|
||||
assert extractor("douBle 2 130") == "22130"
|
||||
assert extractor("It is a One b fifty eIght A Z 5 fifty ") == "A1B508AZ550"
|
||||
assert (
|
||||
only_replacer(" It's a ; One b fifty eIght A Z 5 fifty ")
|
||||
== " Its a 1 b 50 8 A Z 5 50 "
|
||||
)
|
||||
assert (
|
||||
only_replacer(" I'm is a One b fifty eIght A Z 5 fifty ")
|
||||
== " Im is a 1 b 50 8 A Z 5 50 "
|
||||
)
|
||||
assert extractor("One Z fifty-eight 5 oh o b fifty") == "1Z585OOB50"
|
||||
assert extractor_oh("One Z fifty-eight 5 oh o b fifty") == "1Z58500B50"
|
||||
assert (
|
||||
extractor("I One hundred n fifty-eight not 5 oh o fifty A B more")
|
||||
== "I100N585OO50AB"
|
||||
)
|
||||
|
||||
|
||||
def test_alnum_keeper():
|
||||
keeper = alnum_keeper()
|
||||
assert keeper("I One hundred n fifty-eight not 5 oh o fifty A B more") == (
|
||||
"I One hundred n fifty-eight 5 oh o fifty A B",
|
||||
11,
|
||||
)
|
||||
assert keeper(
|
||||
"I'll phone number One hundred n fifty-eight not 5 oh o fifty A B more"
|
||||
) == ("One hundred n fifty-eight 5 oh o fifty A B", 10)
|
||||
assert keeper(
|
||||
"I'm One hundred n fifty-eight not 5 oh o fifty A B more"
|
||||
) == (
|
||||
"One hundred n fifty-eight 5 oh o fifty A B",
|
||||
10,
|
||||
)
|
||||
|
||||
assert keeper(
|
||||
"I am One hundred n fifty-eight not 5 oh o fifty A B more"
|
||||
) == (
|
||||
"One hundred n fifty-eight 5 oh o fifty A B",
|
||||
10,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def random():
|
||||
rand.seed(0)
|
||||
numpy.random.seed(0)
|
||||
|
||||
|
||||
def test_random_segs(random):
|
||||
segs = random_segs(100000, 1000, 3000)
|
||||
|
||||
def segs_comply(segs, min, max):
|
||||
for (start, end) in segs:
|
||||
if end - start < min or end - start > max:
|
||||
return False
|
||||
return True
|
||||
|
||||
assert segs_comply(segs, 1000, 3000) == True
|
||||
17
tests/plume/utils/test_regentity.py
Normal file
17
tests/plume/utils/test_regentity.py
Normal file
@@ -0,0 +1,17 @@
|
||||
from plume.utils.regentity import infer_num_replacer
|
||||
|
||||
|
||||
def test_infer_num():
|
||||
repl = infer_num_replacer()
|
||||
|
||||
assert (
|
||||
repl(
|
||||
"SIX NINE TRIPL EIGHT SIX SIX DOULE NINE THREE ZERO TWO SEVENT-ONE"
|
||||
)
|
||||
== "69888669930271"
|
||||
)
|
||||
|
||||
assert (
|
||||
repl("SIX NINE FSIX EIGHT IGSIX SIX NINE NINE THRE ZERO TWO SEVEN ONE")
|
||||
== "6968669930271"
|
||||
)
|
||||
Reference in New Issue
Block a user