plume-asr/tests/plume/test_entity_replacer_standa...

318 lines
11 KiB
Python

import re
def entity_replacer_keeper(pre_rules=[], entity_rules=[], post_rules=[]):
# def replacer_keeper_gen():
pre_rules_c = [(re.compile(k), v) for (k, v) in pre_rules]
entity_rules_c = [(re.compile(k, re.IGNORECASE), v) for (k, v) in entity_rules]
post_rules_c = [(re.compile(k), v) for (k, v) in post_rules]
re_rules = pre_rules_c + entity_rules_c + post_rules_c
def replacer(w2v_out):
out = w2v_out
for (k, v) in re_rules:
out = k.sub(v, out)
return out
def merge_intervals(intervals):
# https://codereview.stackexchange.com/a/69249
sorted_by_lower_bound = sorted(intervals, key=lambda tup: tup[0])
merged = []
for higher in sorted_by_lower_bound:
if not merged:
merged.append(higher)
else:
lower = merged[-1]
# test for intersection between lower and higher:
# we know via sorting that lower[0] <= higher[0]
if higher[0] <= lower[1]:
upper_bound = max(lower[1], higher[1])
merged[-1] = (
lower[0],
upper_bound,
) # replace by merged interval
else:
merged.append(higher)
return merged
# merging interval tree for optimal # https://www.geeksforgeeks.org/interval-tree/
def keep_literals(w2v_out):
# out = re.sub(r"[ ;,.]", " ", w2v_out).strip()
out = w2v_out
for (k, v) in pre_rules_c:
out = k.sub(v, out)
num_spans = []
for (k, v) in entity_rules_c: # [94:]:
matches = k.finditer(out)
for m in matches:
# num_spans.append((k, m.span()))
num_spans.append(m.span())
# out = re.sub(k, v, out)
merged = merge_intervals(num_spans)
num_ents = len(merged)
keep_out = " ".join((out[s[0] : s[1]] for s in merged))
for (k, v) in post_rules_c:
keep_out = k.sub(v, keep_out)
return keep_out, num_ents
return replacer, keep_literals
def default_num_only_rules(num_range):
entity_rules = (
[
("\\bninety-nine\\b", "99"),
("\\bninety-eight\\b", "98"),
("\\bninety-seven\\b", "97"),
("\\bninety-six\\b", "96"),
("\\bninety-five\\b", "95"),
("\\bninety-four\\b", "94"),
("\\bninety-three\\b", "93"),
("\\bninety-two\\b", "92"),
("\\bninety-one\\b", "91"),
("\\bninety\\b", "90"),
("\\beighty-nine\\b", "89"),
("\\beighty-eight\\b", "88"),
("\\beighty-seven\\b", "87"),
("\\beighty-six\\b", "86"),
("\\beighty-five\\b", "85"),
("\\beighty-four\\b", "84"),
("\\beighty-three\\b", "83"),
("\\beighty-two\\b", "82"),
("\\beighty-one\\b", "81"),
("\\beighty\\b", "80"),
("\\bseventy-nine\\b", "79"),
("\\bseventy-eight\\b", "78"),
("\\bseventy-seven\\b", "77"),
("\\bseventy-six\\b", "76"),
("\\bseventy-five\\b", "75"),
("\\bseventy-four\\b", "74"),
("\\bseventy-three\\b", "73"),
("\\bseventy-two\\b", "72"),
("\\bseventy-one\\b", "71"),
("\\bseventy\\b", "70"),
("\\bsixty-nine\\b", "69"),
("\\bsixty-eight\\b", "68"),
("\\bsixty-seven\\b", "67"),
("\\bsixty-six\\b", "66"),
("\\bsixty-five\\b", "65"),
("\\bsixty-four\\b", "64"),
("\\bsixty-three\\b", "63"),
("\\bsixty-two\\b", "62"),
("\\bsixty-one\\b", "61"),
("\\bsixty\\b", "60"),
("\\bfifty-nine\\b", "59"),
("\\bfifty-eight\\b", "58"),
("\\bfifty-seven\\b", "57"),
("\\bfifty-six\\b", "56"),
("\\bfifty-five\\b", "55"),
("\\bfifty-four\\b", "54"),
("\\bfifty-three\\b", "53"),
("\\bfifty-two\\b", "52"),
("\\bfifty-one\\b", "51"),
("\\bfifty\\b", "50"),
("\\bforty-nine\\b", "49"),
("\\bforty-eight\\b", "48"),
("\\bforty-seven\\b", "47"),
("\\bforty-six\\b", "46"),
("\\bforty-five\\b", "45"),
("\\bforty-four\\b", "44"),
("\\bforty-three\\b", "43"),
("\\bforty-two\\b", "42"),
("\\bforty-one\\b", "41"),
("\\bforty\\b", "40"),
("\\bthirty-nine\\b", "39"),
("\\bthirty-eight\\b", "38"),
("\\bthirty-seven\\b", "37"),
("\\bthirty-six\\b", "36"),
("\\bthirty-five\\b", "35"),
("\\bthirty-four\\b", "34"),
("\\bthirty-three\\b", "33"),
("\\bthirty-two\\b", "32"),
("\\bthirty-one\\b", "31"),
("\\bthirty\\b", "30"),
("\\btwenty-nine\\b", "29"),
("\\btwenty-eight\\b", "28"),
("\\btwenty-seven\\b", "27"),
("\\btwenty-six\\b", "26"),
("\\btwenty-five\\b", "25"),
("\\btwenty-four\\b", "24"),
("\\btwenty-three\\b", "23"),
("\\btwenty-two\\b", "22"),
("\\btwenty-one\\b", "21"),
("\\btwenty\\b", "20"),
("\\bnineteen\\b", "19"),
("\\beighteen\\b", "18"),
("\\bseventeen\\b", "17"),
("\\bsixteen\\b", "16"),
("\\bfifteen\\b", "15"),
("\\bfourteen\\b", "14"),
("\\bthirteen\\b", "13"),
("\\btwelve\\b", "12"),
("\\beleven\\b", "11"),
("\\bten\\b", "10"),
("\\bnine\\b", "9"),
("\\beight\\b", "8"),
("\\bseven\\b", "7"),
("\\bsix\\b", "6"),
("\\bfive\\b", "5"),
("\\bfour\\b", "4"),
("\\bthree\\b", "3"),
("\\btwo\\b", "2"),
("\\bone\\b", "1"),
("\\bzero\\b", "0"),
]
+ [
(
r"\b" + str(i) + r"\b",
str(i),
)
for i in reversed(range(10))
]
+ [
(r"\bhundred\b", "00"),
]
)
return entity_rules
def default_num_rules(num_range):
entity_rules = default_num_only_rules(num_range) + [
(r"\boh\b", " 0 "),
(r"\bo\b", " 0 "),
(r"\bdouble(?: |-)(\w+|\d+)\b", "\\1 \\1"),
(r"\btriple(?: |-)(\w+|\d+)\b", "\\1 \\1 \\1"),
]
return entity_rules
def default_alnum_rules(num_range, oh_is_zero):
oh_is_zero_rules = [
(r"\boh\b", "0"),
(r"\bo\b", "0"),
]
entity_rules = (
default_num_only_rules(num_range)
+ (oh_is_zero_rules if oh_is_zero else [(r"\boh\b", "o")])
+ [
(r"\bdouble(?: |-)(\w+|\d+)\b", "\\1 \\1"),
(r"\btriple(?: |-)(\w+|\d+)\b", "\\1 \\1 \\1"),
(r"\b([a-zA-Z])\b", "\\1"),
]
)
return entity_rules
def num_replacer(num_range=100, condense=True):
entity_rules = default_num_rules(num_range)
post_rules = [(r"[^0-9]", "")] if condense else []
# post_rules = []
replacer, keeper = entity_replacer_keeper(
entity_rules=entity_rules, post_rules=post_rules
)
return replacer
def num_keeper(num_range=100):
entity_rules = default_num_rules(num_range)
pre_rules = [(r"[ ;,.]", " ")]
post_rules = []
replacer, keeper = entity_replacer_keeper(
pre_rules=pre_rules, entity_rules=entity_rules, post_rules=post_rules
)
return keeper
def alnum_replacer(num_range=100, oh_is_zero=False, condense=True):
entity_rules = default_alnum_rules(num_range, oh_is_zero)
# entity_rules = default_num_rules(num_range)
pre_rules = [(r"[ ;,.]", " "), (r"[']", "")]
def upper_case(match_obj):
char_elem = match_obj.group(0)
return char_elem.upper()
post_rules = (
[
# (r"\b[a-zA-Z]+\'[a-zA-Z]+\b", ""),
(r"\b[a-zA-Z]{2,}\b", ""),
(r"[^a-zA-Z0-9]", ""),
(r"([a-z].*)", upper_case),
]
if condense
else []
)
replacer, keeper = entity_replacer_keeper(
pre_rules=pre_rules, entity_rules=entity_rules, post_rules=post_rules
)
return replacer
def alnum_keeper(num_range=100, oh_is_zero=False):
entity_rules = default_alnum_rules(num_range, oh_is_zero)
pre_rules = [(r"[ ;,.]", " "), (r"[']", "")]
post_rules = []
replacer, keeper = entity_replacer_keeper(
pre_rules=pre_rules, entity_rules=entity_rules, post_rules=post_rules
)
return keeper
def test_num():
num_extractor = num_replacer()
keeper = num_keeper()
num_only_replacer = num_replacer(condense=False)
assert num_extractor("thirty-two") == "32"
assert num_extractor("not thirty-two fifty-nine") == "3259"
assert num_extractor(" triPle 5 fifty 3") == "555503"
assert num_only_replacer(" triPle 5 fifty 3") == " 5 5 5 50 3"
assert num_extractor("douBle 2 130") == "22130"
assert num_extractor("It is a One fifty eIght 5 fifty ") == "1508550"
assert (
num_only_replacer(" It is a One fifty eIght 5 fifty ")
== " It is a 1 50 8 5 50 "
)
assert num_extractor("One fifty-eight 5 oh o fifty") == "15850050"
assert keeper(
"my phone number is One hundred fifty-eight not 5 oh o fifty more"
) == ("One hundred fifty-eight 5 oh o fifty", 7)
def test_alnum():
extractor_oh = alnum_replacer(oh_is_zero=True)
extractor = alnum_replacer()
keeper = alnum_keeper()
only_replacer = alnum_replacer(condense=False)
assert extractor("I'm thirty-two") == "32"
assert extractor("a thirty-two") == "A32"
assert extractor("not a b thirty-two fifty-nine") == "AB3259"
assert extractor(" triPle 5 fifty 3") == "555503"
assert only_replacer(" triPle 5 fifty 3") == " 5 5 5 50 3"
assert extractor("douBle 2 130") == "22130"
assert extractor("It is a One b fifty eIght A Z 5 fifty ") == "A1B508AZ550"
assert (
only_replacer(" It's a ; One b fifty eIght A Z 5 fifty ")
== " Its a 1 b 50 8 A Z 5 50 "
)
assert (
only_replacer(" I'm is a One b fifty eIght A Z 5 fifty ")
== " Im is a 1 b 50 8 A Z 5 50 "
)
assert extractor("One Z fifty-eight 5 oh o b fifty") == "1Z585OOB50"
assert extractor_oh("One Z fifty-eight 5 oh o b fifty") == "1Z58500B50"
assert keeper(
"I'll phone number One hundred n fifty-eight not 5 oh o fifty A B more"
) == ("One hundred n fifty-eight 5 oh o fifty A B", 10)
assert keeper("I'm One hundred n fifty-eight not 5 oh o fifty A B more") == (
"One hundred n fifty-eight 5 oh o fifty A B",
10,
)
assert keeper("I am One hundred n fifty-eight not 5 oh o fifty A B more") == (
"I One hundred n fifty-eight 5 oh o fifty A B",
11,
)