318 lines
11 KiB
Python
318 lines
11 KiB
Python
import re
|
|
|
|
|
|
def entity_replacer_keeper(pre_rules=[], entity_rules=[], post_rules=[]):
|
|
# def replacer_keeper_gen():
|
|
pre_rules_c = [(re.compile(k), v) for (k, v) in pre_rules]
|
|
entity_rules_c = [(re.compile(k, re.IGNORECASE), v) for (k, v) in entity_rules]
|
|
post_rules_c = [(re.compile(k), v) for (k, v) in post_rules]
|
|
|
|
re_rules = pre_rules_c + entity_rules_c + post_rules_c
|
|
|
|
def replacer(w2v_out):
|
|
out = w2v_out
|
|
for (k, v) in re_rules:
|
|
out = k.sub(v, out)
|
|
return out
|
|
|
|
def merge_intervals(intervals):
|
|
# https://codereview.stackexchange.com/a/69249
|
|
sorted_by_lower_bound = sorted(intervals, key=lambda tup: tup[0])
|
|
merged = []
|
|
|
|
for higher in sorted_by_lower_bound:
|
|
if not merged:
|
|
merged.append(higher)
|
|
else:
|
|
lower = merged[-1]
|
|
# test for intersection between lower and higher:
|
|
# we know via sorting that lower[0] <= higher[0]
|
|
if higher[0] <= lower[1]:
|
|
upper_bound = max(lower[1], higher[1])
|
|
merged[-1] = (
|
|
lower[0],
|
|
upper_bound,
|
|
) # replace by merged interval
|
|
else:
|
|
merged.append(higher)
|
|
return merged
|
|
|
|
# merging interval tree for optimal # https://www.geeksforgeeks.org/interval-tree/
|
|
|
|
def keep_literals(w2v_out):
|
|
# out = re.sub(r"[ ;,.]", " ", w2v_out).strip()
|
|
out = w2v_out
|
|
for (k, v) in pre_rules_c:
|
|
out = k.sub(v, out)
|
|
num_spans = []
|
|
for (k, v) in entity_rules_c: # [94:]:
|
|
matches = k.finditer(out)
|
|
for m in matches:
|
|
# num_spans.append((k, m.span()))
|
|
num_spans.append(m.span())
|
|
# out = re.sub(k, v, out)
|
|
merged = merge_intervals(num_spans)
|
|
num_ents = len(merged)
|
|
keep_out = " ".join((out[s[0] : s[1]] for s in merged))
|
|
for (k, v) in post_rules_c:
|
|
keep_out = k.sub(v, keep_out)
|
|
return keep_out, num_ents
|
|
|
|
return replacer, keep_literals
|
|
|
|
|
|
def default_num_only_rules(num_range):
|
|
entity_rules = (
|
|
[
|
|
("\\bninety-nine\\b", "99"),
|
|
("\\bninety-eight\\b", "98"),
|
|
("\\bninety-seven\\b", "97"),
|
|
("\\bninety-six\\b", "96"),
|
|
("\\bninety-five\\b", "95"),
|
|
("\\bninety-four\\b", "94"),
|
|
("\\bninety-three\\b", "93"),
|
|
("\\bninety-two\\b", "92"),
|
|
("\\bninety-one\\b", "91"),
|
|
("\\bninety\\b", "90"),
|
|
("\\beighty-nine\\b", "89"),
|
|
("\\beighty-eight\\b", "88"),
|
|
("\\beighty-seven\\b", "87"),
|
|
("\\beighty-six\\b", "86"),
|
|
("\\beighty-five\\b", "85"),
|
|
("\\beighty-four\\b", "84"),
|
|
("\\beighty-three\\b", "83"),
|
|
("\\beighty-two\\b", "82"),
|
|
("\\beighty-one\\b", "81"),
|
|
("\\beighty\\b", "80"),
|
|
("\\bseventy-nine\\b", "79"),
|
|
("\\bseventy-eight\\b", "78"),
|
|
("\\bseventy-seven\\b", "77"),
|
|
("\\bseventy-six\\b", "76"),
|
|
("\\bseventy-five\\b", "75"),
|
|
("\\bseventy-four\\b", "74"),
|
|
("\\bseventy-three\\b", "73"),
|
|
("\\bseventy-two\\b", "72"),
|
|
("\\bseventy-one\\b", "71"),
|
|
("\\bseventy\\b", "70"),
|
|
("\\bsixty-nine\\b", "69"),
|
|
("\\bsixty-eight\\b", "68"),
|
|
("\\bsixty-seven\\b", "67"),
|
|
("\\bsixty-six\\b", "66"),
|
|
("\\bsixty-five\\b", "65"),
|
|
("\\bsixty-four\\b", "64"),
|
|
("\\bsixty-three\\b", "63"),
|
|
("\\bsixty-two\\b", "62"),
|
|
("\\bsixty-one\\b", "61"),
|
|
("\\bsixty\\b", "60"),
|
|
("\\bfifty-nine\\b", "59"),
|
|
("\\bfifty-eight\\b", "58"),
|
|
("\\bfifty-seven\\b", "57"),
|
|
("\\bfifty-six\\b", "56"),
|
|
("\\bfifty-five\\b", "55"),
|
|
("\\bfifty-four\\b", "54"),
|
|
("\\bfifty-three\\b", "53"),
|
|
("\\bfifty-two\\b", "52"),
|
|
("\\bfifty-one\\b", "51"),
|
|
("\\bfifty\\b", "50"),
|
|
("\\bforty-nine\\b", "49"),
|
|
("\\bforty-eight\\b", "48"),
|
|
("\\bforty-seven\\b", "47"),
|
|
("\\bforty-six\\b", "46"),
|
|
("\\bforty-five\\b", "45"),
|
|
("\\bforty-four\\b", "44"),
|
|
("\\bforty-three\\b", "43"),
|
|
("\\bforty-two\\b", "42"),
|
|
("\\bforty-one\\b", "41"),
|
|
("\\bforty\\b", "40"),
|
|
("\\bthirty-nine\\b", "39"),
|
|
("\\bthirty-eight\\b", "38"),
|
|
("\\bthirty-seven\\b", "37"),
|
|
("\\bthirty-six\\b", "36"),
|
|
("\\bthirty-five\\b", "35"),
|
|
("\\bthirty-four\\b", "34"),
|
|
("\\bthirty-three\\b", "33"),
|
|
("\\bthirty-two\\b", "32"),
|
|
("\\bthirty-one\\b", "31"),
|
|
("\\bthirty\\b", "30"),
|
|
("\\btwenty-nine\\b", "29"),
|
|
("\\btwenty-eight\\b", "28"),
|
|
("\\btwenty-seven\\b", "27"),
|
|
("\\btwenty-six\\b", "26"),
|
|
("\\btwenty-five\\b", "25"),
|
|
("\\btwenty-four\\b", "24"),
|
|
("\\btwenty-three\\b", "23"),
|
|
("\\btwenty-two\\b", "22"),
|
|
("\\btwenty-one\\b", "21"),
|
|
("\\btwenty\\b", "20"),
|
|
("\\bnineteen\\b", "19"),
|
|
("\\beighteen\\b", "18"),
|
|
("\\bseventeen\\b", "17"),
|
|
("\\bsixteen\\b", "16"),
|
|
("\\bfifteen\\b", "15"),
|
|
("\\bfourteen\\b", "14"),
|
|
("\\bthirteen\\b", "13"),
|
|
("\\btwelve\\b", "12"),
|
|
("\\beleven\\b", "11"),
|
|
("\\bten\\b", "10"),
|
|
("\\bnine\\b", "9"),
|
|
("\\beight\\b", "8"),
|
|
("\\bseven\\b", "7"),
|
|
("\\bsix\\b", "6"),
|
|
("\\bfive\\b", "5"),
|
|
("\\bfour\\b", "4"),
|
|
("\\bthree\\b", "3"),
|
|
("\\btwo\\b", "2"),
|
|
("\\bone\\b", "1"),
|
|
("\\bzero\\b", "0"),
|
|
]
|
|
+ [
|
|
(
|
|
r"\b" + str(i) + r"\b",
|
|
str(i),
|
|
)
|
|
for i in reversed(range(10))
|
|
]
|
|
+ [
|
|
(r"\bhundred\b", "00"),
|
|
]
|
|
)
|
|
return entity_rules
|
|
|
|
|
|
def default_num_rules(num_range):
|
|
entity_rules = default_num_only_rules(num_range) + [
|
|
(r"\boh\b", " 0 "),
|
|
(r"\bo\b", " 0 "),
|
|
(r"\bdouble(?: |-)(\w+|\d+)\b", "\\1 \\1"),
|
|
(r"\btriple(?: |-)(\w+|\d+)\b", "\\1 \\1 \\1"),
|
|
]
|
|
return entity_rules
|
|
|
|
|
|
def default_alnum_rules(num_range, oh_is_zero):
|
|
oh_is_zero_rules = [
|
|
(r"\boh\b", "0"),
|
|
(r"\bo\b", "0"),
|
|
]
|
|
entity_rules = (
|
|
default_num_only_rules(num_range)
|
|
+ (oh_is_zero_rules if oh_is_zero else [(r"\boh\b", "o")])
|
|
+ [
|
|
(r"\bdouble(?: |-)(\w+|\d+)\b", "\\1 \\1"),
|
|
(r"\btriple(?: |-)(\w+|\d+)\b", "\\1 \\1 \\1"),
|
|
(r"\b([a-zA-Z])\b", "\\1"),
|
|
]
|
|
)
|
|
return entity_rules
|
|
|
|
|
|
def num_replacer(num_range=100, condense=True):
|
|
entity_rules = default_num_rules(num_range)
|
|
post_rules = [(r"[^0-9]", "")] if condense else []
|
|
# post_rules = []
|
|
replacer, keeper = entity_replacer_keeper(
|
|
entity_rules=entity_rules, post_rules=post_rules
|
|
)
|
|
return replacer
|
|
|
|
|
|
def num_keeper(num_range=100):
|
|
entity_rules = default_num_rules(num_range)
|
|
pre_rules = [(r"[ ;,.]", " ")]
|
|
post_rules = []
|
|
replacer, keeper = entity_replacer_keeper(
|
|
pre_rules=pre_rules, entity_rules=entity_rules, post_rules=post_rules
|
|
)
|
|
return keeper
|
|
|
|
|
|
def alnum_replacer(num_range=100, oh_is_zero=False, condense=True):
|
|
entity_rules = default_alnum_rules(num_range, oh_is_zero)
|
|
# entity_rules = default_num_rules(num_range)
|
|
pre_rules = [(r"[ ;,.]", " "), (r"[']", "")]
|
|
|
|
def upper_case(match_obj):
|
|
char_elem = match_obj.group(0)
|
|
return char_elem.upper()
|
|
|
|
post_rules = (
|
|
[
|
|
# (r"\b[a-zA-Z]+\'[a-zA-Z]+\b", ""),
|
|
(r"\b[a-zA-Z]{2,}\b", ""),
|
|
(r"[^a-zA-Z0-9]", ""),
|
|
(r"([a-z].*)", upper_case),
|
|
]
|
|
if condense
|
|
else []
|
|
)
|
|
replacer, keeper = entity_replacer_keeper(
|
|
pre_rules=pre_rules, entity_rules=entity_rules, post_rules=post_rules
|
|
)
|
|
return replacer
|
|
|
|
|
|
def alnum_keeper(num_range=100, oh_is_zero=False):
|
|
entity_rules = default_alnum_rules(num_range, oh_is_zero)
|
|
pre_rules = [(r"[ ;,.]", " "), (r"[']", "")]
|
|
post_rules = []
|
|
replacer, keeper = entity_replacer_keeper(
|
|
pre_rules=pre_rules, entity_rules=entity_rules, post_rules=post_rules
|
|
)
|
|
return keeper
|
|
|
|
|
|
def test_num():
|
|
num_extractor = num_replacer()
|
|
keeper = num_keeper()
|
|
num_only_replacer = num_replacer(condense=False)
|
|
assert num_extractor("thirty-two") == "32"
|
|
assert num_extractor("not thirty-two fifty-nine") == "3259"
|
|
assert num_extractor(" triPle 5 fifty 3") == "555503"
|
|
assert num_only_replacer(" triPle 5 fifty 3") == " 5 5 5 50 3"
|
|
assert num_extractor("douBle 2 130") == "22130"
|
|
assert num_extractor("It is a One fifty eIght 5 fifty ") == "1508550"
|
|
assert (
|
|
num_only_replacer(" It is a One fifty eIght 5 fifty ")
|
|
== " It is a 1 50 8 5 50 "
|
|
)
|
|
assert num_extractor("One fifty-eight 5 oh o fifty") == "15850050"
|
|
assert keeper(
|
|
"my phone number is One hundred fifty-eight not 5 oh o fifty more"
|
|
) == ("One hundred fifty-eight 5 oh o fifty", 7)
|
|
|
|
|
|
def test_alnum():
|
|
extractor_oh = alnum_replacer(oh_is_zero=True)
|
|
extractor = alnum_replacer()
|
|
keeper = alnum_keeper()
|
|
only_replacer = alnum_replacer(condense=False)
|
|
assert extractor("I'm thirty-two") == "32"
|
|
assert extractor("a thirty-two") == "A32"
|
|
assert extractor("not a b thirty-two fifty-nine") == "AB3259"
|
|
assert extractor(" triPle 5 fifty 3") == "555503"
|
|
assert only_replacer(" triPle 5 fifty 3") == " 5 5 5 50 3"
|
|
assert extractor("douBle 2 130") == "22130"
|
|
assert extractor("It is a One b fifty eIght A Z 5 fifty ") == "A1B508AZ550"
|
|
assert (
|
|
only_replacer(" It's a ; One b fifty eIght A Z 5 fifty ")
|
|
== " Its a 1 b 50 8 A Z 5 50 "
|
|
)
|
|
assert (
|
|
only_replacer(" I'm is a One b fifty eIght A Z 5 fifty ")
|
|
== " Im is a 1 b 50 8 A Z 5 50 "
|
|
)
|
|
assert extractor("One Z fifty-eight 5 oh o b fifty") == "1Z585OOB50"
|
|
assert extractor_oh("One Z fifty-eight 5 oh o b fifty") == "1Z58500B50"
|
|
assert keeper(
|
|
"I'll phone number One hundred n fifty-eight not 5 oh o fifty A B more"
|
|
) == ("One hundred n fifty-eight 5 oh o fifty A B", 10)
|
|
assert keeper("I'm One hundred n fifty-eight not 5 oh o fifty A B more") == (
|
|
"One hundred n fifty-eight 5 oh o fifty A B",
|
|
10,
|
|
)
|
|
|
|
assert keeper("I am One hundred n fifty-eight not 5 oh o fifty A B more") == (
|
|
"I One hundred n fifty-eight 5 oh o fifty A B",
|
|
11,
|
|
)
|