Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ pipeline {
AR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/04-24-24-0'
DE_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-03-24-0'
EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-06-24-0'
ES_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-27-23-0'
ES_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-16-24-0'
ES_EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-13-23-2'
FR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-04-24-0'
HU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
Expand Down
50 changes: 33 additions & 17 deletions nemo_text_processing/text_normalization/en/graph_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,9 @@
NEMO_ALPHA = pynini.union(NEMO_LOWER, NEMO_UPPER).optimize()
NEMO_ALNUM = pynini.union(NEMO_DIGIT, NEMO_ALPHA).optimize()
NEMO_HEX = pynini.union(*string.hexdigits).optimize()
NEMO_NON_BREAKING_SPACE = u"\u00A0"
NEMO_NON_BREAKING_SPACE = "\u00A0"
NEMO_SPACE = " "
NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", u"\u00A0").optimize()
NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", "\u00A0").optimize()
NEMO_NOT_SPACE = pynini.difference(NEMO_CHAR, NEMO_WHITE_SPACE).optimize()
NEMO_NOT_QUOTE = pynini.difference(NEMO_CHAR, r'"').optimize()

Expand Down Expand Up @@ -79,20 +79,36 @@
delete_extra_space = pynini.cross(pynini.closure(NEMO_WHITE_SPACE, 1), " ")
delete_preserve_order = pynini.closure(
pynutil.delete(" preserve_order: true")
| (pynutil.delete(" field_order: \"") + NEMO_NOT_QUOTE + pynutil.delete("\""))
| (pynutil.delete(' field_order: "') + NEMO_NOT_QUOTE + pynutil.delete('"'))
)

# Common string literals; expand as you see fit.
username_string = "username"
double_quotes = '"'
domain_string = "domain"
protocol_string = "protocol"
slash = "/"
double_slash = "//"
triple_slash = "///"
file = "file"
period = "."
at = "@"
colon = ":"
https = "https"
http = "http"
www = "www"

suppletive = pynini.string_file(get_abs_path("data/suppletive.tsv"))
# _v = pynini.union("a", "e", "i", "o", "u")
_c = pynini.union(
"b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "y", "z"
"b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "y", "z",
)
_ies = NEMO_SIGMA + _c + pynini.cross("y", "ies")
_es = NEMO_SIGMA + pynini.union("s", "sh", "ch", "x", "z") + pynutil.insert("es")
_s = NEMO_SIGMA + pynutil.insert("s")

graph_plural = plurals._priority_union(
suppletive, plurals._priority_union(_ies, plurals._priority_union(_es, _s, NEMO_SIGMA), NEMO_SIGMA), NEMO_SIGMA
suppletive, plurals._priority_union(_ies, plurals._priority_union(_es, _s, NEMO_SIGMA), NEMO_SIGMA), NEMO_SIGMA,
).optimize()

SINGULAR_TO_PLURAL = graph_plural
Expand All @@ -107,8 +123,8 @@


def capitalized_input_graph(
graph: 'pynini.FstLike', original_graph_weight: float = None, capitalized_graph_weight: float = None
) -> 'pynini.FstLike':
graph: "pynini.FstLike", original_graph_weight: float = None, capitalized_graph_weight: float = None,
) -> "pynini.FstLike":
"""
Allow graph input to be capitalized, e.g. for ITN)

Expand All @@ -129,7 +145,7 @@ def capitalized_input_graph(
return graph


def generator_main(file_name: str, graphs: Dict[str, 'pynini.FstLike']):
def generator_main(file_name: str, graphs: Dict[str, "pynini.FstLike"]):
"""
Exports graph as OpenFst finite state archive (FAR) file with given file name and rule name.

Expand All @@ -141,7 +157,7 @@ def generator_main(file_name: str, graphs: Dict[str, 'pynini.FstLike']):
for rule, graph in graphs.items():
exporter[rule] = graph.optimize()
exporter.close()
logger.info(f'Created {file_name}')
logger.info(f"Created {file_name}")


def get_plurals(fst):
Expand All @@ -168,7 +184,7 @@ def get_singulars(fst):
return PLURAL_TO_SINGULAR @ fst


def convert_space(fst) -> 'pynini.FstLike':
def convert_space(fst) -> "pynini.FstLike":
"""
Converts space to nonbreaking space.
Used only in tagger grammars for transducing token values within quotes, e.g. name: "hello kitty"
Expand All @@ -191,7 +207,7 @@ def string_map_cased(input_file: str, input_case: str = INPUT_LOWER_CASED):
written_capitalized = written[0].upper() + written[1:]
additional_labels.extend(
[
[written_capitalized, spoken.capitalize()], # first letter capitalized
[written_capitalized, spoken.capitalize(),], # first letter capitalized
[
written_capitalized,
spoken.upper().replace(" AND ", " and "),
Expand All @@ -205,7 +221,7 @@ def string_map_cased(input_file: str, input_case: str = INPUT_LOWER_CASED):
logger.debug(f"This is weight {weight}")
if len(weight) == 0:
additional_labels.extend(
[[written, spoken_no_space], [written_capitalized, spoken_no_space.upper()]]
[[written, spoken_no_space], [written_capitalized, spoken_no_space.upper()],]
)
else:
additional_labels.extend(
Expand Down Expand Up @@ -237,7 +253,7 @@ def __init__(self, name: str, kind: str, deterministic: bool = True):
self._fst = None
self.deterministic = deterministic

self.far_path = Path(os.path.dirname(__file__) + '/grammars/' + kind + '/' + name + '.far')
self.far_path = Path(os.path.dirname(__file__) + "/grammars/" + kind + "/" + name + ".far")
if self.far_exist():
self._fst = Far(self.far_path, mode="r", arc_type="standard", far_type="default").get_fst()

Expand All @@ -248,14 +264,14 @@ def far_exist(self) -> bool:
return self.far_path.exists()

@property
def fst(self) -> 'pynini.FstLike':
def fst(self) -> "pynini.FstLike":
return self._fst

@fst.setter
def fst(self, fst):
self._fst = fst

def add_tokens(self, fst) -> 'pynini.FstLike':
def add_tokens(self, fst) -> "pynini.FstLike":
"""
Wraps class name around to given fst

Expand All @@ -267,7 +283,7 @@ def add_tokens(self, fst) -> 'pynini.FstLike':
"""
return pynutil.insert(f"{self.name} {{ ") + fst + pynutil.insert(" }")

def delete_tokens(self, fst) -> 'pynini.FstLike':
def delete_tokens(self, fst) -> "pynini.FstLike":
"""
Deletes class name wrap around output of given fst

Expand All @@ -286,4 +302,4 @@ def delete_tokens(self, fst) -> 'pynini.FstLike':
+ delete_space
+ pynutil.delete("}")
)
return res @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA)
return res @ pynini.cdrewrite(pynini.cross("\u00A0", " "), "", "", NEMO_SIGMA)
80 changes: 56 additions & 24 deletions nemo_text_processing/text_normalization/es/taggers/electronic.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,30 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import pynini
from pynini.lib import pynutil

from nemo_text_processing.text_normalization.en.graph_utils import NEMO_ALPHA, NEMO_DIGIT, GraphFst, insert_space
from nemo_text_processing.text_normalization.en.graph_utils import (
NEMO_ALPHA,
NEMO_DIGIT,
NEMO_SPACE,
GraphFst,
at,
colon,
domain_string,
double_quotes,
double_slash,
http,
https,
period,
protocol_string,
username_string,
www,
)
from nemo_text_processing.text_normalization.es.utils import get_abs_path, load_labels

common_domains = [x[0] for x in load_labels(get_abs_path("data/electronic/domain.tsv"))]
symbols = [x[0] for x in load_labels(get_abs_path("data/electronic/symbols.tsv"))]


class ElectronicFst(GraphFst):
Expand All @@ -36,39 +52,55 @@ def __init__(self, deterministic: bool = True):
super().__init__(name="electronic", kind="classify", deterministic=deterministic)

dot = pynini.accep(".")
accepted_common_domains = pynini.union(*common_domains)
accepted_symbols = pynini.union(*symbols) - dot
accepted_characters = pynini.closure(NEMO_ALPHA | NEMO_DIGIT | accepted_symbols)
acceepted_characters_with_dot = pynini.closure(NEMO_ALPHA | NEMO_DIGIT | accepted_symbols | dot)

symbols = [x[0] for x in load_labels(get_abs_path("data/electronic/symbols.tsv"))]
symbols = pynini.union(*symbols)
symbols_no_period = pynini.difference(symbols, dot)
accepted_characters = pynini.closure((NEMO_ALPHA | NEMO_DIGIT | symbols_no_period), 1)
all_characters = pynini.closure((NEMO_ALPHA | NEMO_DIGIT | symbols), 1)

# domains
domain = dot + accepted_characters
domain_graph = (
pynutil.insert(domain_string + colon + NEMO_SPACE + double_quotes)
+ (accepted_characters + pynini.closure(domain, 1))
+ pynutil.insert(double_quotes)
)

# email
username = (
pynutil.insert("username: \"")
+ acceepted_characters_with_dot
+ pynutil.insert("\"")
+ pynini.cross('@', ' ')
pynutil.insert(username_string + colon + NEMO_SPACE + double_quotes)
+ all_characters
+ pynutil.insert(double_quotes)
+ pynini.cross(at, NEMO_SPACE)
)
domain_graph = accepted_characters + dot + accepted_characters
domain_graph = pynutil.insert("domain: \"") + domain_graph + pynutil.insert("\"")
domain_common_graph = (
pynutil.insert("domain: \"")
+ accepted_characters
+ accepted_common_domains
+ pynini.closure((accepted_symbols | dot) + pynini.closure(accepted_characters, 1), 0, 1)
+ pynutil.insert("\"")
email = username + domain_graph

# social media tags
tag = (
pynini.cross(at, "")
+ pynutil.insert(username_string + colon + NEMO_SPACE + double_quotes)
+ (accepted_characters | (accepted_characters + pynini.closure(domain, 1)))
+ pynutil.insert(double_quotes)
)
graph = (username + domain_graph) | domain_common_graph

# url
protocol_start = pynini.accep("https://") | pynini.accep("http://")
protocol_start = pynini.accep(https + colon + double_slash) | pynini.accep(http + colon + double_slash)
# protocol_end = pynini.accep("www.")
protocol_end = (
pynini.accep("www.")
pynini.accep(www + period)
if deterministic
else pynini.accep("www.") | pynini.cross("www.", "doble ve doble ve doble ve.")
else pynini.accep(www + period) | pynini.cross(www + period, "doble ve doble ve doble ve.")
)
protocol = protocol_start | protocol_end | (protocol_start + protocol_end)
protocol = pynutil.insert("protocol: \"") + protocol + pynutil.insert("\"")
graph |= protocol + insert_space + (domain_graph | domain_common_graph)
protocol = (
pynutil.insert(protocol_string + colon + NEMO_SPACE + double_quotes)
+ protocol
+ pynutil.insert(double_quotes)
)
url = protocol + pynutil.insert(NEMO_SPACE) + (domain_graph)

graph = url | domain_graph | email | tag
self.graph = graph

final_graph = self.add_tokens(self.graph + pynutil.insert(" preserve_order: true"))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ def __init__(
os.makedirs(cache_dir, exist_ok=True)
whitelist_file = os.path.basename(whitelist) if whitelist else ""
far_file = os.path.join(
cache_dir, f"_{input_case}_es_tn_{deterministic}_deterministic{whitelist_file}.far"
cache_dir, f"_{input_case}_es_tn_{deterministic}_deterministic{whitelist_file}.far",
)
if not overwrite_cache and far_file and os.path.exists(far_file):
self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"]
Expand All @@ -86,10 +86,10 @@ def __init__(
self.decimal = DecimalFst(cardinal=self.cardinal, deterministic=deterministic)
decimal_graph = self.decimal.fst

self.fraction = FractionFst(cardinal=self.cardinal, ordinal=self.ordinal, deterministic=deterministic)
self.fraction = FractionFst(cardinal=self.cardinal, ordinal=self.ordinal, deterministic=deterministic,)
fraction_graph = self.fraction.fst
self.measure = MeasureFst(
cardinal=self.cardinal, decimal=self.decimal, fraction=self.fraction, deterministic=deterministic
cardinal=self.cardinal, decimal=self.decimal, fraction=self.fraction, deterministic=deterministic,
)
measure_graph = self.measure.fst
self.date = DateFst(cardinal=self.cardinal, deterministic=deterministic)
Expand All @@ -101,7 +101,7 @@ def __init__(
telephone_graph = self.telephone.fst
self.electronic = ElectronicFst(deterministic=deterministic)
electronic_graph = self.electronic.fst
self.money = MoneyFst(cardinal=self.cardinal, decimal=self.decimal, deterministic=deterministic)
self.money = MoneyFst(cardinal=self.cardinal, decimal=self.decimal, deterministic=deterministic,)
money_graph = self.money.fst
self.whitelist = WhiteListFst(input_case=input_case, deterministic=deterministic, input_file=whitelist)
whitelist_graph = self.whitelist.fst
Expand All @@ -118,7 +118,7 @@ def __init__(
| pynutil.add_weight(decimal_graph, 1.1)
| pynutil.add_weight(money_graph, 1.09)
| pynutil.add_weight(telephone_graph, 1.11)
| pynutil.add_weight(electronic_graph, 1.1)
| pynutil.add_weight(electronic_graph, 1.11)
| pynutil.add_weight(word_graph, 200)
)
punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=2.1) + pynutil.insert(" }")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,14 @@
from nemo_text_processing.text_normalization.en.graph_utils import (
NEMO_NOT_QUOTE,
NEMO_SIGMA,
NEMO_SPACE,
GraphFst,
colon,
delete_preserve_order,
insert_space,
domain_string,
double_quotes,
protocol_string,
username_string,
)
from nemo_text_processing.text_normalization.es.utils import get_abs_path

Expand All @@ -29,6 +34,7 @@
graph_symbols = pynini.string_file(get_abs_path("data/electronic/symbols.tsv"))
server_common = pynini.string_file(get_abs_path("data/electronic/server_name.tsv"))
domain_common = pynini.string_file(get_abs_path("data/electronic/domain.tsv"))
arroba = "arroba"


class ElectronicFst(GraphFst):
Expand All @@ -45,34 +51,42 @@ class ElectronicFst(GraphFst):

def __init__(self, deterministic: bool = True):
super().__init__(name="electronic", kind="verbalize", deterministic=deterministic)

graph_digit_no_zero = (
digit_no_zero @ pynini.cdrewrite(pynini.cross("un", "uno"), "", "", NEMO_SIGMA).optimize()
)
graph_digit = graph_digit_no_zero | zero

def add_space_after_char():
return pynini.closure(NEMO_NOT_QUOTE - pynini.accep(" ") + insert_space) + (
NEMO_NOT_QUOTE - pynini.accep(" ")
return pynini.closure(NEMO_NOT_QUOTE - pynini.accep(NEMO_SPACE) + pynutil.insert(NEMO_SPACE)) + (
NEMO_NOT_QUOTE - pynini.accep(NEMO_SPACE)
)

verbalize_characters = pynini.cdrewrite(graph_symbols | graph_digit, "", "", NEMO_SIGMA)

user_name = pynutil.delete("username: \"") + add_space_after_char() + pynutil.delete("\"")
user_name = (
pynutil.delete(username_string + colon + NEMO_SPACE + double_quotes)
+ add_space_after_char()
+ pynutil.delete(double_quotes)
)
user_name @= verbalize_characters

convert_defaults = pynutil.add_weight(NEMO_NOT_QUOTE, weight=0.0001) | domain_common | server_common
domain = convert_defaults + pynini.closure(insert_space + convert_defaults)
domain = convert_defaults + pynini.closure(pynutil.insert(NEMO_SPACE) + convert_defaults)
domain @= verbalize_characters

domain = pynutil.delete("domain: \"") + domain + pynutil.delete("\"")
domain = (
pynutil.delete(domain_string + colon + NEMO_SPACE + double_quotes) + domain + pynutil.delete(double_quotes)
)
protocol = (
pynutil.delete("protocol: \"")
pynutil.delete(protocol_string + colon + NEMO_SPACE + double_quotes)
+ add_space_after_char() @ pynini.cdrewrite(graph_symbols, "", "", NEMO_SIGMA)
+ pynutil.delete("\"")
+ pynutil.delete(double_quotes)
)
self.graph = (pynini.closure(protocol + pynini.accep(" "), 0, 1) + domain) | (
user_name + pynini.accep(" ") + pynutil.insert("arroba ") + domain

self.graph = (pynini.closure(protocol + NEMO_SPACE, 0, 1) + domain) | (
user_name + NEMO_SPACE + pynutil.insert(arroba + NEMO_SPACE) + domain
| (pynutil.insert(arroba + NEMO_SPACE) + user_name)
)

delete_tokens = self.delete_tokens(self.graph + delete_preserve_order)
self.fst = delete_tokens.optimize()
Loading
Loading