Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ pipeline {
ES_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-27-23-0'
ES_EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-13-23-2'
FR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-04-24-0'
HU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
HU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-16-24-0'
PT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
RU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
VI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
Expand Down
50 changes: 33 additions & 17 deletions nemo_text_processing/text_normalization/en/graph_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,9 @@
NEMO_ALPHA = pynini.union(NEMO_LOWER, NEMO_UPPER).optimize()
NEMO_ALNUM = pynini.union(NEMO_DIGIT, NEMO_ALPHA).optimize()
NEMO_HEX = pynini.union(*string.hexdigits).optimize()
NEMO_NON_BREAKING_SPACE = u"\u00A0"
NEMO_NON_BREAKING_SPACE = "\u00A0"
NEMO_SPACE = " "
NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", u"\u00A0").optimize()
NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", "\u00A0").optimize()
NEMO_NOT_SPACE = pynini.difference(NEMO_CHAR, NEMO_WHITE_SPACE).optimize()
NEMO_NOT_QUOTE = pynini.difference(NEMO_CHAR, r'"').optimize()

Expand Down Expand Up @@ -79,20 +79,36 @@
delete_extra_space = pynini.cross(pynini.closure(NEMO_WHITE_SPACE, 1), " ")
delete_preserve_order = pynini.closure(
pynutil.delete(" preserve_order: true")
| (pynutil.delete(" field_order: \"") + NEMO_NOT_QUOTE + pynutil.delete("\""))
| (pynutil.delete(' field_order: "') + NEMO_NOT_QUOTE + pynutil.delete('"'))
)

# Common string literals; expand as you see fit.
username_string = "username"
double_quotes = '"'
domain_string = "domain"
protocol_string = "protocol"
slash = "/"
double_slash = "//"
triple_slash = "///"
file = "file"
period = "."
at = "@"
colon = ":"
https = "https"
http = "http"
www = "www"

suppletive = pynini.string_file(get_abs_path("data/suppletive.tsv"))
# _v = pynini.union("a", "e", "i", "o", "u")
_c = pynini.union(
"b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "y", "z"
"b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "y", "z",
)
_ies = NEMO_SIGMA + _c + pynini.cross("y", "ies")
_es = NEMO_SIGMA + pynini.union("s", "sh", "ch", "x", "z") + pynutil.insert("es")
_s = NEMO_SIGMA + pynutil.insert("s")

graph_plural = plurals._priority_union(
suppletive, plurals._priority_union(_ies, plurals._priority_union(_es, _s, NEMO_SIGMA), NEMO_SIGMA), NEMO_SIGMA
suppletive, plurals._priority_union(_ies, plurals._priority_union(_es, _s, NEMO_SIGMA), NEMO_SIGMA), NEMO_SIGMA,
).optimize()

SINGULAR_TO_PLURAL = graph_plural
Expand All @@ -107,8 +123,8 @@


def capitalized_input_graph(
graph: 'pynini.FstLike', original_graph_weight: float = None, capitalized_graph_weight: float = None
) -> 'pynini.FstLike':
graph: "pynini.FstLike", original_graph_weight: float = None, capitalized_graph_weight: float = None,
) -> "pynini.FstLike":
"""
Allow graph input to be capitalized, e.g. for ITN)

Expand All @@ -129,7 +145,7 @@ def capitalized_input_graph(
return graph


def generator_main(file_name: str, graphs: Dict[str, 'pynini.FstLike']):
def generator_main(file_name: str, graphs: Dict[str, "pynini.FstLike"]):
"""
Exports graph as OpenFst finite state archive (FAR) file with given file name and rule name.

Expand All @@ -141,7 +157,7 @@ def generator_main(file_name: str, graphs: Dict[str, 'pynini.FstLike']):
for rule, graph in graphs.items():
exporter[rule] = graph.optimize()
exporter.close()
logger.info(f'Created {file_name}')
logger.info(f"Created {file_name}")


def get_plurals(fst):
Expand All @@ -168,7 +184,7 @@ def get_singulars(fst):
return PLURAL_TO_SINGULAR @ fst


def convert_space(fst) -> 'pynini.FstLike':
def convert_space(fst) -> "pynini.FstLike":
"""
Converts space to nonbreaking space.
Used only in tagger grammars for transducing token values within quotes, e.g. name: "hello kitty"
Expand All @@ -191,7 +207,7 @@ def string_map_cased(input_file: str, input_case: str = INPUT_LOWER_CASED):
written_capitalized = written[0].upper() + written[1:]
additional_labels.extend(
[
[written_capitalized, spoken.capitalize()], # first letter capitalized
[written_capitalized, spoken.capitalize(),], # first letter capitalized
[
written_capitalized,
spoken.upper().replace(" AND ", " and "),
Expand All @@ -205,7 +221,7 @@ def string_map_cased(input_file: str, input_case: str = INPUT_LOWER_CASED):
logger.debug(f"This is weight {weight}")
if len(weight) == 0:
additional_labels.extend(
[[written, spoken_no_space], [written_capitalized, spoken_no_space.upper()]]
[[written, spoken_no_space], [written_capitalized, spoken_no_space.upper()],]
)
else:
additional_labels.extend(
Expand Down Expand Up @@ -237,7 +253,7 @@ def __init__(self, name: str, kind: str, deterministic: bool = True):
self._fst = None
self.deterministic = deterministic

self.far_path = Path(os.path.dirname(__file__) + '/grammars/' + kind + '/' + name + '.far')
self.far_path = Path(os.path.dirname(__file__) + "/grammars/" + kind + "/" + name + ".far")
if self.far_exist():
self._fst = Far(self.far_path, mode="r", arc_type="standard", far_type="default").get_fst()

Expand All @@ -248,14 +264,14 @@ def far_exist(self) -> bool:
return self.far_path.exists()

@property
def fst(self) -> 'pynini.FstLike':
def fst(self) -> "pynini.FstLike":
return self._fst

@fst.setter
def fst(self, fst):
self._fst = fst

def add_tokens(self, fst) -> 'pynini.FstLike':
def add_tokens(self, fst) -> "pynini.FstLike":
"""
Wraps class name around to given fst

Expand All @@ -267,7 +283,7 @@ def add_tokens(self, fst) -> 'pynini.FstLike':
"""
return pynutil.insert(f"{self.name} {{ ") + fst + pynutil.insert(" }")

def delete_tokens(self, fst) -> 'pynini.FstLike':
def delete_tokens(self, fst) -> "pynini.FstLike":
"""
Deletes class name wrap around output of given fst

Expand All @@ -286,4 +302,4 @@ def delete_tokens(self, fst) -> 'pynini.FstLike':
+ delete_space
+ pynutil.delete("}")
)
return res @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA)
return res @ pynini.cdrewrite(pynini.cross("\u00A0", " "), "", "", NEMO_SIGMA)
205 changes: 179 additions & 26 deletions nemo_text_processing/text_normalization/hu/taggers/electronic.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,29 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import pynini
from pynini.lib import pynutil

from nemo_text_processing.text_normalization.en.graph_utils import NEMO_ALPHA, NEMO_DIGIT, GraphFst, insert_space
from nemo_text_processing.text_normalization.es.utils import get_abs_path, load_labels

common_domains = [x[0] for x in load_labels(get_abs_path("data/electronic/domain.tsv"))]
symbols = [x[0] for x in load_labels(get_abs_path("data/electronic/symbols.tsv"))]
from nemo_text_processing.text_normalization.de.utils import get_abs_path, load_labels
from nemo_text_processing.text_normalization.en.graph_utils import (
NEMO_ALPHA,
NEMO_DIGIT,
NEMO_SPACE,
GraphFst,
at,
colon,
domain_string,
double_quotes,
double_slash,
http,
https,
period,
protocol_string,
username_string,
www,
)


class ElectronicFst(GraphFst):
Expand All @@ -35,29 +50,162 @@ class ElectronicFst(GraphFst):
def __init__(self, deterministic: bool = True):
super().__init__(name="electronic", kind="classify", deterministic=deterministic)

dot = pynini.accep(".")
accepted_common_domains = pynini.union(*common_domains)
accepted_symbols = pynini.union(*symbols) - dot
accepted_characters = pynini.closure(NEMO_ALPHA | NEMO_DIGIT | accepted_symbols)
acceepted_characters_with_dot = pynini.closure(NEMO_ALPHA | NEMO_DIGIT | accepted_symbols | dot)
dot = pynini.accep(period)

symbols = [x[0] for x in load_labels(get_abs_path("data/electronic/symbols.tsv"))]
symbols = pynini.union(*symbols)
# all symbols
symbols_no_period = pynini.difference(symbols, dot) # alphabet of accepted symbols excluding the '.'
accepted_characters = pynini.closure(
(NEMO_ALPHA | NEMO_DIGIT | symbols_no_period), 1
) # alphabet of accepted chars excluding the '.'
all_characters = pynini.closure(
(NEMO_ALPHA | NEMO_DIGIT | symbols), 1
) # alphabet of accepted chars including the '.'

# domains
domain = dot + accepted_characters
domain_graph = (
pynutil.insert(domain_string + colon + NEMO_SPACE + double_quotes)
+ (accepted_characters + pynini.closure(domain, 1))
+ pynutil.insert(double_quotes)
)

# email
username = (
pynutil.insert("username: \"")
+ acceepted_characters_with_dot
+ pynutil.insert("\"")
+ pynini.cross('@', ' ')
pynutil.insert(username_string + colon + NEMO_SPACE + double_quotes)
+ all_characters
+ pynutil.insert(double_quotes)
+ pynini.cross(at, NEMO_SPACE)
)
domain_graph = accepted_characters + dot + accepted_characters
domain_graph = pynutil.insert("domain: \"") + domain_graph + pynutil.insert("\"")
domain_common_graph = (
pynutil.insert("domain: \"")
+ accepted_characters
+ accepted_common_domains
+ pynini.closure((accepted_symbols | dot) + pynini.closure(accepted_characters, 1), 0, 1)
+ pynutil.insert("\"")
email = username + domain_graph

# social media tags
tag = (
pynutil.delete(at)
+ pynutil.insert(username_string + colon + NEMO_SPACE + double_quotes)
+ (accepted_characters | (accepted_characters + pynini.closure(domain, 1)))
+ pynutil.insert(double_quotes)
)

# url
protocol_start = pynini.accep(https + colon + double_slash) | pynini.accep(http + colon + double_slash)
protocol_end = (
pynini.accep(www + period)
if deterministic
else (
pynini.accep(www + period)
| pynini.cross(www + period, "vé vé vé.")
| pynini.cross(www + period, "dupla vé dupla vé dupla vé.")
| pynini.cross(www + period, "kettős vé kettős vé kettős vé.")
)
)
protocol = protocol_start | protocol_end | (protocol_start + protocol_end)
protocol = (
pynutil.insert(protocol_string + colon + NEMO_SPACE + double_quotes)
+ protocol
+ pynutil.insert(double_quotes)
)
url = protocol + pynutil.insert(NEMO_SPACE) + (domain_graph)

graph = url | domain_graph | email | tag
self.graph = graph

final_graph = self.add_tokens(self.graph + pynutil.insert(" preserve_order: true"))
self.fst = final_graph.optimize()


'''
# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import pynini
from pynini.lib import pynutil

from nemo_text_processing.text_normalization.de.utils import get_abs_path, load_labels
from nemo_text_processing.text_normalization.en.graph_utils import (
NEMO_ALPHA,
NEMO_DIGIT,
NEMO_NOT_SPACE,
NEMO_SIGMA,
NEMO_UPPER,
TO_UPPER,
GraphFst,
get_abs_path,
insert_space,
period,
)


class ElectronicFst(GraphFst):
"""
Finite state transducer for classifying electronic: email addresses
e.g. "abc@hotmail.com" -> electronic { username: "abc" domain: "hotmail.com" preserve_order: true }
e.g. "www.abc.com/123" -> electronic { protocol: "www." domain: "abc.com/123" preserve_order: true }

Args:
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
"""

def __init__(self, deterministic: bool = True):
super().__init__(
name="electronic", kind="classify", deterministic=deterministic
)

dot = pynini.accep(period)

symbols = [
x[0] for x in load_labels(get_abs_path("data/electronic/symbols.tsv"))
]
symbols = pynini.union(*symbols)
# all symbols
symbols_no_period = pynini.difference(
symbols, dot
) # alphabet of accepted symbols excluding the '.'
accepted_characters = pynini.closure(
(NEMO_ALPHA | NEMO_DIGIT | symbols_no_period), 1
) # alphabet of accepted chars excluding the '.'
all_characters = pynini.closure(
(NEMO_ALPHA | NEMO_DIGIT | symbols), 1
) # alphabet of accepted chars including the '.'

# domains
domain = dot + accepted_characters
domain_graph = (
pynutil.insert()
+ (accepted_characters + pynini.closure(domain, 1))
+ insert_double_quotes
)

# email
username = (
insert_username
+ all_characters
+ insert_double_quotes
+ pynini.cross("@", " ")
)
email = username + domain_graph

# social media tags
tag = (
pynutil.delete("@")
+ insert_username
+ (accepted_characters | (accepted_characters + pynini.closure(domain, 1)))
+ insert_double_quotes
)
graph = (username + domain_graph) | domain_common_graph

# url
protocol_start = pynini.accep("https://") | pynini.accep("http://")
Expand All @@ -72,9 +220,14 @@ def __init__(self, deterministic: bool = True):
)
)
protocol = protocol_start | protocol_end | (protocol_start + protocol_end)
protocol = pynutil.insert("protocol: \"") + protocol + pynutil.insert("\"")
graph |= protocol + insert_space + (domain_graph | domain_common_graph)
protocol = insert_protocol + protocol + insert_double_quotes
url = protocol + insert_space + (domain_graph)

graph = url | domain_graph | email | tag
self.graph = graph

final_graph = self.add_tokens(self.graph + pynutil.insert(" preserve_order: true"))
final_graph = self.add_tokens(
self.graph + pynutil.insert(" preserve_order: true")
)
self.fst = final_graph.optimize()
'''
Loading
Loading