NVIDIA · zoobereq · Jul 18, 2024 · Jun 14, 2024 · Jun 14, 2024 · Jul 16, 2024
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -17,7 +17,7 @@ pipeline {
     ES_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-27-23-0'
     ES_EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-13-23-2'
     FR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-04-24-0'
-    HU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
+    HU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-16-24-0'
     PT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
     RU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
     VI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'

diff --git a/nemo_text_processing/text_normalization/en/graph_utils.py b/nemo_text_processing/text_normalization/en/graph_utils.py
@@ -35,9 +35,9 @@
 NEMO_ALPHA = pynini.union(NEMO_LOWER, NEMO_UPPER).optimize()
 NEMO_ALNUM = pynini.union(NEMO_DIGIT, NEMO_ALPHA).optimize()
 NEMO_HEX = pynini.union(*string.hexdigits).optimize()
-NEMO_NON_BREAKING_SPACE = u"\u00A0"
+NEMO_NON_BREAKING_SPACE = "\u00A0"
 NEMO_SPACE = " "
-NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", u"\u00A0").optimize()
+NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", "\u00A0").optimize()
 NEMO_NOT_SPACE = pynini.difference(NEMO_CHAR, NEMO_WHITE_SPACE).optimize()
 NEMO_NOT_QUOTE = pynini.difference(NEMO_CHAR, r'"').optimize()
 
@@ -79,20 +79,36 @@
 delete_extra_space = pynini.cross(pynini.closure(NEMO_WHITE_SPACE, 1), " ")
 delete_preserve_order = pynini.closure(
     pynutil.delete(" preserve_order: true")
-    | (pynutil.delete(" field_order: \"") + NEMO_NOT_QUOTE + pynutil.delete("\""))
+    | (pynutil.delete(' field_order: "') + NEMO_NOT_QUOTE + pynutil.delete('"'))
 )
 
+# Common string literals; expand as you see fit.
+username_string = "username"
+double_quotes = '"'
+domain_string = "domain"
+protocol_string = "protocol"
+slash = "/"
+double_slash = "//"
+triple_slash = "///"
+file = "file"
+period = "."
+at = "@"
+colon = ":"
+https = "https"
+http = "http"
+www = "www"
+
 suppletive = pynini.string_file(get_abs_path("data/suppletive.tsv"))
 # _v = pynini.union("a", "e", "i", "o", "u")
 _c = pynini.union(
-    "b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "y", "z"
+    "b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "y", "z",
 )
 _ies = NEMO_SIGMA + _c + pynini.cross("y", "ies")
 _es = NEMO_SIGMA + pynini.union("s", "sh", "ch", "x", "z") + pynutil.insert("es")
 _s = NEMO_SIGMA + pynutil.insert("s")
 
 graph_plural = plurals._priority_union(
-    suppletive, plurals._priority_union(_ies, plurals._priority_union(_es, _s, NEMO_SIGMA), NEMO_SIGMA), NEMO_SIGMA
+    suppletive, plurals._priority_union(_ies, plurals._priority_union(_es, _s, NEMO_SIGMA), NEMO_SIGMA), NEMO_SIGMA,
 ).optimize()
 
 SINGULAR_TO_PLURAL = graph_plural
@@ -107,8 +123,8 @@
 
 
 def capitalized_input_graph(
-    graph: 'pynini.FstLike', original_graph_weight: float = None, capitalized_graph_weight: float = None
-) -> 'pynini.FstLike':
+    graph: "pynini.FstLike", original_graph_weight: float = None, capitalized_graph_weight: float = None,
+) -> "pynini.FstLike":
     """
     Allow graph input to be capitalized, e.g. for ITN)
 
@@ -129,7 +145,7 @@ def capitalized_input_graph(
     return graph
 
 
-def generator_main(file_name: str, graphs: Dict[str, 'pynini.FstLike']):
+def generator_main(file_name: str, graphs: Dict[str, "pynini.FstLike"]):
     """
     Exports graph as OpenFst finite state archive (FAR) file with given file name and rule name.
 
@@ -141,7 +157,7 @@ def generator_main(file_name: str, graphs: Dict[str, 'pynini.FstLike']):
     for rule, graph in graphs.items():
         exporter[rule] = graph.optimize()
     exporter.close()
-    logger.info(f'Created {file_name}')
+    logger.info(f"Created {file_name}")
 
 
 def get_plurals(fst):
@@ -168,7 +184,7 @@ def get_singulars(fst):
     return PLURAL_TO_SINGULAR @ fst
 
 
-def convert_space(fst) -> 'pynini.FstLike':
+def convert_space(fst) -> "pynini.FstLike":
     """
     Converts space to nonbreaking space.
     Used only in tagger grammars for transducing token values within quotes, e.g. name: "hello kitty"
@@ -191,7 +207,7 @@ def string_map_cased(input_file: str, input_case: str = INPUT_LOWER_CASED):
             written_capitalized = written[0].upper() + written[1:]
             additional_labels.extend(
                 [
-                    [written_capitalized, spoken.capitalize()],  # first letter capitalized
+                    [written_capitalized, spoken.capitalize(),],  # first letter capitalized
                     [
                         written_capitalized,
                         spoken.upper().replace(" AND ", " and "),
@@ -205,7 +221,7 @@ def string_map_cased(input_file: str, input_case: str = INPUT_LOWER_CASED):
                 logger.debug(f"This is weight {weight}")
                 if len(weight) == 0:
                     additional_labels.extend(
-                        [[written, spoken_no_space], [written_capitalized, spoken_no_space.upper()]]
+                        [[written, spoken_no_space], [written_capitalized, spoken_no_space.upper()],]
                     )
                 else:
                     additional_labels.extend(
@@ -237,7 +253,7 @@ def __init__(self, name: str, kind: str, deterministic: bool = True):
         self._fst = None
         self.deterministic = deterministic
 
-        self.far_path = Path(os.path.dirname(__file__) + '/grammars/' + kind + '/' + name + '.far')
+        self.far_path = Path(os.path.dirname(__file__) + "/grammars/" + kind + "/" + name + ".far")
         if self.far_exist():
             self._fst = Far(self.far_path, mode="r", arc_type="standard", far_type="default").get_fst()
 
@@ -248,14 +264,14 @@ def far_exist(self) -> bool:
         return self.far_path.exists()
 
     @property
-    def fst(self) -> 'pynini.FstLike':
+    def fst(self) -> "pynini.FstLike":
         return self._fst
 
     @fst.setter
     def fst(self, fst):
         self._fst = fst
 
-    def add_tokens(self, fst) -> 'pynini.FstLike':
+    def add_tokens(self, fst) -> "pynini.FstLike":
         """
         Wraps class name around to given fst
 
@@ -267,7 +283,7 @@ def add_tokens(self, fst) -> 'pynini.FstLike':
         """
         return pynutil.insert(f"{self.name} {{ ") + fst + pynutil.insert(" }")
 
-    def delete_tokens(self, fst) -> 'pynini.FstLike':
+    def delete_tokens(self, fst) -> "pynini.FstLike":
         """
         Deletes class name wrap around output of given fst
 
@@ -286,4 +302,4 @@ def delete_tokens(self, fst) -> 'pynini.FstLike':
             + delete_space
             + pynutil.delete("}")
         )
-        return res @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA)
+        return res @ pynini.cdrewrite(pynini.cross("\u00A0", " "), "", "", NEMO_SIGMA)
diff --git a/nemo_text_processing/text_normalization/hu/taggers/electronic.py b/nemo_text_processing/text_normalization/hu/taggers/electronic.py
@@ -11,14 +11,29 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+
 import pynini
 from pynini.lib import pynutil
 
-from nemo_text_processing.text_normalization.en.graph_utils import NEMO_ALPHA, NEMO_DIGIT, GraphFst, insert_space
-from nemo_text_processing.text_normalization.es.utils import get_abs_path, load_labels
-
-common_domains = [x[0] for x in load_labels(get_abs_path("data/electronic/domain.tsv"))]
-symbols = [x[0] for x in load_labels(get_abs_path("data/electronic/symbols.tsv"))]
+from nemo_text_processing.text_normalization.de.utils import get_abs_path, load_labels
+from nemo_text_processing.text_normalization.en.graph_utils import (
+    NEMO_ALPHA,
+    NEMO_DIGIT,
+    NEMO_SPACE,
+    GraphFst,
+    at,
+    colon,
+    domain_string,
+    double_quotes,
+    double_slash,
+    http,
+    https,
+    period,
+    protocol_string,
+    username_string,
+    www,
+)
 
 
 class ElectronicFst(GraphFst):
@@ -35,29 +50,162 @@ class ElectronicFst(GraphFst):
     def __init__(self, deterministic: bool = True):
         super().__init__(name="electronic", kind="classify", deterministic=deterministic)
 
-        dot = pynini.accep(".")
-        accepted_common_domains = pynini.union(*common_domains)
-        accepted_symbols = pynini.union(*symbols) - dot
-        accepted_characters = pynini.closure(NEMO_ALPHA | NEMO_DIGIT | accepted_symbols)
-        acceepted_characters_with_dot = pynini.closure(NEMO_ALPHA | NEMO_DIGIT | accepted_symbols | dot)
+        dot = pynini.accep(period)
+
+        symbols = [x[0] for x in load_labels(get_abs_path("data/electronic/symbols.tsv"))]
+        symbols = pynini.union(*symbols)
+        # all symbols
+        symbols_no_period = pynini.difference(symbols, dot)  # alphabet of accepted symbols excluding the '.'
+        accepted_characters = pynini.closure(
+            (NEMO_ALPHA | NEMO_DIGIT | symbols_no_period), 1
+        )  # alphabet of accepted chars excluding the '.'
+        all_characters = pynini.closure(
+            (NEMO_ALPHA | NEMO_DIGIT | symbols), 1
+        )  # alphabet of accepted chars including the '.'
+
+        # domains
+        domain = dot + accepted_characters
+        domain_graph = (
+            pynutil.insert(domain_string + colon + NEMO_SPACE + double_quotes)
+            + (accepted_characters + pynini.closure(domain, 1))
+            + pynutil.insert(double_quotes)
+        )
 
         # email
         username = (
-            pynutil.insert("username: \"")
-            + acceepted_characters_with_dot
-            + pynutil.insert("\"")
-            + pynini.cross('@', ' ')
+            pynutil.insert(username_string + colon + NEMO_SPACE + double_quotes)
+            + all_characters
+            + pynutil.insert(double_quotes)
+            + pynini.cross(at, NEMO_SPACE)
         )
-        domain_graph = accepted_characters + dot + accepted_characters
-        domain_graph = pynutil.insert("domain: \"") + domain_graph + pynutil.insert("\"")
-        domain_common_graph = (
-            pynutil.insert("domain: \"")
-            + accepted_characters
-            + accepted_common_domains
-            + pynini.closure((accepted_symbols | dot) + pynini.closure(accepted_characters, 1), 0, 1)
-            + pynutil.insert("\"")
+        email = username + domain_graph
+
+        # social media tags
+        tag = (
+            pynutil.delete(at)
+            + pynutil.insert(username_string + colon + NEMO_SPACE + double_quotes)
+            + (accepted_characters | (accepted_characters + pynini.closure(domain, 1)))
+            + pynutil.insert(double_quotes)
+        )
+
+        # url
+        protocol_start = pynini.accep(https + colon + double_slash) | pynini.accep(http + colon + double_slash)
+        protocol_end = (
+            pynini.accep(www + period)
+            if deterministic
+            else (
+                pynini.accep(www + period)
+                | pynini.cross(www + period, "vé vé vé.")
+                | pynini.cross(www + period, "dupla vé dupla vé dupla vé.")
+                | pynini.cross(www + period, "kettős vé kettős vé kettős vé.")
+            )
+        )
+        protocol = protocol_start | protocol_end | (protocol_start + protocol_end)
+        protocol = (
+            pynutil.insert(protocol_string + colon + NEMO_SPACE + double_quotes)
+            + protocol
+            + pynutil.insert(double_quotes)
+        )
+        url = protocol + pynutil.insert(NEMO_SPACE) + (domain_graph)
+
+        graph = url | domain_graph | email | tag
+        self.graph = graph
+
+        final_graph = self.add_tokens(self.graph + pynutil.insert(" preserve_order: true"))
+        self.fst = final_graph.optimize()
+
+
+'''
+# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import pynini
+from pynini.lib import pynutil
+
+from nemo_text_processing.text_normalization.de.utils import get_abs_path, load_labels
+from nemo_text_processing.text_normalization.en.graph_utils import (
+    NEMO_ALPHA,
+    NEMO_DIGIT,
+    NEMO_NOT_SPACE,
+    NEMO_SIGMA,
+    NEMO_UPPER,
+    TO_UPPER,
+    GraphFst,
+    get_abs_path,
+    insert_space,
+    period,
+)
+
+
+class ElectronicFst(GraphFst):
+    """
+    Finite state transducer for classifying electronic: email addresses
+        e.g. "abc@hotmail.com" -> electronic { username: "abc" domain: "hotmail.com" preserve_order: true }
+        e.g. "www.abc.com/123" -> electronic { protocol: "www." domain: "abc.com/123" preserve_order: true }
+
+    Args:
+        deterministic: if True will provide a single transduction option,
+            for False multiple transduction are generated (used for audio-based normalization)
+    """
+
+    def __init__(self, deterministic: bool = True):
+        super().__init__(
+            name="electronic", kind="classify", deterministic=deterministic
+        )
+
+        dot = pynini.accep(period)
+
+        symbols = [
+            x[0] for x in load_labels(get_abs_path("data/electronic/symbols.tsv"))
+        ]
+        symbols = pynini.union(*symbols)
+        # all symbols
+        symbols_no_period = pynini.difference(
+            symbols, dot
+        )  # alphabet of accepted symbols excluding the '.'
+        accepted_characters = pynini.closure(
+            (NEMO_ALPHA | NEMO_DIGIT | symbols_no_period), 1
+        )  # alphabet of accepted chars excluding the '.'
+        all_characters = pynini.closure(
+            (NEMO_ALPHA | NEMO_DIGIT | symbols), 1
+        )  # alphabet of accepted chars including the '.'
+
+        # domains
+        domain = dot + accepted_characters
+        domain_graph = (
+            pynutil.insert()
+            + (accepted_characters + pynini.closure(domain, 1))
+            + insert_double_quotes
+        )
+
+        # email
+        username = (
+            insert_username
+            + all_characters
+            + insert_double_quotes
+            + pynini.cross("@", " ")
+        )
+        email = username + domain_graph
+
+        # social media tags
+        tag = (
+            pynutil.delete("@")
+            + insert_username
+            + (accepted_characters | (accepted_characters + pynini.closure(domain, 1)))
+            + insert_double_quotes
         )
-        graph = (username + domain_graph) | domain_common_graph
 
         # url
         protocol_start = pynini.accep("https://") | pynini.accep("http://")
@@ -72,9 +220,14 @@ def __init__(self, deterministic: bool = True):
             )
         )
         protocol = protocol_start | protocol_end | (protocol_start + protocol_end)
-        protocol = pynutil.insert("protocol: \"") + protocol + pynutil.insert("\"")
-        graph |= protocol + insert_space + (domain_graph | domain_common_graph)
+        protocol = insert_protocol + protocol + insert_double_quotes
+        url = protocol + insert_space + (domain_graph)
+
+        graph = url | domain_graph | email | tag
         self.graph = graph
 
-        final_graph = self.add_tokens(self.graph + pynutil.insert(" preserve_order: true"))
+        final_graph = self.add_tokens(
+            self.graph + pynutil.insert(" preserve_order: true")
+        )
         self.fst = final_graph.optimize()
+'''