Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ pipeline {
ES_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-27-23-0'
ES_EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-13-23-2'
FR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-04-24-0'
HU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
HU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-14-24-0'
PT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
RU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
VI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
Expand Down
58 changes: 33 additions & 25 deletions nemo_text_processing/text_normalization/hu/taggers/electronic.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,13 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import pynini
from pynini.lib import pynutil

from nemo_text_processing.text_normalization.de.utils import get_abs_path, load_labels
from nemo_text_processing.text_normalization.en.graph_utils import NEMO_ALPHA, NEMO_DIGIT, GraphFst, insert_space
from nemo_text_processing.text_normalization.es.utils import get_abs_path, load_labels

common_domains = [x[0] for x in load_labels(get_abs_path("data/electronic/domain.tsv"))]
symbols = [x[0] for x in load_labels(get_abs_path("data/electronic/symbols.tsv"))]


class ElectronicFst(GraphFst):
Expand All @@ -36,28 +35,35 @@ def __init__(self, deterministic: bool = True):
super().__init__(name="electronic", kind="classify", deterministic=deterministic)

dot = pynini.accep(".")
accepted_common_domains = pynini.union(*common_domains)
accepted_symbols = pynini.union(*symbols) - dot
accepted_characters = pynini.closure(NEMO_ALPHA | NEMO_DIGIT | accepted_symbols)
acceepted_characters_with_dot = pynini.closure(NEMO_ALPHA | NEMO_DIGIT | accepted_symbols | dot)

# email
username = (
pynutil.insert("username: \"")
+ acceepted_characters_with_dot
+ pynutil.insert("\"")
+ pynini.cross('@', ' ')
symbols = [x[0] for x in load_labels(get_abs_path("data/electronic/symbols.tsv"))]
symbols = pynini.union(*symbols)
# all symbols
symbols_no_period = pynini.difference(symbols, dot) # alphabet of accepted symbols excluding the '.'
accepted_characters = pynini.closure(
(NEMO_ALPHA | NEMO_DIGIT | symbols_no_period), 1
) # alphabet of accepted chars excluding the '.'
all_characters = pynini.closure(
(NEMO_ALPHA | NEMO_DIGIT | symbols), 1
) # alphabet of accepted chars including the '.'

# domains
domain = dot + accepted_characters
domain_graph = (
pynutil.insert('domain: "') + (accepted_characters + pynini.closure(domain, 1)) + pynutil.insert('"')
)
domain_graph = accepted_characters + dot + accepted_characters
domain_graph = pynutil.insert("domain: \"") + domain_graph + pynutil.insert("\"")
domain_common_graph = (
pynutil.insert("domain: \"")
+ accepted_characters
+ accepted_common_domains
+ pynini.closure((accepted_symbols | dot) + pynini.closure(accepted_characters, 1), 0, 1)
+ pynutil.insert("\"")

# email
username = pynutil.insert('username: "') + all_characters + pynutil.insert('"') + pynini.cross("@", " ")
email = username + domain_graph

# social media tags
tag = (
pynini.cross("@", "")
+ pynutil.insert('username: "')
+ (accepted_characters | (accepted_characters + pynini.closure(domain, 1)))
+ pynutil.insert('"')
)
graph = (username + domain_graph) | domain_common_graph

# url
protocol_start = pynini.accep("https://") | pynini.accep("http://")
Expand All @@ -72,8 +78,10 @@ def __init__(self, deterministic: bool = True):
)
)
protocol = protocol_start | protocol_end | (protocol_start + protocol_end)
protocol = pynutil.insert("protocol: \"") + protocol + pynutil.insert("\"")
graph |= protocol + insert_space + (domain_graph | domain_common_graph)
protocol = pynutil.insert('protocol: "') + protocol + pynutil.insert('"')
url = protocol + insert_space + (domain_graph)

graph = url | domain_graph | email | tag
self.graph = graph

final_graph = self.add_tokens(self.graph + pynutil.insert(" preserve_order: true"))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ def __init__(
os.makedirs(cache_dir, exist_ok=True)
whitelist_file = os.path.basename(whitelist) if whitelist else ""
far_file = os.path.join(
cache_dir, f"_{input_case}_hu_tn_{deterministic}_deterministic{whitelist_file}.far"
cache_dir, f"_{input_case}_hu_tn_{deterministic}_deterministic{whitelist_file}.far",
)
if not overwrite_cache and far_file and os.path.exists(far_file):
self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"]
Expand All @@ -86,10 +86,10 @@ def __init__(
self.decimal = DecimalFst(cardinal=self.cardinal, deterministic=deterministic)
decimal_graph = self.decimal.fst

self.fraction = FractionFst(cardinal=self.cardinal, ordinal=self.ordinal, deterministic=deterministic)
self.fraction = FractionFst(cardinal=self.cardinal, ordinal=self.ordinal, deterministic=deterministic,)
fraction_graph = self.fraction.fst
self.measure = MeasureFst(
cardinal=self.cardinal, decimal=self.decimal, fraction=self.fraction, deterministic=deterministic
cardinal=self.cardinal, decimal=self.decimal, fraction=self.fraction, deterministic=deterministic,
)
measure_graph = self.measure.fst
self.date = DateFst(cardinal=self.cardinal, deterministic=deterministic)
Expand All @@ -101,7 +101,7 @@ def __init__(
telephone_graph = self.telephone.fst
self.electronic = ElectronicFst(deterministic=deterministic)
electronic_graph = self.electronic.fst
self.money = MoneyFst(cardinal=self.cardinal, decimal=self.decimal, deterministic=deterministic)
self.money = MoneyFst(cardinal=self.cardinal, decimal=self.decimal, deterministic=deterministic,)
money_graph = self.money.fst
self.whitelist = WhiteListFst(input_case=input_case, deterministic=deterministic, input_file=whitelist)
whitelist_graph = self.whitelist.fst
Expand All @@ -118,7 +118,7 @@ def __init__(
| pynutil.add_weight(decimal_graph, 1.1)
| pynutil.add_weight(money_graph, 1.1)
| pynutil.add_weight(telephone_graph, 1.1)
| pynutil.add_weight(electronic_graph, 1.1)
| pynutil.add_weight(electronic_graph, 1.11)
| pynutil.add_weight(word_graph, 200)
)
punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=2.1) + pynutil.insert(" }")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from nemo_text_processing.text_normalization.en.graph_utils import (
NEMO_NOT_QUOTE,
NEMO_SIGMA,
NEMO_SPACE,
GraphFst,
delete_preserve_order,
insert_space,
Expand Down Expand Up @@ -64,21 +65,24 @@ def add_space_after_char():

verbalize_characters = pynini.cdrewrite(graph_symbols | graph_digit, "", "", NEMO_SIGMA)

user_name = pynutil.delete("username: \"") + add_space_after_char() + pynutil.delete("\"")
user_name = pynutil.delete('username: "') + add_space_after_char() + pynutil.delete('"')
user_name @= verbalize_characters

convert_defaults = pynutil.add_weight(NEMO_NOT_QUOTE, weight=0.0001) | domain_common | server_common
domain = convert_defaults + pynini.closure(insert_space + convert_defaults)
domain @= verbalize_characters

domain = pynutil.delete("domain: \"") + domain + pynutil.delete("\"")
domain = pynutil.delete('domain: "') + domain + pynutil.delete('"')
protocol = (
pynutil.delete("protocol: \"")
pynutil.delete('protocol: "')
+ add_space_after_char() @ pynini.cdrewrite(graph_symbols, "", "", NEMO_SIGMA)
+ pynutil.delete("\"")
+ pynutil.delete('"')
)
self.graph = (pynini.closure(protocol + pynini.accep(" "), 0, 1) + domain) | (
user_name + pynini.accep(" ") + at_sign + domain
# self.graph = (pynini.closure(protocol + pynini.accep(" "), 0, 1) + domain) | (
# user_name + pynini.accep(" ") + at_sign + domain
# )
self.graph = (pynini.closure(protocol + NEMO_SPACE, 0, 1) + domain) | (
user_name + NEMO_SPACE + pynutil.insert("kukac ") + domain | (pynutil.insert("kukac ") + user_name)
)
delete_tokens = self.delete_tokens(self.graph + delete_preserve_order)
self.fst = delete_tokens.optimize()
Original file line number Diff line number Diff line change
@@ -1,2 +1,8 @@
abc@def.hu~a b c kukac d e f pont hu
https://www.nvidia.com~h t t p s kettőspont perjel perjel w w w pont nvidia pont com
https://www.nvidia.com.~h t t p s kettőspont perjel perjel w w w pont nvidia pont com .
@jensen~kukac j e n s e n
www.nasa.gov.~w w w pont n a s a pont gov .
www.enveedya.com.hu.~w w w pont e n v e e d y a pont com pont hu .
@jensen.me~kukac j e n s e n pont m e
@wezyr1986~kukac w e z y r egy kilenc nyolc hat
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#! /bin/sh

GRAMMARS_DIR=${1:-"/workspace/sparrowhawk/documentation/grammars"}
PROJECT_DIR=${2:-"/workspace/tests/en"}
PROJECT_DIR=${2:-"/workspace/tests/"}

runtest () {
input=$1
Expand Down