Skip to content

Commit d7da1ce

Browse files
HU TN Fixes issue NVIDIA#166 (NVIDIA#184)
* Fixes issue NVIDIA#166 Signed-off-by: Simon Zuberek <szuberek@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Implements aliases for common string literals Signed-off-by: Simon Zuberek <szuberek@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fixes the period variable Signed-off-by: Simon Zuberek <szuberek@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Simon Zuberek <szuberek@nvidia.com> Co-authored-by: Simon Zuberek <szuberek@nvidia.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent eefbc2b commit d7da1ce

File tree

6 files changed

+117
-54
lines changed

6 files changed

+117
-54
lines changed

Jenkinsfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ pipeline {
1717
ES_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-27-23-0'
1818
ES_EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-13-23-2'
1919
FR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-04-24-0'
20-
HU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
20+
HU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-16-24-0'
2121
PT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
2222
RU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
2323
VI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'

nemo_text_processing/text_normalization/hu/taggers/electronic.py

Lines changed: 66 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -11,14 +11,29 @@
1111
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
14+
15+
1416
import pynini
1517
from pynini.lib import pynutil
1618

17-
from nemo_text_processing.text_normalization.en.graph_utils import NEMO_ALPHA, NEMO_DIGIT, GraphFst, insert_space
18-
from nemo_text_processing.text_normalization.es.utils import get_abs_path, load_labels
19-
20-
common_domains = [x[0] for x in load_labels(get_abs_path("data/electronic/domain.tsv"))]
21-
symbols = [x[0] for x in load_labels(get_abs_path("data/electronic/symbols.tsv"))]
19+
from nemo_text_processing.text_normalization.de.utils import get_abs_path, load_labels
20+
from nemo_text_processing.text_normalization.en.graph_utils import (
21+
NEMO_ALPHA,
22+
NEMO_DIGIT,
23+
NEMO_SPACE,
24+
GraphFst,
25+
at,
26+
colon,
27+
domain_string,
28+
double_quotes,
29+
double_slash,
30+
http,
31+
https,
32+
period,
33+
protocol_string,
34+
username_string,
35+
www,
36+
)
2237

2338

2439
class ElectronicFst(GraphFst):
@@ -35,45 +50,65 @@ class ElectronicFst(GraphFst):
3550
def __init__(self, deterministic: bool = True):
3651
super().__init__(name="electronic", kind="classify", deterministic=deterministic)
3752

38-
dot = pynini.accep(".")
39-
accepted_common_domains = pynini.union(*common_domains)
40-
accepted_symbols = pynini.union(*symbols) - dot
41-
accepted_characters = pynini.closure(NEMO_ALPHA | NEMO_DIGIT | accepted_symbols)
42-
acceepted_characters_with_dot = pynini.closure(NEMO_ALPHA | NEMO_DIGIT | accepted_symbols | dot)
53+
period_fst = pynini.accep(period)
54+
55+
symbols = [x[0] for x in load_labels(get_abs_path("data/electronic/symbols.tsv"))]
56+
symbols = pynini.union(*symbols)
57+
# all symbols
58+
symbols_no_period = pynini.difference(symbols, period_fst) # alphabet of accepted symbols excluding the '.'
59+
accepted_characters = pynini.closure(
60+
(NEMO_ALPHA | NEMO_DIGIT | symbols_no_period), 1
61+
) # alphabet of accepted chars excluding the '.'
62+
all_characters = pynini.closure(
63+
(NEMO_ALPHA | NEMO_DIGIT | symbols), 1
64+
) # alphabet of accepted chars including the '.'
65+
66+
# domains
67+
domain = period_fst + accepted_characters
68+
domain_graph = (
69+
pynutil.insert(domain_string + colon + NEMO_SPACE + double_quotes)
70+
+ (accepted_characters + pynini.closure(domain, 1))
71+
+ pynutil.insert(double_quotes)
72+
)
4373

4474
# email
4575
username = (
46-
pynutil.insert("username: \"")
47-
+ acceepted_characters_with_dot
48-
+ pynutil.insert("\"")
49-
+ pynini.cross('@', ' ')
76+
pynutil.insert(username_string + colon + NEMO_SPACE + double_quotes)
77+
+ all_characters
78+
+ pynutil.insert(double_quotes)
79+
+ pynini.cross(at, NEMO_SPACE)
5080
)
51-
domain_graph = accepted_characters + dot + accepted_characters
52-
domain_graph = pynutil.insert("domain: \"") + domain_graph + pynutil.insert("\"")
53-
domain_common_graph = (
54-
pynutil.insert("domain: \"")
55-
+ accepted_characters
56-
+ accepted_common_domains
57-
+ pynini.closure((accepted_symbols | dot) + pynini.closure(accepted_characters, 1), 0, 1)
58-
+ pynutil.insert("\"")
81+
email = username + domain_graph
82+
83+
# social media tags
84+
tag = (
85+
pynutil.delete(at)
86+
+ pynutil.insert(username_string + colon + NEMO_SPACE + double_quotes)
87+
+ (accepted_characters | (accepted_characters + pynini.closure(domain, 1)))
88+
+ pynutil.insert(double_quotes)
5989
)
60-
graph = (username + domain_graph) | domain_common_graph
6190

6291
# url
63-
protocol_start = pynini.accep("https://") | pynini.accep("http://")
92+
protocol_start = pynini.accep(https + colon + double_slash) | pynini.accep(http + colon + double_slash)
6493
protocol_end = (
65-
pynini.accep("www.")
94+
pynini.accep(www + period)
6695
if deterministic
6796
else (
68-
pynini.accep("www.")
69-
| pynini.cross("www.", "vé vé vé.")
70-
| pynini.cross("www.", "dupla vé dupla vé dupla vé.")
71-
| pynini.cross("www.", "kettős vé kettős vé kettős vé.")
97+
pynini.accep(www + period)
98+
| pynini.cross(www + period, "vé vé vé.")
99+
| pynini.cross(www + period, "dupla vé dupla vé dupla vé.")
100+
| pynini.cross(www + period, "kettős vé kettős vé kettős vé.")
72101
)
73102
)
74103
protocol = protocol_start | protocol_end | (protocol_start + protocol_end)
75-
protocol = pynutil.insert("protocol: \"") + protocol + pynutil.insert("\"")
76-
graph |= protocol + insert_space + (domain_graph | domain_common_graph)
104+
protocol = (
105+
pynutil.insert(protocol_string + colon + NEMO_SPACE + double_quotes)
106+
+ protocol
107+
+ pynutil.insert(double_quotes)
108+
)
109+
url = protocol + pynutil.insert(NEMO_SPACE) + (domain_graph)
110+
111+
graph = url | domain_graph | email | tag
77112
self.graph = graph
78113

79114
final_graph = self.add_tokens(self.graph + pynutil.insert(" preserve_order: true"))

nemo_text_processing/text_normalization/hu/taggers/tokenize_and_classify.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ def __init__(
6969
os.makedirs(cache_dir, exist_ok=True)
7070
whitelist_file = os.path.basename(whitelist) if whitelist else ""
7171
far_file = os.path.join(
72-
cache_dir, f"_{input_case}_hu_tn_{deterministic}_deterministic{whitelist_file}.far"
72+
cache_dir, f"_{input_case}_hu_tn_{deterministic}_deterministic{whitelist_file}.far",
7373
)
7474
if not overwrite_cache and far_file and os.path.exists(far_file):
7575
self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"]
@@ -86,10 +86,10 @@ def __init__(
8686
self.decimal = DecimalFst(cardinal=self.cardinal, deterministic=deterministic)
8787
decimal_graph = self.decimal.fst
8888

89-
self.fraction = FractionFst(cardinal=self.cardinal, ordinal=self.ordinal, deterministic=deterministic)
89+
self.fraction = FractionFst(cardinal=self.cardinal, ordinal=self.ordinal, deterministic=deterministic,)
9090
fraction_graph = self.fraction.fst
9191
self.measure = MeasureFst(
92-
cardinal=self.cardinal, decimal=self.decimal, fraction=self.fraction, deterministic=deterministic
92+
cardinal=self.cardinal, decimal=self.decimal, fraction=self.fraction, deterministic=deterministic,
9393
)
9494
measure_graph = self.measure.fst
9595
self.date = DateFst(cardinal=self.cardinal, deterministic=deterministic)
@@ -101,7 +101,7 @@ def __init__(
101101
telephone_graph = self.telephone.fst
102102
self.electronic = ElectronicFst(deterministic=deterministic)
103103
electronic_graph = self.electronic.fst
104-
self.money = MoneyFst(cardinal=self.cardinal, decimal=self.decimal, deterministic=deterministic)
104+
self.money = MoneyFst(cardinal=self.cardinal, decimal=self.decimal, deterministic=deterministic,)
105105
money_graph = self.money.fst
106106
self.whitelist = WhiteListFst(input_case=input_case, deterministic=deterministic, input_file=whitelist)
107107
whitelist_graph = self.whitelist.fst
@@ -118,7 +118,7 @@ def __init__(
118118
| pynutil.add_weight(decimal_graph, 1.1)
119119
| pynutil.add_weight(money_graph, 1.1)
120120
| pynutil.add_weight(telephone_graph, 1.1)
121-
| pynutil.add_weight(electronic_graph, 1.1)
121+
| pynutil.add_weight(electronic_graph, 1.11)
122122
| pynutil.add_weight(word_graph, 200)
123123
)
124124
punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=2.1) + pynutil.insert(" }")

nemo_text_processing/text_normalization/hu/verbalizers/electronic.py

Lines changed: 38 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,20 @@
1717
from nemo_text_processing.text_normalization.en.graph_utils import (
1818
NEMO_NOT_QUOTE,
1919
NEMO_SIGMA,
20+
NEMO_SPACE,
2021
GraphFst,
22+
at,
23+
colon,
2124
delete_preserve_order,
22-
insert_space,
25+
domain_string,
26+
double_quotes,
27+
double_slash,
28+
http,
29+
https,
30+
period,
31+
protocol_string,
32+
username_string,
33+
www,
2334
)
2435
from nemo_text_processing.text_normalization.hu.utils import get_abs_path
2536

@@ -30,6 +41,12 @@
3041
server_common = pynini.string_file(get_abs_path("data/electronic/server_name.tsv"))
3142
domain_common = pynini.string_file(get_abs_path("data/electronic/domain.tsv"))
3243

44+
accept_space = pynini.accep(NEMO_SPACE)
45+
delete_username = pynutil.delete(username_string + colon + NEMO_SPACE + double_quotes)
46+
delete_double_quotes = pynutil.delete(double_quotes)
47+
delete_domain = pynutil.delete(domain_string + colon + NEMO_SPACE + double_quotes)
48+
delete_protocol = pynutil.delete(protocol_string + colon + NEMO_SPACE + double_quotes)
49+
3350

3451
class ElectronicFst(GraphFst):
3552
"""
@@ -49,36 +66,41 @@ def __init__(self, deterministic: bool = True):
4966
graph_digit = digit_no_zero | zero
5067

5168
def add_space_after_char():
52-
return pynini.closure(NEMO_NOT_QUOTE - pynini.accep(" ") + insert_space) + (
53-
NEMO_NOT_QUOTE - pynini.accep(" ")
69+
return pynini.closure(NEMO_NOT_QUOTE - accept_space + pynutil.insert(NEMO_SPACE)) + (
70+
NEMO_NOT_QUOTE - accept_space
5471
)
5572

73+
hungarian_at = [
74+
"kukacjel ",
75+
"csiga ",
76+
"ormány ",
77+
"farkas á ",
78+
"bejgli ",
79+
"at-jel ",
80+
]
5681
at_sign = pynutil.insert("kukac ")
5782
if not deterministic:
58-
at_sign |= pynutil.insert("kukacjel ")
59-
at_sign |= pynutil.insert("csiga ")
60-
at_sign |= pynutil.insert("ormány ")
61-
at_sign |= pynutil.insert("farkas á ")
62-
at_sign |= pynutil.insert("bejgli ")
63-
at_sign |= pynutil.insert("at-jel ")
83+
for sign in hungarian_at:
84+
at_sign |= pynutil.insert(sign)
6485

6586
verbalize_characters = pynini.cdrewrite(graph_symbols | graph_digit, "", "", NEMO_SIGMA)
6687

67-
user_name = pynutil.delete("username: \"") + add_space_after_char() + pynutil.delete("\"")
88+
user_name = delete_username + add_space_after_char() + delete_double_quotes
6889
user_name @= verbalize_characters
6990

7091
convert_defaults = pynutil.add_weight(NEMO_NOT_QUOTE, weight=0.0001) | domain_common | server_common
71-
domain = convert_defaults + pynini.closure(insert_space + convert_defaults)
92+
domain = convert_defaults + pynini.closure(pynutil.insert(NEMO_SPACE) + convert_defaults)
7293
domain @= verbalize_characters
7394

74-
domain = pynutil.delete("domain: \"") + domain + pynutil.delete("\"")
95+
domain = delete_domain + domain + delete_double_quotes
7596
protocol = (
76-
pynutil.delete("protocol: \"")
97+
delete_protocol
7798
+ add_space_after_char() @ pynini.cdrewrite(graph_symbols, "", "", NEMO_SIGMA)
78-
+ pynutil.delete("\"")
99+
+ delete_double_quotes
79100
)
80-
self.graph = (pynini.closure(protocol + pynini.accep(" "), 0, 1) + domain) | (
81-
user_name + pynini.accep(" ") + at_sign + domain
101+
102+
self.graph = (pynini.closure(protocol + NEMO_SPACE, 0, 1) + domain) | (
103+
user_name + NEMO_SPACE + at_sign + domain | (at_sign + user_name)
82104
)
83105
delete_tokens = self.delete_tokens(self.graph + delete_preserve_order)
84106
self.fst = delete_tokens.optimize()
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,8 @@
11
abc@def.hu~a b c kukac d e f pont hu
22
https://www.nvidia.com~h t t p s kettőspont perjel perjel w w w pont nvidia pont com
3+
https://www.nvidia.com.~h t t p s kettőspont perjel perjel w w w pont nvidia pont com .
4+
@jensen~kukac j e n s e n
5+
www.nasa.gov.~w w w pont n a s a pont gov .
6+
www.enveedya.com.hu.~w w w pont e n v e e d y a pont com pont hu .
7+
@jensen.me~kukac j e n s e n pont m e
8+
@wezyr1986~kukac w e z y r egy kilenc nyolc hat

tests/nemo_text_processing/hu/test_sparrowhawk_normalization.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#! /bin/sh
22

33
GRAMMARS_DIR=${1:-"/workspace/sparrowhawk/documentation/grammars"}
4-
PROJECT_DIR=${2:-"/workspace/tests/en"}
4+
PROJECT_DIR=${2:-"/workspace/tests/"}
55

66
runtest () {
77
input=$1

0 commit comments

Comments
 (0)