Skip to content

Commit b32d1b3

Browse files
committed
Revert "HU TN Fixes issue #166 (#184)"
This reverts commit aa7cf17. Signed-off-by: Simon Zuberek <szuberek@nvidia.com>
1 parent 1d1374b commit b32d1b3

File tree

6 files changed

+54
-117
lines changed

6 files changed

+54
-117
lines changed

Jenkinsfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ pipeline {
1717
ES_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-27-23-0'
1818
ES_EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-13-23-2'
1919
FR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-04-24-0'
20-
HU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-16-24-0'
20+
HU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
2121
PT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
2222
RU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
2323
VI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'

nemo_text_processing/text_normalization/hu/taggers/electronic.py

Lines changed: 31 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -11,29 +11,14 @@
1111
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
14-
15-
1614
import pynini
1715
from pynini.lib import pynutil
1816

19-
from nemo_text_processing.text_normalization.de.utils import get_abs_path, load_labels
20-
from nemo_text_processing.text_normalization.en.graph_utils import (
21-
NEMO_ALPHA,
22-
NEMO_DIGIT,
23-
NEMO_SPACE,
24-
GraphFst,
25-
at,
26-
colon,
27-
domain_string,
28-
double_quotes,
29-
double_slash,
30-
http,
31-
https,
32-
period,
33-
protocol_string,
34-
username_string,
35-
www,
36-
)
17+
from nemo_text_processing.text_normalization.en.graph_utils import NEMO_ALPHA, NEMO_DIGIT, GraphFst, insert_space
18+
from nemo_text_processing.text_normalization.es.utils import get_abs_path, load_labels
19+
20+
common_domains = [x[0] for x in load_labels(get_abs_path("data/electronic/domain.tsv"))]
21+
symbols = [x[0] for x in load_labels(get_abs_path("data/electronic/symbols.tsv"))]
3722

3823

3924
class ElectronicFst(GraphFst):
@@ -50,65 +35,45 @@ class ElectronicFst(GraphFst):
5035
def __init__(self, deterministic: bool = True):
5136
super().__init__(name="electronic", kind="classify", deterministic=deterministic)
5237

53-
period_fst = pynini.accep(period)
54-
55-
symbols = [x[0] for x in load_labels(get_abs_path("data/electronic/symbols.tsv"))]
56-
symbols = pynini.union(*symbols)
57-
# all symbols
58-
symbols_no_period = pynini.difference(symbols, period_fst) # alphabet of accepted symbols excluding the '.'
59-
accepted_characters = pynini.closure(
60-
(NEMO_ALPHA | NEMO_DIGIT | symbols_no_period), 1
61-
) # alphabet of accepted chars excluding the '.'
62-
all_characters = pynini.closure(
63-
(NEMO_ALPHA | NEMO_DIGIT | symbols), 1
64-
) # alphabet of accepted chars including the '.'
65-
66-
# domains
67-
domain = period_fst + accepted_characters
68-
domain_graph = (
69-
pynutil.insert(domain_string + colon + NEMO_SPACE + double_quotes)
70-
+ (accepted_characters + pynini.closure(domain, 1))
71-
+ pynutil.insert(double_quotes)
72-
)
38+
dot = pynini.accep(".")
39+
accepted_common_domains = pynini.union(*common_domains)
40+
accepted_symbols = pynini.union(*symbols) - dot
41+
accepted_characters = pynini.closure(NEMO_ALPHA | NEMO_DIGIT | accepted_symbols)
42+
acceepted_characters_with_dot = pynini.closure(NEMO_ALPHA | NEMO_DIGIT | accepted_symbols | dot)
7343

7444
# email
7545
username = (
76-
pynutil.insert(username_string + colon + NEMO_SPACE + double_quotes)
77-
+ all_characters
78-
+ pynutil.insert(double_quotes)
79-
+ pynini.cross(at, NEMO_SPACE)
46+
pynutil.insert("username: \"")
47+
+ acceepted_characters_with_dot
48+
+ pynutil.insert("\"")
49+
+ pynini.cross('@', ' ')
8050
)
81-
email = username + domain_graph
82-
83-
# social media tags
84-
tag = (
85-
pynutil.delete(at)
86-
+ pynutil.insert(username_string + colon + NEMO_SPACE + double_quotes)
87-
+ (accepted_characters | (accepted_characters + pynini.closure(domain, 1)))
88-
+ pynutil.insert(double_quotes)
51+
domain_graph = accepted_characters + dot + accepted_characters
52+
domain_graph = pynutil.insert("domain: \"") + domain_graph + pynutil.insert("\"")
53+
domain_common_graph = (
54+
pynutil.insert("domain: \"")
55+
+ accepted_characters
56+
+ accepted_common_domains
57+
+ pynini.closure((accepted_symbols | dot) + pynini.closure(accepted_characters, 1), 0, 1)
58+
+ pynutil.insert("\"")
8959
)
60+
graph = (username + domain_graph) | domain_common_graph
9061

9162
# url
92-
protocol_start = pynini.accep(https + colon + double_slash) | pynini.accep(http + colon + double_slash)
63+
protocol_start = pynini.accep("https://") | pynini.accep("http://")
9364
protocol_end = (
94-
pynini.accep(www + period)
65+
pynini.accep("www.")
9566
if deterministic
9667
else (
97-
pynini.accep(www + period)
98-
| pynini.cross(www + period, "vé vé vé.")
99-
| pynini.cross(www + period, "dupla vé dupla vé dupla vé.")
100-
| pynini.cross(www + period, "kettős vé kettős vé kettős vé.")
68+
pynini.accep("www.")
69+
| pynini.cross("www.", "vé vé vé.")
70+
| pynini.cross("www.", "dupla vé dupla vé dupla vé.")
71+
| pynini.cross("www.", "kettős vé kettős vé kettős vé.")
10172
)
10273
)
10374
protocol = protocol_start | protocol_end | (protocol_start + protocol_end)
104-
protocol = (
105-
pynutil.insert(protocol_string + colon + NEMO_SPACE + double_quotes)
106-
+ protocol
107-
+ pynutil.insert(double_quotes)
108-
)
109-
url = protocol + pynutil.insert(NEMO_SPACE) + (domain_graph)
110-
111-
graph = url | domain_graph | email | tag
75+
protocol = pynutil.insert("protocol: \"") + protocol + pynutil.insert("\"")
76+
graph |= protocol + insert_space + (domain_graph | domain_common_graph)
11277
self.graph = graph
11378

11479
final_graph = self.add_tokens(self.graph + pynutil.insert(" preserve_order: true"))

nemo_text_processing/text_normalization/hu/taggers/tokenize_and_classify.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ def __init__(
6969
os.makedirs(cache_dir, exist_ok=True)
7070
whitelist_file = os.path.basename(whitelist) if whitelist else ""
7171
far_file = os.path.join(
72-
cache_dir, f"_{input_case}_hu_tn_{deterministic}_deterministic{whitelist_file}.far",
72+
cache_dir, f"_{input_case}_hu_tn_{deterministic}_deterministic{whitelist_file}.far"
7373
)
7474
if not overwrite_cache and far_file and os.path.exists(far_file):
7575
self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"]
@@ -86,10 +86,10 @@ def __init__(
8686
self.decimal = DecimalFst(cardinal=self.cardinal, deterministic=deterministic)
8787
decimal_graph = self.decimal.fst
8888

89-
self.fraction = FractionFst(cardinal=self.cardinal, ordinal=self.ordinal, deterministic=deterministic,)
89+
self.fraction = FractionFst(cardinal=self.cardinal, ordinal=self.ordinal, deterministic=deterministic)
9090
fraction_graph = self.fraction.fst
9191
self.measure = MeasureFst(
92-
cardinal=self.cardinal, decimal=self.decimal, fraction=self.fraction, deterministic=deterministic,
92+
cardinal=self.cardinal, decimal=self.decimal, fraction=self.fraction, deterministic=deterministic
9393
)
9494
measure_graph = self.measure.fst
9595
self.date = DateFst(cardinal=self.cardinal, deterministic=deterministic)
@@ -101,7 +101,7 @@ def __init__(
101101
telephone_graph = self.telephone.fst
102102
self.electronic = ElectronicFst(deterministic=deterministic)
103103
electronic_graph = self.electronic.fst
104-
self.money = MoneyFst(cardinal=self.cardinal, decimal=self.decimal, deterministic=deterministic,)
104+
self.money = MoneyFst(cardinal=self.cardinal, decimal=self.decimal, deterministic=deterministic)
105105
money_graph = self.money.fst
106106
self.whitelist = WhiteListFst(input_case=input_case, deterministic=deterministic, input_file=whitelist)
107107
whitelist_graph = self.whitelist.fst
@@ -118,7 +118,7 @@ def __init__(
118118
| pynutil.add_weight(decimal_graph, 1.1)
119119
| pynutil.add_weight(money_graph, 1.1)
120120
| pynutil.add_weight(telephone_graph, 1.1)
121-
| pynutil.add_weight(electronic_graph, 1.11)
121+
| pynutil.add_weight(electronic_graph, 1.1)
122122
| pynutil.add_weight(word_graph, 200)
123123
)
124124
punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=2.1) + pynutil.insert(" }")

nemo_text_processing/text_normalization/hu/verbalizers/electronic.py

Lines changed: 16 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -17,20 +17,9 @@
1717
from nemo_text_processing.text_normalization.en.graph_utils import (
1818
NEMO_NOT_QUOTE,
1919
NEMO_SIGMA,
20-
NEMO_SPACE,
2120
GraphFst,
22-
at,
23-
colon,
2421
delete_preserve_order,
25-
domain_string,
26-
double_quotes,
27-
double_slash,
28-
http,
29-
https,
30-
period,
31-
protocol_string,
32-
username_string,
33-
www,
22+
insert_space,
3423
)
3524
from nemo_text_processing.text_normalization.hu.utils import get_abs_path
3625

@@ -41,12 +30,6 @@
4130
server_common = pynini.string_file(get_abs_path("data/electronic/server_name.tsv"))
4231
domain_common = pynini.string_file(get_abs_path("data/electronic/domain.tsv"))
4332

44-
accept_space = pynini.accep(NEMO_SPACE)
45-
delete_username = pynutil.delete(username_string + colon + NEMO_SPACE + double_quotes)
46-
delete_double_quotes = pynutil.delete(double_quotes)
47-
delete_domain = pynutil.delete(domain_string + colon + NEMO_SPACE + double_quotes)
48-
delete_protocol = pynutil.delete(protocol_string + colon + NEMO_SPACE + double_quotes)
49-
5033

5134
class ElectronicFst(GraphFst):
5235
"""
@@ -66,41 +49,36 @@ def __init__(self, deterministic: bool = True):
6649
graph_digit = digit_no_zero | zero
6750

6851
def add_space_after_char():
69-
return pynini.closure(NEMO_NOT_QUOTE - accept_space + pynutil.insert(NEMO_SPACE)) + (
70-
NEMO_NOT_QUOTE - accept_space
52+
return pynini.closure(NEMO_NOT_QUOTE - pynini.accep(" ") + insert_space) + (
53+
NEMO_NOT_QUOTE - pynini.accep(" ")
7154
)
7255

73-
hungarian_at = [
74-
"kukacjel ",
75-
"csiga ",
76-
"ormány ",
77-
"farkas á ",
78-
"bejgli ",
79-
"at-jel ",
80-
]
8156
at_sign = pynutil.insert("kukac ")
8257
if not deterministic:
83-
for sign in hungarian_at:
84-
at_sign |= pynutil.insert(sign)
58+
at_sign |= pynutil.insert("kukacjel ")
59+
at_sign |= pynutil.insert("csiga ")
60+
at_sign |= pynutil.insert("ormány ")
61+
at_sign |= pynutil.insert("farkas á ")
62+
at_sign |= pynutil.insert("bejgli ")
63+
at_sign |= pynutil.insert("at-jel ")
8564

8665
verbalize_characters = pynini.cdrewrite(graph_symbols | graph_digit, "", "", NEMO_SIGMA)
8766

88-
user_name = delete_username + add_space_after_char() + delete_double_quotes
67+
user_name = pynutil.delete("username: \"") + add_space_after_char() + pynutil.delete("\"")
8968
user_name @= verbalize_characters
9069

9170
convert_defaults = pynutil.add_weight(NEMO_NOT_QUOTE, weight=0.0001) | domain_common | server_common
92-
domain = convert_defaults + pynini.closure(pynutil.insert(NEMO_SPACE) + convert_defaults)
71+
domain = convert_defaults + pynini.closure(insert_space + convert_defaults)
9372
domain @= verbalize_characters
9473

95-
domain = delete_domain + domain + delete_double_quotes
74+
domain = pynutil.delete("domain: \"") + domain + pynutil.delete("\"")
9675
protocol = (
97-
delete_protocol
76+
pynutil.delete("protocol: \"")
9877
+ add_space_after_char() @ pynini.cdrewrite(graph_symbols, "", "", NEMO_SIGMA)
99-
+ delete_double_quotes
78+
+ pynutil.delete("\"")
10079
)
101-
102-
self.graph = (pynini.closure(protocol + NEMO_SPACE, 0, 1) + domain) | (
103-
user_name + NEMO_SPACE + at_sign + domain | (at_sign + user_name)
80+
self.graph = (pynini.closure(protocol + pynini.accep(" "), 0, 1) + domain) | (
81+
user_name + pynini.accep(" ") + at_sign + domain
10482
)
10583
delete_tokens = self.delete_tokens(self.graph + delete_preserve_order)
10684
self.fst = delete_tokens.optimize()
Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,2 @@
11
abc@def.hu~a b c kukac d e f pont hu
22
https://www.nvidia.com~h t t p s kettőspont perjel perjel w w w pont nvidia pont com
3-
https://www.nvidia.com.~h t t p s kettőspont perjel perjel w w w pont nvidia pont com .
4-
@jensen~kukac j e n s e n
5-
www.nasa.gov.~w w w pont n a s a pont gov .
6-
www.enveedya.com.hu.~w w w pont e n v e e d y a pont com pont hu .
7-
@jensen.me~kukac j e n s e n pont m e
8-
@wezyr1986~kukac w e z y r egy kilenc nyolc hat

tests/nemo_text_processing/hu/test_sparrowhawk_normalization.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#! /bin/sh
22

33
GRAMMARS_DIR=${1:-"/workspace/sparrowhawk/documentation/grammars"}
4-
PROJECT_DIR=${2:-"/workspace/tests/"}
4+
PROJECT_DIR=${2:-"/workspace/tests/en"}
55

66
runtest () {
77
input=$1

0 commit comments

Comments
 (0)